专利数据定时获取简单实现
爬取内容
专利基本信息、摘要、专利说明书等。
技术架构
爬虫框架:Scrapy
URL队列:Redis
数据库:PostgreSQL(patent_detail表、patent_task表)
运行方式
手动或定时将patent_task表中的关键词构造成专利URL,并加入redis队列中
定时启动爬虫(支持分布式),從redis队列中获取待爬取url,自动获取指定关键词最新/相关度最高/专利评级最高的专利信息,写入数据库。
数据成果示例
- 北京大学的专利
- 其他
反爬处理
-
随机User-Agent(浏览器)
-
随机访问频率
-
对于专利摘要爬取(无需登录)禁用Cookie
-
代理IP
表结构
- 专利详情表
-- Table: public.patent_detail
-- DROP TABLE IF EXISTS public.patent_detail;
CREATE TABLE IF NOT EXISTS public.patent_detail
(
name character varying COLLATE pg_catalog."default" NOT NULL,
id bigint NOT NULL DEFAULT nextval('patent_detail_id_seq'::regclass),
apply_number character varying COLLATE pg_catalog."default" NOT NULL,
apply_date timestamp with time zone,
public_number character varying COLLATE pg_catalog."default",
public_date date,
apply_person character varying COLLATE pg_catalog."default",
inventor character varying COLLATE pg_catalog."default",
agency character varying COLLATE pg_catalog."default",
ipc character varying COLLATE pg_catalog."default",
cpc character varying COLLATE pg_catalog."default",
abstract text COLLATE pg_catalog."default",
status character varying COLLATE pg_catalog."default",
type character varying COLLATE pg_catalog."default",
source character varying COLLATE pg_catalog."default",
patent_full_text_url character varying COLLATE pg_catalog."default",
created_at timestamp with time zone NOT NULL DEFAULT CURRENT_TIMESTAMP,
updated_at timestamp with time zone NOT NULL DEFAULT CURRENT_TIMESTAMP,
full_text text COLLATE pg_catalog."default",
CONSTRAINT patent_detail_pk PRIMARY KEY (id),
CONSTRAINT patent_detail_unique_apply_number UNIQUE (apply_number),
CONSTRAINT patent_detail_unique_public_number UNIQUE (public_number)
)
TABLESPACE pg_default;
ALTER TABLE IF EXISTS public.patent_detail
OWNER to user_xxxxx;
COMMENT ON COLUMN public.patent_detail.name
IS '专利名称';
COMMENT ON COLUMN public.patent_detail.id
IS '自增长ID';
COMMENT ON COLUMN public.patent_detail.apply_number
IS '专利申请号';
COMMENT ON COLUMN public.patent_detail.apply_date
IS '专利申请日期';
COMMENT ON COLUMN public.patent_detail.public_number
IS '专利公开号';
COMMENT ON COLUMN public.patent_detail.apply_person
IS '专利申请人';
COMMENT ON COLUMN public.patent_detail.inventor
IS '专利发明人';
COMMENT ON COLUMN public.patent_detail.agency
IS '代理机构';
COMMENT ON COLUMN public.patent_detail.ipc
IS 'IPC公开号';
COMMENT ON COLUMN public.patent_detail.cpc
IS 'CPC公开号';
COMMENT ON COLUMN public.patent_detail.abstract
IS '专利摘要';
COMMENT ON COLUMN public.patent_detail.status
IS '专利状态';
COMMENT ON COLUMN public.patent_detail.type
IS '专利类型';
COMMENT ON COLUMN public.patent_detail.source
IS '专利数据源';
COMMENT ON COLUMN public.patent_detail.patent_full_text_url
IS '专利全文URL';
COMMENT ON COLUMN public.patent_detail.created_at
IS '数据创建时间';
COMMENT ON COLUMN public.patent_detail.updated_at
IS '数据更新时间';
COMMENT ON COLUMN public.patent_detail.full_text
IS '专利正文';
-- Trigger: updated_at_update
-- DROP TRIGGER IF EXISTS updated_at_update ON public.patent_detail;
CREATE OR REPLACE TRIGGER updated_at_update
BEFORE UPDATE
ON public.patent_detail
FOR EACH ROW
EXECUTE FUNCTION public.auto_update();