import scrapy from uuid import uuid4 import re from scrapy.spiders import CrawlSpider from demo1.custom_settings_conf import * from demo1.items import Shouyelianjie from scrapy.utils.project import get_project_settings import pymysql import logging import json import time from lxml import etree from urllib import parse from demo1.Util import Util_WANG class fagaiweiSpider(scrapy.Spider,Util_WANG): name='fagaiweiSpider' settings = get_project_settings() allowed_domains = ['ndrc.gov.cn'] custom_settings = custom_settings_conf_fagaiwei start_urls=['https://www.ndrc.gov.cn/xxgk/zcfb/tz/index.html'] def __init__(self, name=None, **kwargs): self.db = pymysql.connect( host=self.settings['MYSQL_HOST'], database=self.settings['MYSQL_DATABASE'], user=self.settings['MYSQL_USER'], password=self.settings['MYSQL_PASSWORD'], port=3306, charset='utf8', cursorclass=pymysql.cursors.DictCursor, use_unicode=True ) self.cursor = self.db.cursor() def parse(self, response): lis=response.xpath('//*[@class="list"]/ul/li[not(@class="empty")]') for lis_sigl in lis: item=Shouyelianjie() item['biaoti']=lis_sigl.xpath('./a/@title').extract_first() item['shijian']=lis_sigl.xpath('.//span/text()').extract_first().replace('/','-') item['lianjie']='https://www.ndrc.gov.cn/xxgk/zcfb/tz'+lis_sigl.xpath('.//a/@href').extract_first().strip('.') item['laiyuan']='发改委' if not self.settings.get("ISQUANPA"): self.cursor.execute( 'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie'])) res = self.cursor.fetchall()[0].get('nums') if res == 0: yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url) else: logging.info('这个链接已经爬过了-----:' + item['lianjie']) else: yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url) if self.settings.get("ISQUANPA"): try: next_page = response.xpath('//*[@class="page"]//script').re('\d{1,2}.*?,.*?\d{1,2}')[0].split(',') count_page = int(next_page[0].strip()) curry_page = int(next_page[-1].strip())+1 if curry_page < count_page: urls = 'https://www.ndrc.gov.cn/xxgk/zcfb/tz/index_' + str(curry_page) + '.html' yield scrapy.Request(url=urls, callback=self.parse) except Exception as e: logging(e) logging.info('因为异常:全部爬取完毕') else: logging.info('全部爬取完毕') # 测试页面的直接 # urls='https://www.ndrc.gov.cn/xxgk/zcfb/tz/202004/t20200414_1225669.html' # yield scrapy.Request(url=urls,callback=self.page_url,meta={'item':Shouyelianjie()}) def page_url(self,response): item=response.meta['item'] txts=response.xpath('//*[@class="article_l"]/*[not(contains(@class,"shezhi"))]') a='' for txt in txts: a+=txt.extract() item['xiangqing']=a self.tihuan_a_return(item, self.settings.get('FILE_PATH'),response) self.tihuan_img_return(item, self.settings.get('MESSAGE'),response) yield item def a_fun(self,href): pass def img_fun(self, src): pass