import scrapy from uuid import uuid4 import re from demo1.custom_settings_conf import * from demo1.items import Shouyelianjie from scrapy.utils.project import get_project_settings from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor import pymysql import logging import json import time from lxml import etree from urllib import parse from demo1.Util import Util_WANG #山西省工业和信息厅 class sxsshangwutingSpider(scrapy.Spider,Util_WANG): name = 'sxgongxintingSpider' settings = get_project_settings() allowed_domains = ['gxt.shanxi.gov.cn'] custom_settings = custom_settings_conf_sxgongxintingSpider start_urls = ['http://gxt.shanxi.gov.cn/web/cateList.html?id=26&word=&pageIndex=1'] def __init__(self, name=None, **kwargs): self.db = pymysql.connect( host=self.settings['MYSQL_HOST'], database=self.settings['MYSQL_DATABASE'], user=self.settings['MYSQL_USER'], password=self.settings['MYSQL_PASSWORD'], port=3306, charset='utf8', cursorclass=pymysql.cursors.DictCursor, use_unicode=True ) self.cursor = self.db.cursor() def parse(self, response): lis=response.xpath('//*[@class="zwgk-ul"]/li[not(@style)]') for li in lis: item=Shouyelianjie() item['laiyuan']='山西省工业和信息厅' item['lianjie']=response.urljoin( li.xpath('./a/@href').extract_first()) shijian=li.xpath('./i/text()').extract_first() item['shijian']=datetime.datetime.strptime(shijian,'%Y-%m-%d %H:%M:%S').strftime('%Y-%m-%d') item['biaoti']=li.xpath('./a/text()').extract_first() #测试用得 #item['lianjie']='http://gxt.shanxi.gov.cn/web/content.html?id=1511' if not self.settings.get("ISQUANPA"): self.cursor.execute( 'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie'])) res = self.cursor.fetchall()[0].get('nums') if res == 0: if Util_WANG.pos_url(item, self.settings): yield item else: yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url) else: logging.info('这个链接已经爬过了-----:' + item['lianjie']) else: if Util_WANG.pos_url(item, self.settings): yield item else: yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url) if self.settings.get("ISQUANPA"): try: next_page=response.urljoin(response.xpath('//a[contains(text(),"下一页")]/@href').extract_first()) # if curry_page < count_page: # urls = 'https://www.mct.gov.cn/whzx/ggtz/index_' + str(curry_page) + '.htm' # yield scrapy.Request(url=urls, callback=self.parse) current_url=response.url if next_page!=current_url: yield scrapy.Request(url=next_page, callback=self.parse) except Exception as e: logging(e) logging.info('因为异常:全部爬取完毕') else: logging.info('全部爬取完毕') def page_url(self,response): item=response.meta['item'] item['xiangqing']=response.xpath('//*[contains(@class,"textbody")]').extract_first() wenjiande=response.xpath('//*[@class="attachment"]').extract_first() if wenjiande is not None: item['xiangqing']+=wenjiande self.tihuan_a_return(item, self.settings.get('FILE_PATH'), response) self.tihuan_img_return(item, self.settings.get('MESSAGE'), response) yield item def a_fun(self,href): pass def img_fun(self, src): pass