import scrapy from uuid import uuid4 import re from demo1.custom_settings_conf import * from demo1.items import Shouyelianjie from scrapy.utils.project import get_project_settings from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor import pymysql import logging import json import time from lxml import etree from urllib import parse from demo1.Util import Util_WANG #山西省中小企业发展促进局 class cujinjuSpider(scrapy.Spider,Util_WANG): name = 'cujinjuSpider' settings = get_project_settings() allowed_domains = ['xqyj.shanxi.gov.cn'] custom_settings = custom_settings_conf_cujinjuSpider start_urls = ['http://xqyj.shanxi.gov.cn/v2/html/tzgg/index.html'] def __init__(self, name=None, **kwargs): self.db = pymysql.connect( host=self.settings['MYSQL_HOST'], database=self.settings['MYSQL_DATABASE'], user=self.settings['MYSQL_USER'], password=self.settings['MYSQL_PASSWORD'], port=3306, charset='utf8', cursorclass=pymysql.cursors.DictCursor, use_unicode=True ) self.cursor = self.db.cursor() def parse(self, response): lis=response.xpath('//*[@class="page_list"]//li') for li in lis: item=Shouyelianjie() item['biaoti']=li.xpath('./a/@title').extract_first() item['lianjie']=response.urljoin(li.xpath('./a/@href').extract_first()) item['laiyuan']='山西省小企业促进局' item['shijian']=li.xpath('./sapn/text()').extract_first() if not self.settings.get("ISQUANPA"): self.cursor.execute( 'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie'])) res = self.cursor.fetchall()[0].get('nums') if res == 0: yield scrapy.Request(url=item['lianjie'],meta={'item':item},callback=self.page_item) else: logging.info('这个链接已经爬过了-----:' + item['lianjie']) else: yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_item) if self.settings.get("ISQUANPA"): try: next_page=response.xpath('//a[@class="next-page"]/@onclick').re('\'index.*') if len(next_page)>0: next_page[0]=next_page[0].strip('\'') yield scrapy.Request(url='http://xqyj.shanxi.gov.cn/v2/html/tzgg/'+next_page[0],callback=self.parse) except: logging.info('全部爬取完毕') def page_item(self,response): item=response.meta['item'] item['xiangqing']=response.xpath('//*[@class="doc_content"]').extract_first().replace('192.168.143.1','xqyj.shanxi.gov.cn') self.tihuan_a_return(item, self.settings.get('FILE_PATH'), response) self.tihuan_img_return(item, self.settings.get('MESSAGE'), response) # if item.get('wenjian') is not None: # for wenjians in item['wenjian'][:]: # if '_' in wenjians['file_name']: # self.cursor.execute( # 'select count(file_url) as nums FROM t_policy_file_crawl where file_url ="{}"'.format(wenjians['file_url'])) # res = self.cursor.fetchall()[0].get('nums') # if res != 0: # item['file_name'].remove(wenjians) # logging.info(item) yield item def a_fun(self,href): print() def img_fun(self, src): print()