import scrapy from uuid import uuid4 import re from demo1.custom_settings_conf import * from demo1.items import Shouyelianjie from scrapy.utils.project import get_project_settings from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor import pymysql import logging import json import time from lxml import etree from urllib import parse from demo1.Util import Util_WANG #山西省商务厅 class sxsshangwutingSpider(scrapy.Spider,Util_WANG): name = 'sxsshangwutingSpider' settings = get_project_settings() allowed_domains = ['swt.shanxi.gov.cn'] custom_settings = custom_settings_conf_sxShangwutingSpider start_urls = ['http://swt.shanxi.gov.cn/Main/list.action?channelId=27'] def __init__(self, name=None, **kwargs): self.db = pymysql.connect( host=self.settings['MYSQL_HOST'], database=self.settings['MYSQL_DATABASE'], user=self.settings['MYSQL_USER'], password=self.settings['MYSQL_PASSWORD'], port=3306, charset='utf8', cursorclass=pymysql.cursors.DictCursor, use_unicode=True ) self.cursor = self.db.cursor() def parse(self, response): pages=response.xpath('//*[@class="pgTotalPage"]/text()').extract_first() script=response.xpath('//script[contains(text(),"listTable.filter.channelId")]/text()').extract_first().lower() size=re.search('pagesize.*?=.*?\d+',script).group().replace('pagesize','').replace('=','').strip() pageCount=re.search('pagecount.*?=.*?\d+',script).group().replace(r'pagecount','').replace('=','').strip() lis=response.xpath('//*[@class="t_text"]//li') for li in lis: item=Shouyelianjie() item['lianjie']=response.urljoin(li.xpath('.//a/@href').extract_first()) item['laiyuan']='山西省商务厅' item['biaoti']=li.xpath('.//a/@title').extract_first() item['shijian']=li.xpath('.//span/text()').extract_first() # 测试链接得用 #item['lianjie'] = 'http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=369d459b-a799-4e8a-87b7-8cd6c5cfc371' if not self.settings.get("ISQUANPA"): self.cursor.execute( 'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie'])) res = self.cursor.fetchall()[0].get('nums') if res == 0: if Util_WANG.pos_url(item, self.settings): yield item else: yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url) else: logging.info('这个链接已经爬过了-----:' + item['lianjie']) else: if Util_WANG.pos_url(item, self.settings): yield item else: yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url) if self.settings.get("ISQUANPA"): try: pageCount=int(pageCount) for page_next in range(2,pageCount+1): url_next='http://swt.shanxi.gov.cn/Main/list.action?ajax=true&pageCount='+str(pageCount)+'&pageSize='+size+'&page='+str(page_next)+'&channelId=27' yield scrapy.Request(url=url_next, callback=self.page_next_url) except Exception as e: logging(e) logging.info('因为异常:全部爬取完毕') else: logging.info('全部爬取完毕') def page_next_url(self,response): context_json=json.loads(response.text) context=context_json['content'] context_html=etree.HTML(context) lis=context_html.xpath('//ul/li') for li in lis: item=Shouyelianjie() item['lianjie']=response.urljoin(li.xpath('.//a/@href')[0]) item['laiyuan']='山西省商务厅' item['biaoti']=li.xpath('.//a/@title')[0] item['shijian']=li.xpath('.//span/text()')[0] if not self.settings.get("ISQUANPA"): self.cursor.execute( 'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie'])) res = self.cursor.fetchall()[0].get('nums') if res == 0: if Util_WANG.pos_url(item, self.settings): yield item else: yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url) else: logging.info('这个链接已经爬过了-----:' + item['lianjie']) else: if Util_WANG.pos_url(item, self.settings): yield item else: yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url) def page_url(self,response): item=response.meta['item'] item['xiangqing']=response.xpath('//div[@id="zoom"]').extract_first() self.tihuan_a_return(item, self.settings.get('FILE_PATH'), response) self.tihuan_img_return(item, self.settings.get('MESSAGE'), response) #需要特别定制一个了 self.dingzhi_tihuan_a(item,self.settings.get('FILE_PATH'),response) yield item def dingzhi_tihuan_a(self,item, tihuanlujing,response=None): txt = item['xiangqing'] year = datetime.datetime.now().strftime('%Y') mouth = datetime.datetime.now().strftime('%m') panDuanNone = lambda x: '_' if x is None else x html = etree.HTML(txt) alis = html.xpath('//a[@href and contains(@href,"getFile.action?fileId")]') for alis_single in alis: single_a_file = {} href = str(alis_single.xpath('@href')[0]) content = str(panDuanNone(alis_single.xpath('string(.)'))) if content.strip() in '': content = '_' single_a_file['file_name'] = content # 每次只需要修改这里我们实际的下载链接地址 old_url = href if href.lower().startswith('http'): single_a_file['file_url'] = old_url elif response != None and (old_url.lower().startswith('./') or old_url.lower().startswith('../')): single_a_file['file_url'] = response.urljoin(old_url) elif response != None and (old_url.lower().startswith('/') or old_url.lower().startswith('/')): single_a_file['file_url'] = response.urljoin(old_url) #houzui = single_a_file['file_url'][single_a_file['file_url'].rfind('/') + 1:] houzui=single_a_file['file_url'][single_a_file['file_url'].rfind('=') + 1:] new_url = '/' + year + '/' + mouth + '/' + self.short_uuid() +'_' + houzui+'.' txt = txt.replace(old_url, tihuanlujing + new_url) single_a_file['new_file'] = new_url try: item['wenjian'].append(single_a_file) except: item['wenjian'] = [single_a_file] item['xiangqing'] = txt def a_fun(self,href): pass def img_fun(self, src): pass def return_url(self, size=10, curr_page=1): start_url = 'http://www.miit.gov.cn/gdnps/searchIndex.jsp' curr_time = time.time() size = size curr_page = curr_page params = { "goPage": curr_page, "orderBy": [ { "orderBy": "publishTime", "reverse": "true" }, { "orderBy": "orderTime", "reverse": "true" } ], "pageSize": size, "queryParam": [ {}, {}, { "shortName": "fbjg", "value": "/1/29/1146295/1652858/1652930" } ] } d = time.time() d_int = int(round(d * 1000)) jquery = 'jQuery111108461701558527148_' + str(d_int) params = json.dumps(params).replace(' ', '').replace('"true"', 'true') url = start_url + "?params=" + parse.quote(parse.quote(params)).replace('/', '%252F') + '&callback=' + jquery + '&_=' + str( d_int + 1) return url