wiki_enterprise_python/demo1/spiders/kexujishubuSpider.py


								import scrapy

								from uuid import uuid4

								import re

								from scrapy.spiders import CrawlSpider

								from demo1.custom_settings_conf import *

								from demo1.items import Shouyelianjie

								from scrapy.utils.project import get_project_settings

								import pymysql

								import logging

								class kexujishubuSpider(CrawlSpider):

								    name = 'kexujishubuSpider'

								    settings=get_project_settings()

								    allowed_domains = ['most.gov.cn']

								    custom_settings =custom_settings_conf_kexujishubu

								    def __init__(self, name=None, **kwargs):

								        self.db = pymysql.connect(

								            host=self.settings['MYSQL_HOST'],

								            database=self.settings['MYSQL_DATABASE'],

								            user=self.settings['MYSQL_USER'],

								            password=self.settings['MYSQL_PASSWORD'],

								            port=3306,

								            charset='utf8',

								            cursorclass=pymysql.cursors.DictCursor,

								            use_unicode=True

								        )

								        self.cursor = self.db.cursor()

								        self.settings =get_project_settings()

								    def start_requests(self):

								        start_url='http://www.most.gov.cn/tztg/index.htm'

								        yield scrapy.Request(url=start_url, callback=self.parse)


								        #测试页面的额

								        # ceshiwenzhang='http://www.most.gov.cn/tztg/201901/t20190107_144549.htm'

								        # item = Shouyelianjie()

								        # item['biaoti']='国家遥感中心2018年面向社会公开招聘拟聘用人员公示'

								        # item['lianjie']=ceshiwenzhang

								        # item['shijian']='2019-01-04'

								        # yield scrapy.Request(url=ceshiwenzhang,callback=self.parse_url,meta={'item':item})


								    def parse(self, response):

								        text=response.text

								        panDuanNone = lambda x: '_' if x is None else x

								        currentPage_var=re.search('var.*?currentPage.*?=.*?\d+',text).group(0)

								        currentPage = int(currentPage_var[currentPage_var.find('=') + 1:].strip())

								        countPage_var=re.search('var.*?countPage.*?=.*?\d+',text).group(0)

								        countPage=int(countPage_var[countPage_var.find('=')+1:].strip())

								        tables=response.xpath('//td[@class="STYLE30"]')

								        for table in tables :

								            item = Shouyelianjie()

								            item['biaoti']=table.xpath('.//a/text()').extract_first()

								            item['lianjie']='http://www.most.gov.cn/tztg'+table.xpath('.//a/@href').extract_first().strip('.')

								            item['shijian']=re.findall('(\d{4}-\d{1,2}-\d{1,2})',table.xpath('string(.)').extract_first())[-1]

								            if not self.settings.get('ISQUANPA'):

								                #续爬解析页面

								                self.cursor.execute('select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))

								                res = self.cursor.fetchall()[0].get('nums')

								                if res==0:

								                    yield scrapy.FormRequest(url=item['lianjie'],

								                                         meta={'item': item},

								                                         callback=self.parse_url,

								                                         method='GET'

								                                        )

								                else:

								                    logging.info('这个链接已经爬过了-----：'+item['lianjie'])

								            else:

								                #全爬解析页面

								                yield scrapy.FormRequest(url=item['lianjie'],

								                                         meta={'item': item},

								                                         callback=self.parse_url,

								                                         method='GET'

								                                         )


								        if self.settings.get('ISQUANPA'):

								            #全爬

								            if currentPage+1<countPage:

								                new_url='http://www.most.gov.cn/tztg/index_'+str(currentPage+1)+'.htm'

								                yield scrapy.FormRequest(url=new_url,

								                                         callback=self.parse

								                                         )


								        #续爬不用写是因为这个页面就是更新得慢我们看首页就行了

								    def parse_url(self,response):

								        year = datetime.datetime.now().strftime('%Y')

								        mouth = datetime.datetime.now().strftime('%m')

								        current_url=response.url

								        item=response.meta['item']

								        item['laiyuan']='科技部'

								        #() | (//meta[@name="ContentEnd"]/preceding-sibling::*)

								        a1=response.xpath('//meta[@name="ContentStart"]/following-sibling::*')

								        for a_i,a_value in enumerate(a1):

								            c=a_value.xpath('.//@name')

								            if len(c)>0 and (str(c.extract_first()).lower()=="ContentEnd".lower()):

								                b=a_i

								        a2=a1[0:a_i-1:]

								        a_suoyou=response.xpath('//*[@id="Zoom"]//a[@href and (' + self.jiewei_contains() + ')]')

								        als = response.xpath('//*[@id="Zoom"]')[0].re('<meta.*name="ContentStart".*[\s\S]*<meta.*name="ContentEnd">')[0]

								        als = str(als)

								        txt = als[als.find('name="ContentStart"') + len('name="ContentStart">'):als.rfind('<meta')]

								        for a_suoyou_i,a_suoyou_value in enumerate(a_suoyou):

								            single_a_file={}

								            single_a_file['file_name']=a_suoyou_value.xpath('string(.)').extract_first()

								            old_url=a_suoyou_value.xpath('@href').extract_first()


								            single_a_file['file_url']=current_url[0:current_url.rfind('/')]+old_url.strip('.')

								            houzui = single_a_file['file_url'][single_a_file['file_url'].rfind('/') + 1:]

								            new_url = '/' + year + '/' + mouth + '/' + self.short_uuid() + '_' + houzui

								            txt=txt.replace(old_url,self.settings.get('FILE_PATH')+new_url)

								            single_a_file['new_file']=new_url

								            try:

								                item['wenjian'].append( single_a_file)

								            except:

								                item['wenjian'] =[single_a_file]

								        item['xiangqing']=txt.strip('\n').strip().replace('\u3000',' ').replace('\xa0',' ')

								        #context_all= etree.HTML(response.text).xpath('//meta[@name="ContentStart"]/following-sibling::*[name(.)!="table" and name(.)!="meta"]')

								        yield item

								    def jiewei_contains(self):

								        str = ''

								        jiewei = ['.doc', '.xls', '.docx', '.xlsx','.txt','.rar','.zip']

								        for j in jiewei:

								            str += 'contains(@href,"' + j + '")' + ' or '

								        str = str.strip().strip('or').strip()

								        return str

								    def short_uuid(self):

								        uuidChars = ("a", "b", "c", "d", "e", "f",

								                     "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s",

								                     "t", "u", "v", "w", "x", "y", "z", "0", "1", "2", "3", "4", "5",

								                     "6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "I",

								                     "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V",

								                     "W", "X", "Y", "Z")

								        uuid = str(uuid4()).replace('-', '')

								        result = ''

								        for i in range(0, 8):

								            sub = uuid[i * 4: i * 4 + 4]

								            x = int(sub, 16)

								            result += uuidChars[x % 0x3E]

								        return result