You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

136 lines
6.8 KiB

import scrapy
from uuid import uuid4
import re
from scrapy.spiders import CrawlSpider
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
import pymysql
import logging
class kexujishubuSpider(CrawlSpider):
name = 'kexujishubuSpider'
settings=get_project_settings()
allowed_domains = ['most.gov.cn']
custom_settings =custom_settings_conf_kexujishubu
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
self.settings =get_project_settings()
def start_requests(self):
start_url='http://www.most.gov.cn/tztg/index.htm'
yield scrapy.Request(url=start_url, callback=self.parse)
#测试页面的额
# ceshiwenzhang='http://www.most.gov.cn/tztg/201901/t20190107_144549.htm'
# item = Shouyelianjie()
# item['biaoti']='国家遥感中心2018年面向社会公开招聘拟聘用人员公示'
# item['lianjie']=ceshiwenzhang
# item['shijian']='2019-01-04'
# yield scrapy.Request(url=ceshiwenzhang,callback=self.parse_url,meta={'item':item})
def parse(self, response):
text=response.text
panDuanNone = lambda x: '_' if x is None else x
currentPage_var=re.search('var.*?currentPage.*?=.*?\d+',text).group(0)
currentPage = int(currentPage_var[currentPage_var.find('=') + 1:].strip())
countPage_var=re.search('var.*?countPage.*?=.*?\d+',text).group(0)
countPage=int(countPage_var[countPage_var.find('=')+1:].strip())
tables=response.xpath('//td[@class="STYLE30"]')
for table in tables :
item = Shouyelianjie()
item['biaoti']=table.xpath('.//a/text()').extract_first()
item['lianjie']='http://www.most.gov.cn/tztg'+table.xpath('.//a/@href').extract_first().strip('.')
item['shijian']=re.findall('(\d{4}-\d{1,2}-\d{1,2})',table.xpath('string(.)').extract_first())[-1]
if not self.settings.get('ISQUANPA'):
#续爬解析页面
self.cursor.execute('select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res==0:
yield scrapy.FormRequest(url=item['lianjie'],
meta={'item': item},
callback=self.parse_url,
method='GET'
)
else:
logging.info('这个链接已经爬过了-----:'+item['lianjie'])
else:
#全爬解析页面
yield scrapy.FormRequest(url=item['lianjie'],
meta={'item': item},
callback=self.parse_url,
method='GET'
)
if self.settings.get('ISQUANPA'):
#全爬
if currentPage+1<countPage:
new_url='http://www.most.gov.cn/tztg/index_'+str(currentPage+1)+'.htm'
yield scrapy.FormRequest(url=new_url,
callback=self.parse
)
#续爬不用写是因为这个页面就是更新得慢我们看首页就行了
def parse_url(self,response):
year = datetime.datetime.now().strftime('%Y')
mouth = datetime.datetime.now().strftime('%m')
current_url=response.url
item=response.meta['item']
item['laiyuan']='科技部'
#() | (//meta[@name="ContentEnd"]/preceding-sibling::*)
a1=response.xpath('//meta[@name="ContentStart"]/following-sibling::*')
for a_i,a_value in enumerate(a1):
c=a_value.xpath('.//@name')
if len(c)>0 and (str(c.extract_first()).lower()=="ContentEnd".lower()):
b=a_i
a2=a1[0:a_i-1:]
a_suoyou=response.xpath('//*[@id="Zoom"]//a[@href and (' + self.jiewei_contains() + ')]')
als = response.xpath('//*[@id="Zoom"]')[0].re('<meta.*name="ContentStart".*[\s\S]*<meta.*name="ContentEnd">')[0]
als = str(als)
txt = als[als.find('name="ContentStart"') + len('name="ContentStart">'):als.rfind('<meta')]
for a_suoyou_i,a_suoyou_value in enumerate(a_suoyou):
single_a_file={}
single_a_file['file_name']=a_suoyou_value.xpath('string(.)').extract_first()
old_url=a_suoyou_value.xpath('@href').extract_first()
single_a_file['file_url']=current_url[0:current_url.rfind('/')]+old_url.strip('.')
houzui = single_a_file['file_url'][single_a_file['file_url'].rfind('/') + 1:]
new_url = '/' + year + '/' + mouth + '/' + self.short_uuid() + '_' + houzui
txt=txt.replace(old_url,self.settings.get('FILE_PATH')+new_url)
single_a_file['new_file']=new_url
try:
item['wenjian'].append( single_a_file)
except:
item['wenjian'] =[single_a_file]
item['xiangqing']=txt.strip('\n').strip().replace('\u3000',' ').replace('\xa0',' ')
#context_all= etree.HTML(response.text).xpath('//meta[@name="ContentStart"]/following-sibling::*[name(.)!="table" and name(.)!="meta"]')
yield item
def jiewei_contains(self):
str = ''
jiewei = ['.doc', '.xls', '.docx', '.xlsx','.txt','.rar','.zip']
for j in jiewei:
str += 'contains(@href,"' + j + '")' + ' or '
str = str.strip().strip('or').strip()
return str
def short_uuid(self):
uuidChars = ("a", "b", "c", "d", "e", "f",
"g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s",
"t", "u", "v", "w", "x", "y", "z", "0", "1", "2", "3", "4", "5",
"6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "I",
"J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V",
"W", "X", "Y", "Z")
uuid = str(uuid4()).replace('-', '')
result = ''
for i in range(0, 8):
sub = uuid[i * 4: i * 4 + 4]
x = int(sub, 16)
result += uuidChars[x % 0x3E]
return result