You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
136 lines
6.8 KiB
136 lines
6.8 KiB
import scrapy
|
|
from uuid import uuid4
|
|
import re
|
|
from scrapy.spiders import CrawlSpider
|
|
from demo1.custom_settings_conf import *
|
|
from demo1.items import Shouyelianjie
|
|
from scrapy.utils.project import get_project_settings
|
|
import pymysql
|
|
import logging
|
|
class kexujishubuSpider(CrawlSpider):
|
|
name = 'kexujishubuSpider'
|
|
settings=get_project_settings()
|
|
allowed_domains = ['most.gov.cn']
|
|
custom_settings =custom_settings_conf_kexujishubu
|
|
def __init__(self, name=None, **kwargs):
|
|
self.db = pymysql.connect(
|
|
host=self.settings['MYSQL_HOST'],
|
|
database=self.settings['MYSQL_DATABASE'],
|
|
user=self.settings['MYSQL_USER'],
|
|
password=self.settings['MYSQL_PASSWORD'],
|
|
port=3306,
|
|
charset='utf8',
|
|
cursorclass=pymysql.cursors.DictCursor,
|
|
use_unicode=True
|
|
)
|
|
self.cursor = self.db.cursor()
|
|
self.settings =get_project_settings()
|
|
def start_requests(self):
|
|
start_url='http://www.most.gov.cn/tztg/index.htm'
|
|
yield scrapy.Request(url=start_url, callback=self.parse)
|
|
|
|
#测试页面的额
|
|
# ceshiwenzhang='http://www.most.gov.cn/tztg/201901/t20190107_144549.htm'
|
|
# item = Shouyelianjie()
|
|
# item['biaoti']='国家遥感中心2018年面向社会公开招聘拟聘用人员公示'
|
|
# item['lianjie']=ceshiwenzhang
|
|
# item['shijian']='2019-01-04'
|
|
# yield scrapy.Request(url=ceshiwenzhang,callback=self.parse_url,meta={'item':item})
|
|
|
|
def parse(self, response):
|
|
text=response.text
|
|
panDuanNone = lambda x: '_' if x is None else x
|
|
currentPage_var=re.search('var.*?currentPage.*?=.*?\d+',text).group(0)
|
|
currentPage = int(currentPage_var[currentPage_var.find('=') + 1:].strip())
|
|
countPage_var=re.search('var.*?countPage.*?=.*?\d+',text).group(0)
|
|
countPage=int(countPage_var[countPage_var.find('=')+1:].strip())
|
|
tables=response.xpath('//td[@class="STYLE30"]')
|
|
for table in tables :
|
|
item = Shouyelianjie()
|
|
item['biaoti']=table.xpath('.//a/text()').extract_first()
|
|
item['lianjie']='http://www.most.gov.cn/tztg'+table.xpath('.//a/@href').extract_first().strip('.')
|
|
item['shijian']=re.findall('(\d{4}-\d{1,2}-\d{1,2})',table.xpath('string(.)').extract_first())[-1]
|
|
if not self.settings.get('ISQUANPA'):
|
|
#续爬解析页面
|
|
self.cursor.execute('select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
|
|
res = self.cursor.fetchall()[0].get('nums')
|
|
if res==0:
|
|
yield scrapy.FormRequest(url=item['lianjie'],
|
|
meta={'item': item},
|
|
callback=self.parse_url,
|
|
method='GET'
|
|
)
|
|
else:
|
|
logging.info('这个链接已经爬过了-----:'+item['lianjie'])
|
|
else:
|
|
#全爬解析页面
|
|
yield scrapy.FormRequest(url=item['lianjie'],
|
|
meta={'item': item},
|
|
callback=self.parse_url,
|
|
method='GET'
|
|
)
|
|
|
|
if self.settings.get('ISQUANPA'):
|
|
#全爬
|
|
if currentPage+1<countPage:
|
|
new_url='http://www.most.gov.cn/tztg/index_'+str(currentPage+1)+'.htm'
|
|
yield scrapy.FormRequest(url=new_url,
|
|
callback=self.parse
|
|
)
|
|
|
|
#续爬不用写是因为这个页面就是更新得慢我们看首页就行了
|
|
def parse_url(self,response):
|
|
year = datetime.datetime.now().strftime('%Y')
|
|
mouth = datetime.datetime.now().strftime('%m')
|
|
current_url=response.url
|
|
item=response.meta['item']
|
|
item['laiyuan']='科技部'
|
|
#() | (//meta[@name="ContentEnd"]/preceding-sibling::*)
|
|
a1=response.xpath('//meta[@name="ContentStart"]/following-sibling::*')
|
|
for a_i,a_value in enumerate(a1):
|
|
c=a_value.xpath('.//@name')
|
|
if len(c)>0 and (str(c.extract_first()).lower()=="ContentEnd".lower()):
|
|
b=a_i
|
|
a2=a1[0:a_i-1:]
|
|
a_suoyou=response.xpath('//*[@id="Zoom"]//a[@href and (' + self.jiewei_contains() + ')]')
|
|
als = response.xpath('//*[@id="Zoom"]')[0].re('<meta.*name="ContentStart".*[\s\S]*<meta.*name="ContentEnd">')[0]
|
|
als = str(als)
|
|
txt = als[als.find('name="ContentStart"') + len('name="ContentStart">'):als.rfind('<meta')]
|
|
for a_suoyou_i,a_suoyou_value in enumerate(a_suoyou):
|
|
single_a_file={}
|
|
single_a_file['file_name']=a_suoyou_value.xpath('string(.)').extract_first()
|
|
old_url=a_suoyou_value.xpath('@href').extract_first()
|
|
|
|
single_a_file['file_url']=current_url[0:current_url.rfind('/')]+old_url.strip('.')
|
|
houzui = single_a_file['file_url'][single_a_file['file_url'].rfind('/') + 1:]
|
|
new_url = '/' + year + '/' + mouth + '/' + self.short_uuid() + '_' + houzui
|
|
txt=txt.replace(old_url,self.settings.get('FILE_PATH')+new_url)
|
|
single_a_file['new_file']=new_url
|
|
try:
|
|
item['wenjian'].append( single_a_file)
|
|
except:
|
|
item['wenjian'] =[single_a_file]
|
|
item['xiangqing']=txt.strip('\n').strip().replace('\u3000',' ').replace('\xa0',' ')
|
|
#context_all= etree.HTML(response.text).xpath('//meta[@name="ContentStart"]/following-sibling::*[name(.)!="table" and name(.)!="meta"]')
|
|
yield item
|
|
def jiewei_contains(self):
|
|
str = ''
|
|
jiewei = ['.doc', '.xls', '.docx', '.xlsx','.txt','.rar','.zip']
|
|
for j in jiewei:
|
|
str += 'contains(@href,"' + j + '")' + ' or '
|
|
str = str.strip().strip('or').strip()
|
|
return str
|
|
def short_uuid(self):
|
|
uuidChars = ("a", "b", "c", "d", "e", "f",
|
|
"g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s",
|
|
"t", "u", "v", "w", "x", "y", "z", "0", "1", "2", "3", "4", "5",
|
|
"6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "I",
|
|
"J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V",
|
|
"W", "X", "Y", "Z")
|
|
uuid = str(uuid4()).replace('-', '')
|
|
result = ''
|
|
for i in range(0, 8):
|
|
sub = uuid[i * 4: i * 4 + 4]
|
|
x = int(sub, 16)
|
|
result += uuidChars[x % 0x3E]
|
|
return result
|