You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

96 lines
4.8 KiB

import scrapy
from uuid import uuid4
import re
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import pymysql
import logging
import json
import time
from lxml import etree
from urllib import parse
from demo1.Util import Util_WANG
#山西省科技厅
class kejitingSpider(scrapy.Spider,Util_WANG):
name = 'shanxishengkejitingSpider'
settings = get_project_settings()
allowed_domains = ['kjt.shanxi.gov.cn']
custom_settings = custom_settings_conf_sxkejitingSpider
start_urls = ['http://kjt.shanxi.gov.cn/tzgg/index.jhtml']
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def parse(self, response):
lis=response.xpath('//*[@align="center"]//tr[not(@class)]')
year = datetime.datetime.now().strftime('%Y')
mouth = datetime.datetime.now().strftime('%m')
for li in lis:
item=Shouyelianjie()
item['lianjie']=response.urljoin(li.xpath('.//a/@href').extract_first())
item['shijian']=li.xpath('.//td')[-3].xpath('./text()').extract_first().replace('.','-')
item['biaoti']=li.xpath('.//a/text()').extract_first()
item['laiyuan']='山西省科技厅'+li.xpath('.//td')[-2].xpath('./text()').extract_first()
if not self.settings.get("ISQUANPA"):
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
if not (item['lianjie'].endswith('.jhtml') or item['lianjie'].endswith('.html')):
item['wenjian']=[{'file_name':'原文件'}]
item['wenjian'][0]['file_url']=item['lianjie']
houzui = item['wenjian'][0]['file_url'][item['wenjian'][0]['file_url'].rfind('/') + 1:]
new_url = '/' + year + '/' + mouth + '/' + self.short_uuid() + '_' + houzui
item['wenjian'][0]['new_file']=new_url
item['xiangqing'] = '<div><p>请查看原文附件:<a href="' +self.settings.get('FILE_PATH')+new_url +'">原文件</a></p></div>'
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_item)
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
if not (item['lianjie'].endswith('.jhtml') or item['lianjie'].endswith('.html')):
item['wenjian'] = [{'file_name': '原文件'}]
item['wenjian'][0]['file_url'] = item['lianjie']
houzui = item['wenjian'][0]['file_url'][item['wenjian'][0]['file_url'].rfind('/') + 1:]
new_url = '/' + year + '/' + mouth + '/' + self.short_uuid() + '_' + houzui
item['wenjian'][0]['new_file'] = new_url
item['xiangqing'] = '<div><p>请查看原文附件:<a href="' + self.settings.get(
'FILE_PATH') + new_url + '">原文件</a></p></div>'
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_item)
if self.settings.get("ISQUANPA"):
try:
next_page = response.xpath('//a[text()="下一页"]/@href').extract_first()
if next_page is not None:
yield scrapy.Request(url='http://kjt.shanxi.gov.cn/tzgg/' + next_page,
callback=self.parse)
else:
logging.info('所有的结束')
except:
logging.info('全部爬取完毕')
def page_item(self,response):
item=response.meta['item']
item['xiangqing']=response.xpath('//*[@id="zoom"]').extract_first()
self.tihuan_a_return(item, self.settings.get('FILE_PATH'), response)
self.tihuan_img_return(item, self.settings.get('MESSAGE'), response)
yield item
def a_fun(self,href):
pass
def img_fun(self, src):
pass