You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
75 lines
3.4 KiB
75 lines
3.4 KiB
5 years ago
|
import scrapy
|
||
|
from uuid import uuid4
|
||
|
import re
|
||
|
from scrapy.spiders import CrawlSpider
|
||
|
from demo1.custom_settings_conf import *
|
||
|
from demo1.items import Shouyelianjie
|
||
|
from scrapy.utils.project import get_project_settings
|
||
|
import pymysql
|
||
|
import logging
|
||
|
import json
|
||
|
import time
|
||
|
from lxml import etree
|
||
|
from urllib import parse
|
||
|
from demo1.Util import Util_WANG
|
||
|
class huojuzhognxinSpider(scrapy.Spider,Util_WANG):
|
||
|
name='huojuzhognxinSpider'
|
||
|
settings = get_project_settings()
|
||
|
allowed_domains = ['chinatorch.gov.cn']
|
||
|
custom_settings = custom_settings_conf_huojuzhognxin
|
||
|
start_urls=['http://www.chinatorch.gov.cn/kjb/tzgg/list.shtml']
|
||
|
def __init__(self, name=None, **kwargs):
|
||
|
self.db = pymysql.connect(
|
||
|
host=self.settings['MYSQL_HOST'],
|
||
|
database=self.settings['MYSQL_DATABASE'],
|
||
|
user=self.settings['MYSQL_USER'],
|
||
|
password=self.settings['MYSQL_PASSWORD'],
|
||
|
port=3306,
|
||
|
charset='utf8',
|
||
|
cursorclass=pymysql.cursors.DictCursor,
|
||
|
use_unicode=True
|
||
|
)
|
||
|
self.cursor = self.db.cursor()
|
||
|
def parse(self, response):
|
||
|
lis=response.xpath('//*[@class="list_con"]/li')
|
||
|
for li in lis:
|
||
|
item=Shouyelianjie()
|
||
|
item['shijian']=li.xpath('.//*[@class="list_time"]/text()').extract_first()
|
||
|
item['biaoti']=li.xpath('.//a/@title').extract_first()
|
||
|
item['laiyuan']='科技部火炬中心'
|
||
|
item['lianjie']='http://www.chinatorch.gov.cn'+li.xpath('.//a/@href').extract_first()
|
||
|
if not self.settings.get("ISQUANPA"):
|
||
|
self.cursor.execute(
|
||
|
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
|
||
|
res = self.cursor.fetchall()[0].get('nums')
|
||
|
if res == 0:
|
||
|
yield scrapy.Request(url=item['lianjie'],meta={'item':item},callback=self.page_url)
|
||
|
else:
|
||
|
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
|
||
|
else:
|
||
|
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
|
||
|
if self.settings.get("ISQUANPA"):
|
||
|
try:
|
||
|
next_page=response.xpath('//ul[@class="list_con"]/script[not(@src and @type)]').re(',.*?\d+.*?,.*?\d.*?,')[0].strip(',').split(',')
|
||
|
count_page=int(next_page[0].strip())
|
||
|
curry_page=int(next_page[-1].strip())
|
||
|
if curry_page<count_page:
|
||
|
urls='http://www.chinatorch.gov.cn/kjb/tzgg/list_'+str(curry_page+1)+'.shtml'
|
||
|
yield scrapy.Request(url=urls,callback=self.parse)
|
||
|
except Exception as e:
|
||
|
logging.info(e)
|
||
|
else:
|
||
|
logging.info('全部爬取完毕')
|
||
|
def page_url(self,response):
|
||
|
item=response.meta['item']
|
||
|
txt=response.xpath('//div[contains(@class,"pages_content") and contains(@id,"content")]').extract_first()
|
||
|
item['xiangqing']=txt.replace('\u3000','')
|
||
|
self.tihuan_a_return(item, self.settings.get('FILE_PATH'))
|
||
|
self.tihuan_img_return(item, self.settings.get('MESSAGE'))
|
||
|
yield item
|
||
|
def a_fun(self,href):
|
||
|
return 'http://www.chinatorch.gov.cn'+href
|
||
|
def img_fun(self, src):
|
||
|
return 'http://www.chinatorch.gov.cn'+src
|
||
|
|