You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

73 lines
3.4 KiB

import scrapy
from uuid import uuid4
import re
from scrapy.spiders import CrawlSpider
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
import pymysql
import logging
import json
import time
from lxml import etree
from urllib import parse
from demo1.Util import Util_WANG
class ziranweiyuanhuiSpider(scrapy.Spider,Util_WANG):
name = 'ziranweiyuanhuiSpider'
settings = get_project_settings()
allowed_domains = ['nsfc.gov.cn']
custom_settings = custom_settings_conf_ziranweiyuanhui
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def start_requests(self):
yield scrapy.Request(url=self.return_start_url(),callback=self.parse)
#ceshixianye
#yield scrapy.Request(url='http://www.nsfc.gov.cn/publish/portal0/tab442/info76617.htm',callback=self.page_url,meta={'item':Shouyelianjie()})
def return_start_url(self,page=1):
return 'http://www.nsfc.gov.cn/publish/portal0/tab442/module1178/page'+str(page)+'.htm'
def parse(self, response):
news=response.xpath('//*[@class="clearfix"]')
for news_list in news:
item=Shouyelianjie()
item['lianjie']='http://www.nsfc.gov.cn'+news_list.xpath('.//*[@class="fl"]/a/@href').extract_first()
item['laiyuan']='国家自然科学基金委员会'
item['shijian']=news_list.xpath('.//*[@class="fr"]/text()').extract_first()
item['biaoti']=news_list.xpath('.//a/text()').extract_first()
if not self.settings.get("ISQUANPA"):
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
yield scrapy.Request(url=item['lianjie'],meta={'item':item},callback=self.page_url)
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
if self.settings.get("ISQUANPA"):
try:
next_page='http://www.nsfc.gov.cn'+response.xpath('//a[@class="Normal"]')[-2].xpath('@href').extract_first()
yield scrapy.Request(url=next_page,callback=self.parse)
except:
logging.info('全部爬取完毕')
def page_url(self,response):
item=response.meta['item']
txt=response.xpath('//*[@class="content_xilan"]').extract_first()
item['xiangqing']=txt.replace('\u3000','')
self.tihuan_a_return(item,self.settings.get('FILE_PATH'))
self.tihuan_img_return(item,self.settings.get('MESSAGE'))
yield item
def a_fun(self,href):
return 'http://www.nsfc.gov.cn'+href
def img_fun(self, src):
return 'http://www.nsfc.gov.cn'+src