You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

86 lines
3.8 KiB

import scrapy
from uuid import uuid4
import re
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
import pymysql
import logging
import json
import time
from lxml import etree
from urllib import parse
from demo1.Util import Util_WANG
class wenhuahelvyoubuSpider(scrapy.Spider,Util_WANG):
name='wenhuahelvyoubuSpider'
settings = get_project_settings()
allowed_domains = ['mct.gov.cn']
custom_settings = custom_settings_conf_wenhuahelvyoubu
start_urls=['https://www.mct.gov.cn/whzx/ggtz/index.htm']
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def parse(self, response):
trs=response.css('table tr')
for tr in trs:
item=Shouyelianjie()
item['lianjie']=tr.css('td a::attr(href)').extract_first()
item['biaoti']=tr.css('td a::text').extract_first()
item['shijian']=tr.css('td[class$="time"]::text').extract_first()
item['laiyuan']='文化和旅游部'
if not self.settings.get("ISQUANPA"):
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
if Util_WANG.pos_url(item,self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
if Util_WANG.pos_url(item, self.settings):
yield item
else:
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
if self.settings.get("ISQUANPA"):
try:
next_page = ts=response.xpath('//body//*[contains(text(),"createPageHTML")]').re('\d{1,2}.*?,.*?\d{1,2}')[0].split(',')
count_page = int(next_page[0].strip())
curry_page = int(next_page[-1].strip()) + 1
if curry_page < count_page:
urls = 'https://www.mct.gov.cn/whzx/ggtz/index_' + str(curry_page) + '.htm'
yield scrapy.Request(url=urls, callback=self.parse)
except Exception as e:
logging(e)
logging.info('因为异常:全部爬取完毕')
else:
logging.info('全部爬取完毕')
# 测试页面的直接
# urls='http://zwgk.mcprc.gov.cn/auto255/201612/t20161206_30535.html'
# yield scrapy.Request(url=urls,callback=self.page_url,meta={'item':Shouyelianjie()})
def page_url(self,response):
item=response.meta['item']
item['xiangqing']=response.css('#ContentRegion').extract_first()
if item['xiangqing'] is None:
item['xiangqing']=response.css('#zoom').extract_first()
self.tihuan_a_return(item, self.settings.get('FILE_PATH'), response)
self.tihuan_img_return(item, self.settings.get('MESSAGE'), response)
yield item
def a_fun(self,href):
pass
def img_fun(self, src):
pass