You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
97 lines
4.3 KiB
97 lines
4.3 KiB
import scrapy
|
|
from uuid import uuid4
|
|
import re
|
|
from demo1.custom_settings_conf import *
|
|
from demo1.items import Shouyelianjie
|
|
from scrapy.utils.project import get_project_settings
|
|
from scrapy.spiders import CrawlSpider, Rule
|
|
from scrapy.linkextractors import LinkExtractor
|
|
import pymysql
|
|
import logging
|
|
import json
|
|
import time
|
|
from lxml import etree
|
|
from urllib import parse
|
|
from demo1.Util import Util_WANG
|
|
#太原市工业和信息话局
|
|
class taiyuangongyehexinxihuajuSpider(scrapy.Spider,Util_WANG):
|
|
name = 'taiyuangongyehexinxihuajuSpider'
|
|
settings = get_project_settings()
|
|
allowed_domains = ['jxw.taiyuan.gov.cn']
|
|
custom_settings = custom_settings_conf_taiyuangongyehexinxihuajuSpider
|
|
start_urls = ['http://jxw.taiyuan.gov.cn/zfxxgk/gggs/index.shtml']
|
|
|
|
def __init__(self, name=None, **kwargs):
|
|
self.db = pymysql.connect(
|
|
host=self.settings['MYSQL_HOST'],
|
|
database=self.settings['MYSQL_DATABASE'],
|
|
user=self.settings['MYSQL_USER'],
|
|
password=self.settings['MYSQL_PASSWORD'],
|
|
port=3306,
|
|
charset='utf8',
|
|
cursorclass=pymysql.cursors.DictCursor,
|
|
use_unicode=True
|
|
)
|
|
self.cursor = self.db.cursor()
|
|
def parse(self, response):
|
|
lis = response.xpath('//ul[@class="List_list"]/li')
|
|
for li in lis:
|
|
item = Shouyelianjie()
|
|
item['lianjie'] = response.urljoin(li.xpath('.//a/@href').extract_first())
|
|
item['biaoti'] = li.xpath('.//a/@title').extract_first()
|
|
if item['biaoti'] is None:
|
|
item['biaoti'] = li.xpath('.//a/text()').extract_first()
|
|
item['shijian'] = li.xpath('.//span/text()').extract_first()
|
|
item['laiyuan'] = '太原市工业和信息化局'
|
|
# 测试用的
|
|
#item['lianjie']='http://jxw.taiyuan.gov.cn/doc/2020/03/27/965251.shtml'
|
|
if not self.settings.get("ISQUANPA"):
|
|
self.cursor.execute(
|
|
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
|
|
res = self.cursor.fetchall()[0].get('nums')
|
|
if res == 0:
|
|
if Util_WANG.pos_url(item, self.settings):
|
|
yield item
|
|
else:
|
|
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
|
|
else:
|
|
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
|
|
else:
|
|
if Util_WANG.pos_url(item, self.settings):
|
|
yield item
|
|
else:
|
|
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
|
|
if self.settings.get("ISQUANPA"):
|
|
try:
|
|
|
|
nums = response.xpath('//*[@id="pages"]').re('{.*?pageIndex\":.*?pageCount\":.*?pageSize\":.*?}')[0]
|
|
nums = eval(nums)
|
|
pageIndex = int(nums['pageIndex'])
|
|
pageCount = int(nums['pageCount'])
|
|
next_page = pageIndex + 1
|
|
# if curry_page < count_page:
|
|
# urls = 'https://www.mct.gov.cn/whzx/ggtz/index_' + str(curry_page) + '.htm'
|
|
# yield scrapy.Request(url=urls, callback=self.parse)
|
|
|
|
if next_page <= pageCount:
|
|
ac='http://jxw.taiyuan.gov.cn/zfxxgk/gggs/index_' + str(next_page) + '.shtml'
|
|
yield scrapy.Request(url='http://jxw.taiyuan.gov.cn/zfxxgk/gggs/index_' + str(next_page) + '.shtml',
|
|
callback=self.parse)
|
|
else:
|
|
logging.info('全部爬完了')
|
|
except Exception as e:
|
|
logging(e)
|
|
logging.info('因为异常:全部爬取完毕')
|
|
else:
|
|
logging.info('全部爬取完毕')
|
|
|
|
def page_url(self, response):
|
|
item = response.meta['item']
|
|
item['xiangqing'] = response.xpath('//*[@id="Zoom"]').extract_first()
|
|
self.tihuan_a_return(item, self.settings.get('FILE_PATH'), response)
|
|
self.tihuan_img_return(item, self.settings.get('MESSAGE'), response)
|
|
yield item
|
|
def a_fun(self,href):
|
|
pass
|
|
def img_fun(self, src):
|
|
pass
|