You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
183 lines
8.6 KiB
183 lines
8.6 KiB
import scrapy
|
|
from uuid import uuid4
|
|
import re
|
|
from demo1.custom_settings_conf import *
|
|
from demo1.items import Shouyelianjie
|
|
from scrapy.utils.project import get_project_settings
|
|
from scrapy.spiders import CrawlSpider, Rule
|
|
from scrapy.linkextractors import LinkExtractor
|
|
import pymysql
|
|
import logging
|
|
import json
|
|
import time
|
|
from lxml import etree
|
|
from urllib import parse
|
|
from demo1.Util import Util_WANG
|
|
#山西省商务厅
|
|
class sxsshangwutingSpider(scrapy.Spider,Util_WANG):
|
|
name = 'sxsshangwutingSpider'
|
|
settings = get_project_settings()
|
|
allowed_domains = ['swt.shanxi.gov.cn']
|
|
custom_settings = custom_settings_conf_sxShangwutingSpider
|
|
start_urls = ['http://swt.shanxi.gov.cn/Main/list.action?channelId=27']
|
|
|
|
def __init__(self, name=None, **kwargs):
|
|
self.db = pymysql.connect(
|
|
host=self.settings['MYSQL_HOST'],
|
|
database=self.settings['MYSQL_DATABASE'],
|
|
user=self.settings['MYSQL_USER'],
|
|
password=self.settings['MYSQL_PASSWORD'],
|
|
port=3306,
|
|
charset='utf8',
|
|
cursorclass=pymysql.cursors.DictCursor,
|
|
use_unicode=True
|
|
)
|
|
self.cursor = self.db.cursor()
|
|
def parse(self, response):
|
|
pages=response.xpath('//*[@class="pgTotalPage"]/text()').extract_first()
|
|
script=response.xpath('//script[contains(text(),"listTable.filter.channelId")]/text()').extract_first().lower()
|
|
size=re.search('pagesize.*?=.*?\d+',script).group().replace('pagesize','').replace('=','').strip()
|
|
pageCount=re.search('pagecount.*?=.*?\d+',script).group().replace(r'pagecount','').replace('=','').strip()
|
|
lis=response.xpath('//*[@class="t_text"]//li')
|
|
for li in lis:
|
|
item=Shouyelianjie()
|
|
item['lianjie']=response.urljoin(li.xpath('.//a/@href').extract_first())
|
|
item['laiyuan']='山西省商务厅'
|
|
item['biaoti']=li.xpath('.//a/@title').extract_first()
|
|
item['shijian']=li.xpath('.//span/text()').extract_first()
|
|
# 测试链接得用
|
|
#item['lianjie'] = 'http://swt.shanxi.gov.cn/Main/cmsContent.action?articleId=369d459b-a799-4e8a-87b7-8cd6c5cfc371'
|
|
if not self.settings.get("ISQUANPA"):
|
|
self.cursor.execute(
|
|
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
|
|
res = self.cursor.fetchall()[0].get('nums')
|
|
if res == 0:
|
|
if Util_WANG.pos_url(item, self.settings):
|
|
yield item
|
|
else:
|
|
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
|
|
else:
|
|
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
|
|
else:
|
|
if Util_WANG.pos_url(item, self.settings):
|
|
yield item
|
|
else:
|
|
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
|
|
if self.settings.get("ISQUANPA"):
|
|
try:
|
|
pageCount=int(pageCount)
|
|
for page_next in range(2,pageCount+1):
|
|
url_next='http://swt.shanxi.gov.cn/Main/list.action?ajax=true&pageCount='+str(pageCount)+'&pageSize='+size+'&page='+str(page_next)+'&channelId=27'
|
|
yield scrapy.Request(url=url_next, callback=self.page_next_url)
|
|
except Exception as e:
|
|
logging(e)
|
|
logging.info('因为异常:全部爬取完毕')
|
|
else:
|
|
logging.info('全部爬取完毕')
|
|
def page_next_url(self,response):
|
|
context_json=json.loads(response.text)
|
|
context=context_json['content']
|
|
context_html=etree.HTML(context)
|
|
lis=context_html.xpath('//ul/li')
|
|
for li in lis:
|
|
item=Shouyelianjie()
|
|
item['lianjie']=response.urljoin(li.xpath('.//a/@href')[0])
|
|
item['laiyuan']='山西省商务厅'
|
|
item['biaoti']=li.xpath('.//a/@title')[0]
|
|
item['shijian']=li.xpath('.//span/text()')[0]
|
|
if not self.settings.get("ISQUANPA"):
|
|
self.cursor.execute(
|
|
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
|
|
res = self.cursor.fetchall()[0].get('nums')
|
|
if res == 0:
|
|
if Util_WANG.pos_url(item, self.settings):
|
|
yield item
|
|
else:
|
|
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
|
|
else:
|
|
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
|
|
else:
|
|
if Util_WANG.pos_url(item, self.settings):
|
|
yield item
|
|
else:
|
|
yield scrapy.Request(url=item['lianjie'], meta={'item': item}, callback=self.page_url)
|
|
def page_url(self,response):
|
|
item=response.meta['item']
|
|
item['xiangqing']=response.xpath('//div[@id="zoom"]').extract_first()
|
|
self.tihuan_a_return(item, self.settings.get('FILE_PATH'), response)
|
|
self.tihuan_img_return(item, self.settings.get('MESSAGE'), response)
|
|
#需要特别定制一个了
|
|
self.dingzhi_tihuan_a(item,self.settings.get('FILE_PATH'),response)
|
|
yield item
|
|
def dingzhi_tihuan_a(self,item, tihuanlujing,response=None):
|
|
txt = item['xiangqing']
|
|
year = datetime.datetime.now().strftime('%Y')
|
|
mouth = datetime.datetime.now().strftime('%m')
|
|
panDuanNone = lambda x: '_' if x is None else x
|
|
html = etree.HTML(txt)
|
|
alis = html.xpath('//a[@href and contains(@href,"getFile.action?fileId")]')
|
|
for alis_single in alis:
|
|
single_a_file = {}
|
|
href = str(alis_single.xpath('@href')[0])
|
|
content = str(panDuanNone(alis_single.xpath('string(.)')))
|
|
if content.strip() in '':
|
|
content = '_'
|
|
single_a_file['file_name'] = content
|
|
# 每次只需要修改这里我们实际的下载链接地址
|
|
old_url = href
|
|
if href.lower().startswith('http'):
|
|
single_a_file['file_url'] = old_url
|
|
elif response != None and (old_url.lower().startswith('./') or old_url.lower().startswith('../')):
|
|
single_a_file['file_url'] = response.urljoin(old_url)
|
|
elif response != None and (old_url.lower().startswith('/') or old_url.lower().startswith('/')):
|
|
single_a_file['file_url'] = response.urljoin(old_url)
|
|
|
|
#houzui = single_a_file['file_url'][single_a_file['file_url'].rfind('/') + 1:]
|
|
houzui=single_a_file['file_url'][single_a_file['file_url'].rfind('=') + 1:]
|
|
new_url = '/' + year + '/' + mouth + '/' + self.short_uuid() +'_' + houzui+'.'
|
|
txt = txt.replace(old_url, tihuanlujing + new_url)
|
|
single_a_file['new_file'] = new_url
|
|
try:
|
|
item['wenjian'].append(single_a_file)
|
|
except:
|
|
item['wenjian'] = [single_a_file]
|
|
item['xiangqing'] = txt
|
|
def a_fun(self,href):
|
|
pass
|
|
def img_fun(self, src):
|
|
pass
|
|
def return_url(self, size=10, curr_page=1):
|
|
start_url = 'http://www.miit.gov.cn/gdnps/searchIndex.jsp'
|
|
curr_time = time.time()
|
|
size = size
|
|
curr_page = curr_page
|
|
params = {
|
|
"goPage": curr_page,
|
|
"orderBy": [
|
|
{
|
|
"orderBy": "publishTime",
|
|
"reverse": "true"
|
|
},
|
|
{
|
|
"orderBy": "orderTime",
|
|
"reverse": "true"
|
|
}
|
|
],
|
|
"pageSize": size,
|
|
"queryParam": [
|
|
{},
|
|
{},
|
|
{
|
|
"shortName": "fbjg",
|
|
"value": "/1/29/1146295/1652858/1652930"
|
|
}
|
|
]
|
|
}
|
|
d = time.time()
|
|
d_int = int(round(d * 1000))
|
|
jquery = 'jQuery111108461701558527148_' + str(d_int)
|
|
params = json.dumps(params).replace(' ', '').replace('"true"', 'true')
|
|
url = start_url + "?params=" + parse.quote(parse.quote(params)).replace('/',
|
|
'%252F') + '&callback=' + jquery + '&_=' + str(
|
|
d_int + 1)
|
|
return url
|