You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
112 lines
4.4 KiB
112 lines
4.4 KiB
import scrapy
|
|
from uuid import uuid4
|
|
import re
|
|
from scrapy.spiders import CrawlSpider
|
|
from demo1.custom_settings_conf import *
|
|
from demo1.items import Shouyelianjie
|
|
from scrapy.utils.project import get_project_settings
|
|
import pymysql
|
|
import logging
|
|
import json
|
|
import time
|
|
from urllib import parse
|
|
from demo1.Util import Util_WANG
|
|
class gongyehexinxihuabuSpider(scrapy.Spider,Util_WANG):
|
|
name = 'gongyehexinxihuabuSpider'
|
|
settings = get_project_settings()
|
|
allowed_domains = ['miit.gov.cn']
|
|
custom_settings = custom_settings_conf_gongyehexinxihuabu
|
|
|
|
def __init__(self, name=None, **kwargs):
|
|
self.db = pymysql.connect(
|
|
host=self.settings['MYSQL_HOST'],
|
|
database=self.settings['MYSQL_DATABASE'],
|
|
user=self.settings['MYSQL_USER'],
|
|
password=self.settings['MYSQL_PASSWORD'],
|
|
port=3306,
|
|
charset='utf8',
|
|
cursorclass=pymysql.cursors.DictCursor,
|
|
use_unicode=True
|
|
)
|
|
self.cursor = self.db.cursor()
|
|
def start_requests(self):
|
|
url='http://www.miit.gov.cn/gdnps/wjfbindex.jsp'
|
|
yield scrapy.Request(url=url,callback=self.dierci_requests)
|
|
def dierci_requests(self,response):
|
|
url = self.return_url()
|
|
yield scrapy.Request(url=url, callback=self.parse)
|
|
def parse(self, response):
|
|
txt=response.text
|
|
txt=txt[txt.find('{'):txt.rfind('}')+1]
|
|
txt=json.loads(txt)
|
|
curPage = txt['curPage']
|
|
totalpagenum = txt['totalPageNum']
|
|
for ac in txt['resultMap']:
|
|
item=Shouyelianjie()
|
|
item['biaoti']=ac['title']
|
|
res_time=datetime.datetime.strptime(ac['publishTime'],'%Y%m%d%H%M%S').strftime('%Y-%m-%d')
|
|
item['shijian']=res_time
|
|
buafter=''
|
|
try:
|
|
buafter=ac['fbjgmc']
|
|
except:
|
|
logging.info('没有这个字段')
|
|
item['laiyuan']='工信部'+buafter
|
|
chushi_url="http://www.miit.gov.cn"+ac['ownSubjectDn'].replace("/1/29/","/").replace('/',"/n")+"/c"+ac['id']+"/content.html"
|
|
item['lianjie']=chushi_url
|
|
item['xiangqing']=ac['htmlContent']
|
|
#html=etree.HTML(ac['htmlContent'])
|
|
#self.tihuan_a_return(item,ac['htmlContent'],self.settings.get('FILE_PATH'))
|
|
self.tihuan_a_return(item,self.settings.get('FILE_PATH'))
|
|
self.tihuan_img_return(item,self.settings.get('MESSAGE'))
|
|
if not self.settings.get('ISQUANPA'):
|
|
# 续爬解析页面
|
|
self.cursor.execute(
|
|
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
|
|
res = self.cursor.fetchall()[0].get('nums')
|
|
if res == 0:
|
|
yield item
|
|
else:
|
|
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
|
|
else:
|
|
yield item
|
|
if self.settings.get('ISQUANPA'):
|
|
if curPage<totalpagenum:
|
|
yield scrapy.Request(url=self.return_url(curr_page=curPage+1),callback=self.parse)
|
|
def a_fun(self, href):
|
|
return href
|
|
def img_fun(self, src):
|
|
return src
|
|
def return_url(self,size=10,curr_page=1):
|
|
start_url = 'http://www.miit.gov.cn/gdnps/searchIndex.jsp'
|
|
curr_time = time.time()
|
|
size = size
|
|
curr_page = curr_page
|
|
params = {
|
|
"goPage": curr_page,
|
|
"orderBy": [
|
|
{
|
|
"orderBy": "publishTime",
|
|
"reverse": "true"
|
|
},
|
|
{
|
|
"orderBy": "orderTime",
|
|
"reverse": "true"
|
|
}
|
|
],
|
|
"pageSize": size,
|
|
"queryParam": [
|
|
{},
|
|
{},
|
|
{
|
|
"shortName": "fbjg",
|
|
"value": "/1/29/1146295/1652858/1652930"
|
|
}
|
|
]
|
|
}
|
|
d = time.time()
|
|
d_int = int(round(d * 1000))
|
|
jquery = 'jQuery111108461701558527148_' + str(d_int)
|
|
params = json.dumps(params).replace(' ', '').replace('"true"', 'true')
|
|
url = start_url + "?params=" + parse.quote(parse.quote(params)).replace('/','%252F') + '&callback=' + jquery + '&_=' + str(d_int + 1)
|
|
return url
|