You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

112 lines
4.4 KiB

import scrapy
from uuid import uuid4
import re
from scrapy.spiders import CrawlSpider
from demo1.custom_settings_conf import *
from demo1.items import Shouyelianjie
from scrapy.utils.project import get_project_settings
import pymysql
import logging
import json
import time
from urllib import parse
from demo1.Util import Util_WANG
class gongyehexinxihuabuSpider(scrapy.Spider,Util_WANG):
name = 'gongyehexinxihuabuSpider'
settings = get_project_settings()
allowed_domains = ['miit.gov.cn']
custom_settings = custom_settings_conf_gongyehexinxihuabu
def __init__(self, name=None, **kwargs):
self.db = pymysql.connect(
host=self.settings['MYSQL_HOST'],
database=self.settings['MYSQL_DATABASE'],
user=self.settings['MYSQL_USER'],
password=self.settings['MYSQL_PASSWORD'],
port=3306,
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
self.cursor = self.db.cursor()
def start_requests(self):
url='http://www.miit.gov.cn/gdnps/wjfbindex.jsp'
yield scrapy.Request(url=url,callback=self.dierci_requests)
def dierci_requests(self,response):
url = self.return_url()
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
txt=response.text
txt=txt[txt.find('{'):txt.rfind('}')+1]
txt=json.loads(txt)
curPage = txt['curPage']
totalpagenum = txt['totalPageNum']
for ac in txt['resultMap']:
item=Shouyelianjie()
item['biaoti']=ac['title']
res_time=datetime.datetime.strptime(ac['publishTime'],'%Y%m%d%H%M%S').strftime('%Y-%m-%d')
item['shijian']=res_time
buafter=''
try:
buafter=ac['fbjgmc']
except:
logging.info('没有这个字段')
item['laiyuan']='工信部'+buafter
chushi_url="http://www.miit.gov.cn"+ac['ownSubjectDn'].replace("/1/29/","/").replace('/',"/n")+"/c"+ac['id']+"/content.html"
item['lianjie']=chushi_url
item['xiangqing']=ac['htmlContent']
#html=etree.HTML(ac['htmlContent'])
#self.tihuan_a_return(item,ac['htmlContent'],self.settings.get('FILE_PATH'))
self.tihuan_a_return(item,self.settings.get('FILE_PATH'))
self.tihuan_img_return(item,self.settings.get('MESSAGE'))
if not self.settings.get('ISQUANPA'):
# 续爬解析页面
self.cursor.execute(
'select count(title_url) as nums FROM t_policy where title_url ="{}"'.format(item['lianjie']))
res = self.cursor.fetchall()[0].get('nums')
if res == 0:
yield item
else:
logging.info('这个链接已经爬过了-----:' + item['lianjie'])
else:
yield item
if self.settings.get('ISQUANPA'):
if curPage<totalpagenum:
yield scrapy.Request(url=self.return_url(curr_page=curPage+1),callback=self.parse)
def a_fun(self, href):
return href
def img_fun(self, src):
return src
def return_url(self,size=10,curr_page=1):
start_url = 'http://www.miit.gov.cn/gdnps/searchIndex.jsp'
curr_time = time.time()
size = size
curr_page = curr_page
params = {
"goPage": curr_page,
"orderBy": [
{
"orderBy": "publishTime",
"reverse": "true"
},
{
"orderBy": "orderTime",
"reverse": "true"
}
],
"pageSize": size,
"queryParam": [
{},
{},
{
"shortName": "fbjg",
"value": "/1/29/1146295/1652858/1652930"
}
]
}
d = time.time()
d_int = int(round(d * 1000))
jquery = 'jQuery111108461701558527148_' + str(d_int)
params = json.dumps(params).replace(' ', '').replace('"true"', 'true')
url = start_url + "?params=" + parse.quote(parse.quote(params)).replace('/','%252F') + '&callback=' + jquery + '&_=' + str(d_int + 1)
return url