You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
223 lines
9.6 KiB
223 lines
9.6 KiB
from uuid import uuid4
|
|
import datetime
|
|
from lxml import etree
|
|
import logging
|
|
import pymysql
|
|
from abc import ABCMeta,abstractmethod
|
|
from twisted.enterprise import adbapi
|
|
import copy
|
|
class Util_WANG(metaclass=ABCMeta):
|
|
@classmethod
|
|
def pos_url(cls,item,settings,response=None):
|
|
"""
|
|
判断我们的链接是否为那种直接需要打开的,pdf,image等,这种的
|
|
:param item:原文item
|
|
:param settings: 把那个settings传进来
|
|
:param response: 请求response
|
|
:return: 最好是直接返回是否为需要下载的,需要的返回true,不需要的返回false
|
|
"""
|
|
houzui=item['lianjie'][item['lianjie'].rfind('.'):].strip()
|
|
jiewei = ['.doc', '.xls', '.docx', '.xlsx', '.txt', '.rar', '.zip', '.wps', '.pdf','.jpg', '.png', '.jpeg', '.gif', '.svg']
|
|
s=False
|
|
for jiewei_sign in jiewei:
|
|
if item['lianjie'].endswith(jiewei_sign):
|
|
s=True
|
|
break
|
|
if s:
|
|
year = datetime.datetime.now().strftime('%Y')
|
|
mouth = datetime.datetime.now().strftime('%m')
|
|
item['wenjian'] = [{'file_name': '原文件'}]
|
|
item['wenjian'][0]['file_url'] = item['lianjie']
|
|
houzui = item['wenjian'][0]['file_url'][item['wenjian'][0]['file_url'].rfind('/') + 1:]
|
|
new_url = '/' + year + '/' + mouth + '/' + cls.short_uuid() + '_' + houzui
|
|
item['wenjian'][0]['new_file'] = new_url
|
|
item['xiangqing'] = '<div><p>请查看原文附件:<a href="' + settings.get(
|
|
'FILE_PATH') + new_url + '">原文件</a></p></div>'
|
|
return s
|
|
@classmethod
|
|
def jiewei_href_contains(cls):
|
|
"""
|
|
返回我们的文件的后缀
|
|
:return:
|
|
"""
|
|
str = ''
|
|
jiewei = ['.doc', '.xls', '.docx', '.xlsx', '.txt', '.rar', '.zip', '.wps', '.pdf']
|
|
for j in jiewei:
|
|
str += 'contains(@href,"' + j + '")' + ' or '
|
|
str = str.strip().strip('or').strip()
|
|
return str
|
|
@classmethod
|
|
def jiewei_src_contains(cls):
|
|
"""
|
|
返回我们图片文件
|
|
:return:
|
|
"""
|
|
str = ''
|
|
jiewei = ['.jpg', '.png', '.jpeg', '.gif', '.svg']
|
|
for j in jiewei:
|
|
str += 'contains(@src,"' + j + '")' + ' or '
|
|
str = str.strip().strip('or').strip()
|
|
return str
|
|
@classmethod
|
|
def short_uuid(cls):
|
|
uuidChars = ("a", "b", "c", "d", "e", "f",
|
|
"g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s",
|
|
"t", "u", "v", "w", "x", "y", "z", "0", "1", "2", "3", "4", "5",
|
|
"6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "I",
|
|
"J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V",
|
|
"W", "X", "Y", "Z")
|
|
uuid = str(uuid4()).replace('-', '')
|
|
result = ''
|
|
for i in range(0, 8):
|
|
sub = uuid[i * 4: i * 4 + 4]
|
|
x = int(sub, 16)
|
|
result += uuidChars[x % 0x3E]
|
|
return result
|
|
@classmethod
|
|
def tihuan_a_return(cls, item, tihuanlujing,response=None):
|
|
txt = item['xiangqing']
|
|
"""
|
|
替换我们得网页里面得a标签,以及实现我们得a标签,并且放入我们得item中,我们必须实现a_fun方法
|
|
:param item:我们要放入得参数的对象
|
|
:param tihuanlujing: 要实现得替换路径得文件路径的前缀
|
|
:return:
|
|
"""
|
|
year = datetime.datetime.now().strftime('%Y')
|
|
mouth = datetime.datetime.now().strftime('%m')
|
|
panDuanNone = lambda x: '_' if x is None else x
|
|
html = etree.HTML(txt)
|
|
c='//a[@href and (' + cls.jiewei_href_contains() + ')]'
|
|
alis = html.xpath('//a[@href and (' + cls.jiewei_href_contains() + ')]')
|
|
for alis_single in alis:
|
|
single_a_file = {}
|
|
href = str(alis_single.xpath('@href')[0])
|
|
content = str(panDuanNone(alis_single.xpath('string(.)')))
|
|
if content.strip() in '':
|
|
content='_'
|
|
single_a_file['file_name'] = content
|
|
# 每次只需要修改这里我们实际的下载链接地址
|
|
old_url = href
|
|
if href.lower().startswith('http'):
|
|
single_a_file['file_url']=old_url
|
|
elif response!=None and (old_url.lower().startswith('./') or old_url.lower().startswith('../')):
|
|
single_a_file['file_url']=response.urljoin(old_url)
|
|
elif response!=None and (old_url.lower().startswith('/') or old_url.lower().startswith('/')):
|
|
single_a_file['file_url']=response.urljoin(old_url)
|
|
else:
|
|
single_a_file['file_url'] = cls.a_fun(cls,href)
|
|
houzui = single_a_file['file_url'][single_a_file['file_url'].rfind('/') + 1:]
|
|
new_url = '/' + year + '/' + mouth + '/' + cls.short_uuid() + '_' + houzui
|
|
txt = txt.replace(old_url, tihuanlujing + new_url)
|
|
single_a_file['new_file'] = new_url
|
|
try:
|
|
item['wenjian'].append(single_a_file)
|
|
except:
|
|
item['wenjian'] = [single_a_file]
|
|
item['xiangqing'] = txt
|
|
@classmethod
|
|
def tihuan_img_return(cls, item, tihuanlujing,response=None):
|
|
txt=item['xiangqing']
|
|
"""
|
|
替换我们得网页里面得src标签,以及实现我们得src标签,并且放入我们得item中
|
|
:param item:我们要放入得参数的对象
|
|
:param tihuanlujing: 要实现图片得替换路径得图片路径的前缀
|
|
:return:
|
|
"""
|
|
year = datetime.datetime.now().strftime('%Y')
|
|
mouth = datetime.datetime.now().strftime('%m')
|
|
panDuanNone = lambda x: '_' if x is None else x
|
|
html = etree.HTML(txt)
|
|
imglis = html.xpath('//img[@src and (' + cls.jiewei_src_contains() + ')]')
|
|
for imglis_single in imglis:
|
|
single_src_file = {}
|
|
src = str(imglis_single.xpath('@src')[0])
|
|
content = str(panDuanNone(imglis_single.xpath('string(.)')))
|
|
if content.strip() in '':
|
|
content='_'
|
|
single_src_file['file_name'] = content
|
|
old_url = src
|
|
# 每次只需要修改这里我们实际的下载链接地址
|
|
if old_url.lower().startswith('http'):
|
|
single_src_file['file_url']=old_url
|
|
elif response != None and (old_url.lower().startswith('./') or old_url.lower().startswith('../')):
|
|
single_src_file['file_url'] = response.urljoin(old_url)
|
|
elif response !=None and (old_url.lower().startswith('/') or old_url.lower().startswith('/')):
|
|
single_src_file['file_url'] = response.urljoin(old_url)
|
|
else:
|
|
single_src_file['file_url'] = cls.img_fun(cls,src)
|
|
houzui = single_src_file['file_url'][single_src_file['file_url'].rfind('/') + 1:]
|
|
new_url = '/' + year + '/' + mouth + '/' + cls.short_uuid() + '_' + houzui
|
|
txt = txt.replace(old_url, tihuanlujing + new_url)
|
|
single_src_file['new_file'] = new_url
|
|
try:
|
|
item['wenjian'].append(single_src_file)
|
|
except:
|
|
item['wenjian'] = [single_src_file]
|
|
item['xiangqing'] = txt
|
|
|
|
@abstractmethod
|
|
def a_fun(self,href):
|
|
"""
|
|
这个就是把href操作过后的结果为我们返回
|
|
:param href:
|
|
:return:
|
|
"""
|
|
pass
|
|
@abstractmethod
|
|
def img_fun(self, src):
|
|
"""
|
|
这个就是把src操作过后的结果为我们返回
|
|
:param src:
|
|
:return:
|
|
"""
|
|
pass
|
|
|
|
|
|
# 公共的异步插入
|
|
class Asyninser(object):
|
|
'''
|
|
实现初始化
|
|
def __init__(self,dbpool):
|
|
self.dbpool=dbpool
|
|
实现do_insert,
|
|
def do_insert(self, cursor, item):
|
|
|
|
'''
|
|
def __init__(self, dbpool):
|
|
self.dbpool = dbpool
|
|
@classmethod
|
|
def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值
|
|
"""
|
|
数据库建立连接
|
|
:param settings: 配置参数
|
|
:return: 实例化参数
|
|
"""
|
|
adbparams = dict(
|
|
host=settings['MYSQL_HOST'],
|
|
db=settings['MYSQL_DATABASE'],
|
|
user=settings['MYSQL_USER'],
|
|
password=settings['MYSQL_PASSWORD'],
|
|
cursorclass=pymysql.cursors.DictCursor # 指定cursor类型
|
|
)
|
|
|
|
# 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接
|
|
dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
|
|
# 返回实例化参数
|
|
return cls(dbpool)
|
|
def close_spider(self, spider):
|
|
logging.info('爬虫运行完毕了')
|
|
def process_item(self, item, spider):
|
|
"""
|
|
使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象
|
|
"""
|
|
asynItem = copy.deepcopy(item)
|
|
query = self.dbpool.runInteraction(self.do_insert, asynItem) # 指定操作方法和操作数据
|
|
# 添加异常处理
|
|
query.addErrback(self.handle_error,asynItem,spider) # 处理异常
|
|
return asynItem
|
|
def handle_error(self, failure,asynItem,spider):
|
|
if failure:
|
|
# 打印错误信息
|
|
logging.info('----------数据库插入异常信息--------')
|
|
logging.info(failure)
|
|
logging.info('---------异常信息结束--------')
|