You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

223 lines
9.6 KiB

from uuid import uuid4
import datetime
from lxml import etree
import logging
import pymysql
from abc import ABCMeta,abstractmethod
from twisted.enterprise import adbapi
import copy
class Util_WANG(metaclass=ABCMeta):
@classmethod
def pos_url(cls,item,settings,response=None):
"""
判断我们的链接是否为那种直接需要打开的,pdf,image等,这种的
:param item:原文item
:param settings: 把那个settings传进来
:param response: 请求response
:return: 最好是直接返回是否为需要下载的,需要的返回true,不需要的返回false
"""
houzui=item['lianjie'][item['lianjie'].rfind('.'):].strip()
jiewei = ['.doc', '.xls', '.docx', '.xlsx', '.txt', '.rar', '.zip', '.wps', '.pdf','.jpg', '.png', '.jpeg', '.gif', '.svg']
s=False
for jiewei_sign in jiewei:
if item['lianjie'].endswith(jiewei_sign):
s=True
break
if s:
year = datetime.datetime.now().strftime('%Y')
mouth = datetime.datetime.now().strftime('%m')
item['wenjian'] = [{'file_name': '原文件'}]
item['wenjian'][0]['file_url'] = item['lianjie']
houzui = item['wenjian'][0]['file_url'][item['wenjian'][0]['file_url'].rfind('/') + 1:]
new_url = '/' + year + '/' + mouth + '/' + cls.short_uuid() + '_' + houzui
item['wenjian'][0]['new_file'] = new_url
item['xiangqing'] = '<div><p>请查看原文附件:<a href="' + settings.get(
'FILE_PATH') + new_url + '">原文件</a></p></div>'
return s
@classmethod
def jiewei_href_contains(cls):
"""
返回我们的文件的后缀
:return:
"""
str = ''
jiewei = ['.doc', '.xls', '.docx', '.xlsx', '.txt', '.rar', '.zip', '.wps', '.pdf']
for j in jiewei:
str += 'contains(@href,"' + j + '")' + ' or '
str = str.strip().strip('or').strip()
return str
@classmethod
def jiewei_src_contains(cls):
"""
返回我们图片文件
:return:
"""
str = ''
jiewei = ['.jpg', '.png', '.jpeg', '.gif', '.svg']
for j in jiewei:
str += 'contains(@src,"' + j + '")' + ' or '
str = str.strip().strip('or').strip()
return str
@classmethod
def short_uuid(cls):
uuidChars = ("a", "b", "c", "d", "e", "f",
"g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s",
"t", "u", "v", "w", "x", "y", "z", "0", "1", "2", "3", "4", "5",
"6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "I",
"J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V",
"W", "X", "Y", "Z")
uuid = str(uuid4()).replace('-', '')
result = ''
for i in range(0, 8):
sub = uuid[i * 4: i * 4 + 4]
x = int(sub, 16)
result += uuidChars[x % 0x3E]
return result
@classmethod
def tihuan_a_return(cls, item, tihuanlujing,response=None):
txt = item['xiangqing']
"""
替换我们得网页里面得a标签,以及实现我们得a标签,并且放入我们得item中,我们必须实现a_fun方法
:param item:我们要放入得参数的对象
:param tihuanlujing: 要实现得替换路径得文件路径的前缀
:return:
"""
year = datetime.datetime.now().strftime('%Y')
mouth = datetime.datetime.now().strftime('%m')
panDuanNone = lambda x: '_' if x is None else x
html = etree.HTML(txt)
c='//a[@href and (' + cls.jiewei_href_contains() + ')]'
alis = html.xpath('//a[@href and (' + cls.jiewei_href_contains() + ')]')
for alis_single in alis:
single_a_file = {}
href = str(alis_single.xpath('@href')[0])
content = str(panDuanNone(alis_single.xpath('string(.)')))
if content.strip() in '':
content='_'
single_a_file['file_name'] = content
# 每次只需要修改这里我们实际的下载链接地址
old_url = href
if href.lower().startswith('http'):
single_a_file['file_url']=old_url
elif response!=None and (old_url.lower().startswith('./') or old_url.lower().startswith('../')):
single_a_file['file_url']=response.urljoin(old_url)
elif response!=None and (old_url.lower().startswith('/') or old_url.lower().startswith('/')):
single_a_file['file_url']=response.urljoin(old_url)
else:
single_a_file['file_url'] = cls.a_fun(cls,href)
houzui = single_a_file['file_url'][single_a_file['file_url'].rfind('/') + 1:]
new_url = '/' + year + '/' + mouth + '/' + cls.short_uuid() + '_' + houzui
txt = txt.replace(old_url, tihuanlujing + new_url)
single_a_file['new_file'] = new_url
try:
item['wenjian'].append(single_a_file)
except:
item['wenjian'] = [single_a_file]
item['xiangqing'] = txt
@classmethod
def tihuan_img_return(cls, item, tihuanlujing,response=None):
txt=item['xiangqing']
"""
替换我们得网页里面得src标签,以及实现我们得src标签,并且放入我们得item中
:param item:我们要放入得参数的对象
:param tihuanlujing: 要实现图片得替换路径得图片路径的前缀
:return:
"""
year = datetime.datetime.now().strftime('%Y')
mouth = datetime.datetime.now().strftime('%m')
panDuanNone = lambda x: '_' if x is None else x
html = etree.HTML(txt)
imglis = html.xpath('//img[@src and (' + cls.jiewei_src_contains() + ')]')
for imglis_single in imglis:
single_src_file = {}
src = str(imglis_single.xpath('@src')[0])
content = str(panDuanNone(imglis_single.xpath('string(.)')))
if content.strip() in '':
content='_'
single_src_file['file_name'] = content
old_url = src
# 每次只需要修改这里我们实际的下载链接地址
if old_url.lower().startswith('http'):
single_src_file['file_url']=old_url
elif response != None and (old_url.lower().startswith('./') or old_url.lower().startswith('../')):
single_src_file['file_url'] = response.urljoin(old_url)
elif response !=None and (old_url.lower().startswith('/') or old_url.lower().startswith('/')):
single_src_file['file_url'] = response.urljoin(old_url)
else:
single_src_file['file_url'] = cls.img_fun(cls,src)
houzui = single_src_file['file_url'][single_src_file['file_url'].rfind('/') + 1:]
new_url = '/' + year + '/' + mouth + '/' + cls.short_uuid() + '_' + houzui
txt = txt.replace(old_url, tihuanlujing + new_url)
single_src_file['new_file'] = new_url
try:
item['wenjian'].append(single_src_file)
except:
item['wenjian'] = [single_src_file]
item['xiangqing'] = txt
@abstractmethod
def a_fun(self,href):
"""
这个就是把href操作过后的结果为我们返回
:param href:
:return:
"""
pass
@abstractmethod
def img_fun(self, src):
"""
这个就是把src操作过后的结果为我们返回
:param src:
:return:
"""
pass
# 公共的异步插入
class Asyninser(object):
'''
实现初始化
def __init__(self,dbpool):
self.dbpool=dbpool
实现do_insert,
def do_insert(self, cursor, item):
'''
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值
"""
数据库建立连接
:param settings: 配置参数
:return: 实例化参数
"""
adbparams = dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DATABASE'],
user=settings['MYSQL_USER'],
password=settings['MYSQL_PASSWORD'],
cursorclass=pymysql.cursors.DictCursor # 指定cursor类型
)
# 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接
dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
# 返回实例化参数
return cls(dbpool)
def close_spider(self, spider):
logging.info('爬虫运行完毕了')
def process_item(self, item, spider):
"""
使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象
"""
asynItem = copy.deepcopy(item)
query = self.dbpool.runInteraction(self.do_insert, asynItem) # 指定操作方法和操作数据
# 添加异常处理
query.addErrback(self.handle_error,asynItem,spider) # 处理异常
return asynItem
def handle_error(self, failure,asynItem,spider):
if failure:
# 打印错误信息
logging.info('----------数据库插入异常信息--------')
logging.info(failure)
logging.info('---------异常信息结束--------')