from uuid import uuid4
import datetime
from lxml import etree
import logging
import pymysql
from abc import ABCMeta,abstractmethod
from twisted.enterprise import adbapi
import copy
class Util_WANG(metaclass=ABCMeta):
@classmethod
def pos_url(cls,item,settings,response=None):
"""
判断我们的链接是否为那种直接需要打开的,pdf,image等,这种的
:param item:原文item
:param settings: 把那个settings传进来
:param response: 请求response
:return: 最好是直接返回是否为需要下载的,需要的返回true,不需要的返回false
"""
houzui=item['lianjie'][item['lianjie'].rfind('.'):].strip()
jiewei = ['.doc', '.xls', '.docx', '.xlsx', '.txt', '.rar', '.zip', '.wps', '.pdf','.jpg', '.png', '.jpeg', '.gif', '.svg']
s=False
for jiewei_sign in jiewei:
if item['lianjie'].endswith(jiewei_sign):
s=True
break
if s:
year = datetime.datetime.now().strftime('%Y')
mouth = datetime.datetime.now().strftime('%m')
item['wenjian'] = [{'file_name': '原文件'}]
item['wenjian'][0]['file_url'] = item['lianjie']
houzui = item['wenjian'][0]['file_url'][item['wenjian'][0]['file_url'].rfind('/') + 1:]
new_url = '/' + year + '/' + mouth + '/' + cls.short_uuid() + '_' + houzui
item['wenjian'][0]['new_file'] = new_url
item['xiangqing'] = '
'
return s
@classmethod
def jiewei_href_contains(cls):
"""
返回我们的文件的后缀
:return:
"""
str = ''
jiewei = ['.doc', '.xls', '.docx', '.xlsx', '.txt', '.rar', '.zip', '.wps', '.pdf']
for j in jiewei:
str += 'contains(@href,"' + j + '")' + ' or '
str = str.strip().strip('or').strip()
return str
@classmethod
def jiewei_src_contains(cls):
"""
返回我们图片文件
:return:
"""
str = ''
jiewei = ['.jpg', '.png', '.jpeg', '.gif', '.svg']
for j in jiewei:
str += 'contains(@src,"' + j + '")' + ' or '
str = str.strip().strip('or').strip()
return str
@classmethod
def short_uuid(cls):
uuidChars = ("a", "b", "c", "d", "e", "f",
"g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s",
"t", "u", "v", "w", "x", "y", "z", "0", "1", "2", "3", "4", "5",
"6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "I",
"J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V",
"W", "X", "Y", "Z")
uuid = str(uuid4()).replace('-', '')
result = ''
for i in range(0, 8):
sub = uuid[i * 4: i * 4 + 4]
x = int(sub, 16)
result += uuidChars[x % 0x3E]
return result
@classmethod
def tihuan_a_return(cls, item, tihuanlujing,response=None):
txt = item['xiangqing']
"""
替换我们得网页里面得a标签,以及实现我们得a标签,并且放入我们得item中,我们必须实现a_fun方法
:param item:我们要放入得参数的对象
:param tihuanlujing: 要实现得替换路径得文件路径的前缀
:return:
"""
year = datetime.datetime.now().strftime('%Y')
mouth = datetime.datetime.now().strftime('%m')
panDuanNone = lambda x: '_' if x is None else x
html = etree.HTML(txt)
c='//a[@href and (' + cls.jiewei_href_contains() + ')]'
alis = html.xpath('//a[@href and (' + cls.jiewei_href_contains() + ')]')
for alis_single in alis:
single_a_file = {}
href = str(alis_single.xpath('@href')[0])
content = str(panDuanNone(alis_single.xpath('string(.)')))
if content.strip() in '':
content='_'
single_a_file['file_name'] = content
# 每次只需要修改这里我们实际的下载链接地址
old_url = href
if href.lower().startswith('http'):
single_a_file['file_url']=old_url
elif response!=None and (old_url.lower().startswith('./') or old_url.lower().startswith('../')):
single_a_file['file_url']=response.urljoin(old_url)
elif response!=None and (old_url.lower().startswith('/') or old_url.lower().startswith('/')):
single_a_file['file_url']=response.urljoin(old_url)
else:
single_a_file['file_url'] = cls.a_fun(cls,href)
houzui = single_a_file['file_url'][single_a_file['file_url'].rfind('/') + 1:]
new_url = '/' + year + '/' + mouth + '/' + cls.short_uuid() + '_' + houzui
txt = txt.replace(old_url, tihuanlujing + new_url)
single_a_file['new_file'] = new_url
try:
item['wenjian'].append(single_a_file)
except:
item['wenjian'] = [single_a_file]
item['xiangqing'] = txt
@classmethod
def tihuan_img_return(cls, item, tihuanlujing,response=None):
txt=item['xiangqing']
"""
替换我们得网页里面得src标签,以及实现我们得src标签,并且放入我们得item中
:param item:我们要放入得参数的对象
:param tihuanlujing: 要实现图片得替换路径得图片路径的前缀
:return:
"""
year = datetime.datetime.now().strftime('%Y')
mouth = datetime.datetime.now().strftime('%m')
panDuanNone = lambda x: '_' if x is None else x
html = etree.HTML(txt)
imglis = html.xpath('//img[@src and (' + cls.jiewei_src_contains() + ')]')
for imglis_single in imglis:
single_src_file = {}
src = str(imglis_single.xpath('@src')[0])
content = str(panDuanNone(imglis_single.xpath('string(.)')))
if content.strip() in '':
content='_'
single_src_file['file_name'] = content
old_url = src
# 每次只需要修改这里我们实际的下载链接地址
if old_url.lower().startswith('http'):
single_src_file['file_url']=old_url
elif response != None and (old_url.lower().startswith('./') or old_url.lower().startswith('../')):
single_src_file['file_url'] = response.urljoin(old_url)
elif response !=None and (old_url.lower().startswith('/') or old_url.lower().startswith('/')):
single_src_file['file_url'] = response.urljoin(old_url)
else:
single_src_file['file_url'] = cls.img_fun(cls,src)
houzui = single_src_file['file_url'][single_src_file['file_url'].rfind('/') + 1:]
new_url = '/' + year + '/' + mouth + '/' + cls.short_uuid() + '_' + houzui
txt = txt.replace(old_url, tihuanlujing + new_url)
single_src_file['new_file'] = new_url
try:
item['wenjian'].append(single_src_file)
except:
item['wenjian'] = [single_src_file]
item['xiangqing'] = txt
@abstractmethod
def a_fun(self,href):
"""
这个就是把href操作过后的结果为我们返回
:param href:
:return:
"""
pass
@abstractmethod
def img_fun(self, src):
"""
这个就是把src操作过后的结果为我们返回
:param src:
:return:
"""
pass
# 公共的异步插入
class Asyninser(object):
'''
实现初始化
def __init__(self,dbpool):
self.dbpool=dbpool
实现do_insert,
def do_insert(self, cursor, item):
'''
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值
"""
数据库建立连接
:param settings: 配置参数
:return: 实例化参数
"""
adbparams = dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DATABASE'],
user=settings['MYSQL_USER'],
password=settings['MYSQL_PASSWORD'],
cursorclass=pymysql.cursors.DictCursor # 指定cursor类型
)
# 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接
dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
# 返回实例化参数
return cls(dbpool)
def close_spider(self, spider):
logging.info('爬虫运行完毕了')
def process_item(self, item, spider):
"""
使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象
"""
asynItem = copy.deepcopy(item)
query = self.dbpool.runInteraction(self.do_insert, asynItem) # 指定操作方法和操作数据
# 添加异常处理
query.addErrback(self.handle_error,asynItem,spider) # 处理异常
return asynItem
def handle_error(self, failure,asynItem,spider):
if failure:
# 打印错误信息
logging.info('----------数据库插入异常信息--------')
logging.info(failure)
logging.info('---------异常信息结束--------')