from uuid import uuid4 import datetime from lxml import etree import logging import pymysql from abc import ABCMeta,abstractmethod from twisted.enterprise import adbapi import copy class Util_WANG(metaclass=ABCMeta): @classmethod def pos_url(cls,item,settings,response=None): """ 判断我们的链接是否为那种直接需要打开的,pdf,image等,这种的 :param item:原文item :param settings: 把那个settings传进来 :param response: 请求response :return: 最好是直接返回是否为需要下载的,需要的返回true,不需要的返回false """ houzui=item['lianjie'][item['lianjie'].rfind('.'):].strip() jiewei = ['.doc', '.xls', '.docx', '.xlsx', '.txt', '.rar', '.zip', '.wps', '.pdf','.jpg', '.png', '.jpeg', '.gif', '.svg'] s=False for jiewei_sign in jiewei: if item['lianjie'].endswith(jiewei_sign): s=True break if s: year = datetime.datetime.now().strftime('%Y') mouth = datetime.datetime.now().strftime('%m') item['wenjian'] = [{'file_name': '原文件'}] item['wenjian'][0]['file_url'] = item['lianjie'] houzui = item['wenjian'][0]['file_url'][item['wenjian'][0]['file_url'].rfind('/') + 1:] new_url = '/' + year + '/' + mouth + '/' + cls.short_uuid() + '_' + houzui item['wenjian'][0]['new_file'] = new_url item['xiangqing'] = '

请查看原文附件:原文件

' return s @classmethod def jiewei_href_contains(cls): """ 返回我们的文件的后缀 :return: """ str = '' jiewei = ['.doc', '.xls', '.docx', '.xlsx', '.txt', '.rar', '.zip', '.wps', '.pdf'] for j in jiewei: str += 'contains(@href,"' + j + '")' + ' or ' str = str.strip().strip('or').strip() return str @classmethod def jiewei_src_contains(cls): """ 返回我们图片文件 :return: """ str = '' jiewei = ['.jpg', '.png', '.jpeg', '.gif', '.svg'] for j in jiewei: str += 'contains(@src,"' + j + '")' + ' or ' str = str.strip().strip('or').strip() return str @classmethod def short_uuid(cls): uuidChars = ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z") uuid = str(uuid4()).replace('-', '') result = '' for i in range(0, 8): sub = uuid[i * 4: i * 4 + 4] x = int(sub, 16) result += uuidChars[x % 0x3E] return result @classmethod def tihuan_a_return(cls, item, tihuanlujing,response=None): txt = item['xiangqing'] """ 替换我们得网页里面得a标签,以及实现我们得a标签,并且放入我们得item中,我们必须实现a_fun方法 :param item:我们要放入得参数的对象 :param tihuanlujing: 要实现得替换路径得文件路径的前缀 :return: """ year = datetime.datetime.now().strftime('%Y') mouth = datetime.datetime.now().strftime('%m') panDuanNone = lambda x: '_' if x is None else x html = etree.HTML(txt) c='//a[@href and (' + cls.jiewei_href_contains() + ')]' alis = html.xpath('//a[@href and (' + cls.jiewei_href_contains() + ')]') for alis_single in alis: single_a_file = {} href = str(alis_single.xpath('@href')[0]) content = str(panDuanNone(alis_single.xpath('string(.)'))) if content.strip() in '': content='_' single_a_file['file_name'] = content # 每次只需要修改这里我们实际的下载链接地址 old_url = href if href.lower().startswith('http'): single_a_file['file_url']=old_url elif response!=None and (old_url.lower().startswith('./') or old_url.lower().startswith('../')): single_a_file['file_url']=response.urljoin(old_url) elif response!=None and (old_url.lower().startswith('/') or old_url.lower().startswith('/')): single_a_file['file_url']=response.urljoin(old_url) else: single_a_file['file_url'] = cls.a_fun(cls,href) houzui = single_a_file['file_url'][single_a_file['file_url'].rfind('/') + 1:] new_url = '/' + year + '/' + mouth + '/' + cls.short_uuid() + '_' + houzui txt = txt.replace(old_url, tihuanlujing + new_url) single_a_file['new_file'] = new_url try: item['wenjian'].append(single_a_file) except: item['wenjian'] = [single_a_file] item['xiangqing'] = txt @classmethod def tihuan_img_return(cls, item, tihuanlujing,response=None): txt=item['xiangqing'] """ 替换我们得网页里面得src标签,以及实现我们得src标签,并且放入我们得item中 :param item:我们要放入得参数的对象 :param tihuanlujing: 要实现图片得替换路径得图片路径的前缀 :return: """ year = datetime.datetime.now().strftime('%Y') mouth = datetime.datetime.now().strftime('%m') panDuanNone = lambda x: '_' if x is None else x html = etree.HTML(txt) imglis = html.xpath('//img[@src and (' + cls.jiewei_src_contains() + ')]') for imglis_single in imglis: single_src_file = {} src = str(imglis_single.xpath('@src')[0]) content = str(panDuanNone(imglis_single.xpath('string(.)'))) if content.strip() in '': content='_' single_src_file['file_name'] = content old_url = src # 每次只需要修改这里我们实际的下载链接地址 if old_url.lower().startswith('http'): single_src_file['file_url']=old_url elif response != None and (old_url.lower().startswith('./') or old_url.lower().startswith('../')): single_src_file['file_url'] = response.urljoin(old_url) elif response !=None and (old_url.lower().startswith('/') or old_url.lower().startswith('/')): single_src_file['file_url'] = response.urljoin(old_url) else: single_src_file['file_url'] = cls.img_fun(cls,src) houzui = single_src_file['file_url'][single_src_file['file_url'].rfind('/') + 1:] new_url = '/' + year + '/' + mouth + '/' + cls.short_uuid() + '_' + houzui txt = txt.replace(old_url, tihuanlujing + new_url) single_src_file['new_file'] = new_url try: item['wenjian'].append(single_src_file) except: item['wenjian'] = [single_src_file] item['xiangqing'] = txt @abstractmethod def a_fun(self,href): """ 这个就是把href操作过后的结果为我们返回 :param href: :return: """ pass @abstractmethod def img_fun(self, src): """ 这个就是把src操作过后的结果为我们返回 :param src: :return: """ pass # 公共的异步插入 class Asyninser(object): ''' 实现初始化 def __init__(self,dbpool): self.dbpool=dbpool 实现do_insert, def do_insert(self, cursor, item): ''' def __init__(self, dbpool): self.dbpool = dbpool @classmethod def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值 """ 数据库建立连接 :param settings: 配置参数 :return: 实例化参数 """ adbparams = dict( host=settings['MYSQL_HOST'], db=settings['MYSQL_DATABASE'], user=settings['MYSQL_USER'], password=settings['MYSQL_PASSWORD'], cursorclass=pymysql.cursors.DictCursor # 指定cursor类型 ) # 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接 dbpool = adbapi.ConnectionPool('pymysql', **adbparams) # 返回实例化参数 return cls(dbpool) def close_spider(self, spider): logging.info('爬虫运行完毕了') def process_item(self, item, spider): """ 使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象 """ asynItem = copy.deepcopy(item) query = self.dbpool.runInteraction(self.do_insert, asynItem) # 指定操作方法和操作数据 # 添加异常处理 query.addErrback(self.handle_error,asynItem,spider) # 处理异常 return asynItem def handle_error(self, failure,asynItem,spider): if failure: # 打印错误信息 logging.info('----------数据库插入异常信息--------') logging.info(failure) logging.info('---------异常信息结束--------')