wiki_enterprise_python/demo1/Util.py


								from uuid import uuid4

								import datetime

								from lxml import etree

								import logging

								import pymysql

								from abc import ABCMeta,abstractmethod

								from twisted.enterprise import adbapi

								import copy

								class Util_WANG(metaclass=ABCMeta):

								    @classmethod

								    def pos_url(cls,item,settings,response=None):

								        """

								        判断我们的链接是否为那种直接需要打开的，pdf，image等，这种的

								        :param item:原文item

								        :param settings: 把那个settings传进来

								        :param response: 请求response

								        :return: 最好是直接返回是否为需要下载的，需要的返回true，不需要的返回false

								        """

								        houzui=item['lianjie'][item['lianjie'].rfind('.'):].strip()

								        jiewei = ['.doc', '.xls', '.docx', '.xlsx', '.txt', '.rar', '.zip', '.wps', '.pdf','.jpg', '.png', '.jpeg', '.gif', '.svg']

								        s=False

								        for jiewei_sign in jiewei:

								            if item['lianjie'].endswith(jiewei_sign):

								                s=True

								                break

								        if s:

								            year = datetime.datetime.now().strftime('%Y')

								            mouth = datetime.datetime.now().strftime('%m')

								            item['wenjian'] = [{'file_name': '原文件'}]

								            item['wenjian'][0]['file_url'] = item['lianjie']

								            houzui = item['wenjian'][0]['file_url'][item['wenjian'][0]['file_url'].rfind('/') + 1:]

								            new_url = '/' + year + '/' + mouth + '/' + cls.short_uuid() + '_' + houzui

								            item['wenjian'][0]['new_file'] = new_url

								            item['xiangqing'] = '<div><p>请查看原文附件：<a href="' + settings.get(

								                'FILE_PATH') + new_url + '">原文件</a></p></div>'

								        return s

								    @classmethod

								    def jiewei_href_contains(cls):

								        """

								        返回我们的文件的后缀

								        :return:

								        """

								        str = ''

								        jiewei = ['.doc', '.xls', '.docx', '.xlsx', '.txt', '.rar', '.zip', '.wps', '.pdf']

								        for j in jiewei:

								            str += 'contains(@href,"' + j + '")' + ' or '

								        str = str.strip().strip('or').strip()

								        return str

								    @classmethod

								    def jiewei_src_contains(cls):

								        """

								        返回我们图片文件

								        :return:

								        """

								        str = ''

								        jiewei = ['.jpg', '.png', '.jpeg', '.gif', '.svg']

								        for j in jiewei:

								            str += 'contains(@src,"' + j + '")' + ' or '

								        str = str.strip().strip('or').strip()

								        return str

								    @classmethod

								    def short_uuid(cls):

								        uuidChars = ("a", "b", "c", "d", "e", "f",

								                     "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s",

								                     "t", "u", "v", "w", "x", "y", "z", "0", "1", "2", "3", "4", "5",

								                     "6", "7", "8", "9", "A", "B", "C", "D", "E", "F", "G", "H", "I",

								                     "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V",

								                     "W", "X", "Y", "Z")

								        uuid = str(uuid4()).replace('-', '')

								        result = ''

								        for i in range(0, 8):

								            sub = uuid[i * 4: i * 4 + 4]

								            x = int(sub, 16)

								            result += uuidChars[x % 0x3E]

								        return result

								    @classmethod

								    def tihuan_a_return(cls, item, tihuanlujing,response=None):

								        txt = item['xiangqing']

								        """

								        替换我们得网页里面得a标签，以及实现我们得a标签，并且放入我们得item中,我们必须实现a_fun方法

								        :param item:我们要放入得参数的对象

								        :param tihuanlujing: 要实现得替换路径得文件路径的前缀

								        :return:

								        """

								        year = datetime.datetime.now().strftime('%Y')

								        mouth = datetime.datetime.now().strftime('%m')

								        panDuanNone = lambda x: '_' if x is None else x

								        html = etree.HTML(txt)

								        c='//a[@href and (' + cls.jiewei_href_contains() + ')]'

								        alis = html.xpath('//a[@href and (' + cls.jiewei_href_contains() + ')]')

								        for alis_single in alis:

								            single_a_file = {}

								            href = str(alis_single.xpath('@href')[0])

								            content = str(panDuanNone(alis_single.xpath('string(.)')))

								            if content.strip() in '':

								                content='_'

								            single_a_file['file_name'] = content

								            # 每次只需要修改这里我们实际的下载链接地址

								            old_url = href

								            if href.lower().startswith('http'):

								                  single_a_file['file_url']=old_url

								            elif response!=None and (old_url.lower().startswith('./') or old_url.lower().startswith('../')):

								                  single_a_file['file_url']=response.urljoin(old_url)

								            elif response!=None and (old_url.lower().startswith('/') or old_url.lower().startswith('/')):

								                  single_a_file['file_url']=response.urljoin(old_url)

								            else:

								                  single_a_file['file_url'] = cls.a_fun(cls,href)

								            houzui = single_a_file['file_url'][single_a_file['file_url'].rfind('/') + 1:]

								            new_url = '/' + year + '/' + mouth + '/' + cls.short_uuid() + '_' + houzui

								            txt = txt.replace(old_url, tihuanlujing + new_url)

								            single_a_file['new_file'] = new_url

								            try:

								                item['wenjian'].append(single_a_file)

								            except:

								                item['wenjian'] = [single_a_file]

								        item['xiangqing'] = txt

								    @classmethod

								    def tihuan_img_return(cls, item, tihuanlujing,response=None):

								        txt=item['xiangqing']

								        """

								        替换我们得网页里面得src标签，以及实现我们得src标签，并且放入我们得item中

								        :param item:我们要放入得参数的对象

								        :param tihuanlujing: 要实现图片得替换路径得图片路径的前缀

								        :return:

								        """

								        year = datetime.datetime.now().strftime('%Y')

								        mouth = datetime.datetime.now().strftime('%m')

								        panDuanNone = lambda x: '_' if x is None else x

								        html = etree.HTML(txt)

								        imglis = html.xpath('//img[@src and (' + cls.jiewei_src_contains() + ')]')

								        for imglis_single in imglis:

								            single_src_file = {}

								            src = str(imglis_single.xpath('@src')[0])

								            content = str(panDuanNone(imglis_single.xpath('string(.)')))

								            if content.strip() in '':

								                content='_'

								            single_src_file['file_name'] = content

								            old_url = src

								            # 每次只需要修改这里我们实际的下载链接地址

								            if old_url.lower().startswith('http'):

								                single_src_file['file_url']=old_url

								            elif response != None and (old_url.lower().startswith('./') or old_url.lower().startswith('../')):

								                single_src_file['file_url'] = response.urljoin(old_url)

								            elif response !=None and (old_url.lower().startswith('/') or old_url.lower().startswith('/')):

								                single_src_file['file_url'] = response.urljoin(old_url)

								            else:

								                single_src_file['file_url'] = cls.img_fun(cls,src)

								            houzui = single_src_file['file_url'][single_src_file['file_url'].rfind('/') + 1:]

								            new_url = '/' + year + '/' + mouth + '/' + cls.short_uuid() + '_' + houzui

								            txt = txt.replace(old_url, tihuanlujing + new_url)

								            single_src_file['new_file'] = new_url

								            try:

								                item['wenjian'].append(single_src_file)

								            except:

								                item['wenjian'] = [single_src_file]

								        item['xiangqing'] = txt


								    @abstractmethod

								    def a_fun(self,href):

								        """

								        这个就是把href操作过后的结果为我们返回

								        :param href:

								        :return:

								        """

								        pass

								    @abstractmethod

								    def img_fun(self, src):

								        """

								        这个就是把src操作过后的结果为我们返回

								        :param src:

								        :return:

								        """

								        pass


								# 公共的异步插入

								class Asyninser(object):

								    '''

								    实现初始化

								       def __init__(self,dbpool):

								            self.dbpool=dbpool

								    实现do_insert,

								    def do_insert(self, cursor, item):


								    '''

								    def __init__(self, dbpool):

								        self.dbpool = dbpool

								    @classmethod

								    def from_settings(cls, settings):  # 函数名固定，会被scrapy调用，直接可用settings的值

								        """

								        数据库建立连接

								        :param settings: 配置参数

								        :return: 实例化参数

								        """

								        adbparams = dict(

								            host=settings['MYSQL_HOST'],

								            db=settings['MYSQL_DATABASE'],

								            user=settings['MYSQL_USER'],

								            password=settings['MYSQL_PASSWORD'],

								            cursorclass=pymysql.cursors.DictCursor  # 指定cursor类型

								        )


								        # 连接数据池ConnectionPool，使用pymysql或者Mysqldb连接

								        dbpool = adbapi.ConnectionPool('pymysql', **adbparams)

								        # 返回实例化参数

								        return cls(dbpool)

								    def close_spider(self, spider):

								        logging.info('爬虫运行完毕了')

								    def process_item(self, item, spider):

								        """

								        使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作，返回一个对象

								        """

								        asynItem = copy.deepcopy(item)

								        query = self.dbpool.runInteraction(self.do_insert, asynItem)  # 指定操作方法和操作数据

								        # 添加异常处理

								        query.addErrback(self.handle_error,asynItem,spider)  # 处理异常

								        return asynItem

								    def handle_error(self, failure,asynItem,spider):

								        if failure:

								            # 打印错误信息

								            logging.info('----------数据库插入异常信息--------')

								            logging.info(failure)

								            logging.info('---------异常信息结束--------')