SoulBook/soulbook/fetcher/cache.py

#!/usr/bin/env python
"""
 Created by howie.hu.
"""

import re
import aiohttp
import async_timeout

from bs4 import BeautifulSoup
from aiocache.serializers import PickleSerializer, JsonSerializer

from urllib.parse import urlparse, parse_qs, urljoin

from soulbook.database.mongodb import MotorBase
from soulbook.fetcher.decorators import cached
from soulbook.fetcher.function import target_fetch, get_time, get_html_by_requests, \
    get_random_user_agent
from soulbook.fetcher.extract_novels import extract_pre_next_chapter
from soulbook.config import RULES, LATEST_RULES, LOGGER

list_class_content = ("content", "content_read", "txt")
list_id_content = ("chaptercontent", "content", "txt")


@cached(ttl=300, key_from_attr='url', serializer=PickleSerializer(), namespace="main")
async def cache_owllook_novels_content(url, chapter_url, netloc):
    headers = {
        'user-agent': await get_random_user_agent()
    }
    html = await target_fetch(headers=headers, url=url)
    if not html:
        html = get_html_by_requests(url=url, headers=headers)
    if html:
        soup = BeautifulSoup(html, 'html5lib')
        # 章节内容
        content = None
        if RULES.get(netloc):
            selector = RULES[netloc].content_selector
            if selector.get('id', None):
                content = soup.find_all(id_=selector['id'])
            elif selector.get('class', None):
                content = soup.find_all(class_=selector['class'])
            else:
                content = soup.find_all(selector.get('tag'))
        if not content:
            # print("novels_content: body")
            # content = soup.find_all("body")

            # for i in list_class_content:
            #     content = soup.find_all(class_=i)
            #     if content:
            #         break
            # else:
            #     for i in list_id_content:
            #         content = soup.find_all(id=i)
            #         if content:
            #             break

            for i in list_id_content:
                content = soup.find_all(id=i)
                if content:
                    break
            else:
                for i in list_class_content:
                    content = soup.find_all(class_=i)
                    if content:
                        break

        if not content:
            # 直接提取整个版块块
            content = soup.find_all("body")
        if content:
            # 提取出真正的章节标题
            title_reg = r'(第?\s*[一二两三四五六七八九十○零百千万亿0-9１２３４５６７８９０]{1,6}\s*[章回卷节折篇幕集]\s*.*?)[_,-]'
            title = soup.title.string
            extract_title = re.findall(title_reg, title, re.I)
            if extract_title:
                title = extract_title[0]
            else:
                try:
                    title = soup.select('h1')[0].get_text()
                except Exception as e:
                    print(e)
            if not title:
                title = soup.title.string
            # if "_" in title:
            #     title = title.split('_')[0]
            # elif "-" in title:
            #     title = title.split('-')[0]
            next_chapter = extract_pre_next_chapter(url=url, chapter_url=chapter_url,
                                                    html=str(soup))
            content = [str(i) for i in content]
            data = {
                'content': str(''.join(content)),
                'next_chapter': next_chapter,
                'title': title
            }
        else:
            data = None
        return data
    return None


ListIdTab = ("box_con", 'box_chap', 'layout layout-col1', 'listmain', "content", "list-chapterAll")
ListClassTab = (
    "box_con", 'box_chap', 'layout layout-col1', 'listmain', 'book chapterlist', 'book_list',
    'TabCss', 'inner', 'mulu', 'bg')


@cached(ttl=300, key_from_attr='url', serializer=PickleSerializer(), namespace="main")
async def cache_owllook_novels_chapter(url, netloc):
    headers = {
        'user-agent': await get_random_user_agent()
    }
    html = await target_fetch(headers=headers, url=url)
    if not html:
        html = get_html_by_requests(url=url, headers=headers)
    if html:
        soup = BeautifulSoup(html, 'html5lib')
        # 提取出的内容
        content = None
        # 存在解析规则
        if RULES.get(netloc):
            # 匹配解析规则
            selector = RULES[netloc].chapter_selector
            if selector.get('id', None):
                content = soup.find_all(id=selector['id'])
            elif selector.get('class', None):
                content = soup.find_all(class_=selector['class'])
            else:
                content = soup.find_all(selector.get('tag'))
        else:
            data = str(html)
            for i in ListIdTab:
                if 'id="%s"' % i in data:
                    content = soup.find_all(id=i)
                    break
            for i in ListClassTab:
                if 'class="%s"' % i in data:
                    content = soup.find_all(class_=i)
                    break
        if not content:
            # 直接提取整个版块块
            content = soup.find_all("body")
        if content:
            # 防止章节被display:none
            return str(content).replace('style', '') if content else None
    return None


@cached(ttl=10800, key_from_attr='search_ranking', serializer=JsonSerializer(), namespace="ranking")
async def cache_owllook_search_ranking():
    motor_db = MotorBase().get_db()
    keyword_cursor = motor_db.search_records.find(
        {'count': {'$gte': 50}},
        {'keyword': 1, 'count': 1, '_id': 0}
    ).sort('count', -1).limit(35)
    result = []
    index = 1
    async for document in keyword_cursor:
        result.append({'keyword': document['keyword'], 'count': document['count'], 'index': index})
        index += 1
    return result


@cached(ttl=3600, key_from_attr='search_ranking', serializer=JsonSerializer(), namespace="ranking")
async def cache_others_search_ranking(spider='qidian', novel_type='全部类别'):
    motor_db = MotorBase().get_db()
    item_data = await motor_db.novels_ranking.find_one({'spider': spider, 'type': novel_type},
                                                       {'data': 1, '_id': 0})
    return item_data


async def get_the_latest_chapter(chapter_url, timeout=15):
    try:
        with async_timeout.timeout(timeout):
            url = parse_qs(urlparse(chapter_url).query).get('url', '')
            novels_name = parse_qs(urlparse(chapter_url).query).get('novels_name', '')
            data = None
            if url and novels_name:
                url = url[0]
                novels_name = novels_name[0]
                netloc = urlparse(url).netloc
                if netloc in LATEST_RULES.keys():
                    headers = {
                        'user-agent': await get_random_user_agent()
                    }
                    try:
                        html = await target_fetch(url=url, headers=headers, timeout=timeout)
                        if html is None:
                            html = get_html_by_requests(url=url, headers=headers, timeout=timeout)
                    except TypeError:
                        html = get_html_by_requests(url=url, headers=headers, timeout=timeout)
                    except Exception as e:
                        LOGGER.exception(e)
                        return None
                    try:
                        soup = BeautifulSoup(html, 'html5lib')
                    except Exception as e:
                        LOGGER.exception(e)
                        return None
                    latest_chapter_name, latest_chapter_url = None, None
                    if LATEST_RULES[netloc].plan:
                        meta_value = LATEST_RULES[netloc].meta_value
                        latest_chapter_name = soup.select(
                            'meta[property="{0}"]'.format(
                                meta_value["latest_chapter_name"])) or soup.select(
                            'meta[name="{0}"]'.format(meta_value["latest_chapter_name"]))

                        latest_chapter_name = latest_chapter_name[0].get('content',
                                                                         None) if latest_chapter_name else None
                        latest_chapter_url = soup.select(
                            'meta[property="{0}"]'.format(
                                meta_value["latest_chapter_url"])) or soup.select(
                            'meta[name="{0}"]'.format(meta_value["latest_chapter_url"]))
                        latest_chapter_url = urljoin(chapter_url,
                                                     latest_chapter_url[0].get('content',
                                                                               None)) if latest_chapter_url else None
                    else:
                        selector = LATEST_RULES[netloc].selector
                        content_url = selector.get('content_url')
                        if selector.get('id', None):
                            latest_chapter_soup = soup.find_all(id=selector['id'])
                        elif selector.get('class', None):
                            latest_chapter_soup = soup.find_all(class_=selector['class'])
                        else:
                            latest_chapter_soup = soup.select(selector.get('tag'))
                        if latest_chapter_soup:
                            if content_url == '1':
                                # TODO
                                pass
                            elif content_url == '0':
                                # TODO
                                pass
                            else:
                                latest_chapter_url = content_url + latest_chapter_soup[0].get(
                                    'href', None)
                            latest_chapter_name = latest_chapter_soup[0].get('title', None)
                    if latest_chapter_name and latest_chapter_url:
                        time_current = get_time()
                        # print(latest_chapter_url)
                        data = {
                            "latest_chapter_name": latest_chapter_name,
                            "latest_chapter_url": latest_chapter_url,
                            "owllook_chapter_url": chapter_url,
                            "owllook_content_url": "/owllook_content?url={latest_chapter_url}&name={name}&chapter_url={chapter_url}&novels_name={novels_name}".format(
                                latest_chapter_url=latest_chapter_url,
                                name=latest_chapter_name,
                                chapter_url=url,
                                novels_name=novels_name,
                            ),
                        }
                        # 存储最新章节
                        motor_db = MotorBase().get_db()
                        await motor_db.latest_chapter.update_one(
                            {"novels_name": novels_name, 'owllook_chapter_url': chapter_url},
                            {'$set': {'data': data, "finished_at": time_current}}, upsert=True)
            return data
    except Exception as e:
        LOGGER.exception(e)
        return None


async def update_all_books(loop, timeout=15):
    try:
        motor_db = MotorBase().get_db()
        # 获取所有书架链接游标
        books_url_cursor = motor_db.user_message.find({}, {'books_url.book_url': 1, '_id': 0})
        book_urls = []
        already_urls = set()
        async for document in books_url_cursor:
            if document:
                books_url = document['books_url']

                for book_url in books_url:
                    chapter_url = book_url['book_url']
                    if chapter_url not in already_urls:
                        try:
                            await get_the_latest_chapter(chapter_url, timeout)
                        except Exception as e:
                            LOGGER.exception(e)
                        already_urls.add(chapter_url)
                        # 一组书架链接列表数据
                        #         book_urls += [book_url['book_url'] for book_url in books_url]
                        # url_tasks = [get_the_latest_chapter(each_url, loop) for each_url in set(book_urls)]
                        # tasks = [asyncio.ensure_future(i) for i in url_tasks]
                        # try:
                        #     await asyncio.gather(*tasks)
                        # except asyncio.TimeoutError as e:
                        #     pass
    except Exception as e:
        LOGGER.exception(e)
        return False