SoulBook/soulbook/fetcher/cache.py
2024-08-01 19:38:07 +08:00

295 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
"""
Created by howie.hu.
"""
import re
import aiohttp
import async_timeout
from bs4 import BeautifulSoup
from aiocache.serializers import PickleSerializer, JsonSerializer
from urllib.parse import urlparse, parse_qs, urljoin
from soulbook.database.mongodb import MotorBase
from soulbook.fetcher.decorators import cached
from soulbook.fetcher.function import target_fetch, get_time, get_html_by_requests, \
get_random_user_agent
from soulbook.fetcher.extract_novels import extract_pre_next_chapter
from soulbook.config import RULES, LATEST_RULES, LOGGER
list_class_content = ("content", "content_read", "txt")
list_id_content = ("chaptercontent", "content", "txt")
@cached(ttl=300, key_from_attr='url', serializer=PickleSerializer(), namespace="main")
async def cache_owllook_novels_content(url, chapter_url, netloc):
headers = {
'user-agent': await get_random_user_agent()
}
html = await target_fetch(headers=headers, url=url)
if not html:
html = get_html_by_requests(url=url, headers=headers)
if html:
soup = BeautifulSoup(html, 'html5lib')
# 章节内容
content = None
if RULES.get(netloc):
selector = RULES[netloc].content_selector
if selector.get('id', None):
content = soup.find_all(id_=selector['id'])
elif selector.get('class', None):
content = soup.find_all(class_=selector['class'])
else:
content = soup.find_all(selector.get('tag'))
if not content:
# print("novels_content: body")
# content = soup.find_all("body")
# for i in list_class_content:
# content = soup.find_all(class_=i)
# if content:
# break
# else:
# for i in list_id_content:
# content = soup.find_all(id=i)
# if content:
# break
for i in list_id_content:
content = soup.find_all(id=i)
if content:
break
else:
for i in list_class_content:
content = soup.find_all(class_=i)
if content:
break
if not content:
# 直接提取整个版块块
content = soup.find_all("body")
if content:
# 提取出真正的章节标题
title_reg = r'(第?\s*[一二两三四五六七八九十○零百千万亿0-9]{1,6}\s*[章回卷节折篇幕集]\s*.*?)[_,-]'
title = soup.title.string
extract_title = re.findall(title_reg, title, re.I)
if extract_title:
title = extract_title[0]
else:
try:
title = soup.select('h1')[0].get_text()
except Exception as e:
print(e)
if not title:
title = soup.title.string
# if "_" in title:
# title = title.split('_')[0]
# elif "-" in title:
# title = title.split('-')[0]
next_chapter = extract_pre_next_chapter(url=url, chapter_url=chapter_url,
html=str(soup))
content = [str(i) for i in content]
data = {
'content': str(''.join(content)),
'next_chapter': next_chapter,
'title': title
}
else:
data = None
return data
return None
ListIdTab = ("box_con", 'box_chap', 'layout layout-col1', 'listmain', "content", "list-chapterAll")
ListClassTab = (
"box_con", 'box_chap', 'layout layout-col1', 'listmain', 'book chapterlist', 'book_list',
'TabCss', 'inner', 'mulu', 'bg')
@cached(ttl=300, key_from_attr='url', serializer=PickleSerializer(), namespace="main")
async def cache_owllook_novels_chapter(url, netloc):
headers = {
'user-agent': await get_random_user_agent()
}
html = await target_fetch(headers=headers, url=url)
if not html:
html = get_html_by_requests(url=url, headers=headers)
if html:
soup = BeautifulSoup(html, 'html5lib')
# 提取出的内容
content = None
# 存在解析规则
if RULES.get(netloc):
# 匹配解析规则
selector = RULES[netloc].chapter_selector
if selector.get('id', None):
content = soup.find_all(id=selector['id'])
elif selector.get('class', None):
content = soup.find_all(class_=selector['class'])
else:
content = soup.find_all(selector.get('tag'))
else:
data = str(html)
for i in ListIdTab:
if 'id="%s"' % i in data:
content = soup.find_all(id=i)
break
for i in ListClassTab:
if 'class="%s"' % i in data:
content = soup.find_all(class_=i)
break
if not content:
# 直接提取整个版块块
content = soup.find_all("body")
if content:
# 防止章节被display:none
return str(content).replace('style', '') if content else None
return None
@cached(ttl=10800, key_from_attr='search_ranking', serializer=JsonSerializer(), namespace="ranking")
async def cache_owllook_search_ranking():
motor_db = MotorBase().get_db()
keyword_cursor = motor_db.search_records.find(
{'count': {'$gte': 50}},
{'keyword': 1, 'count': 1, '_id': 0}
).sort('count', -1).limit(35)
result = []
index = 1
async for document in keyword_cursor:
result.append({'keyword': document['keyword'], 'count': document['count'], 'index': index})
index += 1
return result
@cached(ttl=3600, key_from_attr='search_ranking', serializer=JsonSerializer(), namespace="ranking")
async def cache_others_search_ranking(spider='qidian', novel_type='全部类别'):
motor_db = MotorBase().get_db()
item_data = await motor_db.novels_ranking.find_one({'spider': spider, 'type': novel_type},
{'data': 1, '_id': 0})
return item_data
async def get_the_latest_chapter(chapter_url, timeout=15):
try:
with async_timeout.timeout(timeout):
url = parse_qs(urlparse(chapter_url).query).get('url', '')
novels_name = parse_qs(urlparse(chapter_url).query).get('novels_name', '')
data = None
if url and novels_name:
url = url[0]
novels_name = novels_name[0]
netloc = urlparse(url).netloc
if netloc in LATEST_RULES.keys():
headers = {
'user-agent': await get_random_user_agent()
}
try:
html = await target_fetch(url=url, headers=headers, timeout=timeout)
if html is None:
html = get_html_by_requests(url=url, headers=headers, timeout=timeout)
except TypeError:
html = get_html_by_requests(url=url, headers=headers, timeout=timeout)
except Exception as e:
LOGGER.exception(e)
return None
try:
soup = BeautifulSoup(html, 'html5lib')
except Exception as e:
LOGGER.exception(e)
return None
latest_chapter_name, latest_chapter_url = None, None
if LATEST_RULES[netloc].plan:
meta_value = LATEST_RULES[netloc].meta_value
latest_chapter_name = soup.select(
'meta[property="{0}"]'.format(
meta_value["latest_chapter_name"])) or soup.select(
'meta[name="{0}"]'.format(meta_value["latest_chapter_name"]))
latest_chapter_name = latest_chapter_name[0].get('content',
None) if latest_chapter_name else None
latest_chapter_url = soup.select(
'meta[property="{0}"]'.format(
meta_value["latest_chapter_url"])) or soup.select(
'meta[name="{0}"]'.format(meta_value["latest_chapter_url"]))
latest_chapter_url = urljoin(chapter_url,
latest_chapter_url[0].get('content',
None)) if latest_chapter_url else None
else:
selector = LATEST_RULES[netloc].selector
content_url = selector.get('content_url')
if selector.get('id', None):
latest_chapter_soup = soup.find_all(id=selector['id'])
elif selector.get('class', None):
latest_chapter_soup = soup.find_all(class_=selector['class'])
else:
latest_chapter_soup = soup.select(selector.get('tag'))
if latest_chapter_soup:
if content_url == '1':
# TODO
pass
elif content_url == '0':
# TODO
pass
else:
latest_chapter_url = content_url + latest_chapter_soup[0].get(
'href', None)
latest_chapter_name = latest_chapter_soup[0].get('title', None)
if latest_chapter_name and latest_chapter_url:
time_current = get_time()
# print(latest_chapter_url)
data = {
"latest_chapter_name": latest_chapter_name,
"latest_chapter_url": latest_chapter_url,
"owllook_chapter_url": chapter_url,
"owllook_content_url": "/owllook_content?url={latest_chapter_url}&name={name}&chapter_url={chapter_url}&novels_name={novels_name}".format(
latest_chapter_url=latest_chapter_url,
name=latest_chapter_name,
chapter_url=url,
novels_name=novels_name,
),
}
# 存储最新章节
motor_db = MotorBase().get_db()
await motor_db.latest_chapter.update_one(
{"novels_name": novels_name, 'owllook_chapter_url': chapter_url},
{'$set': {'data': data, "finished_at": time_current}}, upsert=True)
return data
except Exception as e:
LOGGER.exception(e)
return None
async def update_all_books(loop, timeout=15):
try:
motor_db = MotorBase().get_db()
# 获取所有书架链接游标
books_url_cursor = motor_db.user_message.find({}, {'books_url.book_url': 1, '_id': 0})
book_urls = []
already_urls = set()
async for document in books_url_cursor:
if document:
books_url = document['books_url']
for book_url in books_url:
chapter_url = book_url['book_url']
if chapter_url not in already_urls:
try:
await get_the_latest_chapter(chapter_url, timeout)
except Exception as e:
LOGGER.exception(e)
already_urls.add(chapter_url)
# 一组书架链接列表数据
# book_urls += [book_url['book_url'] for book_url in books_url]
# url_tasks = [get_the_latest_chapter(each_url, loop) for each_url in set(book_urls)]
# tasks = [asyncio.ensure_future(i) for i in url_tasks]
# try:
# await asyncio.gather(*tasks)
# except asyncio.TimeoutError as e:
# pass
except Exception as e:
LOGGER.exception(e)
return False