295 lines
13 KiB
Python
295 lines
13 KiB
Python
#!/usr/bin/env python
|
||
"""
|
||
Created by howie.hu.
|
||
"""
|
||
|
||
import re
|
||
import aiohttp
|
||
import async_timeout
|
||
|
||
from bs4 import BeautifulSoup
|
||
from aiocache.serializers import PickleSerializer, JsonSerializer
|
||
|
||
from urllib.parse import urlparse, parse_qs, urljoin
|
||
|
||
from soulbook.database.mongodb import MotorBase
|
||
from soulbook.fetcher.decorators import cached
|
||
from soulbook.fetcher.function import target_fetch, get_time, get_html_by_requests, \
|
||
get_random_user_agent
|
||
from soulbook.fetcher.extract_novels import extract_pre_next_chapter
|
||
from soulbook.config import RULES, LATEST_RULES, LOGGER
|
||
|
||
list_class_content = ("content", "content_read", "txt")
|
||
list_id_content = ("chaptercontent", "content", "txt")
|
||
|
||
|
||
@cached(ttl=300, key_from_attr='url', serializer=PickleSerializer(), namespace="main")
|
||
async def cache_owllook_novels_content(url, chapter_url, netloc):
|
||
headers = {
|
||
'user-agent': await get_random_user_agent()
|
||
}
|
||
html = await target_fetch(headers=headers, url=url)
|
||
if not html:
|
||
html = get_html_by_requests(url=url, headers=headers)
|
||
if html:
|
||
soup = BeautifulSoup(html, 'html5lib')
|
||
# 章节内容
|
||
content = None
|
||
if RULES.get(netloc):
|
||
selector = RULES[netloc].content_selector
|
||
if selector.get('id', None):
|
||
content = soup.find_all(id_=selector['id'])
|
||
elif selector.get('class', None):
|
||
content = soup.find_all(class_=selector['class'])
|
||
else:
|
||
content = soup.find_all(selector.get('tag'))
|
||
if not content:
|
||
# print("novels_content: body")
|
||
# content = soup.find_all("body")
|
||
|
||
# for i in list_class_content:
|
||
# content = soup.find_all(class_=i)
|
||
# if content:
|
||
# break
|
||
# else:
|
||
# for i in list_id_content:
|
||
# content = soup.find_all(id=i)
|
||
# if content:
|
||
# break
|
||
|
||
for i in list_id_content:
|
||
content = soup.find_all(id=i)
|
||
if content:
|
||
break
|
||
else:
|
||
for i in list_class_content:
|
||
content = soup.find_all(class_=i)
|
||
if content:
|
||
break
|
||
|
||
if not content:
|
||
# 直接提取整个版块块
|
||
content = soup.find_all("body")
|
||
if content:
|
||
# 提取出真正的章节标题
|
||
title_reg = r'(第?\s*[一二两三四五六七八九十○零百千万亿0-91234567890]{1,6}\s*[章回卷节折篇幕集]\s*.*?)[_,-]'
|
||
title = soup.title.string
|
||
extract_title = re.findall(title_reg, title, re.I)
|
||
if extract_title:
|
||
title = extract_title[0]
|
||
else:
|
||
try:
|
||
title = soup.select('h1')[0].get_text()
|
||
except Exception as e:
|
||
print(e)
|
||
if not title:
|
||
title = soup.title.string
|
||
# if "_" in title:
|
||
# title = title.split('_')[0]
|
||
# elif "-" in title:
|
||
# title = title.split('-')[0]
|
||
next_chapter = extract_pre_next_chapter(url=url, chapter_url=chapter_url,
|
||
html=str(soup))
|
||
content = [str(i) for i in content]
|
||
data = {
|
||
'content': str(''.join(content)),
|
||
'next_chapter': next_chapter,
|
||
'title': title
|
||
}
|
||
else:
|
||
data = None
|
||
return data
|
||
return None
|
||
|
||
|
||
ListIdTab = ("box_con", 'box_chap', 'layout layout-col1', 'listmain', "content", "list-chapterAll")
|
||
ListClassTab = (
|
||
"box_con", 'box_chap', 'layout layout-col1', 'listmain', 'book chapterlist', 'book_list',
|
||
'TabCss', 'inner', 'mulu', 'bg')
|
||
|
||
|
||
@cached(ttl=300, key_from_attr='url', serializer=PickleSerializer(), namespace="main")
|
||
async def cache_owllook_novels_chapter(url, netloc):
|
||
headers = {
|
||
'user-agent': await get_random_user_agent()
|
||
}
|
||
html = await target_fetch(headers=headers, url=url)
|
||
if not html:
|
||
html = get_html_by_requests(url=url, headers=headers)
|
||
if html:
|
||
soup = BeautifulSoup(html, 'html5lib')
|
||
# 提取出的内容
|
||
content = None
|
||
# 存在解析规则
|
||
if RULES.get(netloc):
|
||
# 匹配解析规则
|
||
selector = RULES[netloc].chapter_selector
|
||
if selector.get('id', None):
|
||
content = soup.find_all(id=selector['id'])
|
||
elif selector.get('class', None):
|
||
content = soup.find_all(class_=selector['class'])
|
||
else:
|
||
content = soup.find_all(selector.get('tag'))
|
||
else:
|
||
data = str(html)
|
||
for i in ListIdTab:
|
||
if 'id="%s"' % i in data:
|
||
content = soup.find_all(id=i)
|
||
break
|
||
for i in ListClassTab:
|
||
if 'class="%s"' % i in data:
|
||
content = soup.find_all(class_=i)
|
||
break
|
||
if not content:
|
||
# 直接提取整个版块块
|
||
content = soup.find_all("body")
|
||
if content:
|
||
# 防止章节被display:none
|
||
return str(content).replace('style', '') if content else None
|
||
return None
|
||
|
||
|
||
@cached(ttl=10800, key_from_attr='search_ranking', serializer=JsonSerializer(), namespace="ranking")
|
||
async def cache_owllook_search_ranking():
|
||
motor_db = MotorBase().get_db()
|
||
keyword_cursor = motor_db.search_records.find(
|
||
{'count': {'$gte': 50}},
|
||
{'keyword': 1, 'count': 1, '_id': 0}
|
||
).sort('count', -1).limit(35)
|
||
result = []
|
||
index = 1
|
||
async for document in keyword_cursor:
|
||
result.append({'keyword': document['keyword'], 'count': document['count'], 'index': index})
|
||
index += 1
|
||
return result
|
||
|
||
|
||
@cached(ttl=3600, key_from_attr='search_ranking', serializer=JsonSerializer(), namespace="ranking")
|
||
async def cache_others_search_ranking(spider='qidian', novel_type='全部类别'):
|
||
motor_db = MotorBase().get_db()
|
||
item_data = await motor_db.novels_ranking.find_one({'spider': spider, 'type': novel_type},
|
||
{'data': 1, '_id': 0})
|
||
return item_data
|
||
|
||
|
||
async def get_the_latest_chapter(chapter_url, timeout=15):
|
||
try:
|
||
with async_timeout.timeout(timeout):
|
||
url = parse_qs(urlparse(chapter_url).query).get('url', '')
|
||
novels_name = parse_qs(urlparse(chapter_url).query).get('novels_name', '')
|
||
data = None
|
||
if url and novels_name:
|
||
url = url[0]
|
||
novels_name = novels_name[0]
|
||
netloc = urlparse(url).netloc
|
||
if netloc in LATEST_RULES.keys():
|
||
headers = {
|
||
'user-agent': await get_random_user_agent()
|
||
}
|
||
try:
|
||
html = await target_fetch(url=url, headers=headers, timeout=timeout)
|
||
if html is None:
|
||
html = get_html_by_requests(url=url, headers=headers, timeout=timeout)
|
||
except TypeError:
|
||
html = get_html_by_requests(url=url, headers=headers, timeout=timeout)
|
||
except Exception as e:
|
||
LOGGER.exception(e)
|
||
return None
|
||
try:
|
||
soup = BeautifulSoup(html, 'html5lib')
|
||
except Exception as e:
|
||
LOGGER.exception(e)
|
||
return None
|
||
latest_chapter_name, latest_chapter_url = None, None
|
||
if LATEST_RULES[netloc].plan:
|
||
meta_value = LATEST_RULES[netloc].meta_value
|
||
latest_chapter_name = soup.select(
|
||
'meta[property="{0}"]'.format(
|
||
meta_value["latest_chapter_name"])) or soup.select(
|
||
'meta[name="{0}"]'.format(meta_value["latest_chapter_name"]))
|
||
|
||
latest_chapter_name = latest_chapter_name[0].get('content',
|
||
None) if latest_chapter_name else None
|
||
latest_chapter_url = soup.select(
|
||
'meta[property="{0}"]'.format(
|
||
meta_value["latest_chapter_url"])) or soup.select(
|
||
'meta[name="{0}"]'.format(meta_value["latest_chapter_url"]))
|
||
latest_chapter_url = urljoin(chapter_url,
|
||
latest_chapter_url[0].get('content',
|
||
None)) if latest_chapter_url else None
|
||
else:
|
||
selector = LATEST_RULES[netloc].selector
|
||
content_url = selector.get('content_url')
|
||
if selector.get('id', None):
|
||
latest_chapter_soup = soup.find_all(id=selector['id'])
|
||
elif selector.get('class', None):
|
||
latest_chapter_soup = soup.find_all(class_=selector['class'])
|
||
else:
|
||
latest_chapter_soup = soup.select(selector.get('tag'))
|
||
if latest_chapter_soup:
|
||
if content_url == '1':
|
||
# TODO
|
||
pass
|
||
elif content_url == '0':
|
||
# TODO
|
||
pass
|
||
else:
|
||
latest_chapter_url = content_url + latest_chapter_soup[0].get(
|
||
'href', None)
|
||
latest_chapter_name = latest_chapter_soup[0].get('title', None)
|
||
if latest_chapter_name and latest_chapter_url:
|
||
time_current = get_time()
|
||
# print(latest_chapter_url)
|
||
data = {
|
||
"latest_chapter_name": latest_chapter_name,
|
||
"latest_chapter_url": latest_chapter_url,
|
||
"owllook_chapter_url": chapter_url,
|
||
"owllook_content_url": "/owllook_content?url={latest_chapter_url}&name={name}&chapter_url={chapter_url}&novels_name={novels_name}".format(
|
||
latest_chapter_url=latest_chapter_url,
|
||
name=latest_chapter_name,
|
||
chapter_url=url,
|
||
novels_name=novels_name,
|
||
),
|
||
}
|
||
# 存储最新章节
|
||
motor_db = MotorBase().get_db()
|
||
await motor_db.latest_chapter.update_one(
|
||
{"novels_name": novels_name, 'owllook_chapter_url': chapter_url},
|
||
{'$set': {'data': data, "finished_at": time_current}}, upsert=True)
|
||
return data
|
||
except Exception as e:
|
||
LOGGER.exception(e)
|
||
return None
|
||
|
||
|
||
async def update_all_books(loop, timeout=15):
|
||
try:
|
||
motor_db = MotorBase().get_db()
|
||
# 获取所有书架链接游标
|
||
books_url_cursor = motor_db.user_message.find({}, {'books_url.book_url': 1, '_id': 0})
|
||
book_urls = []
|
||
already_urls = set()
|
||
async for document in books_url_cursor:
|
||
if document:
|
||
books_url = document['books_url']
|
||
|
||
for book_url in books_url:
|
||
chapter_url = book_url['book_url']
|
||
if chapter_url not in already_urls:
|
||
try:
|
||
await get_the_latest_chapter(chapter_url, timeout)
|
||
except Exception as e:
|
||
LOGGER.exception(e)
|
||
already_urls.add(chapter_url)
|
||
# 一组书架链接列表数据
|
||
# book_urls += [book_url['book_url'] for book_url in books_url]
|
||
# url_tasks = [get_the_latest_chapter(each_url, loop) for each_url in set(book_urls)]
|
||
# tasks = [asyncio.ensure_future(i) for i in url_tasks]
|
||
# try:
|
||
# await asyncio.gather(*tasks)
|
||
# except asyncio.TimeoutError as e:
|
||
# pass
|
||
except Exception as e:
|
||
LOGGER.exception(e)
|
||
return False
|