#!/usr/bin/env python
"""
Created by howie.hu at 2018/5/28.
"""
import re
from bs4 import BeautifulSoup
from collections import OrderedDict
from operator import itemgetter
from urllib.parse import urljoin, urlparse
from soulbook.config import LOGGER
def extract_chapters(chapters_url, html):
"""
通用解析小说目录
:param chapter_url: 小说目录页url
:param res: 当前页面html
:return:
"""
# 参考https://greasyfork.org/zh-CN/scripts/292-my-novel-reader
chapters_reg = r'(.*第?\s*[一二两三四五六七八九十○零百千万亿0-91234567890]{1,6}\s*[章回卷节折篇幕集].*?)'
# 这里不能保证获取的章节分得很清楚,但能保证这一串str是章节目录。可以利用bs安心提取a
chapters_res = re.findall(chapters_reg, str(html), re.I)
str_chapters_res = '\n'.join(chapters_res)
chapters_res_soup = BeautifulSoup(str_chapters_res, 'html5lib')
all_chapters = []
for link in chapters_res_soup.find_all('a'):
each_data = {}
url = urljoin(chapters_url, link.get('href')) or ''
name = link.text or ''
each_data['chapter_url'] = url
each_data['chapter_name'] = name
each_data['index'] = int(urlparse(url).path.split('.')[0].split('/')[-1])
all_chapters.append(each_data)
chapters_sorted = sorted(all_chapters, reverse=True, key=itemgetter('index'))
return chapters_sorted
def extract_pre_next_chapter(url, chapter_url, html):
"""
获取单章节上一页下一页
:param chapter_url:
:param html:
:return:
"""
next_chapter = OrderedDict()
try:
# 参考https://greasyfork.org/zh-CN/scripts/292-my-novel-reader
next_reg = r'(.*[第上前下后][一]?[0-9]{0,6}?[页张个篇章节步].*?)'
judge_reg = r'[第上前下后][一]?[0-9]{0,6}?[页张个篇章节步]'
# 这里同样需要利用bs再次解析
next_res = re.findall(next_reg, html.replace('<<', '').replace('>>', ''), re.I)
str_next_res = '\n'.join(next_res)
next_res_soup = BeautifulSoup(str_next_res, 'html5lib')
for link in next_res_soup.find_all('a'):
text = link.text or ''
text = text.replace(' ', '')
if novels_list(text):
is_next = re.search(judge_reg, text)
# is_ok = is_chapter(text)
if is_next:
url = urljoin(chapter_url, link.get('href')) or ''
regex = re.compile("^http://|^https://")
if regex.sub('', chapter_url) == regex.sub('', url):
url = False
next_chapter[text[:5]] = url
# nextDic = [{v[0]: v[1]} for v in sorted(next_chapter.items(), key=lambda d: d[1])]
return next_chapter
except Exception as e:
LOGGER.exception(e)
return next_chapter
def novels_list(text):
rm_list = ['后一个', '天上掉下个']
for i in rm_list:
if i in text:
return False
else:
continue
return True