#!/usr/bin/env python import aiohttp import asyncio import async_timeout from bs4 import BeautifulSoup from urllib.parse import parse_qs, urlparse from soulbook.fetcher.function import get_random_user_agent from soulbook.config import CONFIG, LOGGER, BLACK_DOMAIN, RULES, LATEST_RULES async def fetch(client, url, novels_name): with async_timeout.timeout(20): try: headers = { 'User-Agent': await get_random_user_agent(), 'Referer': "http://www.so.com/haosou.html?src=home" } params = {'ie': 'utf-8', 'src': 'noscript_home', 'shb': 1, 'q': novels_name, } async with client.get(url, params=params, headers=headers) as response: assert response.status == 200 LOGGER.info('Task url: {}'.format(response.url)) try: text = await response.text() except: text = await response.read() return text except Exception as e: LOGGER.exception(e) return None async def data_extraction_for_web_so(client, html): with async_timeout.timeout(15): try: # 2017.09.09 修改 更加全面地获取title && url try: title = html.select('h3 a')[0].get_text() url = html.select('h3 a')[0].get('href', None) except Exception as e: LOGGER.exception(e) url, title = None, None return None # 针对不同的请进行url的提取 if "www.so.com/link?m=" in url: url = html.select('h3 a')[0].get('data-url', None) if "www.so.com/link?url=" in url: url = parse_qs(urlparse(url).query).get('url', None) url = url[0] if url else None # try: # url = html.select('h3.res-title a')[0].get('data-url', None) # title = html.select('h3.res-title a')[0].get_text() # except IndexError: # url = html.select('h3.title a')[0].get('href', None) # url = parse_qs(urlparse(url).query).get('url', None) # url = url[0] if url else None # title = html.select('h3.title a')[0].get_text() # except Exception as e: # LOGGER.exception(e) # url, title = None, None # return None # 2017.07.09 此处出现bug url展示形式发生变化 因此对于h3.title a形式依旧不变 但是h3.res-title a则取属性data-url # url = parse_qs(urlparse(url).query).get('url', None) # url = url[0] if url else None netloc = urlparse(url).netloc if not url or 'baidu' in url or 'baike.so.com' in url or netloc in BLACK_DOMAIN: return None is_parse = 1 if netloc in RULES.keys() else 0 is_recommend = 1 if netloc in LATEST_RULES.keys() else 0 time = '' timestamp = 0 return {'title': title, 'url': url.replace('index.html', '').replace('Index.html', ''), 'time': time, 'is_parse': is_parse, 'is_recommend': is_recommend, 'timestamp': timestamp, 'netloc': netloc} except Exception as e: LOGGER.exception(e) return None async def so_search(novels_name): url = CONFIG.SO_URL async with aiohttp.ClientSession() as client: html = await fetch(client=client, url=url, novels_name=novels_name) if html: soup = BeautifulSoup(html, 'html5lib') result = soup.find_all(class_='res-list') extra_tasks = [data_extraction_for_web_so(client=client, html=i) for i in result] tasks = [asyncio.ensure_future(i) for i in extra_tasks] return await asyncio.gather(*tasks) else: return [] if __name__ == '__main__': import uvloop import time from pprint import pprint asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) def novel_task(name): loop = asyncio.get_event_loop() task = asyncio.ensure_future(so_search(name)) loop.run_until_complete(task) return task.result() start = time.time() result = novel_task('圣墟 小说 阅读 最新章节') pprint(result) print(time.time() - start)