SoulBook/soulbook/spiders/zongheng_all_novels.py
2024-08-01 19:38:07 +08:00

128 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
"""
Created by howie.hu at 14/03/2018.
纵横小说信息提取http://book.zongheng.com/store/c0/c0/b9/u0/p1/v9/s9/t0/ALL.html
"""
import asyncio
import os
import time
from ruia import Spider, Item, TextField, AttrField, Request
from ruia_ua import middleware as ua_middleware
# os.environ['MODE'] = 'PRO'
from soulbook.database.mongodb import MotorBase
from soulbook.spiders.middlewares import owl_middleware
try:
import uvloop
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
pass
loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)
class ZHNovelsItem(Item):
target_item = TextField(css_select='div.store_collist div.bookbox')
novel_url = AttrField(css_select='div.bookinfo div.bookname a', attr='href')
novel_name = TextField(css_select='div.bookinfo div.bookname a')
novel_author = TextField(css_select='div.bookilnk a:nth-child(1)')
novel_author_home_url = AttrField(css_select='div.bookilnk a:nth-child(1)', attr='href')
novel_type = TextField(css_select='div.bookilnk a:nth-child(2)')
novel_cover = AttrField(css_select='div.bookimg img', attr='src')
novel_abstract = TextField(css_select='div.bookintro')
novel_latest_chapter = TextField(css_select='div.bookupdate a')
# def tal_novel_url(self, novel_url):
# return 'http:' + novel_url
async def clean_novel_author(self, novel_author):
if novel_author:
if isinstance(novel_author, list):
novel_author = novel_author[0].text
return novel_author
else:
return ''
# def tal_novel_author_home_url(self, novel_author_home_url):
# if isinstance(novel_author_home_url, list):
# novel_author_home_url = novel_author_home_url[0].get('href').strip()
# return 'http:' + novel_author_home_url
class ZHNovelsSpider(Spider):
start_urls = ['http://book.zongheng.com/store/c0/c0/b9/u0/p1/v9/s9/t0/ALL.html']
request_config = {
'RETRIES': 8,
'DELAY': 0,
'TIMEOUT': 3
}
concurrency = 60
motor_db = MotorBase(loop=loop).get_db()
async def parse(self, res):
items_data = await ZHNovelsItem.get_items(html=res.html)
tasks = []
for item in items_data:
if item.novel_url:
res_dic = {
'novel_url': item.novel_url,
'novel_name': item.novel_name,
'novel_author': item.novel_author,
'novel_author_home_url': item.novel_author_home_url,
'novel_type': item.novel_type,
'novel_cover': item.novel_cover,
'novel_abstract': item.novel_abstract,
'novel_latest_chapter': item.novel_latest_chapter,
'spider': 'zongheng',
'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()),
}
tasks.append(asyncio.ensure_future(self.save(res_dic)))
# if self.all_novels_col.find_one(
# {"novel_name": item.novel_name, 'novel_author': item.novel_author}) is None:
# self.all_novels_col.insert_one(res_dic)
# # async_callback(self.save, res_dic=res_dic)
# print(item.novel_name + ' - 抓取成功')
good_nums = 0
if tasks:
done_list, pending_list = await asyncio.wait(tasks)
for task in done_list:
if task.result():
good_nums += 1
print(f"{len(tasks)}本小说,抓取成功{good_nums}")
async def save(self, res_dic):
# 存进数据库
res_dic = res_dic
try:
await self.motor_db.all_novels.update_one({
'novel_url': res_dic['novel_url'], 'novel_name': res_dic['novel_name']},
{'$set': res_dic},
upsert=True)
print(res_dic['novel_name'] + ' - 抓取成功')
return True
except Exception as e:
self.logger.exception(e)
return False
if __name__ == '__main__':
# 其他多item示例https://gist.github.com/howie6879/3ef4168159e5047d42d86cb7fb706a2f
# 51793
for page in range(0, 10):
print(f"正在爬取第{page}")
start_page = page * 100
end_page = start_page + 100
if end_page > 999:
end_page = 999
ZHNovelsSpider.start_urls = ['http://book.zongheng.com/store/c0/c0/b9/u0/p{i}/v9/s9/t0/ALL.html'.format(i=i) for
i in
range(start_page, end_page)]
# 其他多item示例https://gist.github.com/howie6879/3ef4168159e5047d42d86cb7fb706a2f
ZHNovelsSpider.start(loop=loop, middleware=[ua_middleware, owl_middleware], close_event_loop=False)