SoulBook/soulbook/spiders/qidian_all_novels.py
2024-08-01 19:38:07 +08:00

121 lines
4.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
"""
Created by howie.hu at 25/02/2018.
Target URI: https://www.qidian.com/all
Param:?page=1
"""
import asyncio
import os
import time
from ruia import Spider, Item, TextField, AttrField
from ruia_ua import middleware as ua_middleware
# os.environ['MODE'] = 'PRO'
from soulbook.database.mongodb import MotorBase
from soulbook.spiders.middlewares import owl_middleware
try:
import uvloop
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
pass
loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)
class QidianNovelsItem(Item):
target_item = TextField(css_select='ul.all-img-list>li')
novel_url = AttrField(css_select='div.book-img-box>a', attr='href')
novel_name = TextField(css_select='div.book-mid-info>h4')
novel_author = TextField(css_select='div.book-mid-info>p.author>a.name')
novel_author_home_url = AttrField(css_select='div.book-mid-info>p.author>a.name', attr='href')
novel_type = TextField(css_select='div.book-mid-info > p.author > a:nth-child(4)')
novel_cover = AttrField(css_select='div.book-img-box img', attr='src')
novel_abstract = TextField(css_select='div.book-mid-info p.intro')
# novel_latest_chapter = TextField(css_select='div.bookupdate a')
async def clean_novel_url(self, novel_url):
return 'https:' + novel_url
async def clean_novel_author(self, novel_author):
if isinstance(novel_author, list):
novel_author = novel_author[0].text
return novel_author
async def clean_novel_author_home_url(self, novel_author_home_url):
if isinstance(novel_author_home_url, list):
novel_author_home_url = novel_author_home_url[0].get('href').strip()
return 'https:' + novel_author_home_url
async def clean_novel_cover(self, novel_cover):
return 'https:' + novel_cover
class QidianNovelsSpider(Spider):
# start_urls = ['https://www.qidian.com/all?page=1']
request_config = {
'RETRIES': 15,
'DELAY': 0,
'TIMEOUT': 3
}
concurrency = 20
motor_db = MotorBase(loop=loop).get_db()
async def parse(self, res):
items_data = await QidianNovelsItem.get_items(html=res.html)
tasks = []
for item in items_data:
res_dic = {
'novel_url': item.novel_url,
'novel_name': item.novel_name,
'novel_author': item.novel_author,
'novel_author_home_url': item.novel_author_home_url,
'novel_type': item.novel_type,
'novel_cover': item.novel_cover,
'novel_abstract': item.novel_abstract,
'spider': 'qidian',
'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()),
}
tasks.append(asyncio.ensure_future(self.save(res_dic)))
good_nums = 0
if tasks:
done_list, pending_list = await asyncio.wait(tasks)
for task in done_list:
if task.result():
good_nums += 1
print(f"{len(tasks)}本小说,抓取成功{good_nums}")
async def save(self, res_dic):
# 存进数据库
try:
await self.motor_db.all_novels.update_one(
{'novel_url': res_dic['novel_url'], 'novel_name': res_dic['novel_name']},
{'$set': res_dic},
upsert=True)
print(res_dic['novel_name'] + ' - 抓取成功')
return True
except Exception as e:
self.logger.exception(e)
return False
if __name__ == '__main__':
# 51793
for page in range(248, 519):
print(f"正在爬取第{page}")
start_page = page * 100
end_page = start_page + 100
if end_page > 51793:
end_page = 51793
QidianNovelsSpider.start_urls = ['https://www.qidian.com/all?page={i}'.format(i=i) for i in
range(start_page, end_page)]
# 其他多item示例https://gist.github.com/howie6879/3ef4168159e5047d42d86cb7fb706a2f
QidianNovelsSpider.start(loop=loop, middleware=[ua_middleware, owl_middleware], close_event_loop=False)