121 lines
4.1 KiB
Python
121 lines
4.1 KiB
Python
#!/usr/bin/env python
|
||
"""
|
||
Created by howie.hu at 25/02/2018.
|
||
Target URI: https://www.qidian.com/all
|
||
Param:?page=1
|
||
"""
|
||
import asyncio
|
||
import os
|
||
import time
|
||
|
||
from ruia import Spider, Item, TextField, AttrField
|
||
from ruia_ua import middleware as ua_middleware
|
||
|
||
# os.environ['MODE'] = 'PRO'
|
||
|
||
from soulbook.database.mongodb import MotorBase
|
||
from soulbook.spiders.middlewares import owl_middleware
|
||
|
||
try:
|
||
import uvloop
|
||
|
||
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
||
except ImportError:
|
||
pass
|
||
|
||
loop = asyncio.get_event_loop()
|
||
asyncio.set_event_loop(loop)
|
||
|
||
|
||
class QidianNovelsItem(Item):
|
||
target_item = TextField(css_select='ul.all-img-list>li')
|
||
novel_url = AttrField(css_select='div.book-img-box>a', attr='href')
|
||
novel_name = TextField(css_select='div.book-mid-info>h4')
|
||
novel_author = TextField(css_select='div.book-mid-info>p.author>a.name')
|
||
novel_author_home_url = AttrField(css_select='div.book-mid-info>p.author>a.name', attr='href')
|
||
novel_type = TextField(css_select='div.book-mid-info > p.author > a:nth-child(4)')
|
||
novel_cover = AttrField(css_select='div.book-img-box img', attr='src')
|
||
novel_abstract = TextField(css_select='div.book-mid-info p.intro')
|
||
|
||
# novel_latest_chapter = TextField(css_select='div.bookupdate a')
|
||
|
||
async def clean_novel_url(self, novel_url):
|
||
return 'https:' + novel_url
|
||
|
||
async def clean_novel_author(self, novel_author):
|
||
if isinstance(novel_author, list):
|
||
novel_author = novel_author[0].text
|
||
return novel_author
|
||
|
||
async def clean_novel_author_home_url(self, novel_author_home_url):
|
||
if isinstance(novel_author_home_url, list):
|
||
novel_author_home_url = novel_author_home_url[0].get('href').strip()
|
||
return 'https:' + novel_author_home_url
|
||
|
||
async def clean_novel_cover(self, novel_cover):
|
||
return 'https:' + novel_cover
|
||
|
||
|
||
class QidianNovelsSpider(Spider):
|
||
# start_urls = ['https://www.qidian.com/all?page=1']
|
||
|
||
request_config = {
|
||
'RETRIES': 15,
|
||
'DELAY': 0,
|
||
'TIMEOUT': 3
|
||
}
|
||
concurrency = 20
|
||
motor_db = MotorBase(loop=loop).get_db()
|
||
|
||
async def parse(self, res):
|
||
items_data = await QidianNovelsItem.get_items(html=res.html)
|
||
tasks = []
|
||
for item in items_data:
|
||
res_dic = {
|
||
'novel_url': item.novel_url,
|
||
'novel_name': item.novel_name,
|
||
'novel_author': item.novel_author,
|
||
'novel_author_home_url': item.novel_author_home_url,
|
||
'novel_type': item.novel_type,
|
||
'novel_cover': item.novel_cover,
|
||
'novel_abstract': item.novel_abstract,
|
||
'spider': 'qidian',
|
||
'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()),
|
||
}
|
||
tasks.append(asyncio.ensure_future(self.save(res_dic)))
|
||
|
||
good_nums = 0
|
||
if tasks:
|
||
done_list, pending_list = await asyncio.wait(tasks)
|
||
for task in done_list:
|
||
if task.result():
|
||
good_nums += 1
|
||
print(f"共{len(tasks)}本小说,抓取成功{good_nums}本")
|
||
|
||
async def save(self, res_dic):
|
||
# 存进数据库
|
||
try:
|
||
await self.motor_db.all_novels.update_one(
|
||
{'novel_url': res_dic['novel_url'], 'novel_name': res_dic['novel_name']},
|
||
{'$set': res_dic},
|
||
upsert=True)
|
||
print(res_dic['novel_name'] + ' - 抓取成功')
|
||
return True
|
||
except Exception as e:
|
||
self.logger.exception(e)
|
||
return False
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# 51793
|
||
for page in range(248, 519):
|
||
print(f"正在爬取第{page}页")
|
||
start_page = page * 100
|
||
end_page = start_page + 100
|
||
if end_page > 51793:
|
||
end_page = 51793
|
||
QidianNovelsSpider.start_urls = ['https://www.qidian.com/all?page={i}'.format(i=i) for i in
|
||
range(start_page, end_page)]
|
||
# 其他多item示例:https://gist.github.com/howie6879/3ef4168159e5047d42d86cb7fb706a2f
|
||
QidianNovelsSpider.start(loop=loop, middleware=[ua_middleware, owl_middleware], close_event_loop=False)
|