SoulBook/soulbook/spiders/zh_ranking.py
2024-08-01 19:38:07 +08:00

76 lines
2.4 KiB
Python

#!/usr/bin/env python
"""
Created by howie.hu at 29/11/2017.
"""
import time
from ruia import Spider, Item, AttrField, HtmlField, TextField
from ruia_ua import middleware
from soulbook.database.mongodb import MotorBaseOld
class RankingItem(Item):
target_item = TextField(css_select='div.rank_i_p_list')
ranking_title = TextField(css_select='div.rank_i_p_tit')
more = AttrField(css_select='div.rank_i_more a', attr='href')
book_list = HtmlField(css_select='div.rank_i_p_list>div.rank_i_li', many=True)
class NameItem(Item):
top_name = TextField(css_select='div.rank_i_bname a.rank_i_l_a_book', default='')
other_name = TextField(css_select='div.rank_i_bname a', default='')
class ZHRankingSpider(Spider):
start_urls = ['http://book.zongheng.com/rank.html']
concurrency = 3
async def parse(self, res):
result = []
res_dic = {}
async for item in RankingItem.get_items(html=res.html):
each_book_list = []
# 只取排名前十的书籍数据
for index, value in enumerate(item.book_list[:10]):
item_data = await NameItem.get_item(html=value)
name = item_data.top_name or item_data.other_name
each_book_list.append({
'num': index + 1,
'name': name
})
data = {
'title': item.ranking_title,
'more': item.more,
'book_list': each_book_list,
'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()),
}
result.append(data)
res_dic['data'] = result
res_dic['target_url'] = res.url
res_dic['type'] = "人气榜单"
res_dic['spider'] = "zongheng"
await self.save(res_dic)
async def save(self, res_dic):
try:
motor_db = MotorBaseOld().db
await motor_db.novels_ranking.update_one({
'target_url': res_dic['target_url']},
{'$set': {
'data': res_dic['data'],
'spider': res_dic['spider'],
'type': res_dic['type'],
'finished_at': time.strftime("%Y-%m-%d %X", time.localtime())
}},
upsert=True)
except Exception as e:
self.logger.exception(e)
if __name__ == '__main__':
ZHRankingSpider.start(middleware=middleware)