76 lines
2.4 KiB
Python
76 lines
2.4 KiB
Python
#!/usr/bin/env python
|
|
"""
|
|
Created by howie.hu at 29/11/2017.
|
|
"""
|
|
import time
|
|
|
|
from ruia import Spider, Item, AttrField, HtmlField, TextField
|
|
from ruia_ua import middleware
|
|
|
|
from soulbook.database.mongodb import MotorBaseOld
|
|
|
|
|
|
class RankingItem(Item):
|
|
target_item = TextField(css_select='div.rank_i_p_list')
|
|
ranking_title = TextField(css_select='div.rank_i_p_tit')
|
|
more = AttrField(css_select='div.rank_i_more a', attr='href')
|
|
book_list = HtmlField(css_select='div.rank_i_p_list>div.rank_i_li', many=True)
|
|
|
|
|
|
class NameItem(Item):
|
|
top_name = TextField(css_select='div.rank_i_bname a.rank_i_l_a_book', default='')
|
|
other_name = TextField(css_select='div.rank_i_bname a', default='')
|
|
|
|
|
|
class ZHRankingSpider(Spider):
|
|
start_urls = ['http://book.zongheng.com/rank.html']
|
|
|
|
concurrency = 3
|
|
|
|
async def parse(self, res):
|
|
result = []
|
|
res_dic = {}
|
|
|
|
async for item in RankingItem.get_items(html=res.html):
|
|
each_book_list = []
|
|
# 只取排名前十的书籍数据
|
|
for index, value in enumerate(item.book_list[:10]):
|
|
item_data = await NameItem.get_item(html=value)
|
|
name = item_data.top_name or item_data.other_name
|
|
each_book_list.append({
|
|
'num': index + 1,
|
|
'name': name
|
|
})
|
|
data = {
|
|
'title': item.ranking_title,
|
|
'more': item.more,
|
|
'book_list': each_book_list,
|
|
'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()),
|
|
}
|
|
result.append(data)
|
|
|
|
res_dic['data'] = result
|
|
res_dic['target_url'] = res.url
|
|
res_dic['type'] = "人气榜单"
|
|
res_dic['spider'] = "zongheng"
|
|
await self.save(res_dic)
|
|
|
|
async def save(self, res_dic):
|
|
try:
|
|
motor_db = MotorBaseOld().db
|
|
await motor_db.novels_ranking.update_one({
|
|
'target_url': res_dic['target_url']},
|
|
{'$set': {
|
|
'data': res_dic['data'],
|
|
'spider': res_dic['spider'],
|
|
'type': res_dic['type'],
|
|
'finished_at': time.strftime("%Y-%m-%d %X", time.localtime())
|
|
}},
|
|
upsert=True)
|
|
except Exception as e:
|
|
self.logger.exception(e)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
ZHRankingSpider.start(middleware=middleware)
|