SoulBook/soulbook/spiders/heiyan_novel_info.py
2024-08-01 19:38:07 +08:00

103 lines
4.0 KiB
Python

#!/usr/bin/env python
"""
Created by howie.hu at 30/03/2018.
"""
import time
from pprint import pprint
from ruia import Spider, Item, TextField, AttrField
from ruia_ua import middleware as ua_middleware
from soulbook.database.mongodb import MotorBase
class HYNovelInfoItem(Item):
"""
定义继承自item的Item类
"""
novel_name = AttrField(css_select="meta[property='og:title']", attr='content')
author = AttrField(css_select="meta[property='og:novel:author']", attr='content')
cover = AttrField(css_select="meta[property='og:image']", attr='content')
abstract = AttrField(css_select="meta[property='og:description']", attr='content')
status = AttrField(css_select="meta[property='og:novel:status']", attr='content')
novels_type = AttrField(css_select="meta[property='og:novel:category']", attr='content')
novel_chapter_url = AttrField(css_select='div#voteList a.index', attr='href')
latest_chapter = AttrField(css_select="meta[property='og:novel:latest_chapter_name']", attr='content')
latest_chapter_url = AttrField(css_select="meta[property='og:novel:latest_chapter_url']", attr='content')
latest_chapter_time = AttrField(css_select="meta[property='og:novel:update_time']", attr='content')
# novel_name = TextField(css_select='div.c-left>div.mod>div.hd>h2')
# author = TextField(css_select='div.author-zone div.right a.name strong')
# cover = AttrField(css_select='img.book-cover', attr='src')
# abstract = TextField(css_select='pre.note')
# status = ''
# novels_type = TextField(css_select='div.c-left>div.mod>div.hd>p.infos>span.cate>a')
# latest_chapter = ''
# novel_chapter_url = AttrField(css_select='div#voteList a.index', attr='href')
async def clean_cover(self, cover):
if 'https' in cover:
return cover
else:
return cover.replace('http', 'https')
async def clean_novels_type(self, novels_type):
types_dict = {
'社会': '都市'
}
print(types_dict.get(str(novels_type).strip(), novels_type))
return types_dict.get(str(novels_type).strip(), novels_type)
async def clean_latest_chapter_time(self, latest_chapter_time):
return latest_chapter_time.replace(u'今天', str(time.strftime("%Y-%m-%d ", time.localtime()))).replace(u'昨日', str(
time.strftime("%Y-%m-%d ", time.localtime(time.time() - 24 * 60 * 60))))
class HYNovelInfoSpider(Spider):
start_urls = []
request_config = {
'RETRIES': 3,
'TIMEOUT': 10
}
async def parse(self, res):
self.motor_db = MotorBase(loop=self.loop).get_db()
item = await HYNovelInfoItem.get_item(html=res.html)
item_data = {
'novel_name': item.novel_name,
'author': item.author,
'cover': item.cover,
'abstract': item.abstract,
'status': item.status,
'novels_type': item.novels_type,
'novel_chapter_url': item.novel_chapter_url,
'latest_chapter': item.latest_chapter,
'latest_chapter_time': item.latest_chapter_time,
'spider': 'heiyan',
'target_url': res.url,
'updated_at': time.strftime("%Y-%m-%d %X", time.localtime())
}
print('获取 {} 小说信息成功'.format(item_data['novel_name']))
await self.save(res_dic=item_data)
async def save(self, res_dic):
# 存进数据库
try:
motor_db = MotorBase().get_db()
await motor_db.all_novels_info.update_one({
'novel_name': res_dic['novel_name'], 'spider': 'heiyan'},
{'$set': res_dic},
upsert=True)
except Exception as e:
self.logger.exception(e)
if __name__ == '__main__':
HYNovelInfoSpider.start_urls = ['http://www.heiyan.com/book/62599']
# HYNovelInfoSpider.start_urls = [each.get('novel_url', '') for each in search_author('火星引力', 'qidian')]
print(HYNovelInfoSpider.start_urls)
HYNovelInfoSpider.start(middleware=ua_middleware)