103 lines
4.0 KiB
Python
103 lines
4.0 KiB
Python
#!/usr/bin/env python
|
|
"""
|
|
Created by howie.hu at 30/03/2018.
|
|
"""
|
|
import time
|
|
|
|
from pprint import pprint
|
|
|
|
from ruia import Spider, Item, TextField, AttrField
|
|
from ruia_ua import middleware as ua_middleware
|
|
|
|
from soulbook.database.mongodb import MotorBase
|
|
|
|
|
|
class HYNovelInfoItem(Item):
|
|
"""
|
|
定义继承自item的Item类
|
|
"""
|
|
novel_name = AttrField(css_select="meta[property='og:title']", attr='content')
|
|
author = AttrField(css_select="meta[property='og:novel:author']", attr='content')
|
|
cover = AttrField(css_select="meta[property='og:image']", attr='content')
|
|
abstract = AttrField(css_select="meta[property='og:description']", attr='content')
|
|
status = AttrField(css_select="meta[property='og:novel:status']", attr='content')
|
|
novels_type = AttrField(css_select="meta[property='og:novel:category']", attr='content')
|
|
novel_chapter_url = AttrField(css_select='div#voteList a.index', attr='href')
|
|
latest_chapter = AttrField(css_select="meta[property='og:novel:latest_chapter_name']", attr='content')
|
|
latest_chapter_url = AttrField(css_select="meta[property='og:novel:latest_chapter_url']", attr='content')
|
|
latest_chapter_time = AttrField(css_select="meta[property='og:novel:update_time']", attr='content')
|
|
|
|
# novel_name = TextField(css_select='div.c-left>div.mod>div.hd>h2')
|
|
# author = TextField(css_select='div.author-zone div.right a.name strong')
|
|
# cover = AttrField(css_select='img.book-cover', attr='src')
|
|
# abstract = TextField(css_select='pre.note')
|
|
# status = ''
|
|
# novels_type = TextField(css_select='div.c-left>div.mod>div.hd>p.infos>span.cate>a')
|
|
# latest_chapter = ''
|
|
# novel_chapter_url = AttrField(css_select='div#voteList a.index', attr='href')
|
|
|
|
async def clean_cover(self, cover):
|
|
if 'https' in cover:
|
|
return cover
|
|
else:
|
|
return cover.replace('http', 'https')
|
|
|
|
async def clean_novels_type(self, novels_type):
|
|
types_dict = {
|
|
'社会': '都市'
|
|
}
|
|
print(types_dict.get(str(novels_type).strip(), novels_type))
|
|
return types_dict.get(str(novels_type).strip(), novels_type)
|
|
|
|
async def clean_latest_chapter_time(self, latest_chapter_time):
|
|
return latest_chapter_time.replace(u'今天', str(time.strftime("%Y-%m-%d ", time.localtime()))).replace(u'昨日', str(
|
|
time.strftime("%Y-%m-%d ", time.localtime(time.time() - 24 * 60 * 60))))
|
|
|
|
|
|
class HYNovelInfoSpider(Spider):
|
|
start_urls = []
|
|
request_config = {
|
|
'RETRIES': 3,
|
|
'TIMEOUT': 10
|
|
}
|
|
|
|
async def parse(self, res):
|
|
self.motor_db = MotorBase(loop=self.loop).get_db()
|
|
item = await HYNovelInfoItem.get_item(html=res.html)
|
|
|
|
item_data = {
|
|
'novel_name': item.novel_name,
|
|
'author': item.author,
|
|
'cover': item.cover,
|
|
'abstract': item.abstract,
|
|
'status': item.status,
|
|
'novels_type': item.novels_type,
|
|
'novel_chapter_url': item.novel_chapter_url,
|
|
'latest_chapter': item.latest_chapter,
|
|
'latest_chapter_time': item.latest_chapter_time,
|
|
'spider': 'heiyan',
|
|
'target_url': res.url,
|
|
'updated_at': time.strftime("%Y-%m-%d %X", time.localtime())
|
|
}
|
|
|
|
print('获取 {} 小说信息成功'.format(item_data['novel_name']))
|
|
await self.save(res_dic=item_data)
|
|
|
|
async def save(self, res_dic):
|
|
# 存进数据库
|
|
try:
|
|
motor_db = MotorBase().get_db()
|
|
await motor_db.all_novels_info.update_one({
|
|
'novel_name': res_dic['novel_name'], 'spider': 'heiyan'},
|
|
{'$set': res_dic},
|
|
upsert=True)
|
|
except Exception as e:
|
|
self.logger.exception(e)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
HYNovelInfoSpider.start_urls = ['http://www.heiyan.com/book/62599']
|
|
# HYNovelInfoSpider.start_urls = [each.get('novel_url', '') for each in search_author('火星引力', 'qidian')]
|
|
print(HYNovelInfoSpider.start_urls)
|
|
HYNovelInfoSpider.start(middleware=ua_middleware)
|