86 lines
3.1 KiB
Python
86 lines
3.1 KiB
Python
# -*- coding:utf-8 -*-
|
||
# !/usr/bin/env python
|
||
import os
|
||
import time
|
||
|
||
from ruia import Spider, Item, TextField, AttrField
|
||
from ruia_ua import middleware
|
||
|
||
os.environ['MODE'] = 'PRO'
|
||
|
||
from soulbook.database.mongodb import MotorBaseOld
|
||
|
||
|
||
class QidianNovelInfoItem(Item):
|
||
"""
|
||
定义继承自item的Item类
|
||
"""
|
||
novel_name = TextField(css_select='.book-info>h1>em')
|
||
author = TextField(css_select='a.writer')
|
||
# 当提取的值是属性的时候,要定义AttrField
|
||
cover = AttrField(css_select='a#bookImg>img', attr='src')
|
||
abstract = TextField(css_select='div.book-intro>p')
|
||
status = TextField(css_select='p.tag>span.blue')
|
||
novels_type = TextField(css_select='p.tag>a.red')
|
||
latest_chapter = TextField(css_select='li.update>div.detail>p.cf>a')
|
||
latest_chapter_time = TextField(css_select='div.detail>p.cf>em')
|
||
|
||
async def clean_cover(self, cover):
|
||
return 'http:' + cover
|
||
|
||
async def clean_status(self, status):
|
||
"""
|
||
当目标值的对象只有一个,默认将值提取出来,否则返回list,可以在这里定义一个函数进行循环提取
|
||
:param ele_tag:
|
||
:return:
|
||
"""
|
||
return '#'.join([i.text for i in status])
|
||
|
||
async def clean_novels_type(self, novels_type):
|
||
return '#'.join([i.text for i in novels_type])
|
||
|
||
async def clean_latest_chapter_time(self, latest_chapter_time):
|
||
return latest_chapter_time.replace(u'今天', str(time.strftime("%Y-%m-%d ", time.localtime()))).replace(u'昨日', str(
|
||
time.strftime("%Y-%m-%d ", time.localtime(time.time() - 24 * 60 * 60))))
|
||
|
||
|
||
class QidianNovelInfoSpider(Spider):
|
||
request_config = {
|
||
'RETRIES': 3,
|
||
'TIMEOUT': 10
|
||
}
|
||
|
||
async def parse(self, res):
|
||
motor_db = MotorBaseOld().db
|
||
item = await QidianNovelInfoItem.get_item(html=res.html)
|
||
item_data = {
|
||
'novel_name': item.novel_name,
|
||
'author': item.author,
|
||
'cover': item.cover,
|
||
'abstract': item.abstract,
|
||
'status': item.status,
|
||
'novels_type': item.novels_type,
|
||
'latest_chapter': item.latest_chapter,
|
||
'latest_chapter_time': item.latest_chapter_time,
|
||
'spider': 'qidian',
|
||
'target_url': res.url,
|
||
'updated_at': time.strftime("%Y-%m-%d %X", time.localtime())
|
||
}
|
||
print('获取 {} 小说信息成功'.format(item.novel_name))
|
||
await motor_db.all_novels_info.update_one(
|
||
{'novel_name': item_data['novel_name'], 'spider': item_data['spider']},
|
||
{'$set': item_data},
|
||
upsert=True)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
import random
|
||
|
||
# 其他多item示例:https://gist.github.com/howie6879/3ef4168159e5047d42d86cb7fb706a2f
|
||
QidianNovelInfoSpider.start_urls = ['https://book.qidian.com/info/1004608738',
|
||
'https://book.qidian.com/info/3602691',
|
||
'https://book.qidian.com/info/3347595', 'https://book.qidian.com/info/1887208']
|
||
|
||
# QidianNovelInfoSpider.start_urls = all_urls
|
||
QidianNovelInfoSpider.start(middleware=middleware)
|