SoulBook/soulbook/spiders/zongheng_novel_info.py
2024-08-01 19:38:07 +08:00

119 lines
3.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# -*- coding:utf-8 -*-
# !/usr/bin/env python
import asyncio
import time
from pprint import pprint
from ruia import Spider, Item, TextField, AttrField
from ruia_ua import middleware as ua_middleware
from soulbook.database.mongodb import MotorBase
from soulbook.spiders.middlewares import owl_middleware
try:
import uvloop
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
except ImportError:
pass
loop = asyncio.get_event_loop()
asyncio.set_event_loop(loop)
class ZHNovelInfoItem(Item):
"""
定义继承自item的Item类
"""
novel_name = TextField(css_select='div.main div.status h1 a')
author = TextField(css_select='div.main div.status div.booksub a')
# 当提取的值是属性的时候要定义AttrField
cover = AttrField(css_select='div.main div.book_cover img', attr='src')
abstract = TextField(css_select='div.main div.status div.info_con p')
status = AttrField(css_select='div.main div.status h1 em', attr='title')
novels_type = TextField(css_select='div.main div.status div.booksub a')
novel_chapter_url = AttrField(css_select='div.main div.status div.book_btn span.list a', attr='href')
async def clean_author(self, author):
if isinstance(author, list):
return author[0].text
else:
return author
async def clean_status(self, status):
"""
当目标值的对象只有一个默认将值提取出来否则返回list可以在这里定义一个函数进行循环提取
:param ele_tag:
:return:
"""
if isinstance(status, list):
return '#'.join([i.get('title').strip().replace('作品', '') for i in status])
else:
return status
async def clean_novels_type(self, novels_type):
if isinstance(novels_type, list):
try:
return novels_type[1].text
except:
return ''
else:
return ''
class ZHNovelInfoSpider(Spider):
start_urls = []
request_config = {
'RETRIES': 3,
'DELAY': 2,
'TIMEOUT': 10
}
motor_db = MotorBase(loop=loop).get_db()
async def parse(self, res):
item = await ZHNovelInfoItem.get_item(html=res.html)
item_data = {
'novel_name': item.novel_name,
'author': item.author,
'cover': item.cover,
'abstract': item.abstract,
'status': item.status,
'novels_type': item.novels_type,
'novel_chapter_url': item.novel_chapter_url,
'target_url': res.url,
'spider': 'zongheng',
'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()),
}
print('获取 {} 小说信息成功'.format(item_data['novel_name']))
print(item_data)
await self.motor_db.all_novels_info.update_one(
{'novel_name': item_data['novel_name'], 'spider': 'zongheng'},
{'$set': item_data},
upsert=True)
if __name__ == '__main__':
import random
# 其他多item示例https://gist.github.com/howie6879/3ef4168159e5047d42d86cb7fb706a2f
ZHNovelInfoSpider.start_urls = ['http://book.zongheng.com/book/672340.html']
ZHNovelInfoSpider.start(middleware=[ua_middleware, owl_middleware])
# def all_novels_info():
# all_urls = []
#
# for each in ZHNovelInfoSpider.all_novels_col.find({'spider': 'zongheng'}):
# if 'zongheng' in each['novel_url']:
# all_urls.append(each['novel_url'])
# random.shuffle(all_urls)
#
# ZHNovelInfoSpider.start_urls = all_urls
# ZHNovelInfoSpider.start()
#
#
# all_novels_info()