119 lines
3.6 KiB
Python
119 lines
3.6 KiB
Python
# -*- coding:utf-8 -*-
|
||
# !/usr/bin/env python
|
||
import asyncio
|
||
import time
|
||
|
||
from pprint import pprint
|
||
|
||
from ruia import Spider, Item, TextField, AttrField
|
||
|
||
from ruia_ua import middleware as ua_middleware
|
||
|
||
from soulbook.database.mongodb import MotorBase
|
||
from soulbook.spiders.middlewares import owl_middleware
|
||
|
||
try:
|
||
import uvloop
|
||
|
||
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
||
except ImportError:
|
||
pass
|
||
|
||
loop = asyncio.get_event_loop()
|
||
asyncio.set_event_loop(loop)
|
||
|
||
|
||
class ZHNovelInfoItem(Item):
|
||
"""
|
||
定义继承自item的Item类
|
||
"""
|
||
novel_name = TextField(css_select='div.main div.status h1 a')
|
||
author = TextField(css_select='div.main div.status div.booksub a')
|
||
# 当提取的值是属性的时候,要定义AttrField
|
||
cover = AttrField(css_select='div.main div.book_cover img', attr='src')
|
||
abstract = TextField(css_select='div.main div.status div.info_con p')
|
||
status = AttrField(css_select='div.main div.status h1 em', attr='title')
|
||
novels_type = TextField(css_select='div.main div.status div.booksub a')
|
||
novel_chapter_url = AttrField(css_select='div.main div.status div.book_btn span.list a', attr='href')
|
||
|
||
async def clean_author(self, author):
|
||
if isinstance(author, list):
|
||
return author[0].text
|
||
else:
|
||
return author
|
||
|
||
async def clean_status(self, status):
|
||
"""
|
||
当目标值的对象只有一个,默认将值提取出来,否则返回list,可以在这里定义一个函数进行循环提取
|
||
:param ele_tag:
|
||
:return:
|
||
"""
|
||
if isinstance(status, list):
|
||
return '#'.join([i.get('title').strip().replace('作品', '') for i in status])
|
||
else:
|
||
return status
|
||
|
||
async def clean_novels_type(self, novels_type):
|
||
if isinstance(novels_type, list):
|
||
try:
|
||
return novels_type[1].text
|
||
except:
|
||
return ''
|
||
else:
|
||
return ''
|
||
|
||
|
||
class ZHNovelInfoSpider(Spider):
|
||
start_urls = []
|
||
request_config = {
|
||
'RETRIES': 3,
|
||
'DELAY': 2,
|
||
'TIMEOUT': 10
|
||
}
|
||
motor_db = MotorBase(loop=loop).get_db()
|
||
|
||
async def parse(self, res):
|
||
item = await ZHNovelInfoItem.get_item(html=res.html)
|
||
|
||
item_data = {
|
||
'novel_name': item.novel_name,
|
||
'author': item.author,
|
||
'cover': item.cover,
|
||
'abstract': item.abstract,
|
||
'status': item.status,
|
||
'novels_type': item.novels_type,
|
||
'novel_chapter_url': item.novel_chapter_url,
|
||
'target_url': res.url,
|
||
'spider': 'zongheng',
|
||
'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()),
|
||
}
|
||
|
||
print('获取 {} 小说信息成功'.format(item_data['novel_name']))
|
||
print(item_data)
|
||
await self.motor_db.all_novels_info.update_one(
|
||
{'novel_name': item_data['novel_name'], 'spider': 'zongheng'},
|
||
{'$set': item_data},
|
||
upsert=True)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
import random
|
||
|
||
# 其他多item示例:https://gist.github.com/howie6879/3ef4168159e5047d42d86cb7fb706a2f
|
||
ZHNovelInfoSpider.start_urls = ['http://book.zongheng.com/book/672340.html']
|
||
ZHNovelInfoSpider.start(middleware=[ua_middleware, owl_middleware])
|
||
|
||
# def all_novels_info():
|
||
# all_urls = []
|
||
#
|
||
# for each in ZHNovelInfoSpider.all_novels_col.find({'spider': 'zongheng'}):
|
||
# if 'zongheng' in each['novel_url']:
|
||
# all_urls.append(each['novel_url'])
|
||
# random.shuffle(all_urls)
|
||
#
|
||
# ZHNovelInfoSpider.start_urls = all_urls
|
||
# ZHNovelInfoSpider.start()
|
||
#
|
||
#
|
||
# all_novels_info()
|