128 lines
4.8 KiB
Python
128 lines
4.8 KiB
Python
#!/usr/bin/env python
|
||
"""
|
||
Created by howie.hu at 14/03/2018.
|
||
纵横小说信息提取:http://book.zongheng.com/store/c0/c0/b9/u0/p1/v9/s9/t0/ALL.html
|
||
"""
|
||
import asyncio
|
||
import os
|
||
import time
|
||
|
||
from ruia import Spider, Item, TextField, AttrField, Request
|
||
from ruia_ua import middleware as ua_middleware
|
||
|
||
# os.environ['MODE'] = 'PRO'
|
||
from soulbook.database.mongodb import MotorBase
|
||
from soulbook.spiders.middlewares import owl_middleware
|
||
|
||
try:
|
||
import uvloop
|
||
|
||
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
|
||
except ImportError:
|
||
pass
|
||
|
||
loop = asyncio.get_event_loop()
|
||
asyncio.set_event_loop(loop)
|
||
|
||
|
||
class ZHNovelsItem(Item):
|
||
target_item = TextField(css_select='div.store_collist div.bookbox')
|
||
novel_url = AttrField(css_select='div.bookinfo div.bookname a', attr='href')
|
||
novel_name = TextField(css_select='div.bookinfo div.bookname a')
|
||
novel_author = TextField(css_select='div.bookilnk a:nth-child(1)')
|
||
novel_author_home_url = AttrField(css_select='div.bookilnk a:nth-child(1)', attr='href')
|
||
novel_type = TextField(css_select='div.bookilnk a:nth-child(2)')
|
||
novel_cover = AttrField(css_select='div.bookimg img', attr='src')
|
||
novel_abstract = TextField(css_select='div.bookintro')
|
||
novel_latest_chapter = TextField(css_select='div.bookupdate a')
|
||
|
||
# def tal_novel_url(self, novel_url):
|
||
# return 'http:' + novel_url
|
||
|
||
async def clean_novel_author(self, novel_author):
|
||
if novel_author:
|
||
if isinstance(novel_author, list):
|
||
novel_author = novel_author[0].text
|
||
return novel_author
|
||
else:
|
||
return ''
|
||
|
||
# def tal_novel_author_home_url(self, novel_author_home_url):
|
||
# if isinstance(novel_author_home_url, list):
|
||
# novel_author_home_url = novel_author_home_url[0].get('href').strip()
|
||
# return 'http:' + novel_author_home_url
|
||
|
||
|
||
class ZHNovelsSpider(Spider):
|
||
start_urls = ['http://book.zongheng.com/store/c0/c0/b9/u0/p1/v9/s9/t0/ALL.html']
|
||
|
||
request_config = {
|
||
'RETRIES': 8,
|
||
'DELAY': 0,
|
||
'TIMEOUT': 3
|
||
}
|
||
concurrency = 60
|
||
motor_db = MotorBase(loop=loop).get_db()
|
||
|
||
async def parse(self, res):
|
||
items_data = await ZHNovelsItem.get_items(html=res.html)
|
||
tasks = []
|
||
for item in items_data:
|
||
if item.novel_url:
|
||
res_dic = {
|
||
'novel_url': item.novel_url,
|
||
'novel_name': item.novel_name,
|
||
'novel_author': item.novel_author,
|
||
'novel_author_home_url': item.novel_author_home_url,
|
||
'novel_type': item.novel_type,
|
||
'novel_cover': item.novel_cover,
|
||
'novel_abstract': item.novel_abstract,
|
||
'novel_latest_chapter': item.novel_latest_chapter,
|
||
'spider': 'zongheng',
|
||
'updated_at': time.strftime("%Y-%m-%d %X", time.localtime()),
|
||
}
|
||
tasks.append(asyncio.ensure_future(self.save(res_dic)))
|
||
# if self.all_novels_col.find_one(
|
||
# {"novel_name": item.novel_name, 'novel_author': item.novel_author}) is None:
|
||
# self.all_novels_col.insert_one(res_dic)
|
||
# # async_callback(self.save, res_dic=res_dic)
|
||
# print(item.novel_name + ' - 抓取成功')
|
||
good_nums = 0
|
||
if tasks:
|
||
done_list, pending_list = await asyncio.wait(tasks)
|
||
for task in done_list:
|
||
if task.result():
|
||
good_nums += 1
|
||
print(f"共{len(tasks)}本小说,抓取成功{good_nums}本")
|
||
|
||
async def save(self, res_dic):
|
||
# 存进数据库
|
||
res_dic = res_dic
|
||
try:
|
||
|
||
await self.motor_db.all_novels.update_one({
|
||
'novel_url': res_dic['novel_url'], 'novel_name': res_dic['novel_name']},
|
||
{'$set': res_dic},
|
||
upsert=True)
|
||
print(res_dic['novel_name'] + ' - 抓取成功')
|
||
return True
|
||
except Exception as e:
|
||
self.logger.exception(e)
|
||
return False
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# 其他多item示例:https://gist.github.com/howie6879/3ef4168159e5047d42d86cb7fb706a2f
|
||
# 51793
|
||
for page in range(0, 10):
|
||
print(f"正在爬取第{page}页")
|
||
start_page = page * 100
|
||
end_page = start_page + 100
|
||
if end_page > 999:
|
||
end_page = 999
|
||
ZHNovelsSpider.start_urls = ['http://book.zongheng.com/store/c0/c0/b9/u0/p{i}/v9/s9/t0/ALL.html'.format(i=i) for
|
||
i in
|
||
range(start_page, end_page)]
|
||
# 其他多item示例:https://gist.github.com/howie6879/3ef4168159e5047d42d86cb7fb706a2f
|
||
ZHNovelsSpider.start(loop=loop, middleware=[ua_middleware, owl_middleware], close_event_loop=False)
|