使用lxml的css选择器用法爬取奇书网并保存到mongoDB中
时间:2021-07-01 10:21:17
帮助过:15人阅读
requests
from lxml
import etree
from fake_useragent
import UserAgent
import pymongo
class QiShuSpider(object):
def __init__(self):
self.base_url=
"https://www.qisuu.la/soft/sort01/"
self.headers=
{
"User-Agent":UserAgent().random,
"HOST":
"www.qisuu.la",
"Referer":
"https://www.qisuu.la",
}
def get_index_code(self):
#声明一个变量,记录重连的次数
retry_link_count=
0
while True:
try:
response=requests.get(self.base_url,headers=
self.headers)
# print(response.text)
except Exception as e:
print(
"连接奇书网失败,原因是:",e)
print(
"正在尝试第{}次重连....".format(retry_link_count))
retry_link_count+=1
if retry_link_count>=5
:
print(
"尝试连接次数已经达到五次,停止连接")
break
else:
html_obj=
etree.HTML(response.text)
# print(response.text)
#获取option这个标签列表
option_list=html_obj.cssselect(
"select>option")
return option_list
def get_every_page_code(self):
option_list=
self.get_index_code()
for option
in option_list:
value=option.get(
"value")
#拼接每一页的完整地址
base_url=
"https://www.qisuu.la"+
value
print(
"正在爬取{}链接".format(base_url))
response=requests.get(base_url,headers=
self.headers).text
html_obj=
etree.HTML(response)
#获取每一本小数所在的a标签的一个列表
a_list=html_obj.cssselect(
".listBox li>a")
for a
in a_list:
novel_href=a.get(
"href")
#拼接每一本小说的完整地址
novel_url=
"https://www.qisuu.la"+
novel_href
print(
"正在爬取链接为{}的小说".format(novel_url))
self.parse_every_novel(novel_url)
def parse_every_novel(self,novel_url):
reponse=requests.get(novel_url,headers=
self.headers)
reponse.encoding=
"utf-8"
html_obj=
etree.HTML(reponse.text)
novel_name=html_obj.cssselect(
".detail_right>h1")[0].text
clik_num=html_obj.cssselect(
".detail_right>ul>li:nth-child(1)")[0].text
novel_size=html_obj.cssselect(
".detail_right>ul>li:nth-child(2)")[0].text
novel_type=html_obj.cssselect(
".detail_right>ul>li:nth-child(3)")[0].text
update_time = html_obj.cssselect(
".detail_right>ul>li:nth-child(4)")[0].text
novel_status = html_obj.cssselect(
".detail_right>ul>li:nth-child(5)")[0].text
novel_author = html_obj.cssselect(
".detail_right>ul>li:nth-child(6)")[0].text
novel_run_envir=html_obj.cssselect(
".detail_right>ul>li:nth-child(7)")[0].text
novel_lasted_chapter=html_obj.cssselect(
".detail_right>ul>li:nth-child(8)>a")[0].text
dict_novel={
"小说名称":novel_name,
"点击次数":clik_num,
"小说大小":novel_size,
"小说类型":novel_type,
"更新时间":update_time,
"小说状态":novel_status,
"小说作者":novel_author,
"小说运行环境":novel_run_envir,
"小说最新章节":novel_lasted_chapter}
collection.insert_one(dict_novel)
def start_spider(self):
self.get_every_page_code()
if __name__ ==
‘__main__‘:
client = pymongo.MongoClient(host=
"localhost", port=27017
)
db =
client.novel
collection =
db.novel
spider=
QiShuSpider()
spider.start_spider()
使用lxml的css选择器用法爬取奇书网并保存到mongoDB中
标签:环境 imp soft www. exce ide link 记录 pymongo