当前位置:Gxlcms > 数据库问题 > bokeyuan_python文章爬去入mongodb读取--LOWBIPROGRAMMER

bokeyuan_python文章爬去入mongodb读取--LOWBIPROGRAMMER

时间:2021-07-01 10:21:17 帮助过:3人阅读

# -*- coding: utf-8 -*- import requests,os from lxml import etree from pymongo import *
class Boke(object): def __init__(self): self.url ="https://www.cnblogs.com/cate/python/" self.headers={‘user-agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331‘} def get_data(self,url): response = requests.get(url,headers=self.headers) return response.content def xml_data(self,data): html = etree.HTML(data) mes = html.xpath("//div[@class=‘post_item‘]") for i in mes: dict={} info_url = i.xpath("./div[@class=‘post_item_body‘]/h3/a/@href")[0] self.info_data(info_url) dict[‘url‘] = info_url self.write_dbs(dict)

def info_data(self,data): path = "f:/woc/" if not os.path.exists(path): os.makedirs(path) mes = self.get_data(data) html = etree.HTML(mes) list = html.xpath("//div[@id=‘topics‘]/div[@class=‘post‘]") # print(list) for x in list: dictlist = {} title = x.xpath("./h1[@class=‘postTitle‘]/a/text()")[0] info = x.xpath("./div[@class=‘postBody‘]//text()") dictlist[‘title‘] = title dictlist[‘info‘] = info self.write1_dbs(dictlist)
def dbs(self): connect = MongoClient(‘127.0.0.1‘,27017) conn = connect[‘boke‘] conn1 =conn[‘zhu‘] conn2 =conn[‘info‘] return conn1,conn2 def write_dbs(self,data): conn1,conn2 = self.dbs() conn1.insert_one(data) result=conn1.find() for i in result: print(i) def write1_dbs(self,data): conn1, conn2 = self.dbs() conn2.insert_one(data) result = conn2.find() for i in result: print(i)

def run(self): url = self.url data = self.get_data(url) self.xml_data(data) if __name__ == ‘__main__‘: boke = Boke() boke.run()

bokeyuan_python文章爬去入mongodb读取--LOWBIPROGRAMMER

标签:header   gen   window   safari   ret   exp   from   god   __init__   

人气教程排行