时间:2021-07-01 10:21:17 帮助过:14人阅读
# 优化上面的pipeline文件, 实现异步保存 # 使用twisted 提供的数据库连接池 ConnectionPool,把插入数据的动作变成异步的 (面试可以说) # 上面的存储是同步 比较慢, 现在优化成异步 class JianshuTwistedPipeline(object): def __init__(self): # 创建连接池 dbparams = { ‘host‘: ‘127.0.0.1‘, ‘port‘: 3306, ‘user‘: ‘root‘, ‘password‘: ‘‘, ‘database‘: ‘jianshu‘, ‘charset‘: ‘utf8‘, ‘cursorclass‘:cursors.DictCursor } self.dbpool = adbapi.ConnectionPool(‘pymysql‘,**dbparams) self._sql = None @property def sql(self): if not self._sql: # 如果没有 执行 self._sql = ‘‘‘ insert into article2(id,title,content,author,avatar,pub_time, origin_url,article_id) values(null,%s,%s,%s,%s,%s,%s,%s) ‘‘‘ return self._sql else: return self._sql def process_item(self,item,spider): # runInteraction执行异步的 defer = self.dbpool.runInteraction(self.insert_item,item) defer.addErrback(self.handle_error,item,spider) def insert_item(self,cursor,item): # 插入数据库 cursor.execute(self.sql,(item[‘title‘],item[‘content‘],item[‘author‘],item[‘avatar‘], item[‘pub_time‘],item[‘origin_url‘],item[‘article_id‘])) def handle_error(self,error,item,spider): print(‘=‘*20) print("error:",error) print(‘=‘*20) # 把settings中的pipeline文件改一下 ITEM_PIPELINES = { # ‘jianshu_spider.pipelines.JianshuSpiderPipeline‘: 300, ‘jianshu_spider.pipelines.JianshuTwistedPipeline‘: 300, # 异步保存数据 }
# 优化动态数据 处理ajax加载进来的数据 # selenium+chromdriver 处理 # 爬虫文件 把阅读量,点赞数,文章字数,标题分类,评论数 字段获取,保存到item中 def parse_detail(self, response): # print(response.text) title = response.xpath("//div[@class=‘note‘]/div[@class=‘post‘]/div[@class=‘article‘]/h1[@class=‘title‘]/text()").get() print(title) avatar = response.xpath("//a[@class=‘avatar‘]/img/@src").get() # print(avatar) author = response.xpath("//span[@class=‘name‘]/a/text()").get() # print(author) pub_time = response.xpath("//span[@class=‘publish-time‘]/text()").get().replace("*","") # print(pub_time) # url正常情况下里面只有一个? url = response.url url1 = url.split("?")[0] article_id = url1.split("/")[-1] # print(article_id) # 把html标签一起趴下来, 方便以后展示 content = response.xpath("//div[@class=‘show-content‘]").get() # print(content) # 动态获取下面的数据 word_count = response.xpath("//span[@class=‘wordage‘]/text()").get().split(" ")[-1] read_count = response.xpath("//span[@class=‘views-count‘]/text()").get().split(" ")[-1] comment_count = response.xpath("//span[@class=‘comments-count‘]/text()").get().split(" ")[-1] like_count = response.xpath("//span[@class=‘likes-count‘]/text()").get().split(" ")[-1] subject = response.xpath("//div[@class=‘include-collection‘]/a/div/text()").getall() # subject 获取的时候一个列表 存到mysql的时候不支持, 需要把列表转成字符串 subject = ",".join(subject) item = ArticleItem( title=title, avatar=avatar, author=author, pub_time=pub_time, origin_url=response.url, article_id=article_id, content=content, word_count=word_count, read_count=read_count, comment_count=comment_count, like_count=like_count, subject=subject, ) yield item # 管道文件 # 上面的存储是同步 比较慢, 现在优化成异步 class JianshuTwistedPipeline(object): def __init__(self): # 创建连接池 dbparams = { ‘host‘: ‘127.0.0.1‘, ‘port‘: 3306, ‘user‘: ‘root‘, ‘password‘: ‘‘, ‘database‘: ‘jianshu‘, ‘charset‘: ‘utf8‘, ‘cursorclass‘:cursors.DictCursor } self.dbpool = adbapi.ConnectionPool(‘pymysql‘,**dbparams) self._sql = None @property def sql(self): if not self._sql: # 如果没有 执行 self._sql = ‘‘‘ insert into article2(id,title,content,author,avatar,pub_time, origin_url,article_id,read_count, word_count, like_count, comment_count,subject) values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) ‘‘‘ # return self._sql else: return self._sql def process_item(self,item,spider): # runInteraction执行异步的 defer = self.dbpool.runInteraction(self.insert_item,item) defer.addErrback(self.handle_error,item,spider) def insert_item(self,cursor,item): # 插入数据库 cursor.execute(self.sql,(item[‘title‘],item[‘content‘],item[‘author‘],item[‘avatar‘], item[‘pub_time‘],item[‘origin_url‘],item[‘article_id‘], item[‘read_count‘],item[‘word_count‘],item[‘like_count‘],item[‘comment_count‘],item[‘subject‘])) def handle_error(self,error,item,spider): print(‘=‘*20+‘error‘+‘=‘*20) print("error:",error) print(‘=‘*20+‘error‘+‘=‘*20)
简书全站爬取 mysql异步保存
标签:self exe 开始 comment ext clu tar tor 加载