当前位置:Gxlcms > 数据库问题 > Scrapy和MongoDB的应用---爬取

Scrapy和MongoDB的应用---爬取

时间:2021-07-01 10:21:17 帮助过:4人阅读

    技术分享

  settings.py

BOT_NAME = novelspider

SPIDER_MODULES = [novelspider.spiders]
NEWSPIDER_MODULE = novelspider.spiders

ITEM_PIPELINES = [novelspider.pipelines.NovelspiderPipeline]  #导入pipelines.py中的方法

USER_AGENT = Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0
COOKIES_ENABLED = True

MONGODB_HOST = 127.0.0.1
MONGODB_PORT = 27017
MONGODB_DBNAME = zzl‘    #数据库名
MONGODB_DOCNAME = Book‘   #表名

  pipelines.py

from scrapy.conf import settings
import pymongo

class NovelspiderPipeline(object):
    def __init__(self):
        host = settings[MONGODB_HOST]
        port = settings[MONGODB_PORT]
        dbName = settings[MONGODB_DBNAME]
        client = pymongo.MongoClient(host=host, port=port)
        tdb = client[dbName]
        self.post = tdb[settings[MONGODB_DOCNAME]]

    def process_item(self, item, spider):
        bookInfo = dict(item)
        self.post.insert(bookInfo)
        return item

  items.py

from scrapy import Item,Field


class NovelspiderItem(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    bookName = Field()
    bookTitle = Field()
    chapterNum = Field()
    chapterName = Field()
    chapterURL = Field()

  在spiders目录下创建novspider.py

from scrapy.spiders import CrawlSpider
from scrapy.selector import Selector
from novelspider.items import NovelspiderItem

class novSpider(CrawlSpider):
    name = "novspider"
    redis_key = novspider:start_urls
    start_urls = [http://www.daomubiji.com/]

    def parse(self,response):
        selector = Selector(response)
        table = selector.xpath(//table)
        for each in table:
            bookName = each.xpath(tr/td[@colspan="3"]/center/h2/text()).extract()[0]
            content = each.xpath(tr/td/a/text()).extract()
            url = each.xpath(tr/td/a/@href).extract()
            for i in range(len(url)):
                item = NovelspiderItem()
                item[bookName] = bookName
                item[chapterURL] = url[i]
                try:
                    item[bookTitle] = content[i].split( )[0]
                    item[chapterNum] = content[i].split( )[1]
                except Exception,e:
                    continue

                try:
                    item[chapterName] = content[i].split( )[2]
                except Exception,e:
                    item[chapterName] = content[i].split( )[1][-3:]
                yield item

  5.启动项目命令: scrapy crawl novspider.  

     抓取结果

  技术分享

  

Scrapy和MongoDB的应用---爬取

标签:

人气教程排行