19 03 13 关于 scrapy 框架的对环球网的整体爬取(存储于 mongodb 数据库里)

关于 spinder 在这个框架里面和不用数据库相同

# -*- coding: utf-8 -*-

import scrapy

from yang_guan.items import YangGuanItem

from copy import deepcopy

from scrapy.spiders import CrawlSpider

class YgSpider(scrapy.Spider):

    name = 'yg'

    allowed_domains = ['huanqiu.com']

    start_urls = ['http://www.huanqiu.com/',

                  ]

    def parse (self, response):  # 总页面  第一个一定要用parse  用来传递start_urls

        item = YangGuanItem()

        # item = {}

        class_news_urls_li = response.xpath(".//div[@class='navCon']/ul/li/a")

        print(class_news_urls_li)

        for class_news_url in class_news_urls_li:

            item["class_tittle"] = class_news_url.xpath("./text()").extract_first()

            print(item)

            new_url = class_news_url.xpath("./@href").extract_first()

            print(new_url)

            yield scrapy.Request(

                new_url,

                callback=self.second_class,

                meta={"item": deepcopy(item)},  # 由于是多线程 所以要用深拷贝进入item

            )

    def second_class(self, response):  # 二级页面

        item = response.meta["item"]

        print(response.url)

        second_urls = response.xpath(".//div/h2/em")

        for second_url in second_urls:

            secoond_news_url = second_url.xpath("./a/@href").extract_first()

            yield scrapy.Request(

                secoond_news_url,

                callback=self.parse_detail_analyze,

                meta={"item": deepcopy(item)}

            )

    def parse_detail_analyze(self, response):  # 进入第三成  总细节的抓取  http://china.huanqiu.com/leaders/'

        item = response.meta["item"]

        li_list = response.xpath("//ul[@class='listPicBox']/li")

        for li in li_list:

            # item = YangGuanItem()

            item["title"] = li.xpath("./h3/a/text()").extract_first()

            item["img_url"] = li.xpath("./a/img/@src").extract_first()

            item["detail"] = li.xpath("./h5/text()").extract_first()

            yield item

        next_url = response.xpath(".//div[@class='pageBox']/div/a[last()]/@href").extract_first()  # 遇见翻页就要这样写

        yield scrapy.Request(next_url, callback=self.parse_detail_analyze,meta={"item":response.meta["item"]})

关于 pipelines 的管道设定

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import pymongo

class YangGuanPipeline(object):

    def __init__(self):

        # 建立mongodb 数据库连接

        client = pymongo.MongoClient('127.0.0.1', 27017)

        # 连接数据库,['scrapy_huan_qiu]

        db = client['scrapy_huan_qiu']

        # 连接所用的集合

        self.post = db['zong_huan_qiu']

        print("*"*100)

    def process_item(self, item, spider):

        postItem = dict(item)

        self.post.insert(postItem)

        return item

setting 的设置

# -*- coding: utf-8 -*-

# Scrapy settings for yang_guan project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

#     https://doc.scrapy.org/en/latest/topics/settings.html

#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#  好像记得是ip代理

PROXIES = [

    {'ip_port': '111.11.228.75:80', 'user_pass': ''},

    {'ip_port': '120.198.243.22:80', 'user_pass': ''},

    {'ip_port': '111.8.60.9:8123', 'user_pass': ''},

    {'ip_port': '101.71.27.120:80', 'user_pass': ''},

    {'ip_port': '122.96.59.104:80', 'user_pass': ''},

    {'ip_port': '122.224.249.122:8088', 'user_pass': ''},]

BOT_NAME = 'yang_guan'

SPIDER_MODULES = ['yang_guan.spiders']

NEWSPIDER_MODULE = 'yang_guan.spiders'

# LOG_LEVEL = "WARNING"

# Crawl responsibly by identifying yourself (and your website) on the user-agent

# 计算机型号防止反爬虫

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'

# Obey robots.txt rules

# 不遵守爬虫机器人协议

ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)

# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay

# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 3

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

#   'Accept-Language': 'en',

#}

# Enable or disable spider middlewares

# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html

#SPIDER_MIDDLEWARES = {

#    'yang_guan.middlewares.YangGuanSpiderMiddleware': 543,

#}

# Enable or disable downloader middlewares

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html

#DOWNLOADER_MIDDLEWARES = {

#    'yang_guan.middlewares.YangGuanDownloaderMiddleware': 543,

#}

# Enable or disable extensions

# See https://doc.scrapy.org/en/latest/topics/extensions.html

#EXTENSIONS = {

#    'scrapy.extensions.telnet.TelnetConsole': None,

#}

# Configure item pipelines

# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html

#  开启管道  由于这次没有编写 items  也无法保存进入数据库

ITEM_PIPELINES = {

   'yang_guan.pipelines.YangGuanPipeline': 300,

}

#  关于  debug等级  和生成log日志

# LOG_FILE = "dg.log"

# LOG_LEVEL = "DEBUG"

# Enable and configure the AutoThrottle extension (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/autothrottle.html

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

关于item 的设置这个一定要有用spider 里面的 yield 来进行传递字典

# -*- coding: utf-8 -*-

# Define here the models for your scraped items

#

# See documentation in:

# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class YangGuanItem(scrapy.Item):

    # define the fields for your item here like:

    # name = scrapy.Field()

    title = scrapy.Field()

    class_tittle = scrapy.Field()

    img_url = scrapy.Field()

    detail = scrapy.Field()

# -*- coding: utf-8 -*-

# Define here the models for your scraped items

#

# See documentation in:

# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class YangGuanItem(scrapy.Item):

    # define the fields for your item here like:

    # name = scrapy.Field()

    title = scrapy.Field()

    class_tittle = scrapy.Field()

    img_url = scrapy.Field()

    detail = scrapy.Field()

# -*- coding: utf-8 -*-

# Define here the models for your scraped items

#

# See documentation in:

# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class YangGuanItem(scrapy.Item):

    # define the fields for your item here like:

    # name = scrapy.Field()

    title = scrapy.Field()

    class_tittle = scrapy.Field()

    img_url = scrapy.Field()

    detail = scrapy.Field()

巴特西

19 03 13 关于 scrapy 框架的对环球网的整体爬取(存储于 mongodb 数据库里)

最新文章

热门文章

巴特西

19 03 13 关于 scrapy 框架的 对环球网的整体爬取(存储于 mongodb 数据库里)

最新文章

热门文章

19 03 13 关于 scrapy 框架的对环球网的整体爬取(存储于 mongodb 数据库里)