关于  spinder  在这个框架里面   和不用数据库  相同

# -*- coding: utf-8 -*-
import scrapy
from yang_guan.items import YangGuanItem
from copy import deepcopy
from scrapy.spiders import CrawlSpider class YgSpider(scrapy.Spider):
name = 'yg'
allowed_domains = ['huanqiu.com']
start_urls = ['http://www.huanqiu.com/',
] def parse (self, response): # 总页面 第一个一定要用parse 用来传递start_urls
item = YangGuanItem()
# item = {} class_news_urls_li = response.xpath(".//div[@class='navCon']/ul/li/a")
for class_news_url in class_news_urls_li:
item["class_tittle"] = class_news_url.xpath("./text()").extract_first()
new_url = class_news_url.xpath("./@href").extract_first()
yield scrapy.Request(
meta={"item": deepcopy(item)}, # 由于是多线程 所以要用深拷贝进入item
) def second_class(self, response): # 二级页面
item = response.meta["item"]
print(response.url) second_urls = response.xpath(".//div/h2/em") for second_url in second_urls: secoond_news_url = second_url.xpath("./a/@href").extract_first() yield scrapy.Request(
meta={"item": deepcopy(item)}
) def parse_detail_analyze(self, response): # 进入第三成 总细节的抓取 http://china.huanqiu.com/leaders/'
item = response.meta["item"] li_list = response.xpath("//ul[@class='listPicBox']/li") for li in li_list:
# item = YangGuanItem()
item["title"] = li.xpath("./h3/a/text()").extract_first()
item["img_url"] = li.xpath("./a/img/@src").extract_first()
item["detail"] = li.xpath("./h5/text()").extract_first()
yield item next_url = response.xpath(".//div[@class='pageBox']/div/a[last()]/@href").extract_first() # 遇见翻页就要这样写 yield scrapy.Request(next_url, callback=self.parse_detail_analyze,meta={"item":response.meta["item"]})

关于  pipelines  的 管道设定

# -*- coding: utf-8 -*-

# Define your item pipelines here
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo class YangGuanPipeline(object): def __init__(self):
# 建立mongodb 数据库连接
client = pymongo.MongoClient('', 27017)
# 连接数据库,['scrapy_huan_qiu]
db = client['scrapy_huan_qiu']
# 连接所用的集合
self.post = db['zong_huan_qiu']
print("*"*100) def process_item(self, item, spider):
postItem = dict(item)
return item

setting  的设置

# -*- coding: utf-8 -*-

# Scrapy settings for yang_guan project
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html # 好像记得是ip代理
{'ip_port': '', 'user_pass': ''},
{'ip_port': '', 'user_pass': ''},
{'ip_port': '', 'user_pass': ''},
{'ip_port': '', 'user_pass': ''},
{'ip_port': '', 'user_pass': ''},
{'ip_port': '', 'user_pass': ''},] BOT_NAME = 'yang_guan' SPIDER_MODULES = ['yang_guan.spiders']
NEWSPIDER_MODULE = 'yang_guan.spiders' # LOG_LEVEL = "WARNING" # Crawl responsibly by identifying yourself (and your website) on the user-agent
# 计算机型号防止反爬虫
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' # Obey robots.txt rules
# 不遵守爬虫机器人协议
ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default)
# COOKIES_ENABLED = False # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False # Override the default request headers:
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#} # Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
# 'yang_guan.middlewares.YangGuanSpiderMiddleware': 543,
#} # Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# 'yang_guan.middlewares.YangGuanDownloaderMiddleware': 543,
#} # Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
# 'scrapy.extensions.telnet.TelnetConsole': None,
#} # Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html # 开启管道 由于这次没有编写 items 也无法保存进入数据库
'yang_guan.pipelines.YangGuanPipeline': 300,
} # 关于 debug等级 和生成log日志
# LOG_FILE = "dg.log"
# LOG_LEVEL = "DEBUG" # Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
# The initial download delay
# The maximum download delay to be set in case of high latencies
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

关于item  的设置   这个一定要有  用spider 里面的 yield 来进行传递 字典

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html import scrapy class YangGuanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
class_tittle = scrapy.Field()
img_url = scrapy.Field()
detail = scrapy.Field()
