关于  spinder  在这个框架里面   和不用数据库  相同

# -*- coding: utf-8 -*-
import scrapy
from yang_guan.items import YangGuanItem
from copy import deepcopy
from scrapy.spiders import CrawlSpider class YgSpider(scrapy.Spider):
name = 'yg'
allowed_domains = ['huanqiu.com']
start_urls = ['http://www.huanqiu.com/',
] def parse (self, response): # 总页面 第一个一定要用parse 用来传递start_urls
item = YangGuanItem()
# item = {} class_news_urls_li = response.xpath(".//div[@class='navCon']/ul/li/a")
print(class_news_urls_li)
for class_news_url in class_news_urls_li:
item["class_tittle"] = class_news_url.xpath("./text()").extract_first()
print(item)
new_url = class_news_url.xpath("./@href").extract_first()
print(new_url)
yield scrapy.Request(
new_url,
callback=self.second_class,
meta={"item": deepcopy(item)}, # 由于是多线程 所以要用深拷贝进入item
) def second_class(self, response): # 二级页面
item = response.meta["item"]
print(response.url) second_urls = response.xpath(".//div/h2/em") for second_url in second_urls: secoond_news_url = second_url.xpath("./a/@href").extract_first() yield scrapy.Request(
secoond_news_url,
callback=self.parse_detail_analyze,
meta={"item": deepcopy(item)}
) def parse_detail_analyze(self, response): # 进入第三成 总细节的抓取 http://china.huanqiu.com/leaders/'
item = response.meta["item"] li_list = response.xpath("//ul[@class='listPicBox']/li") for li in li_list:
# item = YangGuanItem()
item["title"] = li.xpath("./h3/a/text()").extract_first()
item["img_url"] = li.xpath("./a/img/@src").extract_first()
item["detail"] = li.xpath("./h5/text()").extract_first()
yield item next_url = response.xpath(".//div[@class='pageBox']/div/a[last()]/@href").extract_first() # 遇见翻页就要这样写 yield scrapy.Request(next_url, callback=self.parse_detail_analyze,meta={"item":response.meta["item"]})

关于  pipelines  的 管道设定

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo class YangGuanPipeline(object): def __init__(self):
# 建立mongodb 数据库连接
client = pymongo.MongoClient('127.0.0.1', 27017)
# 连接数据库,['scrapy_huan_qiu]
db = client['scrapy_huan_qiu']
# 连接所用的集合
self.post = db['zong_huan_qiu']
print("*"*100) def process_item(self, item, spider):
postItem = dict(item)
self.post.insert(postItem)
return item

setting  的设置

# -*- coding: utf-8 -*-

# Scrapy settings for yang_guan project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html # 好像记得是ip代理
PROXIES = [
{'ip_port': '111.11.228.75:80', 'user_pass': ''},
{'ip_port': '120.198.243.22:80', 'user_pass': ''},
{'ip_port': '111.8.60.9:8123', 'user_pass': ''},
{'ip_port': '101.71.27.120:80', 'user_pass': ''},
{'ip_port': '122.96.59.104:80', 'user_pass': ''},
{'ip_port': '122.224.249.122:8088', 'user_pass': ''},] BOT_NAME = 'yang_guan' SPIDER_MODULES = ['yang_guan.spiders']
NEWSPIDER_MODULE = 'yang_guan.spiders' # LOG_LEVEL = "WARNING" # Crawl responsibly by identifying yourself (and your website) on the user-agent
# 计算机型号防止反爬虫
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' # Obey robots.txt rules
# 不遵守爬虫机器人协议
ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default)
# COOKIES_ENABLED = False # Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False # Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#} # Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'yang_guan.middlewares.YangGuanSpiderMiddleware': 543,
#} # Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'yang_guan.middlewares.YangGuanDownloaderMiddleware': 543,
#} # Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#} # Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html # 开启管道 由于这次没有编写 items 也无法保存进入数据库
ITEM_PIPELINES = {
'yang_guan.pipelines.YangGuanPipeline': 300,
} # 关于 debug等级 和生成log日志
# LOG_FILE = "dg.log"
# LOG_LEVEL = "DEBUG" # Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

关于item  的设置   这个一定要有  用spider 里面的 yield 来进行传递 字典

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html import scrapy class YangGuanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
class_tittle = scrapy.Field()
img_url = scrapy.Field()
detail = scrapy.Field()
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html import scrapy class YangGuanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
class_tittle = scrapy.Field()
img_url = scrapy.Field()
detail = scrapy.Field()
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html import scrapy class YangGuanItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
class_tittle = scrapy.Field()
img_url = scrapy.Field()
detail = scrapy.Field()

最新文章

  1. php 生成word的三种方式
  2. win7 部署WCF遇到的问题记录
  3. Lable得到自定义高度!
  4. CSS-JQUERY笔记
  5. 【BZOJ 3036】 绿豆蛙的归宿
  6. Struts2配置文件各种标签的含义
  7. .net开发,html ajax开发架构之我见 bs ajax最简化法 Knock out Request,totally oo
  8. django view使用学习记录
  9. 学生信息管理 --- c语言实现
  10. onkeyup 事件会在键盘按键被松开时发生
  11. oracle 表查询(1)
  12. b端商家赋值权限
  13. 移动端web app开发学习笔记
  14. JavaScript中的this详解
  15. java乱码问题解决
  16. Python第2天
  17. How to view assertions in the Verdi waveform viewer
  18. MySQL一个延迟案例
  19. Shell获取格式化日期
  20. POJ 2488 DFS

热门文章

  1. 解题报告:luogu P5755 [NOI2000]单词查找树
  2. Codeforces1307D. Cow and Fields
  3. Python 之并发编程之进程下(事件(Event())、队列(Queue)、生产者与消费者模型、JoinableQueue)
  4. C语言笔记 13_排序算法
  5. JAVA 内部类 总结
  6. winform跳转到bs
  7. Vue 路由组件
  8. STM32CubeMX+FreeRTOS 定时器os_timer的使用
  9. english-phoneme
  10. VirtualBox安装Debian