1.创建scrapy项目

dos窗口输入:

scrapy startproject quote
cd quote

2.编写item.py文件(相当于编写模板,需要爬取的数据在这里定义)

import scrapy

class QuoteItem(scrapy.Item):
# define the fields for your item here like:
text = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()

  

3.创建爬虫文件

dos窗口输入:

scrapy genspider myspider quotes.toscrape.com

4.编写myspider.py文件(接收响应,处理数据)

# -*- coding: utf-8 -*-
import scrapy
from quote.items import QuoteItem class MyspiderSpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['quotes.toscrape.com']
start_urls = ['http://quotes.toscrape.com/'] def parse(self, response):
for each in response.xpath('//div[@class="quote"]'):
item = QuoteItem()
item['text'] = each.xpath('./span/text()').extract()[0]
item['author'] = each.xpath('.//small/text()').extract()[0]
list = each.xpath('.//a[@class="tag"]/text()').extract()
#列表形式的文件不能存入mysql,需要弄成str形式
item['tags']= '/'.join(list)
yield item next = response.xpath('//li[@class="next"]/a/@href').extract()[0]
url = response.urljoin(next)
yield scrapy.Request(url=url,callback=self.parse)

  

5.编写pipelines.py(存储数据)

存储到mysql

import pymysql.cursors

class QuotePipeline(object):
def __init__(self):
self.connect = pymysql.connect(
host='localhost',
user='root',
password='',
database='quotes',
charset='utf8',
)
self.cursor = self.connect.cursor() def process_item(self, item, spider):
item = dict(item)
sql = 'insert into quote(text,author,tags) values(%s,%s,%s)'
self.cursor.execute(sql,(item['text'],item['author'],item['tags']))
self.connect.commit()
return item def close_spider(self,spider):
self.cursor.close()
self.connect.close()  

改进版:

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql.cursors class QuotePipeline(object):
def __init__(self):
self.connect = pymysql.connect(
host='localhost',
user='root',
password='',
database='quotes',
charset='utf8',
)
self.cursor = self.connect.cursor() def process_item(self, item, spider):
item = dict(item)
table = 'quote'
keys = ','.join(item.keys())
values = ','.join(['%s']*len(item))
sql = 'insert into {table}({keys}) values({values})'.format(table=table,keys=keys,values=values)
try:
if self.cursor.execute(sql, tuple(item.values())):
self.connect.commit()
print("Successful!")
except:
print("Failed!")
self.connect.rollback()
return item def close_spider(self, spider):
self.cursor.close()
self.connect.close()

存储到mongoDB

  1.在setting文件设置2个属性

MONGO_URI = 'localhost'
MONGO_DB = 'study' #一个管道文件 ITEM_PIPELINES = {
# 'quote.pipelines.QuotePipeline': 300,
'quote.pipelines.MongoPipeline': 300, }

  2.pipeline.py

import pymongo

class MongoPipeline(object):
# 表名字
collection = 'student' def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db @classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB'),
) def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db] def close_spider(self, spider):
self.client.close() def process_item(self, item, spider):
# 插入到mongo数据库
self.db[self.collection].insert(dict(item))
return item

  

    

6.编写settings.py(设置headers,pipelines等)

robox协议

# Obey robots.txt rules
ROBOTSTXT_OBEY = False  

headers

DEFAULT_REQUEST_HEADERS = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
}

pipelines

ITEM_PIPELINES = {
'quote.pipelines.QuotePipeline': 300,
}

  

7.运行爬虫

dos窗口输入:

scrapy crawl myspider 

运行结果

最新文章

  1. VIJOS P1037搭建双塔[DP]
  2. []with[[]]
  3. Setting Margin Properties in code
  4. centos 服务器操作
  5. web交互方式
  6. HAOI2007反素数
  7. 《Algorithms 4th Edition》读书笔记——2.4 优先队列(priority queue)-Ⅳ
  8. nodejs入门demo
  9. 初学node.js有感三
  10. 存储管理工具StorageExplorer的基本使用
  11. java_多线程4种实现方式
  12. 失去焦点布局在ios12-微信6.7.0版本以上不回滚的解决方案
  13. Hibernate处理事务并发问题
  14. Centos启动流程及grub legacy
  15. BZOJ2872 : 优莱卡
  16. Tensorflow高效读取数据
  17. vue 使用高德地图vue-amap组件
  18. 数据模型Model(I)
  19. div等比例缩放-------纯CSS实现自适应浏览器宽度的正方形
  20. IE浏览器如何调试Asp.net的 js代码

热门文章

  1. thinkphp 数据分页
  2. Framework7-Vue搭建项目
  3. NX二次开发-UFUN获取图层类别的信息UF_LAYER_ask_category_info
  4. C++实现的B树
  5. nutch集成solr和中文分词
  6. springmvc静态资源;mvc:default-servlet-handler后Controller失效
  7. RoadFlow2.7.5 MyController.cs
  8. (转)元类metaclass
  9. Flink 1.6.0 Windows操作
  10. JavaScript网页特效5则