scrapy文件管道

安装scrapy

pip install scrapy

新建项目

(python36) E:\www>scrapy startproject fileDownload

New Scrapy project 'fileDownload', using template directory 'c:\users\brady\.conda\envs\python36\lib\site-packages\scrapy\templates\project', created in:

    E:\www\fileDownload

You can start your first spider with:

    cd fileDownload

    scrapy genspider example example.com

(python36) E:\www>

(python36) E:\www>scrapy startproject fileDownload

New Scrapy project 'fileDownload', using template directory 'c:\users\brady\.conda\envs\python36\lib\site-packages\scrapy\templates\project', created in:

    E:\www\fileDownload

You can start your first spider with:

    cd fileDownload

    scrapy genspider example example.com

(python36) E:\www>

编辑爬虫提取内容

# -*- coding: utf-8 -*-

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

from  fileDownload.items import  FiledownloadItem

class PexelsSpider(CrawlSpider):

    name = 'pexels'

    allowed_domains = ['www.pexels.com']

    start_urls = ['https://www.pexels.com/photo/white-concrete-building-2559175/']

    rules = (

        Rule(LinkExtractor(allow=r'/photo/'), callback='parse_item', follow=True),

    )

    def parse_item(self, response):

        print(response.url)

        url = response.xpath("//img[contains(@src,'photos')]/@src").extract()

        item = FiledownloadItem()

        try:

            item['file_urls'] = url

            print("爬取到图片列表 " + url)

            yield item

        except Exception as  e:

            print(str(e))

配置item

class FiledownloadItem(scrapy.Item):

    # define the fields for your item here like:

    # name = scrapy.Field()

    file_urls = scrapy.Field()

setting.py

启用文件管道

'scrapy.pipelines.files.FilesPipeline':2 文件管道

FILES_STORE='' //存储路径

item里面

file_urls = scrapy.Field()

files = scrapy.field()

爬虫里面改为file_urls参数传递到管道

重写文件管道保存文件名为图片原名

pipelines.php里面新建自己图片管道，继承图片管道

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html

from scrapy.pipelines.files import  FilesPipeline

class FiledownloadPipeline(object):

    def process_item(self, item, spider):

        tmp = item['file_urls']

        item['file_urls'] = []

        for i in tmp:

            if "?" in i:

                item['file_urls'].append(i.split('?')[0])

            else:

                item['file_urls'].append(i)

        print(item)

        return item

class  MyFilesPipeline(FilesPipeline):

    def file_path(self, request, response=None, info=None):

        file_path = request.url

        file_path = file_path.split('/')[-1]

        print("下载图片"+ file_path)

        return 'full/%s' % (file_path)

setting.py 改为启用自己文件管道

ITEM_PIPELINES = {

    'fileDownload.pipelines.FiledownloadPipeline': 1,

    'fileDownload.pipelines.MyFilesPipeline': 2,

    #'scrapy.pipelines.files.FilesPipeline':2

}

获取套图

# -*- coding: utf-8 -*-

from time import sleep

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

class AngelSpider(CrawlSpider):

    name = 'angel'

    allowed_domains = ['angelimg.spbeen.com']

    start_urls = ['http://angelimg.spbeen.com/']

    base_url = "http://angelimg.spbeen.com"

    rules = (

        Rule(LinkExtractor(allow=r'^http://angelimg.spbeen.com/ang/\d+$'), callback='parse_item', follow=False),

    )

    def parse_item(self, response):

        item = response.meta.get('item',False)

        if item:

            pass

        else:

            item = {}

            item['files'] = []

            item['file_urls'] = []

        print(response.url)

        img_url = response.xpath('.//div[@id="content"]/a/img/@src').extract_first()

        item['file_urls'].append(img_url)

        # 如果有下一页 请求下一页，没有数据丢回管道

        next_url = response.xpath('.//div[@class="page"]//a[contains(@class,"next")]/@href').extract_first()

        if next_url:

            next_url = self.base_url + next_url

            yield scrapy.Request(next_url,callback=self.parse_item,meta={'item':item})

        else:

            print(item)

            yield item

    def parse_next_response(self,response,):

        item = response.meta.get('item')

        print(item,response.url)

　　github地址

https://github.com/brady-wang/spider-fileDownload

巴特西

scrapy文件管道

最新文章

热门文章