scrapy数据写入管道

1 setting里面启动管道

ITEM_PIPELINES = {
   'ganji.pipelines.GanjiPipeline': 300,
}

2 拿到的数据通过yield返回给管道

# -*- coding: utf-8 -*-

import csv

import scrapy

class GjSpider(scrapy.Spider):

    name = 'gj'

    allowed_domains = ['ganji.com']

    start_urls = ['http://sz.ganji.com/zufang/']

    def optimizeContent(self,res):

        res = res.replace('b\'', '')

        res = res.replace('\\n', '')

        res = res.replace('\'', '')

        res = res.replace('style', 'nouse')

        res = res.replace('\.', '')

        return res

    def parse(self, response):

        print(response.url)

        houseList = response.xpath('.//div[@class="f-main-list"]/div/div[position()>2]')

        for houst in houseList:

            title = houst.xpath(".//dl/dd[contains(@class,'title')]/a/@title").extract_first()

            size = houst.xpath(".//dl/dd[contains(@class,'size')]/span[3]/text()").extract_first()

            chaoxiang = houst.xpath(".//dl/dd[contains(@class,'size')]/span[5]/text()").extract_first()

            price = houst.xpath(".//dl/dd[contains(@class,'info')]/div/span[1]/text()").extract_first()

            address1 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[1]/text()").extract_first()

            address2 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[2]/span/text()").extract_first()

            item = {'title':title,"size":size,"chaoxiang":chaoxiang, "price":price,"address": str(address1)+"-"+str(address2)}

            yield item

3 pipeline文件里面写入文件

# -*- coding: utf-8 -*-

# Define your item pipelines here

#

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

import csv

class GanjiPipeline(object):

    def  open_spider(self,spider):

        with open('ganji.csv', 'a+', encoding='utf_8_sig') as fp:

            writer = csv.writer(fp)  # 先传入文件句柄

            writer.writerow(['标题', '大小', '朝向', '价格', '地址'])  # 然后写入

            fp.close()

    def process_item(self, item, spider):

        with open('ganji.csv', 'a+', encoding='utf_8_sig') as fp:

            writer = csv.writer(fp)  # 先传入文件句柄

            item['title'] = self.optimizeContent(item['title'])

            print(item['title'])

            writer.writerow([item['title'], item['size'], item['chaoxiang'], item['price'], item['address']])  # 按行写入

            fp.close()

        return item

    def optimizeContent(self,res):

        res = res.replace('b\'', '')

        res = res.replace('\\n', '')

        res = res.replace('\'', '')

        res = res.replace('style', 'nouse')

        res = res.replace('\.', '')

        return res

巴特西

scrapy数据写入管道

最新文章

热门文章