1 setting里面启动管道

ITEM_PIPELINES = {
'ganji.pipelines.GanjiPipeline': 300,
}

2 拿到的数据通过yield返回给管道
# -*- coding: utf-8 -*-
import csv import scrapy class GjSpider(scrapy.Spider):
name = 'gj'
allowed_domains = ['ganji.com']
start_urls = ['http://sz.ganji.com/zufang/'] def optimizeContent(self,res):
res = res.replace('b\'', '')
res = res.replace('\\n', '')
res = res.replace('\'', '')
res = res.replace('style', 'nouse')
res = res.replace('\.', '')
return res def parse(self, response):
print(response.url)
houseList = response.xpath('.//div[@class="f-main-list"]/div/div[position()>2]') for houst in houseList:
title = houst.xpath(".//dl/dd[contains(@class,'title')]/a/@title").extract_first()
size = houst.xpath(".//dl/dd[contains(@class,'size')]/span[3]/text()").extract_first()
chaoxiang = houst.xpath(".//dl/dd[contains(@class,'size')]/span[5]/text()").extract_first()
price = houst.xpath(".//dl/dd[contains(@class,'info')]/div/span[1]/text()").extract_first()
address1 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[1]/text()").extract_first()
address2 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[2]/span/text()").extract_first() item = {'title':title,"size":size,"chaoxiang":chaoxiang, "price":price,"address": str(address1)+"-"+str(address2)}
yield item

3 pipeline文件里面写入文件

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import csv class GanjiPipeline(object): def open_spider(self,spider):
with open('ganji.csv', 'a+', encoding='utf_8_sig') as fp:
writer = csv.writer(fp) # 先传入文件句柄
writer.writerow(['标题', '大小', '朝向', '价格', '地址']) # 然后写入
fp.close() def process_item(self, item, spider): with open('ganji.csv', 'a+', encoding='utf_8_sig') as fp:
writer = csv.writer(fp) # 先传入文件句柄
item['title'] = self.optimizeContent(item['title'])
print(item['title'])
writer.writerow([item['title'], item['size'], item['chaoxiang'], item['price'], item['address']]) # 按行写入
fp.close()
return item def optimizeContent(self,res):
res = res.replace('b\'', '')
res = res.replace('\\n', '')
res = res.replace('\'', '')
res = res.replace('style', 'nouse')
res = res.replace('\.', '')
return res

最新文章

  1. Python2.7如何安装numhttp://www.cnblogs.com/yuanzm/p/4089856.htmlpy
  2. 关于inf的问题
  3. 剑指Offer40 和为s的连续正数序列
  4. js中的FileSystemObject使用(FSO)
  5. Oracle中SQL语句学习五(统计分组语句group by和having)
  6. Java 泛型具体解释
  7. angular学习(六)-- Filter
  8. Akka(37): Http:客户端操作模式
  9. Python 基于pykafka简单实现KAFKA消费者
  10. webpack4.0各个击破(10)—— Integration篇
  11. [译]Ocelot - Load Balancer
  12. ActiveMQ常见消息类型
  13. MySQL基准测试(一)--原因,策略,思路
  14. 学习windows编程 day4 之视口和窗口
  15. 2019-03-08-day007-深浅拷贝
  16. js基于json的级联下拉框
  17. Casperjs循环执行(重复执行不退出)
  18. 使用mothur进行OTU聚类
  19. HQL进阶
  20. TP框架中APP_SUB_DOMAIN_DEPLOY什么意思?

热门文章

  1. 自定义AQS独占模式下的同步器来实现独享锁
  2. Mybatis和Mybatis-Plus时间范围查询,亲测有效
  3. P4742 【[Wind Festival]Running In The Sky】
  4. 快速上手开发——JFinal配置(全步骤图文解析)
  5. Redis中的订阅模式
  6. SQL注入之Mysql报错注入
  7. Python-全局解释器锁GIL原理和多线程产生原因与原理-多线程通信机制
  8. __declspec(dllexport)和__declspec(dllimport) (——declspec方法创建dll的方法已验证ok)
  9. #ifndef, #define, #endif三者的作用
  10. 【学习笔记/题解】树上启发式合并/CF600E Lomsat gelral