scrapy数据写入管道
2024-09-07 21:53:37
1 setting里面启动管道
ITEM_PIPELINES = {
'ganji.pipelines.GanjiPipeline': 300,
}
2 拿到的数据通过yield返回给管道
# -*- coding: utf-8 -*-
import csv import scrapy class GjSpider(scrapy.Spider):
name = 'gj'
allowed_domains = ['ganji.com']
start_urls = ['http://sz.ganji.com/zufang/'] def optimizeContent(self,res):
res = res.replace('b\'', '')
res = res.replace('\\n', '')
res = res.replace('\'', '')
res = res.replace('style', 'nouse')
res = res.replace('\.', '')
return res def parse(self, response):
print(response.url)
houseList = response.xpath('.//div[@class="f-main-list"]/div/div[position()>2]') for houst in houseList:
title = houst.xpath(".//dl/dd[contains(@class,'title')]/a/@title").extract_first()
size = houst.xpath(".//dl/dd[contains(@class,'size')]/span[3]/text()").extract_first()
chaoxiang = houst.xpath(".//dl/dd[contains(@class,'size')]/span[5]/text()").extract_first()
price = houst.xpath(".//dl/dd[contains(@class,'info')]/div/span[1]/text()").extract_first()
address1 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[1]/text()").extract_first()
address2 = houst.xpath(".//dl/dd[contains(@class,'address')]/span/a[2]/span/text()").extract_first() item = {'title':title,"size":size,"chaoxiang":chaoxiang, "price":price,"address": str(address1)+"-"+str(address2)}
yield item
3 pipeline文件里面写入文件
# -*- coding: utf-8 -*- # Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import csv class GanjiPipeline(object): def open_spider(self,spider):
with open('ganji.csv', 'a+', encoding='utf_8_sig') as fp:
writer = csv.writer(fp) # 先传入文件句柄
writer.writerow(['标题', '大小', '朝向', '价格', '地址']) # 然后写入
fp.close() def process_item(self, item, spider): with open('ganji.csv', 'a+', encoding='utf_8_sig') as fp:
writer = csv.writer(fp) # 先传入文件句柄
item['title'] = self.optimizeContent(item['title'])
print(item['title'])
writer.writerow([item['title'], item['size'], item['chaoxiang'], item['price'], item['address']]) # 按行写入
fp.close()
return item def optimizeContent(self,res):
res = res.replace('b\'', '')
res = res.replace('\\n', '')
res = res.replace('\'', '')
res = res.replace('style', 'nouse')
res = res.replace('\.', '')
return res
最新文章
- Python2.7如何安装numhttp://www.cnblogs.com/yuanzm/p/4089856.htmlpy
- 关于inf的问题
- 剑指Offer40 和为s的连续正数序列
- js中的FileSystemObject使用(FSO)
- Oracle中SQL语句学习五(统计分组语句group by和having)
- Java 泛型具体解释
- angular学习(六)-- Filter
- Akka(37): Http:客户端操作模式
- Python 基于pykafka简单实现KAFKA消费者
- webpack4.0各个击破(10)—— Integration篇
- [译]Ocelot - Load Balancer
- ActiveMQ常见消息类型
- MySQL基准测试(一)--原因,策略,思路
- 学习windows编程 day4 之视口和窗口
- 2019-03-08-day007-深浅拷贝
- js基于json的级联下拉框
- Casperjs循环执行(重复执行不退出)
- 使用mothur进行OTU聚类
- HQL进阶
- TP框架中APP_SUB_DOMAIN_DEPLOY什么意思?
热门文章
- 自定义AQS独占模式下的同步器来实现独享锁
- Mybatis和Mybatis-Plus时间范围查询,亲测有效
- P4742 【[Wind Festival]Running In The Sky】
- 快速上手开发——JFinal配置(全步骤图文解析)
- Redis中的订阅模式
- SQL注入之Mysql报错注入
- Python-全局解释器锁GIL原理和多线程产生原因与原理-多线程通信机制
- __declspec(dllexport)和__declspec(dllimport) (——declspec方法创建dll的方法已验证ok)
- #ifndef, #define, #endif三者的作用
- 【学习笔记/题解】树上启发式合并/CF600E Lomsat gelral