scrapy 请求传参

1.定义数据结构item.py文件

'''
field: item.py
'''
# -*- coding: utf-8 -*- # Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html import scrapy class MovieprojectItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 电影海报
# 一级页面要抓取的内容
post = scrapy.Field()
name = scrapy.Field()
_type = scrapy.Field() # 二级页面要抓取的内容
director = scrapy.Field()
design = scrapy.Field()
actor = scrapy.Field()
info = scrapy.Field()

2.爬虫文件

# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import scrapy
from movieproject.items import MovieprojectItem class MovieSpider(scrapy.Spider):
name = 'movie'
allowed_domains = ['www.id97.com']
start_urls = ['http://www.id97.com/movie/']
url = 'http://www.id97.com/movie/?page={}'
page = 1 '''
(1)只需要提取页码链接,只提取第一页的信息即可
(2)需要写两个规则,一个规则提取详情页面,一个规则是提取页码链接
''' def parse(self, response):
# 先查找所有的movie_div
movie_div_list = response.xpath('//div[starts-with(@class,"col-xs-1-5")]')
# 遍历所有的div,去获取每一个详细的信息
for odiv in movie_div_list:
item = MovieprojectItem()
# 获取电影海报
item['post'] = odiv.xpath(".//img/@data-original").extract_first() # 获取电影名字
item['name'] = odiv.xpath("./div/div/h1/a/text()").extract_first()
# 获取电影类型
item['_type'] = odiv.xpath("./div/div/div/a/text()").extract() # 获取详情页面
detail_href = odiv.xpath('./div/a/@href').extract_first()
'''
向详情页面发送请求
将item向二级传递过去,到二级页面接受并且接着提取其他的信息
请求二级详情页面,解析二级页面中的相应内容,通过meta参数进行Request的数据传
''' yield scrapy.Request(url=detail_href,callback=self.parse_detail, meta={'item': item})
# 爬取其他页面
if self.page <= 5:
self.page += 1
url = self.url.format(self.page)
print(url)
yield scrapy.Request(url=url, callback=self.parse) def parse_detail(self,response):
# 首先获取到上一级传递过来的item
item = response.meta['item']
# 在这个页面中接着提取电影的其它信息即可
# 获取导演
item['director'] = response.xpath("//div[starts-with(@class,'col-xs-8')]/table/tbody/tr/td[2]/a/text()").extract()
# 获取编剧
item['design'] = response.xpath("//div[starts-with(@class,'col-xs-8')]/table/tbody/tr[2]/td[2]/a/text()").extract()
# 获取主演
item['actor'] = response.xpath("//div[starts-with(@class,'col-xs-8')]/table/tbody/tr[3]/td[2]/a/text()").extract()
# 获取电影介绍
item['info'] = response.xpath("//div[@class='col-xs-12 movie-introduce']/p/text()").extract_first() #提交item到管道
yield item

3.管道文件

# -*- coding: utf-8 -*-
'''
filed: pipelines.py
'''
# -*- coding: utf-8 -*- # Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import json
from scrapy.utils.project import get_project_settings
import pymysql class MovieprojectPipeline(object):
def open_spider(self,spider):
self.fp = open("movie.json","w",encoding="utf8")
def process_item(self, item, spider):
obj = dict(item)
string = json.dumps(obj,ensure_ascii=False)
self.fp.write(string+'\n')
# print("写入成功")
return item
def close_spider(self,spider):
self.fp.close() class MovieMysqlPipeline(object):
def open_spider(self,spider):
# 获取所有的配置信息
settings = get_project_settings()
# 链接数据库
host = settings['DB_HOST']
port = settings['DB_PORT']
user = settings['DB_USER']
pwd = settings['DB_PWD']
name = settings['DB_NAME']
charset = settings['DB_CHARSET'] self.conn = pymysql.connect(host=host, port=port, user=user, password=pwd, db=name, charset=charset) def process_item(self, item, spider):
# 拼接sql语句
sql = 'insert into movie(post, name, type, director, design, actor, info) values("%s","%s","%s","%s","%s","%s","%s")' % (item['post'], item['name'], item['_type'], item['director'], item['design'], item['actor'], item['info']) # 获取游标
cursor = self.conn.cursor() # 执行sql语句
try:
cursor.execute(sql)
self.conn.commit()
except Exception as e:
self.conn.rollback()
return item def close_spider(self,spider):
# 关闭数据库
self.conn.close()

最新文章

  1. RAM、DRAM、SD卡
  2. GitHub Pages 搭建流程-基于jekyll-bootstrap
  3. Android 网络编程基础之简单聊天程序
  4. quick sort java version
  5. 数据库配置文件 conf.properties
  6. C++语言中cin cin.getline cin.get getline gets getchar 的用法实例
  7. if语句使用
  8. 模板template
  9. 彷徨中的成长-记一个文科生的IT成长过程
  10. Windows下使用NCL(Cygwin模拟Linux环境)
  11. java functional syntax overview
  12. UVA 11549 CALCULATOR CONUNDRUM(Floyd判圈算法)
  13. vue ajax获取数据的时候,如何保证传递参数的安全或者说如何保护api的安全
  14. (八十六)使用系统自带的分享框架Social.framework
  15. python11 函数的定义,调用,分类
  16. VS之设置文件编码格式
  17. 在linux和windows用c++编写c接口的动态库
  18. 【问题解决:启动卡死】Eclipse启动卡死的解决办法
  19. 2、以自定义struct或struct指针作为map的Key
  20. phpize增加php模块

热门文章

  1. SSH原理常见应用升级及端口转发
  2. 一文入门Kafka,必知必会的概念通通搞定
  3. Java8新特性之函数式接口
  4. Windows 程序设计(4) MFC-02 基本控件-上
  5. 手把手教你使用Python生成图灵智能小伙伴,实现工作助手/闲聊功能
  6. Linux Pam后门总结拓展
  7. 必知必会的8个Python列表技巧
  8. 【Spring】@Transactional 闲聊
  9. 设计模式系列之代理模式(Proxy Pattern)——对象的间接访问
  10. Win8.1卸载64位Oracle Database 11g的详细图文步骤记录