scrapy Request方法
2024-08-25 16:17:54
# -*- coding: utf-8 -*-
import scrapy class TestSpider(scrapy.Spider):
name = 'test'
allowed_domains = ['yeves.cn']
start_urls = ['https://yeves.cn/']
base_domain = 'https://yeves.cn{}' # 基础域名
def parse(self, response): articles = response.xpath('//*[@id="article"]//div') # 获取首页的标题和链接 for article in articles:
title = article.xpath('./div/article/div/header/h2/a/text()').extract_first()
href = article.xpath('./div/article/div/header/h2/a/@href').extract_first()
if title is not None and href is not None:
href = self.base_domain.format(href)
yield scrapy.Request(href,callback=self.parse_detail,meta={"title":title}) #通过标题链接获取详情 把标题带过去 def parse_detail(self,respone):
print(respone.url)
print(respone.meta.get('title'))
detail = {}
detail['title'] = respone.meta.get('title') created_at = respone.xpath('/html/body/section/div/div/header/div/span[1]/time/text()').extract_first() # 拿到详情数据
category = respone.xpath('/html/body/section/div/div/header/div/span[2]/a/text()').extract_first()
content = respone.xpath('/html/body/section/div/div/article//text()').extract_first() detail['created_at'] = created_at
detail['category'] = category
print(detail)
yield detail
最新文章
- [C/C++] C/C++延伸学习系列之STL及Boost库概述
- SVN批处理
- GridLayout自定义数字键盘(两个EditText)
- Fedora 24 install MySQL
- Python Ogre Blender(转载)
- Entity Framework 学习第一天 续
- PHPStorm 3.0 与服务器端代码同步配置
- SQL 连接 JOIN 例解。(左连接,右连接,全连接,内连接,交叉连接,自连接)
- Qt容器类的对象模型及应用(线性结构篇)(好多图,比较清楚)
- [bzoj3702] 二叉树
- php分页数据最后一页继续追加第一页数据
- SQL Server 2014备份维护计划
- PID控制器开发笔记之三:抗积分饱和PID控制器的实现
- Java 之 JavaScript (二)
- EasyUI datagrid columns 中 field 区分大小写
- Linux安装python3.7
- if else的使用以及如何从键盘获取数值
- Java - 线程让步和休眠
- POJ 2337 Catenyms (欧拉回路)
- 判断UNITY版本号