scrapy Request方法

# -*- coding: utf-8 -*-

import scrapy

class TestSpider(scrapy.Spider):

    name = 'test'

    allowed_domains = ['yeves.cn']

    start_urls = ['https://yeves.cn/']

    base_domain = 'https://yeves.cn{}'  # 基础域名

    def parse(self, response):

        articles = response.xpath('//*[@id="article"]//div') # 获取首页的标题和链接

        for article in articles:

            title = article.xpath('./div/article/div/header/h2/a/text()').extract_first()

            href = article.xpath('./div/article/div/header/h2/a/@href').extract_first()

            if title is not None and href is not None:

                href = self.base_domain.format(href)

                yield scrapy.Request(href,callback=self.parse_detail,meta={"title":title})  #通过标题链接获取详情 把标题带过去

    def parse_detail(self,respone):

        print(respone.url)

        print(respone.meta.get('title'))

        detail = {}

        detail['title'] = respone.meta.get('title')

        created_at = respone.xpath('/html/body/section/div/div/header/div/span[1]/time/text()').extract_first() # 拿到详情数据

        category = respone.xpath('/html/body/section/div/div/header/div/span[2]/a/text()').extract_first()

        content = respone.xpath('/html/body/section/div/div/article//text()').extract_first()

        detail['created_at'] = created_at

        detail['category'] = category

        print(detail)

        yield detail
巴特西

scrapy Request方法

最新文章

热门文章