Scrapy 框架 CrawlSpider 全站数据爬取

CrawlSpider 全站数据爬取

创建 crawlSpider 爬虫文件

scrapy genspider -t crawl chouti www.xxx.com

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

class CrawSpider(CrawlSpider):

    name = 'craw'

    # allowed_domains = ['www.xxx.com']

    start_urls = ['https://dig.***.com/r/scoff/hot/1']

    #连接提取器：可以根据指定条件提取连接

    link = LinkExtractor(allow=r'/r/scoff/hot/\d+')

    # link1 = LinkExtractor(allow=r'/pic/$')  针对于第一页的 url 不同的 页面使用

    rules = (

        #规则解析器：将连接提取器提取到的连接对应的页面进行指定规则的数据解析

        Rule(link, callback='parse_item', follow=True),

        #参数follow=True：将连接提取器继续作用到连接提取器提取到的连接所有对应的页面中

        # Rule(link1, callback='parse_item', follow=False),

    )

    def parse_item(self, response):

        print(response)

对于简介与详情不是一个 item 的存储

# -*- coding: utf-8 -*-

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

from tenPro.items import TenproItem, TenproItem_detail

class TenSpider(CrawlSpider):

    name = 'ten'

    # allowed_domains = ['www.ccc.com']

    start_urls = ['https://hr.****.com/position.php?&start=#a0']

    rules = (

        Rule(LinkExtractor(allow=r'&start=\d+#a'), callback='parse_item', follow=True),

        Rule(LinkExtractor(allow=r'position_detail.php\?id ='), callback='parse_detail', follow=True),

    )

    def parse_item(self, response):

        # 岗位名称和类别

        tr_list = response.xpath(

            '//table[@class="tablelist"]/tr[@class="odd"] | //table[@class="tablelist"]/tr[@class="even"]')

        for tr in tr_list:

            title = tr.xpath('./td[1]/a/text()').extract_first()

            kind = tr.xpath('./td[2]/text()').extract_first()

            item = TenproItem()

            item['title'] = title

            item['kind'] = kind

            yield item

    def parse_detail(self, response):

        desc = response.xpath('//ul[@class="squareli"]//text()').extract()

        desc = ''.join(desc)

        item = TenproItem_detail()

        item['desc'] = desc

        yield item

import scrapy

class TenproItem(scrapy.Item):

    # define the fields for your item here like:

    title = scrapy.Field()

    kind = scrapy.Field()

    # pass

class TenproItem_detail(scrapy.Item):

    desc = scrapy.Field()

# 分别进行存储  利用数据库的 多表联查  或数据解析

class TenproPipeline(object):

    def process_item(self, item, spider):

        desc = None

        if item.__class__.__name__ == 'TenproItem_detail':

            desc = item['desc']

        else:

            title = item['title']

            kind = item['kind']

        print(item)

        return item

思路:

基于手动请求发送的形式：对所有页面表示的url发起请求，获取页面数据，进行解析

基于CrawlSpider的形式：使用链接提取器和规则解析器进行所有页面对应页面数据的获取也指定数据的解析

巴特西

Scrapy 框架 CrawlSpider 全站数据爬取

CrawlSpider 全站数据爬取

对于简介与详情不是一个 item 的存储

思路:

最新文章

热门文章