爬虫之CrawlSpider简单案例之读书网

项目名py文件下

class DsSpider(CrawlSpider):

    name = 'ds'

    allowed_domains = ['dushu.com']

    start_urls = ['https://www.dushu.com/book/1163_1.html']

    rules = (

        Rule(LinkExtractor(restrict_xpaths='//div[@class="pages"]'), callback='parse_item', follow=True),

    )

    def parse_item(self, response):

        item = {}

        # print(response.url)

        lis = response.xpath('//div[@class="bookslist"]/ul/li')

        for li in lis:

            item['name'] = li.xpath('.//h3/a/text()').extract_first()

            item['link'] = li.xpath('.//h3/a/@href').extract_first()

            item['author'] = li.xpath('.//p[1]/a/text()').extract_first()

        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()

        #item['name'] = response.xpath('//div[@id="name"]').get()

        #item['description'] = response.xpath('//div[@id="description"]').get()

            yield item

巴特西

爬虫之CrawlSpider简单案例之读书网

最新文章

热门文章