step_by_step_用python爬点磁力链接

爬点东西 -Scrapy

今天是小年，团聚的日子，想想这一年中发生过大大小小的事，十分感慨。

言归正传: 接触python ，想着可不可以自己爬它点数据，目的是能够过滤掉他这些令人头痛的广告，当然也可以顺带熟悉一下python 的scrapy框架那就开始吧

scrapy startproject btxxxx

scrapy genspider -t crawl btxxxx xxx.info

spider的代码

 # -*- coding: utf-8 -*-

 import scrapy

 from scrapy.linkextractors import LinkExtractor

 from scrapy.spiders import CrawlSpider, Rule

 import urllib

 from xxx.items import XxxxItem

 import sys

 reload(sys)

 sys.setdefaultencoding('utf8')

 class BtxxxxSpider(CrawlSpider):

     name = 'btxxxx'

     allowed_domains = ['btxxxx.info']

     def __init__(self, key_word='', *args, **kwargs):

         super(BtxxxxSpider, self).__init__(*args, **kwargs)

         self.key_words = key_word

         quote_str = urllib.quote(self.key_words)

         # 网址就不搞出来啦

         zero_url = 'http://www.xxxx.info/search/' + quote_str + '.html'

         self.start_urls = [zero_url]

     rules = (

         Rule(LinkExtractor(allow=r'\/search\/b-[\s\S]*\.html'),callback='root_url', follow=True),

          Rule(   LinkExtractor(

                 allow=r'\/search\/b-[a-z,A-Z,0-9]+\/[0-9]+-[0-9]+\.html'), callback='content_url', follow=True

             ),

         Rule(LinkExtractor(allow=r'\/wiki\/.*\.html'), callback='parse_item', follow=False)

     )

     def root_url(self, response):

         pass

     def content_url(self, response):

         pass

     def parse_item(self, response):

         i = BtxxxxItem()

         script_txt  = response.xpath('//*[@id="wall"]/h2/script/text()').extract()

         if len(script_txt) !=0:

             url_str = script_txt[0].replace('document.write(decodeURIComponent(', '').replace('));', '').replace('"','')

             link_name = urllib.unquote(str(url_str.replace('+', '')))

             i["file_name"] = link_name

             print "*" * 10

             #print link_name

             print "*" * 10

         file_nodes = response.xpath('//*[@id="wall"]/div/table/tr[last()]/td/text()').extract()

         print "#" * 10

         print file_nodes

         print "#" * 10

         if len(file_nodes) > 0 :

             i["file_type"] = file_nodes[0].replace('\n', '')

             i["file_createtime"] = file_nodes[1].replace('\n', '')

             i["file_hot"] = file_nodes[2].replace('\n', '')

             i["file_size"] = file_nodes[3].replace('\n', '')

         i["file_url"] = response.url

         file_link = response.xpath('//*[@id="wall"]/div[1]/div[1]/div[2]/a/@href').extract()

         if len(file_link) > 0:

             i["file_link"] = file_link[0]

         yield i

items的代码

 class BtxxxxItem(scrapy.Item):

     file_type = scrapy.Field()

     file_createtime = scrapy.Field()

     file_hot = scrapy.Field()

     file_size = scrapy.Field()

     file_count = scrapy.Field()

     file_link = scrapy.Field()

     file_name = scrapy.Field()

     file_url = scrapy.Field()

settings 中添加

FEED_EXPORT_ENCODING = 'utf-8'

并启用 DEFAULT_REQUEST_HEADERS

执行scrapy (设置要检索的关键字和输出的文件)

scrapy crawl btxxxx  -a key_word=xx -o xx.json

简单的爬数据而已，包含网址信息的代码我都已经在文章中删掉，只做学习使用

转载请标明出自原文地址

巴特西

step_by_step_用python爬点磁力链接

最新文章

热门文章