scrapy--doutu
2024-08-24 22:44:31
年轻人都爱斗图,可是有时候斗图的数量比较少.就想办法收藏其他的人图片,然而只要能在doutula网页里爬取图片,是一件很棒的的事,看别人写爬斗图的爬虫程序有点麻烦,自己也来动动手,简单,实用.给大家分享一下。先给大家看看成果,喜欢的话,就开始吧!!!
1.doutu.py
# -*- coding: utf-8 -*-
import scrapy
from doutu.items import DoutuItem
from scrapy.linkextractors import LinkExtractor
import pdb class DoutuSpider(scrapy.Spider):
name = 'Doutu'
#allowed_domains = ['www.doutula.com']
start_urls = ['http://www.doutula.com/'] def parse(self, response):
le = LinkExtractor(restrict_css='div.col-sm-9')
links = le.extract_links(response)
for link in links[1:4]:
yield scrapy.Request(link.url,callback=self.parse_pager) le1 = LinkExtractor(restrict_css='ul.pagination')
links1 = le1.extract_links(response)
for link1 in links1:
yield scrapy.Request(link1.url,callback=self.parse) def parse_pager(self,response):
le2 = LinkExtractor(restrict_css='div.pic-content')
links2 = le2.extract_links(response)
for link2 in links2:
yield scrapy.Request(link2.url,callback=self.parse_img) def parse_img(self,response):
doutu = DoutuItem()
doutu['image_urls'] = response.xpath('//div[@class="swiper-slide"]//img/@src').extract()[0]items.py
2.items.py
import scrapy class DoutuItem(scrapy.Item):
image_urls = scrapy.Field()
images = scrapy.Field()
3.pipelines.py
import pdb
import scrapy
from scrapy.pipelines.images import ImagesPipeline class DoutuPipeline(ImagesPipeline):
def get_media_requests(self, item, info): yield scrapy.Request(item['image_urls']) def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] # ok判断是否下载成功 if not image_paths:
raise DropItem("Item contains no images")
return item
4.settings.py
IMAGES_STORE = r'C:\Desktop\doutula' #图片存储文件名 USER_AGENT ={ #设置浏览器的User_agent
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
} CONCURRENT_REQUESTS = 16 #同时来16个请求
DOWNLOAD_DELAY = 0.2 #0.2s后开启处理第一个请求 IMAGES_THUMBS = {#缩略图的尺寸,设置这个值就会产生缩略图
'small': (50, 50), #full/small
'big': (200, 200), #full/big
} ROBOTSTXT_OBEY = False #不遵守robot.txt条约 COOKIES_ENABLED = False #禁用cookies ITEM_PIPELINES = {
'doutu.pipelines.DoutuPipeline': 1, #设置优先级1-1000
}
有遇到问题的小伙伴,可以在下面留言.欢迎
最新文章
- COGS1008. 贪婪大陆[树状数组 模型转换]
- spark应用程序常见问题整理
- 【转】全面解析Unity3D自动生成的脚本工程文件
- 加载.properties方式
- EF CRUD 操作
- hadoop测试环境主配置简例
- 单机c/s软件如何让老板在异地看销售营业报表
- Fedora14下首次搭建Samba服务器遇到的一些问题
- HttpClient连接池抛出大量ConnectionPoolTimeoutException: Timeout waiting for connection异常排查
- JDK1.7 安装加(一劳永逸的环境配置)
- iOS开发-OC数据类型
- 学习Vue.js之vue移动端框架到底哪家强
- 基于weex的app开发脚手架weexplus学习笔记
- redis 在 php 中的应用(string篇)
- Java基础_0206:方法的定义与使用
- 如何移除HTML5 input在type=";number";时的上下小箭头
- viedo formats vs file formats
- 利用三层判断sql数据库中编码是否已经存在(个人拙作,不喜勿喷)
- javascript中闭包最简单的简绍
- Python学习---重点模块之subprocess