python抓取不得姐动图(报错 urllib.error.HTTPError: HTTP Error 403: Forbidden)
2024-08-29 00:47:47
抓取不得姐动图(报错)
# -*- coding:utf-8 -*-
#__author__ :kusy
#__content__:文件说明
#__date__:2018/7/23 17:01
import urllib.request
import re def getHtml(url):
page = urllib.request.urlopen(url)
html = page.read()
# print(html)
return html def getImg(reg,savePath):
iCnt = 0
def giveImg(html):
imgre = re.compile(reg)
imglist = re.findall(imgre, html.decode('utf-8'))
nonlocal iCnt
for imgurl in imglist:
urllib.request.urlretrieve(imgurl, savePath + '%s.gif' % iCnt)
iCnt += 1
return giveImg # html = getHtml("http://pic.sogou.com/")
# reg = r'"image":"(.+?)"' #sougou reg = r'data-original="(.+?\.gif)"'
savePath = 'image/gif/'
g = getImg(reg,savePath)
for i in range(10):
if i >1:
print("http://www.budejie.com/" + str(i))
html = getHtml("http://www.budejie.com/" + str(i))
else:
html = getHtml("http://www.budejie.com/")
g(html)
报错如下
E:\kusy\python\venv\Scripts\python.exe E:/kusy/python/getJpg.py
http://www.budejie.com/2
Traceback (most recent call last):
File "E:/kusy/python/getJpg.py", line 35, in <module>
html = getHtml("http://www.budejie.com/" + str(i))
File "E:/kusy/python/getJpg.py", line 9, in getHtml
page = urllib.request.urlopen(url)
File "C:\Users\jingjing\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "C:\Users\jingjing\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 532, in open
response = meth(req, response)
File "C:\Users\jingjing\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "C:\Users\jingjing\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 570, in error
return self._call_chain(*args)
File "C:\Users\jingjing\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 504, in _call_chain
result = func(*args)
File "C:\Users\jingjing\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 650, in http_error_default
raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 403: Forbidden Process finished with exit code 1
百度了下已解决:
# -*- coding:utf-8 -*-
#__author__ :kusy
#__content__:文件说明
#__date__:2018/7/23 17:01
import urllib.request
import re def getHtml(url):
# 如果不加上下面的这行出现会出现urllib.error.HTTPError: HTTP Error 403: Forbidden错误
# 主要是由于该网站禁止爬虫导致的,可以在请求加上头信息,伪装成浏览器访问User-Agent,具体的信息可以通过火狐的FireBug插件查询
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
req = urllib.request.Request(url=url,headers=headers)
page = urllib.request.urlopen(req)
html = page.read()
# print(html)
return html def getImg(reg,savePath):
iCnt = 0
def giveImg(html):
imgre = re.compile(reg)
imglist = re.findall(imgre, html.decode('utf-8'))
nonlocal iCnt
for imgurl in imglist:
urllib.request.urlretrieve(imgurl, savePath + '%s.gif' % iCnt)
iCnt += 1
return giveImg # html = getHtml("http://pic.sogou.com/")
# reg = r'"image":"(.+?)"' #sougou reg = r'data-original="(.+?\.gif)"'
savePath = 'image/gif/'
g = getImg(reg,savePath)
for i in range(10):
if i >1:
print("http://www.budejie.com/" + str(i))
html = getHtml("http://www.budejie.com/" + str(i))
else:
html = getHtml("http://www.budejie.com/")
g(html)
下载成功
最新文章
- Excel转Html
- Android之assets资源
- ASP.Net MVC开发基础学习笔记(5):区域、模板页与WebAPI初步
- hdf第一周完了,突然时间静止.,醒了就早点去公司上班,再努力一点
- android 组合控件接收不到点击事件的问题
- cocos2d-x入门笔记(1)
- [BZOJ 2594] [Wc2006]水管局长数据加强版 【LCT】
- JavaWeb之cookie
- [php] in_array 判断问题(坑)
- 接触JS的变量
- JS中函数常见的表现形式以及立即执行函数
- 受到 1 万点暴击,二狗子被 DDoS 攻击的惨痛经历
- Javascript arguments.callee和caller的区别
- Akka详细介绍
- c#的一些快捷键
- vue.js学习:1.0到2.0的变化(区别)
- 前端基础开发之HTML
- L313 珊瑚裸鼠灭绝
- JQuery常用和很有用处的方法
- linux开机、重启和用户登陆注销