python抓取不得姐动图（报错 urllib.error.HTTPError: HTTP Error 403: Forbidden）

抓取不得姐动图（报错）

# -*- coding:utf-8 -*-

#__author__ :kusy

#__content__:文件说明

#__date__:2018/7/23 17:01

import urllib.request

import re

def getHtml(url):

    page = urllib.request.urlopen(url)

    html = page.read()

    # print(html)

    return html

def getImg(reg,savePath):

    iCnt = 0

    def giveImg(html):

        imgre = re.compile(reg)

        imglist = re.findall(imgre, html.decode('utf-8'))

        nonlocal iCnt

        for imgurl in imglist:

            urllib.request.urlretrieve(imgurl, savePath + '%s.gif' % iCnt)

            iCnt += 1

    return giveImg

# html = getHtml("http://pic.sogou.com/")

# reg = r'"image":"(.+?)"'  #sougou

reg = r'data-original="(.+?\.gif)"'

savePath = 'image/gif/'

g = getImg(reg,savePath)

for i in range(10):

    if i >1:

        print("http://www.budejie.com/" + str(i))

        html = getHtml("http://www.budejie.com/" + str(i))

    else:

        html = getHtml("http://www.budejie.com/")

    g(html)

报错如下

E:\kusy\python\venv\Scripts\python.exe E:/kusy/python/getJpg.py

http://www.budejie.com/2

Traceback (most recent call last):

  File "E:/kusy/python/getJpg.py", line 35, in <module>

    html = getHtml("http://www.budejie.com/" + str(i))

  File "E:/kusy/python/getJpg.py", line 9, in getHtml

    page = urllib.request.urlopen(url)

  File "C:\Users\jingjing\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 223, in urlopen

    return opener.open(url, data, timeout)

  File "C:\Users\jingjing\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 532, in open

    response = meth(req, response)

  File "C:\Users\jingjing\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 642, in http_response

    'http', request, response, code, msg, hdrs)

  File "C:\Users\jingjing\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 570, in error

    return self._call_chain(*args)

  File "C:\Users\jingjing\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 504, in _call_chain

    result = func(*args)

  File "C:\Users\jingjing\AppData\Local\Programs\Python\Python36\lib\urllib\request.py", line 650, in http_error_default

    raise HTTPError(req.full_url, code, msg, hdrs, fp)

urllib.error.HTTPError: HTTP Error 403: Forbidden

Process finished with exit code 1

百度了下已解决：

# -*- coding:utf-8 -*-

#__author__ :kusy

#__content__:文件说明

#__date__:2018/7/23 17:01

import urllib.request

import re

def getHtml(url):

    # 如果不加上下面的这行出现会出现urllib.error.HTTPError: HTTP Error 403: Forbidden错误

    # 主要是由于该网站禁止爬虫导致的，可以在请求加上头信息，伪装成浏览器访问User-Agent,具体的信息可以通过火狐的FireBug插件查询

    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}

    req = urllib.request.Request(url=url,headers=headers)

    page = urllib.request.urlopen(req)

    html = page.read()

    # print(html)

    return html

def getImg(reg,savePath):

    iCnt = 0

    def giveImg(html):

        imgre = re.compile(reg)

        imglist = re.findall(imgre, html.decode('utf-8'))

        nonlocal iCnt

        for imgurl in imglist:

            urllib.request.urlretrieve(imgurl, savePath + '%s.gif' % iCnt)

            iCnt += 1

    return giveImg

# html = getHtml("http://pic.sogou.com/")

# reg = r'"image":"(.+?)"'  #sougou

reg = r'data-original="(.+?\.gif)"'

savePath = 'image/gif/'

g = getImg(reg,savePath)

for i in range(10):

    if i >1:

        print("http://www.budejie.com/" + str(i))

        html = getHtml("http://www.budejie.com/" + str(i))

    else:

        html = getHtml("http://www.budejie.com/")

    g(html)

下载成功

巴特西

python抓取不得姐动图（报错 urllib.error.HTTPError: HTTP Error 403: Forbidden）

最新文章

热门文章