我的第一个python爬虫程序

程序用来爬取糗事百科上的图片的，程序设有超时功能，具有异常处理能力
下面直接上源码：
#-*-coding:utf-8-*-

'''

Created on 2016年10月20日

@author: audi

'''

import urllib2

import re

from bs4 import BeautifulSoup

import sys

reload(sys)

sys.setdefaultencoding('utf-8')

count = 0

path = "pic/tupian"

headers = {

           'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'

        }

for x in range(1,10):

    temp_url = "http://www.qiushibaike.com/imgrank/page/%d"%x

    req = urllib2.Request(

            url = temp_url,

            headers = headers

            )

    try:

        data = urllib2.urlopen(req,timeout=10).read()

    except:

        print "打开页面链接超时！！！！"

        continue

    else:

        print "打开页面成功，开始解析数据。。"

        soup=BeautifulSoup(data,'html.parser',from_encoding='utf-8')

#         图片链接的div标签格式

#         <div class="thumb">

#             <a href="/article/117795261" target="_blank">

#                 <img src="http://pic.qiushibaike.com/system/pictures/11779/117795261/medium/app117795261.jpg" alt="我想变成妈妈的眼睛，这样我就可以一直粘在妈妈的身上">

#             </a>

#         </div>

#         查询所有图片所在的div标签内容

        content = soup.find_all('div',class_ = 'thumb')   #(jpg|JPG|jpeg)

#         links集合存放最终的图片的链接

        links = set()

#         再次过滤以获得图片的链接

        for i in content:

            temp_link = i.find_all('a',href=re.compile(r"/article/\d"))

            temp_linnk = temp_link[0].find('img',src=re.compile(r"\.(jpg|JPG|jpeg)"))

            temp_linnk = temp_linnk['src']

            links.add(temp_linnk)

        for link in links:

            try:

                picData = urllib2.urlopen(link,timeout=3).read()

            except:

                print "当前子链接打开失败。。"

                continue

            else:

                file_name = path + str(count) + '.jpg'

                count +=1

                f = file(file_name,"wb")

                f.write(picData)

                f.close()

                print "爬取第" + str(count) + "个链接" + link

print "恭喜你，爬取图片结束！！！！！！！！！！！！"
巴特西

我的第一个python爬虫程序

最新文章

热门文章