Python爬虫之路——简单的网页抓图

转载自我自己的博客:http://www.mylonly.com/archives/1401.html

用Python的urllib2库和HTMLParser库写了一个简单的抓图脚本。主要抓的是http://desk.zol.com.cn/meinv/这个链接下的图片，通过得到图集的起始URL地址。得到第一张图片，然后不断的去获取其下一个图片的URL。继而得到全部首页的图集的图片。

整个源代码例如以下。比較简单。写这个仅仅是简单的练手而已

#coding: utf-8 #############################################################

# File Name: girls.py

# Author: mylonly

# mail: mylonly@gmail.com

# Created Time: Mon 09 Jun 2014 09:23:18 PM CST

#########################################################################

#!/usr/bin/python

import urllib2,HTMLParser,re

#根url

host = "http://desk.zol.com.cn"

#本地保存地址

localSavePath = '/data/girls/'

#起始图片html地址

startHtmlUrl = ''

#图片页Html的地址

htmlUrlList = []

#图片Url地址

imageUrlList = []

#依据得到的图片路径URL将图片下载下来保存本地

def downloadImage(url):

	cont = urllib2.urlopen(url).read()

	patter = '[0-9]*\.jpg';

	match = re.search(patter,url);

	if match:

		print '正在下载文件：',match.group()

		filename = localSavePath+match.group()

		f = open(filename,'w+')

		f.write(cont)

		f.close()

	else:

		print 'no match'

#依据首页得到的图片集遍历每一个图片集

def getImageUrlByHtmlUrl(htmlUrl):

	parser = MyHtmlParse(False)

	request = urllib2.Request(htmlUrl)

	try:

		response = urllib2.urlopen(request)

		content = response.read()

		parser.feed(content)

	except urllib2.URLError,e:

		print e.reason

class MyHtmlParse(HTMLParser.HTMLParser):

	def __init__(self,isIndex):

		self.isIndex = isIndex;

		HTMLParser.HTMLParser.__init__(self)

	def handle_starttag(self,tag,attrs):

		if(self.isIndex):

			if(tag == 'a'):

				if(len(attrs) == 4):

					if(attrs[0] ==('class','pic')):

						newUrl = host+attrs[1][1]

						print '找到一处图片的网页链接:',newUrl

						global startHtml

						startHtmlUrl = newUrl

						getImageUrlByHtmlUrl(newUrl)

		else:

			if(tag == 'img'):

				if(attrs[0] == ('id','bigImg')):

						imageUrl = attrs[1][1]

						print '找到一张图片:',imageUrl

						downloadImage(imageUrl)

						#imageUrlList.append(imageUrl)

			if (tag == 'a'):

				if (len(attrs) == 4):

					if (attrs[1] == ('class','next')):

						nextUrl = host + attrs[2][1]

						print '找到一处图片的网页链接:',nextUrl

						global startHtmlUrl

						if (startHtmlUrl != nextUrl):

							getImageUrlByHtmlUrl(nextUrl)

#分析首页得到每一个图片集的链接

indexUrl = 'http://desk.zol.com.cn/meinv/'

m = urllib2.urlopen(indexUrl).read()

parserIndex = MyHtmlParse(True)

parserIndex.feed(m

巴特西

Python爬虫之路——简单的网页抓图

最新文章

热门文章