python爬虫小实例

1、python爬取贴吧壁纸

1.1、获取整个页面数据

#coding=utf-8

import urllib

def getHtml(url):

    page = urllib.urlopen(url)

    html = page.read()

    return html

html = getHtml("http://tieba.baidu.com/p/2738151262")

print html

复制代码

1.2、筛选页面中想要的数据

import re

import urllib

def getHtml(url):

    page = urllib.urlopen(url)

    html = page.read()

    return html

def getImg(html):

    reg = r'src="(.+?\.jpg)" '

    imgre = re.compile(reg)

    imglist = re.findall(imgre,html)

    return imglist      

html = getHtml("http://tieba.baidu.com/p/2460150866")

print getImg(html)

1.3、将页面筛选的数据保存到本地

#coding=utf-8

import urllib

import re

def getHtml(url):

    page = urllib.urlopen(url)

    html = page.read()

    return html

def getImg(html):

    reg = r'src="(.+?\.jpg)" '

    imgre = re.compile(reg)

    imglist = re.findall(imgre,html)

    x = 0

    for imgurl in imglist:

        urllib.urlretrieve(imgurl,'%s.jpg' % x)

        x+=1

html = getHtml("http://tieba.baidu.com/p/2460150866")

print getImg(html)

抓取昵图网图片 --修改版

#coding=utf-8

import urllib

import re

def getHtml(url):

    page = urllib.urlopen(url)

    html = page.read()

    return html

def getImg(html):

    reg = r'src="(.*?)" '

    imgre = re.compile(reg)

    imglist = re.findall(imgre,html)

    x = 0

    for imgurl in imglist:

        urllib.urlretrieve(imgurl,'D:360\\%s.jpg' % x)

        x+=1

html = getHtml("http://www.nipic.com/show/17742538.html")

print getImg(html)

解释：

%s意思是字符串参数，就是将变量的值传入到字符串里面，字符串后的'%'后就是写要传入的参数。

在你给出的例子中，就是用x的值替代%s。比如说x=5，那么就是爬取url后面是'5.jpg'这个图片

保存的位置默认为程序的存放目录

如何保存到指定目录：urllib.urlretrieve(imgurl,'D:360\\%s.jpg' % x)

https://image.baidu.com/search/detail?ct=503316480&z=0&ipn=false&word

2、python抓取价格

前两个不用加 text

#-*—coding:utf8-*-

from lxml import etree

import urllib

import urllib.request

#headers构造一个字典，里面保存了user-agent

#headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' }

url="http://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=194960f41c994e81ada43edbc276f54b"

html = urllib.request.urlopen(url).read()

data=html.decode('utf-8')

selector = etree.HTML(data)

#xpath

qiubai_text = selector.xpath('//div/ul/li/div/div/strong/i/text()')

#print(qiubai_text)

for i in qiubai_text:

    print(i)

或者

#-*—coding:utf8-*-

from lxml import etree

import urllib

import urllib.request

#headers构造一个字典，里面保存了user-agent

#headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' }

url="http://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=194960f41c994e81ada43edbc276f54b"

html = urllib.request.urlopen(url).read()

selector = etree.HTML(html)

#xpath

qiubai_text = selector.xpath('//div/ul/li/div/div/strong/i/text()')

#print(qiubai_text)

for i in qiubai_text:

    print(i)

或者    ：注意：这个需要加text         html.text

#-*—coding:utf8-*-

from lxml import etree

import requests

#headers构造一个字典，里面保存了user-agent

#headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' }

url="http://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=194960f41c994e81ada43edbc276f54b"

html = requests.get(url)

selector = etree.HTML(html.text)

#xpath

qiubai_text = selector.xpath('//div/ul/li/div/div/strong/i/text()')

#print(qiubai_text)

for i in qiubai_text:

    print(i)

3、python爬取昵图网图片

#coding=utf-8

import urllib

import re

def getHtml(url):

    page = urllib.urlopen(url)

    html = page.read()

    return html

def getImg(html):

    reg = r'src="(.*?)" '

    imgre = re.compile(reg)

    imglist = re.findall(imgre,html)

    x = 0

    for imgurl in imglist:

        urllib.urlretrieve(imgurl,'D:360\\%s.jpg' % x)

        x+=1

html = getHtml("http://www.nipic.com/show/17742538.html")

print getImg(html)

4、爬音乐

# coding:utf-8

import urllib

import urllib.request

import re

url="http://www.yy8844.cn/ting/ccceo/ceeivi.shtml"

html = urllib.request.urlopen(url).read()

data=html.decode('GBK')

#print(data)

music_id = int(re.findall(r'MusicId=(\d+)',data)[0])

music_name = re.findall(r'<title>(.*?)</title>',data)[0].split('/')[0].strip()

music_word = re.findall(r'<div class="textgeci_show" id="showtext">(.*?)</div>',data,re.S)[0]

article='word'

with open("%s.txt" % article,'w') as f:

    f.write(music_word)

#print(music_word)

quanurl="http://96.ierge.cn/"'%d/%d/%s' % (music_id//30000,music_id//2000,music_id)+".mp3"

#print(quanurl)

bata=urllib.request.urlopen(quanurl).read()

with open("%s.mp3" % music_name,'wb') as f:

    f.write(bata)

注意问题：

music_word = re.findall(r'<div class="textgeci_show" id="showtext">(.*?)</div>',data,re.S)[0]

python中AttributeError解决

【Python 脚本报错】AttributeError:'module' has no attribute 'xxx'的解决方法

http://blog.csdn.net/cn_wk/article/details/50839159

int库的.pyc文件

python 去掉 .pyc

http://blog.csdn.net/ubuntu64fan/article/details/48241985

python操作对象属性

http://www.jianshu.com/p/c38a81b8cb38

Python学习日记4|python爬虫常见报错小结及解决方法

http://www.jianshu.com/p/17c921639ad0

#coding=utf-8

from Tkinter import *

import  tkMessageBox

import urllib

import json

import mp3play

import time

import threading

from pinyin import PinYin

import os

import stat

test = PinYin()

test.load_word()

stop=0

def music():

    if not entry.get():

        tkMessageBox.showinfo("温馨提示","搜索内容不能为空")

        return

    name = test.hanzi2pinyin_split(entry.get())

    html=urllib.urlopen("http://s.music.163.com/search/get/?type=1&s=%s&limit=9"%name).read()

    js=json.loads(html)

    n = 0

    global x

    x = []

    for i in js['result']['songs']:

        listbox.insert(n,'%s(%s)'%(i['name'],i['artists'][0]['name']))

        n+=1

        x.append(i['audio'])

count = 0

#isplaying = None

def play():

    global count

    count += 1

    index=listbox.curselection()

    var1.set(u"正在加载"+listbox.get(index,last=None))

    urllib.urlretrieve(x[index[0]],'tmp%s.mp3'%str(count))

    var1.set(u"正在播放"+listbox.get(index,last=None))

    mp3=mp3play.load("tmp%s.mp3"%str(count))

    mp3.play()

    time.sleep(mp3.seconds())

import inspect

import ctypes

def _async_raise(tid, exctype):

    """raises the exception, performs cleanup if needed"""

    tid = ctypes.c_long(tid)

    if not inspect.isclass(exctype):

        exctype = type(exctype)

    res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))

    if res == 0:

        raise ValueError("invalid thread id")

    elif res != 1:

        ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)

        raise SystemError("PyThreadState_SetAsyncExc failed")

def stop_thread(thread):

    _async_raise(thread.ident, SystemExit)

threads=list()

t=None

def excute(event):

    global  t

    for i in threads:

        stop_thread(i)

    t = threading.Thread(target=play)

    t.setDaemon(True)

    t.start()

    threads.append(t)

root = Tk()#创建一个窗口

root.title("云音乐")

root.geometry("500x300+500+200")

entry=Entry(root)#创建输入框（单行）,置父

entry.pack()

btn=Button(root,text="搜 索",command=music)

btn.pack()#布局方式必须用同一种

var=StringVar()

listbox=Listbox(root,width=50,listvariable=var)

listbox.bind('<Double-Button-1>',excute)

listbox.pack()

var1=StringVar()

label=Label(root,text="云音乐播放器",fg="purple",textvariable=var1)

var1.set("云音乐播放器")

label.pack()

root.mainloop()#显示窗口
巴特西

python爬虫小实例

最新文章

热门文章