1、python爬取贴吧壁纸

1.1、获取整个页面数据

#coding=utf-8
import urllib def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html html = getHtml("http://tieba.baidu.com/p/2738151262") print html
复制代码 1.2、筛选页面中想要的数据 import re
import urllib def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html def getImg(html):
reg = r'src="(.+?\.jpg)" '
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
return imglist html = getHtml("http://tieba.baidu.com/p/2460150866")
print getImg(html) 1.3、将页面筛选的数据保存到本地 #coding=utf-8
import urllib
import re def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html def getImg(html):
reg = r'src="(.+?\.jpg)" '
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x = 0
for imgurl in imglist:
urllib.urlretrieve(imgurl,'%s.jpg' % x)
x+=1 html = getHtml("http://tieba.baidu.com/p/2460150866") print getImg(html) 抓取昵图网图片 --修改版 #coding=utf-8
import urllib
import re def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html def getImg(html):
reg = r'src="(.*?)" '
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x = 0
for imgurl in imglist:
urllib.urlretrieve(imgurl,'D:360\\%s.jpg' % x)
x+=1 html = getHtml("http://www.nipic.com/show/17742538.html") print getImg(html) 解释: %s意思是字符串参数,就是将变量的值传入到字符串里面,字符串后的'%'后就是写要传入的参数。
在你给出的例子中,就是用x的值替代%s。比如说x=5,那么就是爬取url后面是'5.jpg'这个图片 保存的位置默认为程序的存放目录 如何保存到指定目录:urllib.urlretrieve(imgurl,'D:360\\%s.jpg' % x) https://image.baidu.com/search/detail?ct=503316480&z=0&ipn=false&word 2、python抓取价格 前两个不用加 text #-*—coding:utf8-*-
from lxml import etree import urllib
import urllib.request
#headers构造一个字典,里面保存了user-agent
#headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' }
url="http://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=194960f41c994e81ada43edbc276f54b"
html = urllib.request.urlopen(url).read()
data=html.decode('utf-8')
selector = etree.HTML(data)
#xpath
qiubai_text = selector.xpath('//div/ul/li/div/div/strong/i/text()')
#print(qiubai_text)
for i in qiubai_text:
print(i) 或者 #-*—coding:utf8-*-
from lxml import etree import urllib
import urllib.request
#headers构造一个字典,里面保存了user-agent
#headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' }
url="http://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=194960f41c994e81ada43edbc276f54b"
html = urllib.request.urlopen(url).read()
selector = etree.HTML(html)
#xpath
qiubai_text = selector.xpath('//div/ul/li/div/div/strong/i/text()')
#print(qiubai_text)
for i in qiubai_text:
print(i) 或者 :注意:这个需要加text html.text #-*—coding:utf8-*-
from lxml import etree
import requests
#headers构造一个字典,里面保存了user-agent
#headers= { 'User-Agent' : 'User-Agent:Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' }
url="http://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&pvid=194960f41c994e81ada43edbc276f54b"
html = requests.get(url)
selector = etree.HTML(html.text)
#xpath
qiubai_text = selector.xpath('//div/ul/li/div/div/strong/i/text()')
#print(qiubai_text)
for i in qiubai_text:
print(i) 3、python爬取昵图网图片 #coding=utf-8
import urllib
import re def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html def getImg(html):
reg = r'src="(.*?)" '
imgre = re.compile(reg)
imglist = re.findall(imgre,html)
x = 0
for imgurl in imglist:
urllib.urlretrieve(imgurl,'D:360\\%s.jpg' % x)
x+=1 html = getHtml("http://www.nipic.com/show/17742538.html") print getImg(html) 4、爬音乐 # coding:utf-8
import urllib
import urllib.request
import re
url="http://www.yy8844.cn/ting/ccceo/ceeivi.shtml"
html = urllib.request.urlopen(url).read()
data=html.decode('GBK')
#print(data)
music_id = int(re.findall(r'MusicId=(\d+)',data)[0])
music_name = re.findall(r'<title>(.*?)</title>',data)[0].split('/')[0].strip()
music_word = re.findall(r'<div class="textgeci_show" id="showtext">(.*?)</div>',data,re.S)[0]
article='word'
with open("%s.txt" % article,'w') as f:
f.write(music_word)
#print(music_word)
quanurl="http://96.ierge.cn/"'%d/%d/%s' % (music_id//30000,music_id//2000,music_id)+".mp3"
#print(quanurl)
bata=urllib.request.urlopen(quanurl).read()
with open("%s.mp3" % music_name,'wb') as f:
f.write(bata) 注意问题: music_word = re.findall(r'<div class="textgeci_show" id="showtext">(.*?)</div>',data,re.S)[0] python中AttributeError解决 【Python 脚本报错】AttributeError:'module' has no attribute 'xxx'的解决方法
http://blog.csdn.net/cn_wk/article/details/50839159 int库的.pyc文件 python 去掉 .pyc
http://blog.csdn.net/ubuntu64fan/article/details/48241985 python操作对象属性
http://www.jianshu.com/p/c38a81b8cb38 Python学习日记4|python爬虫常见报错小结及解决方法 http://www.jianshu.com/p/17c921639ad0 #coding=utf-8
from Tkinter import *
import tkMessageBox
import urllib
import json
import mp3play
import time
import threading
from pinyin import PinYin
import os
import stat
test = PinYin()
test.load_word()
stop=0
def music():
if not entry.get():
tkMessageBox.showinfo("温馨提示","搜索内容不能为空")
return
name = test.hanzi2pinyin_split(entry.get())
html=urllib.urlopen("http://s.music.163.com/search/get/?type=1&s=%s&limit=9"%name).read()
js=json.loads(html)
n = 0
global x
x = []
for i in js['result']['songs']:
listbox.insert(n,'%s(%s)'%(i['name'],i['artists'][0]['name']))
n+=1
x.append(i['audio'])
count = 0
#isplaying = None
def play():
global count
count += 1
index=listbox.curselection()
var1.set(u"正在加载"+listbox.get(index,last=None))
urllib.urlretrieve(x[index[0]],'tmp%s.mp3'%str(count))
var1.set(u"正在播放"+listbox.get(index,last=None))
mp3=mp3play.load("tmp%s.mp3"%str(count))
mp3.play()
time.sleep(mp3.seconds()) import inspect
import ctypes def _async_raise(tid, exctype):
"""raises the exception, performs cleanup if needed"""
tid = ctypes.c_long(tid)
if not inspect.isclass(exctype):
exctype = type(exctype)
res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
if res == 0:
raise ValueError("invalid thread id")
elif res != 1:
ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
raise SystemError("PyThreadState_SetAsyncExc failed") def stop_thread(thread):
_async_raise(thread.ident, SystemExit)
threads=list()
t=None
def excute(event):
global t
for i in threads:
stop_thread(i)
t = threading.Thread(target=play)
t.setDaemon(True)
t.start()
threads.append(t)
root = Tk()#创建一个窗口
root.title("云音乐")
root.geometry("500x300+500+200")
entry=Entry(root)#创建输入框(单行),置父
entry.pack()
btn=Button(root,text="搜 索",command=music)
btn.pack()#布局方式必须用同一种
var=StringVar()
listbox=Listbox(root,width=50,listvariable=var)
listbox.bind('<Double-Button-1>',excute)
listbox.pack()
var1=StringVar()
label=Label(root,text="云音乐播放器",fg="purple",textvariable=var1)
var1.set("云音乐播放器")
label.pack()
root.mainloop()#显示窗口

最新文章

  1. MySQL 注册码
  2. 终端更新ubuntu系统
  3. TOJ 1191. The Worm Turns
  4. 讨论SEO中是锚文本有效,还是纯文本有效呢?
  5. session和cookie的简单理解
  6. c#新语法学习笔记
  7. JQuery1.11版本对prop和attr接口的含义分离导致问题分析
  8. Atitit org.eclipse.jdt&#160;的ast 架构 Eclipse JDT API&#160;spec
  9. Sed替换行和字符shell
  10. Hadoop2.2.0(yarn)编译部署手册
  11. The First
  12. [原创]MLCC全球性缺货分析
  13. 初始化仓库(git init)
  14. MVC和MVP设计模式
  15. 无法启动此程序,因为计算机中丢失VCRUNTIME140.dll 尝试重新安装此程序以解决此问题
  16. Android--UI之Gallery
  17. iptables防火墙常用配置介绍
  18. $Django redis内存数据库 (知识回顾cmd切换目录)
  19. for...in的改进版for...of
  20. Linux之目录结构解析

热门文章

  1. zencart批量插入TEXT文本属性attributes
  2. 【ARC072 E】Alice in linear land
  3. 转 SQL连接查询语句(内、外、交叉和合并查询)
  4. CSS——简写属性(在padding和margin这样的简写属性中,值赋值的顺序是top、right、bottom、left)
  5. 给零基础的小白从0到1的react-naitve电商app——简单易学!
  6. Spring Boot 中初始化资源的几种方式(转)
  7. Python 多线程Ⅱ
  8. 「SNOI2017」礼物
  9. 灰度图像--频域滤波 傅里叶变换之连续信号傅里叶变换(FT)
  10. 2019牛客暑期多校训练营(第三场)F 单调队列