用python爬校花网
2024-10-17 13:02:01
import requests
import re
import hashlib,time def get_index(url):
response=requests.get(url)
if response.status_code == 200:
return response.text def parse_index(res):
urls=re.findall(r'class="items".*?href="(.*?)"',res,re.S)
return urls def get_detail(urls):
for url in urls:
if not url.startswith('http'):
url='http://www.xiaohuar.com%s' %url
r1=requests.get(url)
if r1.status_code == 200:
url_list=re.findall(r'id="media".*?src="(.*?)"',r1.text,re.S)
if url_list:
mp4_url=url_list[0]
# print(mp4_url)
save(mp4_url) def save(url):
print('Download:%s' %url)
r2=requests.get(url)
if r2.status_code == 200:
m=hashlib.md5()
m.update(url.encode('utf-8'))
m.update(str(time.time()).encode('utf-8'))
filename='%s.mp4' %m.hexdigest()
file_path=r'D:\\爬虫视频\%s' % filename
with open(file_path,'wb') as f:
f.write(r2.content) def main():
for i in range(5):
res1=get_index('http://www.xiaohuar.com/list-3-%s.html' %i)
res2=parse_index(res1)
get_detail(res2) if __name__ == '__main__':
main()
基于上面代码开多线程爬取视频,优化下载速度
# 异步,多线程优化下载速度 import requests
import re
import hashlib,time
from concurrent.futures import ThreadPoolExecutor p=ThreadPoolExecutor(30) def get_index(url):
response=requests.get(url)
if response.status_code == 200:
return response.text def parse_index(res):
res=res.result()
urls=re.findall(r'class="items".*?href="(.*?)"',res,re.S)
# return urls
for url in urls:
p.submit(get_detail,url) def get_detail(urls):
for url in urls:
if not url.startswith('http'):
url='http://www.xiaohuar.com%s' %url
r1=requests.get(url)
if r1.status_code == 200:
url_list=re.findall(r'id="media".*?src="(.*?)"',r1.text,re.S)
if url_list:
mp4_url=url_list[0]
# print(mp4_url)
save(mp4_url) def save(url):
print('Download:%s' %url)
r2=requests.get(url)
if r2.status_code == 200:
m=hashlib.md5()
m.update(url.encode('utf-8'))
m.update(str(time.time()).encode('utf-8'))
filename='%s.mp4' %m.hexdigest()
file_path=r'D:\\爬虫视频\%s' % filename
with open(file_path,'wb') as f:
f.write(r2.content) def main():
for i in range(5):
p.submit(get_index,'http://www.xiaohuar.com/list-3-%s.html' %i).add_done_callback(parse_index)
# res1=get_index('http://www.xiaohuar.com/list-3-%s.html' %i)
# res2=parse_index(res1)
# get_detail(res2) if __name__ == '__main__':
main()
最新文章
- mouseover事件与mouseenter事件的区别
- SpirentTestcenter测试仪的自动化
- Dubbo中服务消费者和服务提供者之间的请求和响应过程
- angular2监听页面大小变化
- 贝叶斯定理推导(Bayes' Theorem Induction)
- 自学Linux Shell9.4-基于Red Hat系统工具包存在两种方式之二:源码包
- Maven命令行使用 mvn clean package
- Queue-621. Task Scheduler
- 更改 pandas dataframe 中两列的位置
- 王者荣耀交流协会scrum立会20171111
- autocomplete.jquery 点击或进入默认显示所有结果
- 无序数组中第Kth大的数
- C# 路径的使用
- 【Python实例二】之前期准备:Windows下的BeautifulSoup安装
- 3.5.基于STC89C52+MC20的短信远程控制开关LCD1602显示
- DAY15-HTTP协议简述
- ES Docs-1:Installation Elasticsearch-2.3.1
- 春哥的nginx systemtap调试脚本简单介绍
- XON/OFF
- Unity5.1 新的网络引擎UNET(八) UNET 系统概括