python爬取全站壁纸代码
2024-09-03 17:15:13
#测试网址:https://www.ivsky.com/bizhi/
#需要安装的库:requests,bs4
#本人是个强迫症患者,为了美观添加数个print(),其并没有实际意义,若是不爽删去即可。
import requests,re,os
from bs4 import BeautifulSoup
from time import sleep
from random import uniform
#网址解析
def url_open(url):
headers= {}
headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
headers["Referer"] = "https://www.ivsky.com/bizhi/"
html = requests.get(url,headers=headers).text
return html
#获取全部主题图片链接
def get_url_all():
print("正在收集整理壁纸主题网址,请稍候.....")
print()
theme_url_list = []
theme_title_list = []
data = []
page_totle = 100 #壁纸主题共有100页
#逐页收集主题URL
for page in range(1,page_totle+1):
url = "https://www.ivsky.com/bizhi/index_{}.html".format(page)
html = url_open(url)
soup = BeautifulSoup(html,"html.parser")
url_all = soup.find_all("div",class_="il_img")
for each in url_all:
theme_title = each.a["title"]
theme_title_list.append(theme_title)
theme_url = "https://www.ivsky.com" + each.a["href"]
theme_url_list.append(theme_url)
#将数据打包 以便能够将两个数据一起返回
data.append(theme_url_list)
data.append(theme_title_list)
break #减少调试运行时间使用 若要获取全部主题链接则删除此处即可
theme_totle = len(data[0]) #计算主题数目
print("壁纸网址收集结束,共收集%d个主题,准备进行图片下载....."%theme_totle)
sleep(1) #走个形式而已
return data
def save_img(img_url_list,theme_name,work_path):
#更改图片保存路径(分主题保存)
save_path = work_path + r"\%s" % theme_name
if os.path.exists(save_path) == True:
os.chdir(save_path)
else:
os.mkdir(save_path)
os.chdir(save_path)
num = 0 #当前任务图片下载计数
for img_url in img_url_list:
num += 1
print("正在下载主题“%s”第%d张图片" % (theme_name, num))
headers = {}
headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
content = requests.get(img_url, headers=headers).content
with open("%d.jpg" % num, "wb") as f:
f.write(content)
sleep_time = uniform(0.18,0.37) #随机休眠 减少服务器压力 (真有诚意调大点即可)
sleep(sleep_time)
def get_img(data):
img_root_url = "https://img.ivsky.com/img/bizhi/pre/"
num_1 = -1 # 标题索引 后面用于索引标题
work_path = os.getcwd()
num_2 = 0 #统计图片总张数
for theme_url in data[0]:
#print(theme_url)
num_1 += 1
theme_name_temp = data[1][num_1] #获取对应的主题名称
img_url_list = [] #用于存储单个主题的图片下载链接
#去掉(x张)字眼 (强迫症患者)
p_theme_name = r'(.+)[(]\d+?张[)]'
theme_name = re.findall(p_theme_name,theme_name_temp)[0]
print()
print("正在下载主题:%s"%theme_name)
print()
#每个页面16张图片 若主题图片数目大于16张图片则存在多个页面.....
p_img_num = r'.+[(](\d+?)张[)]'
img_num = int(re.findall(p_img_num,theme_name_temp)[0])
if img_num / 16 > img_num // 16:
page_totle = img_num // 16 + 1
else:
page_totle = img_num / 16
#获取全部图片链接
if page_totle == 1:
html = url_open(theme_url)
soup = BeautifulSoup(html,"html.parser")
soup_img_url = soup.find_all("div",class_="il_img")
for each in soup_img_url:
temp = each.img["src"].split("/t/")[1]
img_url = img_root_url + temp
img_url_list.append(img_url)
num_2 += 1
else:
for page in range(1,page_totle+1):
url = theme_url + "index_{}.html".format(page)
html = url_open(url)
soup = BeautifulSoup(html,"html.parser")
soup_img_url = soup.find_all("div",class_="il_img")
for each in soup_img_url:
temp = each.img["src"].split("/t/")[1]
img_url = img_root_url + temp
img_url_list.append(img_url)
num_2 += 1
save_img(img_url_list, theme_name,work_path) #图片下载保存
print()
print("任务完成,共计下载图片%d张"%num_2)
def main():
path = r'C:\Users\Administrator\Desktop\test'
if os.getcwd() != path:
if os.path.exists(path) == False:
os.mkdir(path)
os.chdir(path)
else:
os.chdir(path)
data = get_url_all()
get_img(data)
if __name__ == "__main__":
main()
最新文章
- 采用cocos2d-x lua 制作数字滚动效果样例
- Windows系统Stunnel客户端的配置
- 谈谈我的编程之路---WAMP(一)
- 配置rt-thread开发环境(配置系统,生成系统镜像)
- hdu2296Ring(ac自动机+dp)
- OA 办公自动化系统:权限管理模块的实现原理思路
- 阅读layim代码小记,实现可以更改用户签名的方法
- log4net 动态设定日志文件名
- 一些常用sqlite语句
- FormMove
- uitableview的重用重叠问题
- C++的ABI真特么是evil
- jquey(判断文本框输入的网址链接是否符合规则)
- (转)Collections类方法详解
- H-ui.admin v2.3后台模版!
- 对css盒模型的理解
- C# 启动外部进程
- 《Java编程思想》读书笔记-类与对象
- Spark基础-scala学习(三、Trait)
- jqeury-地区三级联动