# -*- coding: utf-8 -*-

 import json
import os
import time
from multiprocessing import Pool
import multiprocessing
import requests
from selenium import webdriver def get_image_links(keyword, num_requested = 1000):
"""get image links with selenium
"""
number_of_scrolls = int(num_requested/400) + 1
img_urls = set()#设置为集合,自动去除重复链接
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')#设置无头浏览器
# chrome_options.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"')
# chrome_options.add_argument("lang=en_US")#设置语言
# prefs = {"profile.managed_default_content_settings.images":2}
# chrome_options.add_experimental_option("prefs",prefs)#配置不加载图片
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.maximize_window()
search_query = keyword
url = "https://www.google.com/search?q="+search_query+"&source=lnms&tbm=isch"
driver.get(url)
for _ in range(number_of_scrolls):
for i in range(5):
# multiple scrolls needed to show all 400 images
driver.execute_script("window.scrollBy(0, 100000)")
time.sleep(1)
time.sleep(5)#等待页面刷新,否则有可能元素不可见
try:
# driver.find_element_by_xpath("//input[@value='Show more results']").click()#浏览器的中英文版本不同
driver.find_element_by_xpath("//input[@value='显示更多结果']").click()
except Exception as e:
print("reach the end of page ")
break # with open('page.html','w') as f:
# f.write(driver.page_source)
imgs = driver.find_elements_by_xpath('//div[contains(@class,"rg_meta")]')#模糊定位
for i,img in enumerate(imgs):
img_url = json.loads(img.get_attribute('innerHTML'))["ou"]
img_urls.add(img_url)
driver.quit()
print("finish getting all image urls!") return img_urls def download(urls,download_dir):
'''download images
'''
print("start downloading images!")
for url in urls:
filename=os.path.join(download_dir,os.path.basename(url))
try:
r = requests.get(url, stream=True, timeout=60)
r.raise_for_status()
with open(filename, 'wb') as f:
f.write(r.content)
except Exception:
continue
print("finish downloading images!") keywords = ['girl','boy']
download_dir = './images/'
download_dirs = []
for keyword in keywords:
path = os.path.join(download_dir,keyword)
download_dirs.append(path)
if not os.path.exists(path):
os.makedirs(path) # for keyword in main_keywords:
# image_urls = get_image_links(keyword)
# download(image_urls,download_dir) ###################################
# get image links/MultiProcess
###################################
img_urls=[]
multiprocessing.freeze_support()
p = Pool(4) # default number of process is the number of cores of your CPU, change it by yourself
for keyword in keywords:
img_urls.append(p.apply_async(get_image_links, (keyword,)))
#img_urls:[<multiprocessing.pool.ApplyResult object at 0x7f536925fcc0>, <multiprocessing.pool.ApplyResult object at 0x7f536925fd68>]
for i,urls in enumerate(img_urls):
img_urls[i]=urls.get()
p.close()
p.join() # # ###################################
# # # download images/MultiProcess
# # ###################################
p = Pool(4) # default number of process is the number of cores of your CPU, change it by yourself
for i,urls in enumerate(img_urls):
p.apply_async(download, [urls,download_dirs[i]])
p.close()
p.join()

最新文章

  1. select for update行锁
  2. [转] MySQL 查询表数据大小的总结
  3. mac系统上使用压缩包版的mysql(非安装版)
  4. 20145218 《Java程序设计》第9周学习总结
  5. iOS开发——UI篇&amp;提示效果
  6. SPFA中 正逆邻接表的建立
  7. webkit 渲染机制
  8. python常用模块详解
  9. sublime text3空格和tab的显示
  10. AngularJS进阶(十六)脏值检查
  11. 依赖背包——cf855C好题
  12. 自定义一个全屏的AlertDialog。
  13. Cs231n-assignment 1作业笔记
  14. 解决PHP乱码
  15. SpringBoot 项目打包分开lib,配置和资源文件
  16. Python GUI之tkinter窗口视窗教程大集合(看这篇就够了)
  17. 章节七、1-ArrayList
  18. Golang标准库——io-结构
  19. CentOS搭建NAT和DHCP服务,实现共享上网
  20. 在C#中对枚举进行位运算--枚举组合

热门文章

  1. Unity创作赛车游戏的四款插件
  2. 【深入学习linux】系统分区与格式化
  3. Java12新特性 -- Shenandoah GC
  4. tomcat乱码解决
  5. TortoiseGit 查看单个文件日志显示全部提交记录了 解决办法
  6. 非LODOP的打印其他问题-简短问答
  7. 算法练习之x的平方根,爬楼梯,删除排序链表中的重复元素, 合并两个有序数组
  8. 【OpenCV开发】OpenCV3后IPLimage转换成MAT的问题
  9. K8S+GitLab+.net core-自动化分布式部署-1
  10. Linux下zookeeper单机版详细安装