python+selenium爬取关键字搜索google图片

 # -*- coding: utf-8 -*-

 import json

 import os

 import time

 from multiprocessing import Pool

 import multiprocessing

 import requests

 from selenium import webdriver

 def get_image_links(keyword, num_requested = 1000):

     """get image links with selenium

     """

     number_of_scrolls = int(num_requested/400) + 1

     img_urls = set()#设置为集合，自动去除重复链接

     chrome_options = webdriver.ChromeOptions()

     # chrome_options.add_argument('--headless')#设置无头浏览器

     # chrome_options.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36"')

     # chrome_options.add_argument("lang=en_US")#设置语言

     # prefs = {"profile.managed_default_content_settings.images":2}

     # chrome_options.add_experimental_option("prefs",prefs)#配置不加载图片

     driver = webdriver.Chrome(chrome_options=chrome_options)

     driver.maximize_window()

     search_query = keyword

     url = "https://www.google.com/search?q="+search_query+"&source=lnms&tbm=isch"

     driver.get(url)

     for _ in range(number_of_scrolls):

         for i in range(5):

             # multiple scrolls needed to show all 400 images

             driver.execute_script("window.scrollBy(0, 100000)")

             time.sleep(1)

         time.sleep(5)#等待页面刷新，否则有可能元素不可见

         try:

             # driver.find_element_by_xpath("//input[@value='Show more results']").click()＃浏览器的中英文版本不同

             driver.find_element_by_xpath("//input[@value='显示更多结果']").click()

         except Exception as e:

             print("reach the end of page ")

             break

     # with open('page.html','w') as f:

     #     f.write(driver.page_source)

     imgs = driver.find_elements_by_xpath('//div[contains(@class,"rg_meta")]')#模糊定位

     for i,img in enumerate(imgs):

         img_url = json.loads(img.get_attribute('innerHTML'))["ou"]

         img_urls.add(img_url)

     driver.quit()

     print("finish getting all image urls!")

     return img_urls

 def download(urls,download_dir):

     '''download images

     '''

     print("start downloading images!")

     for url in urls:

         filename=os.path.join(download_dir,os.path.basename(url))

         try:

             r = requests.get(url, stream=True, timeout=60)

             r.raise_for_status()

             with open(filename, 'wb') as f:

                 f.write(r.content)

         except Exception:

             continue

     print("finish downloading images!")

 keywords = ['girl','boy']

 download_dir = './images/'

 download_dirs = []

 for keyword in keywords:

     path = os.path.join(download_dir,keyword)

     download_dirs.append(path)

     if not os.path.exists(path):

         os.makedirs(path)

 # for keyword in main_keywords:

 #     image_urls = get_image_links(keyword)

 #     download(image_urls,download_dir)

 ###################################

 # get image links/MultiProcess

 ###################################

 img_urls=[]

 multiprocessing.freeze_support()

 p = Pool(4) # default number of process is the number of cores of your CPU, change it by yourself

 for keyword in keywords:

     img_urls.append(p.apply_async(get_image_links, (keyword,)))

 #img_urls:[<multiprocessing.pool.ApplyResult object at 0x7f536925fcc0>, <multiprocessing.pool.ApplyResult object at 0x7f536925fd68>]

 for i,urls in enumerate(img_urls):

     img_urls[i]=urls.get()

 p.close()

 p.join()

 # # ###################################

 # # # download images/MultiProcess

 # # ###################################

 p = Pool(4) # default number of process is the number of cores of your CPU, change it by yourself

 for i,urls in enumerate(img_urls):

     p.apply_async(download, [urls,download_dirs[i]])

 p.close()

 p.join()
巴特西

python+selenium爬取关键字搜索google图片

最新文章

热门文章