# coding=utf-8
from urllib.parse import urlencode
import requests
from requests.exceptions import RequestException,Timeout
import json
from bs4 import BeautifulSoup
from pymongo import MongoClient
from multiprocessing import Pool
import os
import string
from hashlib import md5 def get_response(url):
try:
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36"
}
# proxies = {'http':'118.11.2.3:8080'}
response = requests.get(url, headers=headers, timeout=5)
print(url + 'request success')
return response
except Timeout:
print(url + 'request timeout') def get_page_index(offset, keyword): data = {
"offset": offset,
"format": "json",
"keyword": keyword,
"autoload": "true",
"count": "20",
"cur_tab": "1",
"from":"search_tab"
} url = "https://www.toutiao.com/search_content/?" + urlencode(data)
print(url)
try:
response = get_response(url)
print(response.status_code)
if response.status_code == 200:
return response.text
return None
except RequestException:
print('request error')
return None def conn_mongodb():
client = MongoClient('localhost', 27017)
db = client['jiepai']
jiepai = db['jiepai']
return jiepai def save_image_url(data):
jiepai = conn_mongodb()
jiepai.update({'title':data.get('title')}, {'$set':data}, upsert=True) def get_image_url():
jiepai = conn_mongodb()
data = jiepai.find({}, {'title': 1, 'images_list': 1, '_id': 0})
return data def download_image(data): base_dir = os.path.abspath(os.path.dirname(__file__))
if not os.path.exists(base_dir + '\jiepai'):
os.mkdir(base_dir + '\jiepai')
for item in data:
print(item.get('title'))
title = item.get('title')
images_list = item.get('images_list')
print('images_lsit',images_list)
# every file name
file_name = title.strip(string.punctuation)
file_name = str(file_name).replace('?','')
if not os.path.exists(base_dir + '\jiepai/' + file_name):
os.mkdir(base_dir + '\jiepai\\' + file_name)
# save images path
file_path = base_dir + '\jiepai\\' + file_name
for image_url in images_list:
print(image_url)
response = get_response(image_url)
html = response.content
image_name = md5(html).hexdigest() + '.jpg' with open(file_path + '\\' + image_name, 'wb') as f:
f.write(html)
print('download success') def parse_page_index(html):
data = json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
a_gourp_image_detail = {}
images_list = []
title = item.get('title')
# print(title)
if title is not None:
a_gourp_image_detail['title'] = title
images = item.get('image_detail')
# print(images)
if images:
for image in images:
# print(image.get('url'))
images_list.append(image.get('url'))
# if images_list:
a_gourp_image_detail['images_list'] = list(set(images_list))
print(a_gourp_image_detail)
save_image_url(a_gourp_image_detail) def main(offset): html = get_page_index(offset, '街拍')
# print(html)
parse_page_index(html) if __name__ == "__main__":
# 多进程爬取图片链接,并保存到 Mongodb
# groups = [x*20 for x in range(0,5)]
# pool = Pool()
# pool.map(main, groups) # 从 mongodb 中获取链接,多进程下载图片,并保存
data = get_image_url()
datas = [item for item in data] pool = Pool()
pool.map(download_image, data)
# download_image()

  

最新文章

  1. MySQL~ IN , FIND_IN_SET , LIKE
  2. list,set,map,数组之间的相互转换详细解析
  3. HDU 1847 Good Luck in CET-4 Everybody!(找规律,或者简单SG函数)
  4. No ongoing transaction. Did you forget to call multi?
  5. MySQL设置
  6. D - Cow Ski Area
  7. IOS7学习之路三(UISpriteKit游戏开发SKNode)
  8. 监控-CPU使用率
  9. 在HTML页面中加载js文件和css文件的方法
  10. leetcode 第4题 Median of Two Sorted Arrays
  11. hnsdfz -- 6.20 -- day5
  12. 如何在 PhpStorm 使用 Code Generation?
  13. JS脚本-零星片段
  14. iOS视频流开发(1)—视频基本概念
  15. linux之常见命令
  16. Android-消息处理学习总结(Handler,Looper)
  17. springboot项目生成jar包(带静态资源)方法
  18. 谈谈Enter回车键提交表单那些事 回车搜索 enter搜索
  19. 【Unity笔记】Terrain地形制作坍塌/深坑
  20. [AX]AX2012 R2 采购订单的“Request change”

热门文章

  1. rac_安装软件时报版本号过高问题
  2. zabbix proxy 安装
  3. 自己动手一步一步安装hadoop(含编译hadoop的native本地包)
  4. CS0016: 未能写入输出文件“c:\Windows\Microsoft.NET\Framework\v4.0.30319\Temporary ASP.NET Files\
  5. MII_GMII_RGMII_RMII_SMII_SSMII_TBI_RTBI比较
  6. EFM8单片机与I2C外设通信
  7. intent 启动activity、service的方法
  8. Dig HOWTO 中文手册--dig命令使用大全
  9. Hash索引和BTREE索引2
  10. Mysql之sync-binlog参数