前提:.需要安装MongDB

注:因今日投票网页发生变更,如下代码不保证能正常使用

#!/usr/bin/env python
#-*- coding: utf-8 -*- import json
import os
from urllib.parse import urlencode
import pymongo
import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError
import re
from multiprocessing import Pool
from hashlib import md5
from json.decoder import JSONDecodeError MONGO_URL = 'localhost'
MONGO_DB = 'toutiao'
MONGO_TABLE = 'toutiao' GROUP_START = 1
GROUP_END = 20
KEYWORD='街拍' client = pymongo.MongoClient(MONGO_URL, connect=False)
db = client[MONGO_DB] def get_page_index(offset, keyword):
data = {
'autoload': 'true',
'count': 20,
'cur_tab': 3,
'format': 'json',
'keyword': keyword,
'offset': offset,
}
params = urlencode(data)
base = 'http://www.toutiao.com/search_content/'
url = base + '?' + params
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('Error occurred')
return None def download_image(url):
print('Downloading', url)
try:
response = requests.get(url)
if response.status_code == 200:
save_image(response.content)
return None
except ConnectionError:
return None def save_image(content):
file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
print(file_path)
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(content)
f.close() def parse_page_index(text):
try:
data = json.loads(text)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
except JSONDecodeError:
pass def get_page_detail(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('Error occurred')
return None def parse_page_detail(html, url):
soup = BeautifulSoup(html, 'lxml')
result = soup.select('title')
title = result[0].get_text() if result else ''
images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S)
result = re.search(images_pattern, html)
if result:
data = json.loads(result.group(1).replace('\\', ''))
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url') for item in sub_images]
for image in images: download_image(image)
return {
'title': title,
'url': url,
'images': images
} def save_to_mongo(result):
if db[MONGO_TABLE].insert(result):
print('Successfully Saved to Mongo', result)
return True
return False def main(offset):
text = get_page_index(offset, KEYWORD)
urls = parse_page_index(text)
for url in urls:
html = get_page_detail(url)
print(html)
result = parse_page_detail(html, url)
print(result)
if result: save_to_mongo(result) if __name__ == '__main__':
pool = Pool()
groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
pool.map(main, groups)
pool.close()
pool.join()

最新文章

  1. Android之SAX解析XML
  2. JS or C#?不存在的脚本之争
  3. Beta阶段第八次Scrum Meeting
  4. 使用nodeJs安装Vue-cli
  5. C#和Java在重写上的区别
  6. PowerDesigner新装后的设置
  7. SpringMVC学习--参数绑定
  8. Effective Java 读书笔记之一 创建和销毁对象
  9. extjs grid renderer用法
  10. Zabbix全方位告警接入-电话/微信/短信都支持
  11. Drupal如何更新注册表?
  12. struts2,hibernate4,spring3配置时问题汇总及解决办法
  13. Openjudge-NOI题库-变幻的矩阵
  14. mysql 一张表的数据插入另一张表的sql语句
  15. Python 元组tuple相关知识
  16. 在Ubuntu内制作自己的VOC数据集
  17. C语言 · 猜算式 · 乘法竖式
  18. java基础语法2.
  19. 解决android有的手机拍照后上传图片被旋转的问题
  20. Codeforces Round #364 (Div. 1) (差一个后缀自动机)

热门文章

  1. HTTP基本认证(Basic Authentication)的JAVA演示样例
  2. Thread.suspend和println使线程死锁
  3. 多个线程作用于同一个runnable对象
  4. HDU 3104 Combination Lock(数学题)
  5. python清除数据库错误日志
  6. rabbitmq 入门基础(一)
  7. [专辑] 也晒晒我的RBAC系统 ——行一山人的博客
  8. JS函数种类详解
  9. LVS十种调度算法介绍
  10. WPF MVVM 关闭窗体