分析Ajax来爬取今日头条街拍美图并保存到MongDB
2024-08-30 18:00:34
前提:.需要安装MongDB
注:因今日投票网页发生变更,如下代码不保证能正常使用
#!/usr/bin/env python
#-*- coding: utf-8 -*- import json
import os
from urllib.parse import urlencode
import pymongo
import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError
import re
from multiprocessing import Pool
from hashlib import md5
from json.decoder import JSONDecodeError MONGO_URL = 'localhost'
MONGO_DB = 'toutiao'
MONGO_TABLE = 'toutiao' GROUP_START = 1
GROUP_END = 20
KEYWORD='街拍' client = pymongo.MongoClient(MONGO_URL, connect=False)
db = client[MONGO_DB] def get_page_index(offset, keyword):
data = {
'autoload': 'true',
'count': 20,
'cur_tab': 3,
'format': 'json',
'keyword': keyword,
'offset': offset,
}
params = urlencode(data)
base = 'http://www.toutiao.com/search_content/'
url = base + '?' + params
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('Error occurred')
return None def download_image(url):
print('Downloading', url)
try:
response = requests.get(url)
if response.status_code == 200:
save_image(response.content)
return None
except ConnectionError:
return None def save_image(content):
file_path = '{0}/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
print(file_path)
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(content)
f.close() def parse_page_index(text):
try:
data = json.loads(text)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
except JSONDecodeError:
pass def get_page_detail(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('Error occurred')
return None def parse_page_detail(html, url):
soup = BeautifulSoup(html, 'lxml')
result = soup.select('title')
title = result[0].get_text() if result else ''
images_pattern = re.compile('gallery: JSON.parse\("(.*)"\)', re.S)
result = re.search(images_pattern, html)
if result:
data = json.loads(result.group(1).replace('\\', ''))
if data and 'sub_images' in data.keys():
sub_images = data.get('sub_images')
images = [item.get('url') for item in sub_images]
for image in images: download_image(image)
return {
'title': title,
'url': url,
'images': images
} def save_to_mongo(result):
if db[MONGO_TABLE].insert(result):
print('Successfully Saved to Mongo', result)
return True
return False def main(offset):
text = get_page_index(offset, KEYWORD)
urls = parse_page_index(text)
for url in urls:
html = get_page_detail(url)
print(html)
result = parse_page_detail(html, url)
print(result)
if result: save_to_mongo(result) if __name__ == '__main__':
pool = Pool()
groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])
pool.map(main, groups)
pool.close()
pool.join()
最新文章
- Android之SAX解析XML
- JS or C#?不存在的脚本之争
- Beta阶段第八次Scrum Meeting
- 使用nodeJs安装Vue-cli
- C#和Java在重写上的区别
- PowerDesigner新装后的设置
- SpringMVC学习--参数绑定
- Effective Java 读书笔记之一 创建和销毁对象
- extjs grid renderer用法
- Zabbix全方位告警接入-电话/微信/短信都支持
- Drupal如何更新注册表?
- struts2,hibernate4,spring3配置时问题汇总及解决办法
- Openjudge-NOI题库-变幻的矩阵
- mysql 一张表的数据插入另一张表的sql语句
- Python 元组tuple相关知识
- 在Ubuntu内制作自己的VOC数据集
- C语言 · 猜算式 · 乘法竖式
- java基础语法2.
- 解决android有的手机拍照后上传图片被旋转的问题
- Codeforces Round #364 (Div. 1) (差一个后缀自动机)