1.抓包

  

2.代码

  抓取:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#author tom
import requests
from multiprocessing import Queue
from handle_pymongo import mongo
from concurrent.futures import ThreadPoolExecutor class Douguo():
def __init__(self):
self.queue_list=Queue() #因为所有的请求的请求头都一样,所以放在这边处理,其实请求头也可以放在__init__
def handle_request(self,url,data):
headers={
"client":"",
"version":"6920.4",
"device":"SM-G9350",
"sdk":"22,5.1.1",
"imei":"",
"channel":"qqkp",
# "mac":"2c:c3:82:e2:0b:03",
"resolution":"1024*576",
"dpi":"1.19375",
# "android-id":"4014041355524873",
# "pseudo-id":"28075263",
"brand" :"samsung",
"scale" :"1.19375",
"timezone":"",
"language":"zh",
"cns":"",
# "imsi":"460005263415341",
"user-agent":"Mozilla/5.0 (Linux; Android 5.1.1; SM-G9350 Build/LMY48Z) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",
"reach" :"",
"newbie":"",
"Content-Type":"application/x-www-form-urlencoded; charset=utf-8",
"Accept-Encoding":"gzip, deflate",
# "Cookie":"duid=59758072", #能不带最好不带
"Host":"api.douguo.net",
# "Content":"-Length 68",
"Connection":"keep-alive"
}
response = requests.post(url=url, headers=headers,data=data)
return response #请求食谱首页
def handle_index(self):
url='http://api.douguo.net/recipe/flatcatalogs'
data={
"client":"",
# "_session":"1557318413116861373280750547",
# "v":"1503650468",
"_vs": ""
}
response_index=self.handle_request(url,data)
indext_response_dict=response_index.json()
#遍历这个三级分类
for index_item in indext_response_dict['result']['cs']:
for items in index_item['cs']:
for item in items['cs']:
#每一个小分类
data2={
"client": "",
# "_session": "1557318413116861373280750547",
"keyword":item['name'],
"order": "",
"_vs": ""
}
self.queue_list.put(data2) #请求具体食材的做法
def handle_caipu_list(self,data):
print("当前处理的食材是:",data['keyword'])
#翻页
for i in range(1,11):
caipu_list_url='http://api.douguo.net/recipe/v2/search/{0}/20'.format(str(i*20))
caipu_list_response=self.handle_request(url=caipu_list_url,data=data)
caipu_list_dict=caipu_list_response.json()
#判断是否有数据
if caipu_list_dict['result']['end']==1:
break
for item in caipu_list_dict['result']['list']:
caipu_info={}
caipu_info['shicai'] = data['keyword']
if item['type']==13:
caipu_info['user_name']=item['r']['an']
caipu_info['shicai_id']=item['r']['id']
caipu_info['describe']=item['r']['cookstory'].replace('/n','').replace(' ','')
caipu_info['caipu_name']=item['r']['n']
caipu_info['zuoliao_list']=item['r']['major']
# print(caipu_info)
#获取详情页的内容
detail_url='http://api.douguo.net/recipe/detail/'+str(caipu_info['shicai_id'])
detail_data={
"client": "",
# "_session": "1557318413116861373280750547",
"author_id": "",
"_vs": "",
"_ext":'{"query": {"kw": '+caipu_info['shicai']+', "src": "2803", "idx": "1", "type": "13", "id": '+str(caipu_info['shicai_id'])+'}}'
}
detail_response=self.handle_request(url=detail_url,data=detail_data)
detail_response_dict=detail_response.json()
caipu_info['tips']=detail_response_dict['result']['recipe']['tips']
caipu_info['cook_step']=detail_response_dict['result']['recipe']['cookstep']
print('当前入库的是:',caipu_info['caipu_name'])
mongo.inset_item(caipu_info)
#g过滤掉广告
else:
continue if __name__ == '__main__':
d=Douguo()
d.handle_index()
pool=ThreadPoolExecutor(max_workers=20)
while d.queue_list.qsize()>0:
pool.submit(d.handle_caipu_list,d.queue_list.get())

  存储:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#author tom
import pymongo class Connect_Mongo(object):
def __init__(self):
self.client=pymongo.MongoClient(host='127.0.0.1',port=27017)
self.db=self.client['doukou_meishi'] def inset_item(self,item):
self.collection=self.db['meishi']
self.collection.insert(item) mongo=Connect_Mongo()

最新文章

  1. 表单 - Validatebox - 表单参数校验
  2. [转]JDBC中日期时间的处理技巧
  3. 使用ASP.NET Web Api构建基于REST风格的服务实战系列教程【六】——实现资源间的关联
  4. leveldb - log格式
  5. CATALOGUE 目录
  6. c++ static用法总结【转载】
  7. is 和==的区别
  8. window下mySql数据库设置密码
  9. sql server replace的替换字符,replace的使用
  10. shell脚本实现并发控制
  11. 3.20 总结 java程序流程控制
  12. JavaScript入门学习笔记(二)
  13. 非node环境下的vue.js 实现简单的购物车计算功能 样式请无视
  14. Nginx动静分离
  15. tfs项目管理
  16. HTTP Error 500.22 - Internal Server Error 错误解决方案
  17. Chrome 的 Material Design Refresh UI初探
  18. bootstrap轮播图
  19. 1-100求和 sum(range(101))
  20. MySQL锁分类

热门文章

  1. RSA算法原理(简单易懂)
  2. https协议 和 Charles 进行https抓包原理
  3. Trailing Zeroes (III) LightOJ - 1138 二分+找规律
  4. R的基础数据结构
  5. idea激活教程(永久)支持2019 3.1 亲测
  6. 联想拯救者y7000使用体验
  7. 《ASP.NET Core 高性能系列》Span<T>和Memory<T>
  8. in和exists比较
  9. VMware安装CentOS6.X 系统
  10. AE脚本:把SubRip/SRT/TXT/VTT字幕导入到AE