1.re实现
 import requests
from requests.exceptions import RequestException
import re,json
import xlwt,xlrd # 数据
DATA = []
KEYWORD = 'python'
HEADERS = {'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome'\
'/63.0.3239.132 Safari/537.36'}
MAX_PAGE = 10 def get_target(data_list):
for item in data_list:
temp = {
'title': item['title'],
'price': item['view_price'],
'sales': item['view_sales'],
'isTmall': '否' if float(item['view_fee']) else '是',
'area': item['item_loc'],
'name': item['nick'],
'url': item['detail_url']
}
DATA.append(temp)
return True # 发送http请求,获取网页源码
def get_html(url,*args):
try:
if not args:
response = requests.get(url,headers=HEADERS)
global COOKIES
COOKIES = response.cookies # 获取cookie
else:
response = requests.get(url,headers=HEADERS,cookies=COOKIES) response.encoding = response.apparent_encoding
return response.text
except RequestException:
print('请求源码出错!') # 解析源码,得到目标信息
def parse_html(html,*args):
if not args:
pattern = re.compile(r'g_page_config = (.*?)g_srp_loadCss',re.S)
# 去掉末尾的';'
result = re.findall(pattern, html)[0].strip()[:-1]
# 格式化json,可以用json在线解析工具查看结构
content = json.loads(result)
data_list = content['mods']['itemlist']['data']['auctions']
else:
pattern = re.compile(r'{.*}',re.S)
result = re.findall(pattern,html)[0]
content = json.loads(result)
data_list = content['API.CustomizedApi']['itemlist']['auctions'] get_target(data_list) def save_to_excel():
f_name = '淘宝%s数据'%KEYWORD
book = xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet = book.add_sheet(f_name)
sheet.write(0, 0, 'title')
sheet.write(0, 1, 'price')
sheet.write(0, 2, 'sales')
sheet.write(0, 3, 'isTmall')
sheet.write(0, 4, 'area')
sheet.write(0, 5, 'name')
sheet.write(0, 6, 'url')
for i in range(len(DATA)):
sheet.write(i+1, 0, DATA[i]['title'])
sheet.write(i+1, 1, DATA[i]['price'])
sheet.write(i+1, 2, DATA[i]['sales'])
sheet.write(i+1, 3, DATA[i]['isTmall'])
sheet.write(i+1, 4, DATA[i]['area'])
sheet.write(i+1, 5, DATA[i]['name'])
sheet.write(i+1, 6, DATA[i]['url'])
book.save('淘宝%s数据.xls'%KEYWORD) def main():
for offset in range(MAX_PAGE):
# 首页有12条异步加载的数据 api?
if offset == 0:
url1 = 'https://s.taobao.com/search?q={}&s={}'.format(KEYWORD,offset*44)
html = get_html(url1)
contents = parse_html(html) url2 = 'https://s.taobao.com/api?_ksTS=1532524504679_226&callback=jsonp227&ajax=true&m=customized&' \
'stats_click=search_radio_all:1&q={}'.format(KEYWORD)
html = get_html(url2,2)
contents = parse_html(html,2)
else:
url = 'https://s.taobao.com/search?q={}&s={}'.format(KEYWORD,offset*44)
html = get_html(url)
contents = parse_html(html) save_to_excel()
print(len(DATA)) if __name__ == '__main__':
main()

最新文章

  1. mysql高性能索引策略
  2. UIImage NSData 相互转化
  3. .Net中使用aliases让相同命名空间的dll引用共存
  4. OneAlert 入门(二)——事件分析
  5. AspectJ的安装和Eclipse的AJDT插件的配置
  6. Windows8.1硬盘安装Ubuntu14.04双系统参考教程和多硬盘的注意事项[画画]
  7. SSH证书登录方式(无password验证登录)
  8. vs2013安装visual assist和viemu之后提示功能等无效解决
  9. RAC(ReactiveCocoa)
  10. 从 Bridge 到 OVS,探索虚拟交换机
  11. [HNOI2011]数矩形
  12. 深度解密Go语言之Slice
  13. shell编程规范:引用
  14. sublime text 安装nodejs开发插件
  15. js算法初窥07(算法复杂度)
  16. MUI + Spring MVC 实现多图片上传
  17. centos7下安装docker(9容器对资源的使用限制-内存)
  18. 使用absolute布局
  19. Spring4 mvc+maven 框架搭建(1)
  20. 解决Android sdk无法下载的问题

热门文章

  1. PAT(B) 1019 数字黑洞(Java)
  2. 谈谈Python中pop与remove的用法
  3. 希尔排序——C语言
  4. java基础知识学习 内存相关
  5. 数据格式转换string.Format
  6. SQLAlchemy 在查询期间丢失与MySQL服务器的连接
  7. sql 视图的好处
  8. Seaborn(二)之数据集分布可视化
  9. 工作流引擎Activiti
  10. MMU简介