1 # -*- coding: utf-8 -*-
2 # @Time : 2020/9/11 16:23
3 # @Author : Chunfang
4 # @Email : 3470959534@qq.com
5 # @File : amazon_bestseller_cate_url.py
6 # @Software: PyCharm
7
8 import random,requests
9 import re
10
11 def secend_cates_url(url):#正则匹配二级标题
12 # print(url)
13 page_data = get_data(url)
14 url_cates = re.findall("<li><a href='(https://www.amazon.com/Best.*?)'>(.*?)</a></li>", page_data, re.S)
15 # print('二级标题有',url_cates)
16 url_cate_all.append(url_cates)
17 # print(page_data)
18
19 def get_html_data(page_data):#正则匹配一级标题
20 url_cates = re.findall("<li><a href='(https://www.amazon.com/Best.*?)'>(.*?)</a></li>",page_data,re.S)
21 # print('一级标题有',url_cates)
22 url_cate_all.append(url_cates)
23 # secend_cates_url(url_cates[0][0])
24 for i in range(len(url_cates)):
25 secend_cates_url(url_cates[i][0])
26
27 def randHeader():
28 head_connection = ['Keep-Alive', 'close']
29 head_accept = ['text/html, application/xhtml+xml, */*']
30 head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
31 head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
32 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
33 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
34 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
35 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
36 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
37 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
38 'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
39 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
40 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
41 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
42 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
43 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
44 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
45 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
46 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
47 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
48 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
49 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
50 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
51
52 header = {
53 'Connection': head_connection[0],
54 'Accept': head_accept[0],
55 'Accept-Language': head_accept_language[1],
56 'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
57 }
58 return header
59
60 def get_data(url):#获取页面数据
61 headers = randHeader()
62 page_data = requests.get(url, headers, timeout=20)
63 import html
64 page_data = html.unescape(page_data.text)
65 return page_data
66
67 def save_to_excel(url_cate_all):
68 url_cate_alls = []
69 for i in range(len(url_cate_all)):
70 for j in range(len(url_cate_all[i])):
71 # print('所有的标题链接:',url_cate_all[i][j])
72 url_cate_alls.append(url_cate_all[i][j])
73 url_cate_all_only = list(set(url_cate_alls))
74 # print('唯一一个链接和分类:',url_cate_all_only)
75 return url_cate_all_only
76
77 def url_cate_all_only():
78 global url_cate_all
79 url_cate_all = []
80 url = 'https://www.amazon.com/Best-Sellers-Womens-Clothing/zgbs/fashion/1040660/ref=zg_bs_unv_3_9522931011_1'
81 page_data = get_data(url)
82 # print(page_data)
83 get_html_data(page_data)
84 # print(url_cate_all)
85 url_cate_all_only = save_to_excel(url_cate_all)
86 return url_cate_all_only
  1 # -*- coding: utf-8 -*-
2 # @Time : 2020/9/9 17:30
3 # @Author : Chunfang
4 # @Email : 3470959534@qq.com
5 # @File : amazon_best_sellers.py
6 # @Software: PyCharm
7
8 import requests
9 import re,os,random
10 from openpyxl import load_workbook
11
12 from amazon_bestseller_cate_url2 import url_cate_all_only
13
14 def down_imgs(url_xuhao,url_img,pro_name):#下载图片
15 for i in range(len(url_xuhao)):
16 print('正在下载第' + str(i+1) + '张图片,图片地址:' + str(url_img[i]))
17 try:
18 header = randHeader()
19 pic = requests.get(url_img[i], header,timeout=10)
20 except requests.exceptions.ConnectionError:
21 print('错误!当前图片无法下载')
22 continue
23 dir = cwd + '\\images_amazon\\' + pro_name + '_' + url_xuhao[i] + '.jpg'#运行路径下自己手动新建一个images_amazon文件加,存放图片
24 with open(dir, 'wb') as file:
25 file.write(pic.content)
26
27 def save_pro_to_excel(products_inf,pro_name):#热卖产品数据保存到Excel表
28 t = ws.max_row
29 for i in range(len(products_inf)):
30 for j in range(len(products_inf[i])):
31 # ws.cell(j + 2, i + 1).value = products_inf[i][j] # 序号
32 ws.cell(t+1+j, 1).value = pro_name
33 ws.cell(t+1+j, i+2).value = products_inf[i][j] # 产品信息
34 wb.save(path)
35
36 def down_products(result,pro_name):#正则匹配产品信息
37 products_inf = []
38 # url_title = re.findall('<div class="p13n-sc-truncate p13n-sc-line-clamp-2" aria-hidden="true" data-rows="2">\n (.*?)\n </div>',result,re.S)
39 url_title = re.findall('<div class=".*?" aria-hidden=".*?" data-rows=".*?">\n (.*?)\n </div>',result, re.S)
40 url_pro = re.findall('<div class="a-row"><a class="a-link-normal a-text-normal" href="(.*?)"><span class="a-size-base a-color-price">',result,re.S)
41 url_price = re.findall('<span class="a-size-base a-color-price"><span class=.*?>(.*?)</span>',result,re.S)
42 url_xuhao = re.findall('<span class="zg-badge-text">#(.*?)</span></span>',result,re.S)
43 url_img = re.findall('<div class="a-section a-spacing-small"><img alt=".*?src="(https.*?)" height="200" width="200"></div></span>',result,re.S)
44
45
46 products_inf.append(url_xuhao)
47 products_inf.append(url_title)
48 products_inf.append(url_price)
49 products_inf.append(url_pro)
50 products_inf.append(url_img)
51 print(products_inf)
52
53 save_pro_to_excel(products_inf, pro_name)
54 down_imgs(url_xuhao, url_img, pro_name)
55
56 #生成随机头
57 def randHeader():
58 head_connection = ['Keep-Alive', 'close']
59 head_accept = ['text/html, application/xhtml+xml, */*']
60 head_accept_language = ['zh-CN,fr-FR;q=0.5', 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3']
61 head_user_agent = ['Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
62 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',
63 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36',
64 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',
65 'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',
66 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',
67 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',
68 'Opera/9.27 (Windows NT 5.2; U; zh-cn)',
69 'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',
70 'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',
71 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',
72 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',
73 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',
74 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',
75 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',
76 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',
77 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',
78 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',
79 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',
80 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']
81
82 header = {
83 'Connection': head_connection[0],
84 'Accept': head_accept[0],
85 'Accept-Language': head_accept_language[1],
86 'User-Agent': head_user_agent[random.randrange(0, len(head_user_agent))]
87 }
88 return header
89
90 def start_url(pro_name,url):
91 headers = randHeader()
92 result = requests.get(url, headers, timeout=20)
93 import html
94 result = html.unescape(result.text)
95 # print(result)
96 # 调用函数,下载页面产品信息:序号,标题,产品最低价格,产品链接,产品图片和链接
97 down_products(result,pro_name)
98
99 if __name__ == '__main__':
100 cwd = os.getcwd()
101 path = cwd+'\\AmazonBestsellers.xlsx'
102 wb = load_workbook(path)
103 ws = wb.worksheets[0]
104 table_titles = ['产品类别','序号','产品标题','产品最低价格','产品链接','产品图片链接']
105 for i,table_title in enumerate(table_titles):
106 ws.cell(1,i+1).value = table_title
107 wb.save(path)
108
109 # amazon_urls = [
110 # #一级标题--女装衣服
111 # 'https://www.amazon.com/Best-Sellers-Womens-Clothing/zgbs/fashion/1040660/ref=zg_bs_pg_1?_encoding=UTF8&pg=1',
112 # 'https://www.amazon.com/Best-Sellers-Womens-Clothing/zgbs/fashion/1040660/ref=zg_bs_pg_2?_encoding=UTF8&pg=2',
113 # #二级标题--女装裙子
114 # 'https://www.amazon.com/Best-Sellers-Womens-Dresses/zgbs/fashion/1045024/ref=zg_bs_pg_1?_encoding=UTF8&pg=1',
115 # 'https://www.amazon.com/Best-Sellers-Womens-Dresses/zgbs/fashion/1045024/ref=zg_bs_pg_2?_encoding=UTF8&pg=2',
116 # #三级标题--女装日常款裙子
117 # 'https://www.amazon.com/Best-Sellers-Womens-Casual-Dresses/zgbs/fashion/2346727011/ref=zg_bs_pg_1?_encoding=UTF8&pg=1',
118 # 'https://www.amazon.com/Best-Sellers-Womens-Casual-Dresses/zgbs/fashion/2346727011/ref=zg_bs_pg_2?_encoding=UTF8&pg=2'
119 # ]
120
121 amazon_urls = []
122 all_urls = url_cate_all_only()
123 for i in range(len(all_urls)):
124 amazon_urls.append(all_urls[i][0]) # 一级、二级分类下的所有页面链接
125 print(len(amazon_urls))
126 print(amazon_urls)
127
128 for i in range(len(amazon_urls)):
129 pro_name = amazon_urls[i].split('/')
130 print(pro_name[3])
131 print(pro_name[3][13:])
132 start_url(pro_name[3][13:],amazon_urls[i])

最新文章

  1. C#设计模式-外观模式
  2. STM32F4读写内部FLASH【使用库函数】
  3. Memcached服务介绍及安装指南
  4. 二十一、【.Net开源框架】EFW框架Web前端开发之目录结构和使用FireBug调试方法
  5. Class 实现IDisposing方法
  6. cell点击按钮崩的一种情况
  7. php安装过程中遇到的需要安装的问题
  8. javadoc 生成帮助文档时,注意以下几点
  9. javascript 里找元素操作元素
  10. js scrollTop 事件
  11. Tesseract-OCR4.0识别中文与训练字库实例
  12. Spring出现事务代理的原因
  13. AGC030 简要题解
  14. Django 学习第四天——Django 模板标签
  15. freeswitch用户整合(使用mysql数据库的用户表)
  16. Unity5 AssetBundle系列——基本流程
  17. 【python51--__name__属性】
  18. Java设计模式(17)解释器模式(Interpreter模式)
  19. C++ 枚举转字符串
  20. KVM--安装及初步使用

热门文章

  1. 绕过CDN找到⽬标站点真实IP
  2. 配置文件 /etc/profile出错导致ls,vi不能用
  3. Snort中pcre和正则表达式的使用
  4. javascript的比较运算符
  5. Java并发机制(3)--volatile关键字与内存模型
  6. Springmvc入门基础(六) ---拦截器应用demo
  7. 学习FastDfs(一)
  8. 如何设置出IDEA中VCS下的Enable Version Control Intergration
  9. 数据结构:DHU顺序表ADT模板设计及简单应用:找匹配
  10. Leetcode刷题之链表中箭头转移和内容转移