通过Selenium模拟浏览器抓取淘宝商品美食信息,并存储到MongoDB数据库中。

from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pyquery import PyQuery as pq
import re
import json
from config import *
import pymongo
client = pymongo.MongoClient(MONGO_URL)
db = client[MONGO_DB]
browser = webdriver.Firefox()
wait = WebDriverWait(browser,10) def search():
try:
browser.get('https://www.taobao.com')
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#q'))
)
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#J_TSearchForm > div.search-button > button')))
input.send_keys('美食')
submit.click()
total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.total')))
get_products()
return total.text
except TimeoutException:
return search() def next_page(page_number):
try:
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager > div > div > div > div.form > input'))
)
submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit')))
input.clear()
input.send_keys(page_number)
submit.click()
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,'#mainsrp-pager > div > div > div > ul > li.item.active > span'),str(page_number)))
get_products()
except TimeoutException:
return next_page(page_number) # def write_to_file(content):
# with open('E:/python/Projects/test1/result.txt','a',encoding='utf-8') as f:
# f.write(json.dumps(content,ensure_ascii=False) + '\n')
# f.close() def get_products():
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#mainsrp-itemlist .items .item')))
html = browser.page_source
doc = pq(html)
items = doc('#mainsrp-itemlist .items .item').items()
for item in items:
product = {
'image': item.find('.pic .img').attr('src'),
'price': item.find('.price').text(),
'deal': item.find('.deal-cnt').text()[:-3],
'title': item.find('.title').text(),
'shop': item.find('.shop').text(),
'location': item.find('.location').text()
}
print(product)
save_to_mongo(product )
# write_to_file(product)
def save_to_mongo(result):
try:
if db[MONGO_TABLE].insert(result):
print('存储到MONGODB成功',result)
except Exception:
print('存储到MONGODB失败',result) def main():
total= search()
total = int(re.compile('(\d+)').search(total).group(1))
for i in range(2,total+1):
next_page(i)
browser.close()
if __name__ == '__main__':
main()

最新文章

  1. HDU 1817Necklace of Beads(置换+Polya计数)
  2. <工作一周的心情总结>
  3. javascript除法如何取整
  4. Xcode 设置 ARC&MRC混用
  5. 颜色表及html代码
  6. CodeForces 478C Table Decorations
  7. (转)VS.NET2010水晶报表安装部署[VS2010]
  8. android学习笔记五——AutoCompleteTextView
  9. ThinkPHP函数详解:L方法
  10. BZOJ 1600: [Usaco2008 Oct]建造栅栏
  11. POJ 3422 Kaka's Matrix Travels(费用流)
  12. 安卓培训第五天---上传文件SD卡
  13. Shell学问: 调用脚本之间
  14. java进行图片和字符串的互相转换
  15. vue+element-ui实现表格编辑(增加或删除行,删除单行或删除多行)
  16. [20171120]理解v$session的state字段(11G).txt
  17. Vue的watch监听事件
  18. JAVA多线程基础学习三:volatile关键字
  19. python(unittest)报告导出(一):使用HTMLTestRunner导出
  20. js千分位处理

热门文章

  1. Day 1:线程与进程系列问题(一)
  2. 每天一点点之laravel框架开发 - passport授权报invalid_credentials
  3. Tensorflow学习教程------tensorboard网络运行和可视化
  4. 如何在MySQL目录下找到my.ini
  5. 目录服务不能与此服务器复制,因为距上一次与此服务器复制的时间已经超过了 tombstone 生存时间。
  6. JavaScript 之 原型及原型链
  7. 因子分析和PCA总结
  8. 面试准备 HTTP协议
  9. MySQL--通过.frm和.ibd对mysql数据恢复
  10. MySQL--InnoDB 关键特性