#coding=utf-8

import time
import requests
from lxml import etree
from pymongo import MongoClient
from selenium import webdriver client = MongoClient("IP", 27017)
db = client["Automobile"]
collection = db["wenda_autohome"]
db.authenticate("","") driver = webdriver.Chrome(executable_path=r"D:\chromedriver_win32\chromedriver.exe") def splist(l, s):
return [l[i: i+s] for i in range(len(l)) if i%s==0] for i in range(36726, 40202):
# url = 'https://wenda.autohome.com.cn/topic/detail/40195'
url = 'https://wenda.autohome.com.cn/topic/detail/' + str(i) time.sleep(1)
driver.get(url)
html = driver.page_source
tree = etree.HTML(html) question = tree.xpath("//h1[@class='card-title']/text()")
answer_list = tree.xpath("//a[@class='text']/text()")
if question==[] or answer_list==[]:
continue n = 0
for j in answer_list: n += 1
answer_list[n-1] = j[41:-37]
if answer_list[n-1][-3:]!='...':
continue s = "//div[@class='card-reply-wrap'][" + str(n) + "]//a[@class='more']"
try:
driver.find_element_by_xpath(s).click() html_answer = driver.page_source
tree_answer = etree.HTML(html_answer)
answer_part = tree_answer.xpath("//div[@class='answer-content']/div/div[@class='ahe__area ahe__block ahe__text']/p/text()")
answer = ''
for item in answer_part:
answer += item answer_list[n-1] = answer
time.sleep(1)
driver.get(url)
except Exception as e:
print e
continue keywords = tree.xpath("//ul[@class='card-tag-list']/li/text()")
discription_list = tree.xpath("//div[@class='ahe__area ahe__block ahe__text']/p/text()")
discription = ''
for j in discription_list:
discription += j zancai = tree.xpath("//span[@class='js-praise-count']/text()")
zancai_list = splist(zancai, 2) dc = {}
dc['keywords'] = keywords
dc['question'] = question[0]
dc['discription'] = discription
dc['answer'] = answer_list
dc['zancai'] = zancai_list
dc['url'] = url collection.insert(dc) driver.close()

最新文章

  1. 面对bug和困难的心态
  2. Linux SHELL 命令入门题目(一)
  3. DS实验题 融合软泥怪-2 Heap实现
  4. myeclipse 10打开status.xml 卡死
  5. sql 读取本地txt文件批量插入数据库
  6. hdu5255 魔法因子
  7. 第二回 认识CDN
  8. php_1
  9. Beaglebone Back学习二(功能测试)
  10. socketpair的使用
  11. Mysql主从配置+读写分离(转)
  12. p1349星屑幻想
  13. WEB安全:文件上传漏洞
  14. ES6箭头函数Arrow Function
  15. Feature Extractor[googlenet v1]
  16. 面试 12:玩转 Java 快速排序
  17. POJ 2987 Firing (最大权闭合图)
  18. Android 虹软2.0人脸识别,注册失败问题 分析synchronized的作用
  19. 深度学习原理与框架-CNN在文本分类的应用 1.tf.nn.embedding_lookup(根据索引数据从数据中取出数据) 2.saver.restore(加载sess参数)
  20. 数独 php

热门文章

  1. 洛谷P3943星空
  2. js相关数组迭代方法图解
  3. Eclipse控制台
  4. Loj10222 佳佳的Fibonacci(矩阵乘法)
  5. 第4天:Ansible模块
  6. Flask实战第37天:服务器权限验证
  7. Web应用扫描测试工具Vega
  8. 主数据及其管理MDM
  9. Spring的事务传播性
  10. [BZOJ4710][JSOI2011]分特产(组合数+容斥原理)