import requests

url = "https://magi.com/search"

querystring = {"q":"堕却乡"}

headers = {
'authority': "magi.com",
'pragma': "no-cache",
'cache-control': "no-cache,no-cache",
'upgrade-insecure-requests': "1",
'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
'referer': "https://magi.com/search?q=%E7%89%B9%E6%96%AF%E6%8B%89",
'accept-encoding': "gzip, deflate, br",
'accept-language': "zh-CN,zh;q=0.9",
'cookie': "acw_tc=7af6142615735221487104171e68298facdedf1e07add2205636582990",
'Postman-Token': "dda0d475-41b9-44b4-812a-6dd489fe19dd,64d3ddc4-7036-4c42-bff6-53dcbc065db2",
'Host': "magi.com",
'Connection': "keep-alive"
} response = requests.request("GET", url,
headers=headers,
params=querystring,
# verify=True
) # print(response.text)
import lxml.etree
taxt = lxml.etree.HTML(response.text)
cells=taxt.xpath("//main//div[@data-type='fact']//article[@class='fact']")
for cell in cells:
sop = cell.xpath(".//dl/dd//text()")
sop_url = cell.xpath(".//div/ul//ol//li//a//@href")
reliability = cell.xpath(".//div//span//text()")
import re reliability=re.findall("(\d{1,3})",reliability[0])[-1]
print(reliability,sop,sop_url) #
import requests url = "https://www.tuicool.com/articles/jiyEnq7" headers = {
# 'Connection': "keep-alive",
# 'Pragma': "no-cache",
# 'Cache-Control': "no-cache",
# 'Upgrade-Insecure-Requests': "1",
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36",
# 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
# 'Referer': "https://magi.com/",
# 'Accept-Encoding': "gzip, deflate",
# 'Accept-Language': "zh-CN,zh;q=0.9",
# 'Cookie': "Hm_lvt_28af3b8ab090b6821eea60c696e82b96=1573539698,1573540931; Hm_lpvt_28af3b8ab090b6821eea60c696e82b96=1573540931; ALLYESID4=128D850DE1E5CFA6; wdcid=11576d250e703f68; wdses=274efe2ee2728bdc; zycna=XkZbSr7Ily0BAXPBvUrZ6/aL; wdlast=1573540932",
# 'Postman-Token': "3fb8dcac-17e0-431b-bc1e-209ab1e7c2dd,86d4a803-c79a-4949-ac91-1edd3323465e",
# 'Host': "www.ce.cn",
# 'cache-control': "no-cache"
}
spo=['特斯拉', '电池供应商', '松下']
import chardet
response = requests.request("GET", url, headers=headers)
response.encoding=chardet.detect((response.content))["encoding"]
# response.encoding="utf-8"
import lxml.etree
taxt = lxml.etree.HTML(response.text)
list_sentence=taxt.xpath("//body//text()")
import re
# for t in t_list:
# t = re.split("(。|!|?)",t)
# if len(t)>1:
# t = ["".join(i) for i in zip(t[0::2], t[1::2])]
# print(t)
spo_sentence = []
for sentence in list_sentence:
sentence_list = re.split("(。|!|?)", sentence)
if len(sentence_list) > 1:
sentence_list = ["".join(i) for i in zip(sentence_list[0::2], sentence_list[1::2])]
for sentence in sentence_list:
if spo[1]=="描述" or spo[1]=="标签" or spo[1]=="近义项":
if sentence.find(spo[0])!=-1 and sentence.find(spo[2])!=-1:
spo_sentence.append(sentence)
print(sentence)
else:
if sentence.find(spo[0])!=-1 and sentence.find(spo[1])!=-1 and sentence.find(spo[2])!=-1:
spo_sentence.append(sentence)
print(sentence)
if spo_sentence:
item = {
"spo_sentence": spo_sentence,
"spo": spo
}
print(item)

  

最新文章

  1. 无限分页//////////////zz
  2. 黄聪:解决Web部署 svg/woff/woff2字体 404错误
  3. Android TextView 高亮字体并添加点击事件
  4. Git add 常见用法
  5. Ubuntu 环境 运行Asp.net mvc +EntityFramework+ Mysql
  6. 【转】SSM框架——详细整合教程(Spring+SpringMVC+MyBatis)
  7. 夺命雷公狗---DEDECMS----20dedecms取出栏目页对应的内容
  8. java基础学习之 消息对话款
  9. Nunit 使用介绍
  10. 55个高质量的Magento主题,助你构建电子商务站点
  11. 利用ExcelDataReader封装类 导入表格数据
  12. delphi ExecWB
  13. linux系统安装对硬件有什么要求
  14. LinkedList和ArrayList的区别
  15. 【JAVA编码专题】深入分析 Java 中的中文编码问题
  16. Mock以及Mockito的使用
  17. FZU 2256 迷宫
  18. 如何在web项目中添加javamelody monitoring 监控。
  19. C# 根据路径删除文件或文件夹
  20. [Leetcode 771]宝石和石子 Jewels and Stones HashSet简单应用

热门文章

  1. 爬虫实现51job谁看过我的简历多条记录功能
  2. Hive概述
  3. [转帖]差之毫厘谬之千里!带你认识CPU后缀含义
  4. JVM 和 GC
  5. Snoopy.class.php使用手册
  6. kettle处理未发现hadoop插件问题
  7. 交替方向乘子法(ADMM)的原理和流程的白话总结
  8. java语言中使用三元式的时候应该注意的问题
  9. 【原创】大数据基础之ETL vs ELT or DataWarehouse vs DataLake
  10. Qtspim和MIPS的坑