import requests
import cchardet
import traceback
from lxml import etree def downloader(url,timeout = 10,headers = None,debug = False, binary = False):
_headers = {
'User-Agent': ('Mozilla/5.0 (compatible; MSIE 9.0; '
'Windows NT 6.1; Win64; x64; Trident/5.0)')
}
redirected_url = url
if headers:
headers = _headers
try:
res = requests.get(url,headers,timeout = timeout)
if binary:
html = res.content
else:
encoding = cchardet.detect(res.content)["encoding"]
html = res.content.decode(encoding)
status = res.status_code
redirected_url = res.url
except:
if debug:
traceback.print_exc()
msg = "failed download:{}".format(url)
print(msg)
if binary:
html =b""
else:
html = ""
status = 0
return status,html,redirected_url def parser(html):
d = 0
tree = etree.HTML(html)
divs_list = tree.xpath(".//div[@class = 'main']/div[contains(@class,'clearfix')]")
for div in divs_list:
a_list = div.xpath(".//ul[contains(@class,'list-a')]//a")
for i in a_list:
try:
href = i.xpath("./@href")[0].strip().replace("\\n",'').replace('\\t','')
title = i.xpath("./text()")[0].strip().replace("\\n",'').replace('\\t','')
d += 1
print(d,(href,title))
except (IndexError) as e:
pass if __name__ == '__main__':
url = r"https://www.sina.com.cn/"
status,html,redirected_url = downloader(url)
paser = parser(html)
#print(status,html,redirected_url)

最新文章

  1. SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");//设置日期格式
  2. YII 的源码分析(-)
  3. HTTP协议发展脉络
  4. 【CC评网】2013.第44周 把握每天的第一个小时
  5. winfrom LED时钟
  6. PHPCMS搭建wap手机网站
  7. Android开发必知--自定义Toast提示
  8. 调试出不来 断点不起作用 调试技巧 MyEclipse进不了调试
  9. ps 命令的十个简单用法
  10. 19. vue的原理
  11. android + eclipse + 后台静默安装(一看就会)
  12. ShopEx customSchema 定制能够依据客户的需求对站点进行对应功能的加入改动或者删除
  13. LeetCode 318. Maximum Product of Word Lengths (状态压缩)
  14. linux下的pd
  15. J2SE 8的注解
  16. [BZOJ 4573][ZJOI 2016]大森林
  17. ECShop 2.x 3.0代码执行漏洞分析
  18. 浅谈ES6新特性
  19. Android Studio NDK环境配置
  20. 问题集录04--json和jsonp讲解

热门文章

  1. Logstash:Email output plugin 检查日志中是否还有某些错误信息并发送邮件报警
  2. 使用docker-compose方式安装redash
  3. “kill -9”一时爽,秋后算账泪两行
  4. Lock 锁底层实现
  5. bootstrapValidator 参数校验框架
  6. Java注解(2):实现自己的ORM
  7. SpringBoot后端接口项目
  8. Vue3 JS 与 SCSS 变量相互使用
  9. GitLab CI/CD 自动化部署入门
  10. 盘它!基于CANN的辅助驾驶AI实战案例,轻松搞定车辆检测和车距计算!