# -*- coding: utf-8 -*-
import os
import sys
import urllib.request
import requests
import re
from lxml import etree def StringListSave(save_path, filename, slist):
if not os.path.exists(save_path):
os.makedirs(save_path)
path = save_path+"/"+filename+".txt"
with open(path, "w+") as fp:
for s in slist:
fp.write("%s\t\t%s\n" % (s[0].encode("utf8"), s[1].encode("utf8"))) def Page_Info(myPage):
'''Regex'''
mypage_Info = re.findall(r'<div class="titleBar" id=".*?"><h2>(.*?)</h2><div class="more"><a href="(.*?)">.*?</a></div></div>', myPage, re.S)
return mypage_Info def New_Page_Info(new_page):
'''Regex(slowly) or Xpath(fast)'''
# new_page_Info = re.findall(r'<td class=".*?">.*?<a href="(.*?)\.html".*?>(.*?)</a></td>', new_page, re.S)
# # new_page_Info = re.findall(r'<td class=".*?">.*?<a href="(.*?)">(.*?)</a></td>', new_page, re.S) # bugs
# results = []
# for url, item in new_page_Info:
# results.append((item, url+".html"))
# return results
dom = etree.HTML(new_page)
new_items = dom.xpath('//tr/td/a/text()')
new_urls = dom.xpath('//tr/td/a/@href')
assert(len(new_items) == len(new_urls))
return zip(new_items, new_urls) def Spider(url):
i = 0
print ("downloading ", url)
myPage = requests.get(url).content.decode("gbk")
# myPage = urllib.request.urlopen(url).read().decode("gbk")
myPageResults = Page_Info(myPage)
save_path = u"网易新闻抓取"
filename = str(i)+"_"+u"新闻排行榜"
StringListSave(save_path, filename, myPageResults)
i += 1
for item, url in myPageResults:
print ("downloading ", url)
new_page = requests.get(url).content.decode("gbk")
# new_page = urllib.request.urlopen(url).read().decode("gbk")
newPageResults = New_Page_Info(new_page)
filename = str(i)+"_"+item
StringListSave(save_path, filename, newPageResults)
i += 1 if __name__ == '__main__':
print ("start")
start_url = "http://news.163.com/rank/"
Spider(start_url)
print ("end")

  

最新文章

  1. ios图文混排
  2. C# 托管和非托管混合编程
  3. Java内存浅析分类
  4. 带你走近AngularJS - 体验指令实例
  5. Chapter 5. Graph Theory:: Fundamentals:: Intermediate
  6. windows查看端口占用
  7. 【Deep Learning学习笔记】Efficient Estimation of Word Representations in Vector Space_google2013
  8. 使用nRF51822/nRF51422创建一个简单的BLE应用 ---入门实例手册(中文)之五
  9. codeblocks快捷键及设置
  10. Cocoapods最新安装教程
  11. nth-child(n)和nth-of-type(n)的区别
  12. .net core 2.x - 日志 - to elasticsearch - (2)
  13. x变成y的最少操作次数(层次遍历)
  14. java 判断字符串IP合法性以及获取IP的数值形式
  15. uoj407 【IOI2018】狼人
  16. Nginx负载均衡的五种策略
  17. [转载]一步一步教你如何在Virtualbox虚拟机中安装Remix
  18. JSP小例子——实现用户登录小例子(不涉及DB操作)
  19. 十个书写Node.js REST API的最佳实践(下)
  20. Eclipse项目上红叉

热门文章

  1. python摸爬滚打之day02----while循环,运算符,格式化输出
  2. unittest框架assert断言
  3. 注解之@CookieValue
  4. 20170725 Python 必须使用的Url编码
  5. TlistView基本使用
  6. 006-优化web请求二-应用缓存、异步调用【Future、ListenableFuture、CompletableFuture】、ETag、WebSocket【SockJS、Stomp】
  7. vue.js安装问题
  8. C++的函数功能总结
  9. javascript封装animate动画
  10. Python Singleton模式