网易新网 spider
2024-10-15 01:30:32
# -*- coding: utf-8 -*-
import os
import sys
import urllib.request
import requests
import re
from lxml import etree def StringListSave(save_path, filename, slist):
if not os.path.exists(save_path):
os.makedirs(save_path)
path = save_path+"/"+filename+".txt"
with open(path, "w+") as fp:
for s in slist:
fp.write("%s\t\t%s\n" % (s[0].encode("utf8"), s[1].encode("utf8"))) def Page_Info(myPage):
'''Regex'''
mypage_Info = re.findall(r'<div class="titleBar" id=".*?"><h2>(.*?)</h2><div class="more"><a href="(.*?)">.*?</a></div></div>', myPage, re.S)
return mypage_Info def New_Page_Info(new_page):
'''Regex(slowly) or Xpath(fast)'''
# new_page_Info = re.findall(r'<td class=".*?">.*?<a href="(.*?)\.html".*?>(.*?)</a></td>', new_page, re.S)
# # new_page_Info = re.findall(r'<td class=".*?">.*?<a href="(.*?)">(.*?)</a></td>', new_page, re.S) # bugs
# results = []
# for url, item in new_page_Info:
# results.append((item, url+".html"))
# return results
dom = etree.HTML(new_page)
new_items = dom.xpath('//tr/td/a/text()')
new_urls = dom.xpath('//tr/td/a/@href')
assert(len(new_items) == len(new_urls))
return zip(new_items, new_urls) def Spider(url):
i = 0
print ("downloading ", url)
myPage = requests.get(url).content.decode("gbk")
# myPage = urllib.request.urlopen(url).read().decode("gbk")
myPageResults = Page_Info(myPage)
save_path = u"网易新闻抓取"
filename = str(i)+"_"+u"新闻排行榜"
StringListSave(save_path, filename, myPageResults)
i += 1
for item, url in myPageResults:
print ("downloading ", url)
new_page = requests.get(url).content.decode("gbk")
# new_page = urllib.request.urlopen(url).read().decode("gbk")
newPageResults = New_Page_Info(new_page)
filename = str(i)+"_"+item
StringListSave(save_path, filename, newPageResults)
i += 1 if __name__ == '__main__':
print ("start")
start_url = "http://news.163.com/rank/"
Spider(start_url)
print ("end")
最新文章
- ios图文混排
- C# 托管和非托管混合编程
- Java内存浅析分类
- 带你走近AngularJS - 体验指令实例
- Chapter 5. Graph Theory:: Fundamentals:: Intermediate
- windows查看端口占用
- 【Deep Learning学习笔记】Efficient Estimation of Word Representations in Vector Space_google2013
- 使用nRF51822/nRF51422创建一个简单的BLE应用 ---入门实例手册(中文)之五
- codeblocks快捷键及设置
- Cocoapods最新安装教程
- nth-child(n)和nth-of-type(n)的区别
- .net core 2.x - 日志 - to elasticsearch - (2)
- x变成y的最少操作次数(层次遍历)
- java 判断字符串IP合法性以及获取IP的数值形式
- uoj407 【IOI2018】狼人
- Nginx负载均衡的五种策略
- [转载]一步一步教你如何在Virtualbox虚拟机中安装Remix
- JSP小例子——实现用户登录小例子(不涉及DB操作)
- 十个书写Node.js REST API的最佳实践(下)
- Eclipse项目上红叉
热门文章
- python摸爬滚打之day02----while循环,运算符,格式化输出
- unittest框架assert断言
- 注解之@CookieValue
- 20170725 Python 必须使用的Url编码
- TlistView基本使用
- 006-优化web请求二-应用缓存、异步调用【Future、ListenableFuture、CompletableFuture】、ETag、WebSocket【SockJS、Stomp】
- vue.js安装问题
- C++的函数功能总结
- javascript封装animate动画
- Python Singleton模式