新浪新闻按keyword抓取实例

import urllib2

import requests

#import MySQLdb

import webbrowser

import string

import re

from BeautifulSoup import BeautifulSoup

def getHtml(page):#获取网址内容

page=str(page)

html=requests.get("http://search.sina.com.cn/?q=%BD%F0%D0%E3%CF%CD&range=all&c=news&sort=time&page="+page).text

return html

def getPage():#获得网页总数

html=requests.get("http://search.sina.com.cn/?

range=all&c=news&q=%BD%F0%D0%E3%CF%CD&from=home").text #网址

soup=BeautifulSoup(''.join(html))

a=soup('div',{ 'class' : 'l_v2' })

race=[]

c=""

race=str(a).split("新闻")[1].split("篇")[0].split(",") #获取网址有多少页码

b=len(race)

for i in range(b):

c+=race[i]

b=string.atoi(c)/20

return b

def getContents(html):#获取指定新闻内容

soup=BeautifulSoup(''.join(html))

rs=re.compile("fgray_time")

html=soup.findAll('span',attrs={'class':rs})

rs=re.compile("box-result clearfix")

contents=soup.findAll('div',attrs={'class':rs})

for c in html:

length=len(c.text.split(' '))

if length==3:

source=c.text.split(' ')[0]#新闻来源

time=c.text.split(' ')[1]+' '+c.text.split(' ')[2]#新闻发表时间

print source

print time

else:

time=c.text#新闻发表时间

source=''#新闻来源

print time

for i in contents:

title= i.h2.a.text#新闻标题

content= i.p.text#新闻简单介绍内容

## print html

if __name__=="__main__":

count=getPage()

print 111

for i in range(count):

print getContents(getHtml(i))

print 222

巴特西

新浪新闻按keyword抓取实例

最新文章

热门文章