BeautifulSoup 有何用途



pip install beautifulsoup4



在抓取网页数据时,一般和 requests 库一起使用,如下:

import requests
from bs4 import BeautifulSoup url = ''
HEADERS = {'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'} req = requests.get(url, headers=HEADERS, timeout=5)
soup = BeautifulSoup(req.text, "html.parser")


Connection Aborted Error(10060 ' A connection attempt failed becvause the connected party did not properly respond after a period of time, or established a connection failed because connected host has failed to respond' 解决方法:
PROXY = {'http': ''} 替换为代理IP和端口
req = requests.get(url, headers=HEADERS, proxies=PROXY, timeout=5)
soup = BeautifulSoup(req.text, "html.parser") timeout 参数是自己可以设定的连接超时时间,单位:秒



其中的 hangzhou 可以替换为其他任何城市

import requests
from bs4 import BeautifulSoup
import sys
HEADERS = {'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'} def get_soup_from_link(url):
req = requests.get(url, headers=HEADERS, timeout=5)
return BeautifulSoup(req.text, "html.parser") def get_weather_report_list(weather_info):
weather_report_list = [] for group in weather_info:
detail_info = {}
txt = group.find_all(class_='txt')
detail_info['date'] = group.find('dl').get_text()
detail_info['description'] = group.find(class_='temp').get_text()
detail_info['temperature'] = txt[0].get_text()
detail_info['wind'] = txt[1].get_text() weather_report_list.append(detail_info) return weather_report_list def print_day_weather(weather_list, city):
print('City: ' + city)
for weather in weather_list:
print(weather['date'] + '\t' + weather['description'] + '\t' + weather['temperature'] + '\t' + weather['wind']) def main():
print(sys.argv) city = 'hangzhou'
if len(sys.argv) == 1:
city = 'hangzhou'
elif len(sys.argv) == 2:
city = sys.argv[1].lower()
print('Usage: python city_name\n city_name: hangzhou, shanghai, ...')
return weather_url = '' + city + '/7'
soup = get_soup_from_link(weather_url)
weather_table_day7 = soup.find_all(class_='table_day7')
weather_report_list = get_weather_report_list(weather_table_day7)
print_day_weather(weather_report_list, city) if __name__ == '__main__':

通过Chrome F12,我们可以看到我们需要解析的标签数据如下,他们都在 class="table_day7 tag" 或者 class="table_day7" 中

<dl class="table_day7 tbg">
<dd class="week">今天</dd>
<dd class="air">
<b style="background-color:#79b800;" title="空气质量:优">优</b>
<dd class="img">
<img src=""/>
<dd class="temp">雪</dd>
<dd class="txt">
-1℃ ~ <b>1</b>

<dd class="txt">东北风 3级</dd>
<dl class="table_day7 ">
<dd class="week">星期四</dd>
<dd class="air">
<b style="background-color:#79b800;" title="空气质量:优">优</b>
<dd class="img">
<img src=""/>
<dd class="temp">多云</dd>
<dd class="txt">
-3℃ ~ <b>6</b>

<dd class="txt">东北风 1级</dd>


weather_url = '' + city + '/7'
soup = get_soup_from_link(weather_url)
weather_table_day7 = soup.find_all(class_='table_day7') find_all 返回的是一个列表,里面包含的是所有找到的匹配项。
入参使用带下划线的class,是因为class是python中的关键字,所以用class_来进行代替 下面就是我们暂时得到的数据结果: [<dl class="table_day7 tbg">
<dd class="week">今天</dd>
<dd class="air"><b style="background-color:#79b800;" title="空气质量:优">优</b></dd>
<dd class="img"><img src=""/></dd>
<dd class="temp">雪</dd>
<dd class="txt">-1℃ ~ <b>1</b>℃</dd>
<dd class="txt">东北风 3级</dd>
</dl>, <dl class="table_day7 tbg">
<dd class="week">明天</dd>
<dd class="air"><b style="background-color:#79b800;" title="空气质量:优">优</b></dd>
<dd class="img"><img src=""/></dd>
<dd class="temp">雪</dd>
<dd class="txt">-2℃ ~ <b>0</b>℃</dd>
<dd class="txt">北风 2级</dd>
</dl>, <dl class="table_day7 tbg">
<dd class="week">后天</dd>
<dd class="air"><b style="background-color:#79b800;" title="空气质量:优">优</b></dd>
<dd class="img"><img src=""/></dd>
<dd class="temp">雪</dd>
<dd class="txt">0℃ ~ <b>2</b>℃</dd>
<dd class="txt">北风 3级</dd>
</dl>, <dl class="table_day7 ">
<dd class="week">星期一</dd>
<dd class="air"><b style="background-color:#79b800;" title="空气质量:优">优</b></dd>
<dd class="img"><img src=""/></dd>
<dd class="temp">多云</dd>
<dd class="txt">-2℃ ~ <b>3</b>℃</dd>
<dd class="txt">北风 3级</dd>
</dl>, <dl class="table_day7 ">
<dd class="week">星期二</dd>
<dd class="air"><b style="background-color:#79b800;" title="空气质量:优">优</b></dd>
<dd class="img"><img src=""/></dd>
<dd class="temp">晴</dd>
<dd class="txt">-2℃ ~ <b>4</b>℃</dd>
<dd class="txt">东北风 2级</dd>
</dl>, <dl class="table_day7 ">
<dd class="week">星期三</dd>
<dd class="air"><b style="background-color:#79b800;" title="空气质量:优">优</b></dd>
<dd class="img"><img src=""/></dd>
<dd class="temp">晴</dd>
<dd class="txt">-2℃ ~ <b>5</b>℃</dd>
<dd class="txt">北风 2级</dd>
</dl>, <dl class="table_day7 ">
<dd class="week">星期四</dd>
<dd class="air"><b style="background-color:#79b800;" title="空气质量:优">优</b></dd>
<dd class="img"><img src=""/></dd>
<dd class="temp">多云</dd>
<dd class="txt">-3℃ ~ <b>6</b>℃</dd>
<dd class="txt">东北风 1级</dd>


def get_weather_report_list(weather_info):
weather_report_list = [] for group in weather_info:
detail_info = {}
txt = group.find_all(class_='txt')
# txt 中包含了温度范围及风向内容 detail_info['date'] = group.find('dl').get_text()
# dl 标签中对应的是日期 detail_info['description'] = group.find(class_='temp').get_text()
# temp 对应了天气状态 detail_info['temperature'] = txt[0].get_text()
detail_info['wind'] = txt[1].get_text() weather_report_list.append(detail_info) return weather_report_list

爬取的数据,我们只用了BeautifulSoup中 find()、find_all()及get_text() 这几个方法,就取得了我们想要的文本内容,非常的方便。


