使用Beautiful Soup

Beautiful Soup初了解

# 解析工具Beautiful Soup，借助网页的结构和属性等特性来解析网页(简单的说就是python的一个HTML或XML的解析库)
# Beautiful Soup支持的解析器

解析器	使用方法	优势	劣势
Python标准库	BeautifulSoup(markup, " html. parser ")	Python 的内宜标准库、执行速度适中、文档容错能力强	Python 2.7.3及 Python3.2.2 之前的版本文档容错能力差
lxml HTML解析器	BeautifulSoup(markup,"lxml")	速度快、文档容错能力强	需要安装c语言库
lxmlXML解析器	BeautifulSoup(markup,"xml")	速度快、唯一支持 XML 的解析器	需要安装c语言库
html5lib	BeautifulSoup(markup,"htmlSlib")	最好的容错性、以浏览器的方式解析文梢、生成 HTML5 格式的文档	速度慢、不依赖外部扩展

实例引入：

 from bs4 import BeautifulSoup

 soup = BeautifulSoup('<p>Hello</p>', 'lxml')

 print(soup.p.string)

 # 输出：

 Hello

Beautiful Soup基本用法

 from bs4 import BeautifulSoup

 html = """

 <html><head><title>The Dormouse's story</title></head>

 <body>

 <p class="title" name="dromouse"><b>The Dormouse's story</b></p>

 <p class="story">Once upon a time there were three little sisters; and their names were

 <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

 and they lived at the bottom of a well.</p>

 <p class="story">...</p>

 """

 soup = BeautifulSoup(html, 'lxml')

 print(soup.prettify(), soup.title.string, sep='\n\n')

 # 初始化BeautifulSoup时，自动更正了不标准的HTML

 # prettify()方法可以把要解析的字符串以标准的缩进格式输出

 # soup.title 可以选出HTML中的title节点，再调用string属性就可以得到里面的文本了

 # 输出：

 <html>

  <head>

   <title>

    The Dormouse's story

   </title>

  </head>

  <body>

   <p class="title" name="dromouse">

    <b>

     The Dormouse's story

    </b>

   </p>

   <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a class="sister" href="http://example.com/elsie" id="link1">

     <!-- Elsie -->

    </a>

    ,

    <a class="sister" href="http://example.com/lacie" id="link2">

     Lacie

    </a>

    and

    <a class="sister" href="http://example.com/tillie" id="link3">

     Tillie

    </a>

    ;

 and they lived at the bottom of a well.

   </p>

   <p class="story">

    ...

   </p>

  </body>

 </html>

 The Dormouse's story

节点选择器

# 选择元素

 from bs4 import BeautifulSoup

 html = """

 <html><head><title>The Dormouse's story</title></head>

 <body>

 <p class="title" name="dromouse"><b>The Dormouse's story</b></p>

 <p class="story">Once upon a time there were three little sisters; and their names were

 <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

 and they lived at the bottom of a well.</p>

 <p class="story">...</p>

 """

 soup = BeautifulSoup(html, 'lxml')

 print(soup.title)               # 打印输出title节点的选择结果

 print(type(soup.title))         # 输出soup.title类型

 print(soup.title.string)        # 输出title节点的内容

 print(soup.head)                # 打印输出head节点的选择结果

 print(soup.p)                   # 打印输出p节点的选择结果

 # 输出：

 <title>The Dormouse's story</title>

 <class 'bs4.element.Tag'>

 The Dormouse's story

 <head><title>The Dormouse's story</title></head>

 <p class="title" name="dromouse"><b>The Dormouse's story</b></p>

# 提取信息
    # 调用string属性获取文本的值
    # 利用那么属性获取节点的名称
    # 调用attrs获取所有HTML节点属性

 from bs4 import BeautifulSoup

 html = """

 <html><head><title>The Dormouse's story</title></head>

 <body>

 <p class="title" name="dromouse"><b>The Dormouse's story</b></p>

 <p class="story">Once upon a time there were three little sisters; and their names were

 <a href="http://example.com/elsie" class="sister" id="link1"><!-- Elsie --></a>,

 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

 and they lived at the bottom of a well.</p>

 <p class="story">...</p>

 """

 soup = BeautifulSoup(html, 'lxml')

 print(soup.title.name)          # 选取title节点，然后调用name属性获得节点名称

 # 输出：title

 print(soup.title.string)        # 调用string属性，获取title节点的文本值

 # 输出：The Dormouse's story

 print(soup.p.attrs)             # 调用attrs，获取p节点的所有属性

 # 输出：{'class': ['title'], 'name': 'dromouse'}

 print(soup.p.attrs['name'])         # 获取name属性

 # 输出：dromouse

 print(soup.p['name'])               # 获取name属性

 # 输出：dromouse

# 嵌套选择

 from bs4 import BeautifulSoup

 html = """

 <html><head><title>The Dormouse's story</title></head>

 <body>

 """

 soup = BeautifulSoup(html, 'lxml')

 print(soup.head.title)

 print(type(soup.head.title))

 print(soup.head.title.string)

 # 输出：

 <title>The Dormouse's story</title>

 <class 'bs4.element.Tag'>

 The Dormouse's story

# 关联选择
    # 1、子节点和子孙节点
        # contents属性得到的结果是直接子节点的列表。

 from bs4 import BeautifulSoup

 html = """

 <html>

  <head>

   <title>

    The Dormouse's story

   </title>

  </head>

  <body>

   <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a class="sister" href="http://example.com/elsie" id="link1">

     <!-- Elsie -->

    </a>

    ,

    <a class="sister" href="http://example.com/lacie" id="link2">

     Lacie

    </a>

    and

    <a class="sister" href="http://example.com/tillie" id="link3">

     Tillie

    </a>

    ;

 and they lived at the bottom of a well.

   </p>

   <p class="story">

    ...

   </p>

  </body>

 </html>

 """

 soup = BeautifulSoup(html, 'lxml')

 # 选取节点元素之后，可以调用contents属性获取它的直接子节点

 print(soup.p.contents)

 # 输出：

 ['\n   Once upon a time there were three little sisters; and their names were\n   ', <a class="sister" href="http://example.com/elsie" id="link1">

 <!-- Elsie -->

 </a>, '\n   ,\n   ', <a class="sister" href="http://example.com/lacie" id="link2">

     Lacie

    </a>, '\n   and\n   ', <a class="sister" href="http://example.com/tillie" id="link3">

     Tillie

    </a>, '\n   ;\nand they lived at the bottom of a well.\n  ']

 # 返回结果是一个列表，列表中的元素是所选节点的直接子节点（不包括孙节点）

直接子节点

        # children属性，返回结果是生成器类型。与contents属性一样，只是返回结果类型不同。

 from bs4 import BeautifulSoup

 html = """

 <html>

  <head>

   <title>

    The Dormouse's story

   </title>

  </head>

  <body>

   <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a class="sister" href="http://example.com/elsie" id="link1">

     <span>Elsie</span>

    </a>

    ,

    <a class="sister" href="http://example.com/lacie" id="link2">

     Lacie

    </a>

    and

    <a class="sister" href="http://example.com/tillie" id="link3">

     Tillie

    </a>

    ;

 and they lived at the bottom of a well.

   </p>

   <p class="story">

    ...

   </p>

  </body>

 </html>

 """

 soup = BeautifulSoup(html, 'lxml')

 print(soup.p.children)                          # 输出：<list_iterator object at 0x1159b7668>

 for i, child in enumerate(soup.p.children):

     print(i, child)

 # for 循环的输出结果:

 0

    Once upon a time there were three little sisters; and their names were

 1 <a class="sister" href="http://example.com/elsie" id="link1">

 <span>Elsie</span>

 </a>

 2

    ,

 3 <a class="sister" href="http://example.com/lacie" id="link2">

     Lacie

    </a>

 4

    and

 5 <a class="sister" href="http://example.com/tillie" id="link3">

     Tillie

    </a>

 6

    ;

 and they lived at the bottom of a well.

直接子节点

        # descendants属性会递归查询所有子节点，得到所有子孙节点。

 from bs4 import BeautifulSoup

 html = """

 <html>

  <head>

   <title>

    The Dormouse's story

   </title>

  </head>

  <body>

   <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a class="sister" href="http://example.com/elsie" id="link1">

     <span>Elsie</span>

    </a>

    ,

    <a class="sister" href="http://example.com/lacie" id="link2">

     Lacie

    </a>

    and

    <a class="sister" href="http://example.com/tillie" id="link3">

     Tillie

    </a>

    ;

 and they lived at the bottom of a well.

   </p>

   <p class="story">

    ...

   </p>

  </body>

 </html>

 """

 soup = BeautifulSoup(html, 'lxml')

 print(soup.p.descendants)                          # 输出：<generator object Tag.descendants at 0x1131d0048>

 for i, child in enumerate(soup.p.descendants):

     print(i, child)

 # for 循环输出结果：

 0

    Once upon a time there were three little sisters; and their names were

 1 <a class="sister" href="http://example.com/elsie" id="link1">

 <span>Elsie</span>

 </a>

 2 

 3 <span>Elsie</span>

 4 Elsie

 5 

 6

    ,

 7 <a class="sister" href="http://example.com/lacie" id="link2">

     Lacie

    </a>

 8

     Lacie

 9

    and

 10 <a class="sister" href="http://example.com/tillie" id="link3">

     Tillie

    </a>

 11

     Tillie

 12

    ;

 and they lived at the bottom of a well.

获取子孙节点

    # 2、父节点和祖先节点

 from bs4 import BeautifulSoup

 html = """

 <html>

  <head>

   <title>

    The Dormouse's story

   </title>

  </head>

  <body>

   <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a class="sister" href="http://example.com/elsie" id="link1">

     <span>Elsie</span>

    </a>

   </p>

   <p class="story">

    ...

   </p>

  </body>

 </html>

 """

 soup = BeautifulSoup(html, 'lxml')

 print(soup.a.parent)

 # 输出：

 <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a class="sister" href="http://example.com/elsie" id="link1">

 <span>Elsie</span>

 </a>

 </p>

parent获取某个节点的一个父节点

 from bs4 import BeautifulSoup

 html = """

 <html>

  <head>

   <title>

    The Dormouse's story

   </title>

  </head>

  <body>

   <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a class="sister" href="http://example.com/elsie" id="link1">

     <span>Elsie</span>

    </a>

   </p>

   <p class="story">

    ...

   </p>

  </body>

 </html>

 """

 soup = BeautifulSoup(html, 'lxml')

 print(soup.a.parents, type(soup.a.parents), list(enumerate(soup.a.parents)), sep='\n\n')

 # 输出：

 <generator object PageElement.parents at 0x11c76e048>

 <class 'generator'>

 [(0, <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a class="sister" href="http://example.com/elsie" id="link1">

 <span>Elsie</span>

 </a>

 </p>), (1, <body>

 <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a class="sister" href="http://example.com/elsie" id="link1">

 <span>Elsie</span>

 </a>

 </p>

 <p class="story">

    ...

   </p>

 </body>), (2, <html>

 <head>

 <title>

    The Dormouse's story

   </title>

 </head>

 <body>

 <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a class="sister" href="http://example.com/elsie" id="link1">

 <span>Elsie</span>

 </a>

 </p>

 <p class="story">

    ...

   </p>

 </body>

 </html>), (3, <html>

 <head>

 <title>

    The Dormouse's story

   </title>

 </head>

 <body>

 <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a class="sister" href="http://example.com/elsie" id="link1">

 <span>Elsie</span>

 </a>

 </p>

 <p class="story">

    ...

   </p>

 </body>

 </html>

 )]

parent获取所有祖先节点

        # 涉及内置函数enumerate()
        # enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列，同时列出数据和数据下标，一般用在 for 循环当中。

 # enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列，同时列出数据和数据下标，一般用在 for 循环当中。

 a = ["恕", "我", "直", "言", "在", "坐", "的", "各", "位", "都", "是", "爱", "学", "习", "的"]

 print(a)            # 输出：['恕', '我', '直', '言', '在', '坐', '的', '各', '位', '都', '是', '爱', '学', '习', '的']

 b = enumerate(a)

 print(enumerate(a))     # 输出：<enumerate object at 0x11a1f8b40>

 print(list(b))

 # [(0, '恕'), (1, '我'), (2, '直'), (3, '言'), (4, '在'), (5, '坐'), (6, '的'), (7, '各'), (8, '位'), (9, '都'),

 # (10, '是'), (11, '爱'), (12, '学'), (13, '习'), (14, '的')]

 for m, n in enumerate(a):

     print(m, n)

 # for 循环 输出：

 0 恕

 1 我

 2 直

 3 言

 4 在

 5 坐

 6 的

 7 各

 8 位

 9 都

 10 是

 11 爱

 12 学

 13 习

 14 的

enumerate()内置函数

    # 3、兄弟节点

 from bs4 import BeautifulSoup

 html = """

 <html>

  <head>

   <title>

    The Dormouse's story

   </title>

  </head>

  <body>

   <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a class="sister" href="http://example.com/elsie" id="link1">

     <span>Elsie</span>

    </a>

    ,

    <a class="sister" href="http://example.com/lacie" id="link2">

     Lacie

    </a>

    and

    <a class="sister" href="http://example.com/tillie" id="link3">

     Tillie

    </a>

    ;

 and they lived at the bottom of a well.

   </p>

   <p class="story">

    ...

   </p>

  </body>

 </html>

 """

 soup = BeautifulSoup(html, 'lxml')

 print(

     # 获取下一个兄弟元素

     {'Next Sibling': soup.a.next_sibling},

     # 获取上一个兄弟元素

     {'Previous Sibling': soup.a.previous_sibling},

     # 返回后面的兄弟元素

     {'Next Siblings': list(enumerate(soup.a.next_siblings))},

     # 返回前面的兄弟元素

     {'Previous Siblings': list(enumerate(soup.a.previous_siblings))},

     sep='\n\n'

 )

 # 输出：

 {'Next Sibling': '\n   ,\n   '}

 {'Previous Sibling': '\n   Once upon a time there were three little sisters; and their names were\n   '}

 {'Next Siblings': [(0, '\n   ,\n   '), (1, <a class="sister" href="http://example.com/lacie" id="link2">

     Lacie

    </a>), (2, '\n   and\n   '), (3, <a class="sister" href="http://example.com/tillie" id="link3">

     Tillie

    </a>), (4, '\n   ;\nand they lived at the bottom of a well.\n  ')]}

 {'Previous Siblings': [(0, '\n   Once upon a time there were three little sisters; and their names were\n   ')]}

获取同级节点

    # 4、提取信息

 from bs4 import BeautifulSoup

 html = """

 <html>

  <body>

   <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a class="sister" href="http://example.com/elsie" id="link1">Bob</a>

    <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

   </p>

  </body>

 </html>

 """

 soup = BeautifulSoup(html, 'lxml')

 print(

     'Next Sibling:',

     [soup.a.next_sibling],        # 获取上一个兄弟节点

     # \n

     type(soup.a.next_sibling),      # 上一个兄弟节点的类型

     # <class 'bs4.element.NavigableString'>

     [soup.a.next_sibling.string],     # 获取上一个兄弟节点的内容

     # \n

     sep='\n'

 )

 print(

     'Parent:',

     [type(soup.a.parents)],      # 获取所有的祖先节点

     # <class 'generator'>

     [list(soup.a.parents)[0]],           # 获取第一个祖先节点

     # <p class="story">

    Once upon a time there were three little sisters; and their names were

    <a class="sister" href="http://example.com/elsie" id="link1">Bob</a>

 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

 </p>

     [list(soup.a.parents)[0].attrs['class']],        # 获取第一个祖先节点的"class属性"的值

     # ['story']

     sep='\n'

 )

 # 为了输出返回的结果，均以列表形式

 # 输出：

 Next Sibling:

 ['\n']

 <class 'bs4.element.NavigableString'>

 ['\n']

 Parent:

 [<class 'generator'>]

 [<p class="story">

    Once upon a time there were three little sisters; and their names were

    <a class="sister" href="http://example.com/elsie" id="link1">Bob</a>

 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

 </p>]

 [['story']]

方法选择器

find_all(name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs)
# 查询所有符合条件的元素

 from bs4 import BeautifulSoup

 html = """

 <div>

 <ul>

 <li class="item-O"><a href="linkl.html">first item</a></li>

 <li class="item-1"><a href="link2.html">second item</a></li>

 <li class="item-inactive"><a href="link3.html">third item</a></li>

 <li class="item-1"><a href="link4.html">fourth item</a></li>

 <li class="item-0"><a href="link5.html">fifth item</a>

 </ul>

 </div>

 """

 soup = BeautifulSoup(html, 'lxml')

 print(soup.find_all(name='li'),

       type(soup.find_all(name='li')[0]),

       sep='\n\n')

 # 输出：

 [<li class="item-O"><a href="linkl.html">first item</a></li>, <li class="item-1"><a href="link2.html">second item</a></li>, <li class="item-inactive"><a href="link3.html">third item</a></li>, <li class="item-1"><a href="link4.html">fourth item</a></li>, <li class="item-0"><a href="link5.html">fifth item</a>

 </li>]

 <class 'bs4.element.Tag'>

 # 返回值是一个列表，列表的元素是名为"li"的节点，每个元素都是bs4.element.Tag类型

 # 遍历每个a节点

 from bs4 import BeautifulSoup

 html = """

 <div>

 <ul>

 <li class="item-O"><a href="linkl.html">first item</a></li>

 <li class="item-1"><a href="link2.html">second item</a></li>

 <li class="item-inactive"><a href="link3.html">third item</a></li>

 <li class="item-1"><a href="link4.html">fourth item</a></li>

 <li class="item-0"><a href="link5.html">fifth item</a>

 </ul>

 </div>

 """

 soup = BeautifulSoup(html, 'lxml')

 li = soup.find_all(name='li')

 for a in li:

     print(a.find_all(name='a'))

 # 输出：

 [<a href="linkl.html">first item</a>]

 [<a href="link2.html">second item</a>]

 [<a href="link3.html">third item</a>]

 [<a href="link4.html">fourth item</a>]

 [<a href="link5.html">fifth item</a>]

name参数

 from bs4 import BeautifulSoup

 html = """

 <div>

 <ul>

 <li class="item-O"><a href="linkl.html">first item</a></li>

 <li class="item-1"><a href="link2.html">second item</a></li>

 <li class="item-inactive"><a href="link3.html">third item</a></li>

 <li class="item-1"><a href="link4.html">fourth item</a></li>

 <li class="item-0"><a href="link5.html">fifth item</a>

 </ul>

 </div>

 """

 soup = BeautifulSoup(html, 'lxml')

 print(soup.find_all(attrs={'class': 'item-0'}))

 print(soup.find_all(attrs={'href': 'link5.html'}))

 # 输出：

 [<li class="item-0"><a href="link5.html">fifth item</a>

 </li>]

 [<a href="link5.html">fifth item</a>]

 # 可以通过attrs参数传入一些属性来进行查询，即通过特定的属性来查询

 # find_all(attrs={'属性名': '属性值', ......})

attrs参数

 from bs4 import BeautifulSoup

 import re

 html = """

 <div class="panel">

 <div class="panel-body">

 <a>Hello, this is a link</a>

 <a>Hello, this is a link, too</a>

 <div/>

 <div/>

 """

 soup = BeautifulSoup(html, 'lxml')

 # 正则表达式规则对象

 regular = re.compile('link')

 # text参数课用来匹配节点的文本，传入的形式可以是字符串，也可以是正则表达式对象

 print(soup.find_all(text=regular))

 # 正则匹配输出

 print(re.findall(regular, html))

 # 输出：

 ['Hello, this is a link', 'Hello, this is a link, too']

 ['link', 'link']

text参数

find(name=None, attrs={}, recursive=True, text=None, **kwargs)

仅返回与给定条件匹配标记的第一个元素

CSS选择器

Beautiful Soup 提供了CSS选择器，调用select()方法即可
css选择器用法：http://www.w3school.com.cn/cssref/css_selectors.asp

select(selector, namespaces=None, limit=None, **kwargs)

 html = '''

 <div class="panel">

 <div class="panel-heading">

 <h4>Hello</h4>

 </div>

 <div class="panel-body">

 <ul class="list" id="list-1">

 <li class="element">Foo</li>

 <li class="element">Bar</li>

 <li class="element">Jay</li>

 </ul>

 <ul class="list list-small" id="list-2">

 <li class="element">Foo</li>

 <li class="element">Bar</li>

 </ul>

 </div>

 </div>

 '''

 from bs4 import BeautifulSoup

 soup = BeautifulSoup(html, 'lxml')

 print(

     soup.select('.panel .panel-heading'),

     soup.select('ul li'),

     soup.select('#list-2 .element'),

     type(soup.select('ul')[0]),

     sep='\n\n'

 )

 # 输出：

 [<div class="panel-heading">

 <h4>Hello</h4>

 </div>]

 [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]

 [<li class="element">Foo</li>, <li class="element">Bar</li>]

 <class 'bs4.element.Tag'>

简单示例

 html = '''

 <div class="panel">

 <div class="panel-heading">

 <h4>Hello</h4>

 </div>

 <div class="panel-body">

 <ul class="list" id="list-1">

 <li class="element">Foo</li>

 <li class="element">Bar</li>

 <li class="element">Jay</li>

 </ul>

 <ul class="list list-small" id="list-2">

 <li class="element">Foo</li>

 <li class="element">Bar</li>

 </ul>

 </div>

 </div>

 '''

 from bs4 import BeautifulSoup

 soup = BeautifulSoup(html, 'lxml')

 ul_all = soup.select('ul')

 print(ul_all)

 for ul in ul_all:

     print()

     print(

         ul['id'],

         ul.select('li'),

         sep='\n'

     )

 # 输出：

 [<ul class="list" id="list-1">

 <li class="element">Foo</li>

 <li class="element">Bar</li>

 <li class="element">Jay</li>

 </ul>, <ul class="list list-small" id="list-2">

 <li class="element">Foo</li>

 <li class="element">Bar</li>

 </ul>]

 list-1

 [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]

 list-2

 [<li class="element">Foo</li>, <li class="element">Bar</li>]

嵌套选择

 html = '''

 <div class="panel">

 <div class="panel-heading">

 <h4>Hello</h4>

 </div>

 <div class="panel-body">

 <ul class="list" id="list-1">

 <li class="element">Foo</li>

 <li class="element">Bar</li>

 <li class="element">Jay</li>

 </ul>

 <ul class="list list-small" id="list-2">

 <li class="element">Foo</li>

 <li class="element">Bar</li>

 </ul>

 </div>

 </div>

 '''

 from bs4 import BeautifulSoup

 soup = BeautifulSoup(html, 'lxml')

 ul_all = soup.select('ul')

 print(ul_all)

 for ul in ul_all:

     print()

     print(

         ul['id'],

         ul.attrs['id'],

         sep='\n'

     )

 # 直接传入中括号和属性名  或者  通过attrs属性获取属性值 都可以成功获得属性值

 # 输出：

 [<ul class="list" id="list-1">

 <li class="element">Foo</li>

 <li class="element">Bar</li>

 <li class="element">Jay</li>

 </ul>, <ul class="list list-small" id="list-2">

 <li class="element">Foo</li>

 <li class="element">Bar</li>

 </ul>]

 list-1

 list-1

 list-2

 list-2

获取属性

 html = '''

 <div class="panel">

 <div class="panel-heading">

 <h4>Hello</h4>

 </div>

 <div class="panel-body">

 <ul class="list" id="list-1">

 <li class="element">Foo</li>

 <li class="element">Bar</li>

 <li class="element">Jay</li>

 </ul>

 <ul class="list list-small" id="list-2">

 <li class="element">Foo</li>

 <li class="element">Bar</li>

 </ul>

 </div>

 </div>

 '''

 from bs4 import BeautifulSoup

 soup = BeautifulSoup(html, 'lxml')

 ul_all = soup.select('li')

 print(ul_all)

 for li in ul_all:

     print()

     print(

         'get_text()方法获取文本：'+li.get_text(),

         'string属性获取文本：'+li.string,

         sep='\n'

     )

 # 输出：

 [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>]

 get_text()方法获取文本：Foo

 string属性获取文本：Foo

 get_text()方法获取文本：Bar

 string属性获取文本：Bar

 get_text()方法获取文本：Jay

 string属性获取文本：Jay

 get_text()方法获取文本：Foo

 string属性获取文本：Foo

 get_text()方法获取文本：Bar

 string属性获取文本：Bar

获取文本

巴特西

使用Beautiful Soup

Beautiful Soup初了解

Beautiful Soup基本用法

节点选择器

方法选择器

CSS选择器

最新文章

热门文章