安装使用

# 安装 pip3 install beautifulsoup4

from bs4 import BeautifulSoup

soup=BeautifulSoup(ret.text,'html.parser')  # 传数据

soup=BeautifulSaoup(open('a.html','r'))		# 传文件

# html.parser内置解析器，速度稍微慢一些，但是不需要装第三方模块

# lxml：速度快一些，但是需要安装 pip3 install lxml

soup=BeautifulSoup(ret.text,'html.parser')

soup=BeautifulSoup(ret.text,'lxml')

# find（找到的第一个）

# find_all(找到的所有)

# 找页面所有的li标签

li_list=soup.find_all(name='li')

.text     # 全部拿出来，拼在一起

.string    # 只拿第一个

.strings   # 全部拿出来生成一个迭代器

遍历文档树

html_doc = """

<html><head><title>The Dormouse's story</title></head>

<body>

<p class="title"id="id_p"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were

<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,

<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

and they lived at the bottom of a well.</p>

<p class="story">...</p>

"""

soup=BeautifulSoup(html_doc,'lxml')

# 1.美化+自动补全闭合标签

print(soup.prettify())

# 2.（通过.来查找，只能找到第一个）

head=soup.head  # Tag对象

title=head.title  # Tag对象

# 3.获取标签的名称

p=soup.body

print(p.name)  # body    #没什么卵用

# 4.获取标签的属性

p=soup.p

# 获取class属性,可以有多个，所以拿到是列表

# 方式一

print(p['class'])  # ['title']

print(p['id'])     # id_p

# 方式二

print(p.get('class'))   # ['title']

print(p.attrs['class'])  # ['title']

# 5.获取标签内容

p=soup.p

print(p)  # <p class="title" id="id_p"><b>The Dormouse's story</b></p>

print(p.text)  # The Dormouse's story

print(p.string) # The Dormouse's story

# 6.嵌套选择

title = soup.head.title.text  # 获取head下第一个title的内容

print(title)  # The Dormouse's story

# 7.子节点、子孙节点

p1=soup.p.children   # 迭代器

p2=soup.p.contents  # 列表

print(p1)  # <list_iterator object at 0x000001FA66E0C4A8>

print(list(p1))  # [<b>The Dormouse's story</b>]

print(p2)  # [<b>The Dormouse's story</b>]

# 8.父节点、祖先节点

p1=soup.p.parent  # 直接父节点

p2=soup.p.parents

print(p1)

# # print(len(list(p2)))

print(list(p2))

# 9.兄弟节点

print(soup.a.next_sibling) #下一个兄弟

print(soup.a.previous_sibling) #上一个兄弟

print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象

print(soup.a.previous_siblings) #上面的兄弟们=>生成器对象

查找文档树

# 查找文档树（find，find_all），速度比遍历文档树慢

# 两个配合着使用（soup.p.find()）

# 五种过滤器: 字符串、正则表达式、列表、True、方法

# 1.字符串查找 引号内是字符串

p=soup.find(name='p')  # <p class="title" id="id_p"><b>The Dormouse's story</b></p>

p=soup.find(name='body')  # body标签所以内容

print(p)

# 查找类名是title的所有标签,class是关键字，class_

ret=soup.find_all(class_='title')  # [<p class="title" id="id_p"><b>The Dormouse's story</b></p>]

# 找id为xx的标签

ret=soup.find_all(id='id_p')  # [<p class="title" id="id_p"><b>The Dormouse's story</b></p>]

# href属性为http://example.com/elsie的标签

ret=soup.find_all(href='http://example.com/elsie') # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

# 2.正则表达式

import re

reg=re.compile('^id')  # ^id开头

ret=soup.find_all(id=reg)  # 查找以id开头的id标签

print(ret)  # [<p class="title" id="id_p"><b>The Dormouse's story</b></p>]

# 3.列表

# or关系

ret=soup.find_all(id=['id_psafdsaf','link1'])  # 查找id有id_psafdsaf或者link1的标签

ret=soup.find_all(class_=['title1','story'])  # 查找类名有title1或者story的标签

# and关系

ret=soup.find_all(class_='title',name='p') # 查找类名有title1并且name=p的标签

# 4.true

# 所有有名字的标签

ret=soup.find_all(name=True)

#所有有id的标签

ret=soup.find_all(id=True)

# 所有有herf属性的

ret=soup.find_all(href=True)

print(ret)

# 5.方法

def has_class_but_no_id(tag):

    return tag.has_attr('class') and not tag.has_attr('id')

print(soup.find_all(has_class_but_no_id))

# 6.其他使用

ret=soup.find_all(attrs={'class':"title"})

ret=soup.find_all(attrs={'id':"id_p1",'class':'title'})

print(ret)

# 7.limit(限制条数)

# soup.find()  就是find_all limit=1

ret=soup.find_all(name=True,limit=2)

print(len(ret)) # 2

# 8.recursive

# recursive=False (只找儿子)不递归查找，只找第一层

ret=soup.body.find_all(name='p',recursive=False)

print(ret)

选择器介绍

# bs4：自己的选择器，css选择器

# lxml：css选择器，xpath选择器

# selenium：自己的选择器，css选择器，xpath选择器

# scrapy框架：自己的选择器，css选择器，xpath选择器

# css选择器，xpath选择器会用了，它就是个通行证

CSS选择器

# Tag对象.select("css选择器")

from bs4 import BeautifulSoup

import requests

for i in range(1, 5):

    i1 = str(i - 1)

    i = str(i)

    url = 'https://www.mzitu.com/202340/' + i

    ret = requests.get(url,headers={'User-Agent': 'request', 'Referer': 'https://www.mzitu.com/206122/' + i1},proxies={'http': '47.115.54.89'}, )

soup = BeautifulSoup(ret.text, 'lxml')

案例：

#   div>p：儿子      div p：子子孙孙

#   找div下最后一个a标签 div a:last-child

print(soup.select('#list-1 li:nth-child(1)')[0].text)    # 取第一个li标签

print(soup.select('#list-1 li:nth-child(2)')[0].text)    # 取第二个li标签

print(soup.select('#list-1 li:nth-last-child(1)')[0].text)    # 取倒数第一个li标签

print(soup.select('#list-1 li:nth-last-child(2)')[0].text)    # 取倒数第二个li标签

print(soup.p.select('.sister'))  # 可以组合使用。

print(soup.select('.sister span'))

print(soup.select('#link1'))

print(soup.select('#link1 span'))

print(soup.select('#list-2 .element.xxx'))

print(soup.select('#list-2')[0].select('.element')) #可以一直select,但其实没必要,一条select就可以了

# 2、获取属性

print(soup.select('#list-2 h1')[0].attrs)

href = soup.select('body > div.main > div.content > div.main-image > p > a > img')

print(href[0].attrs['src'])  # 图片链接

# 3、获取内容

# .get_text()

# .text

# .string

# .strings 变成迭代器

print(soup.select('#list-2 h1')[0].get_text())

xpath选择器

# xpath选择

# / 从根节点选取  /a   从根节点开始，往下找a标签（子）

# //从匹配选择的当前节点选择文档中的节点，而不考虑它们的位置  //a 从根节点开始找a标签（子子孙孙中所有a）

# . 	选取当前节点。

# .. 	选取当前节点的父节点。

# @ 	选取属性。

# 取值 /text()

# 取属性 /@属性名

使用：

from lxml import etree

html=etree.HTML(doc) # 传字符串

html=etree.parse('search.html',etree.HTMLParser())  # 传文件

# 1 文本获取   标签后加：/text() ********重点

a=html.xpath('//body//a[@href="image1.html"]/text()')

a=html.xpath('//body//a/text()')

# 2 属性获取  标签后：/@href   ********重点

a=html.xpath('//body//a/@href')

# 注意从1 开始取（不是从0）

a=html.xpath('//body//a[3]/@href')

# 3 所有节点

a=html.xpath('//*')

# 4 指定节点（结果为列表）

a=html.xpath('//head')

# 5 子节点，子孙节点

a=html.xpath('//div/a')

a=html.xpath('//body/a') #无数据

a=html.xpath('//body//a')

# 6 父节点

a=html.xpath('//body//a[@href="image1.html"]/..')

a=html.xpath('//body//a[@href="image1.html"]')

a=html.xpath('//body//a[1]/..')

也可以这样

a=html.xpath('//body//a[1]/parent::*')

# 7 属性匹配

a=html.xpath('//body//a[@href="image1.html"]')

# 8 属性多值匹配

a 标签有多个class类，直接匹配就不可以了，需要用contains

a=html.xpath('//body//a[@class="li"]')

a=html.xpath('//body//a[@href="image1.html"]')

a=html.xpath('//body//a[contains(@class,"li")]')

a=html.xpath('//body//a[contains(@class,"li")]/text()')

a=html.xpath('//body//a[contains(@class,"li")]/@name')

# 9 多属性匹配 or 和 and （了解）

a=html.xpath('//body//a[contains(@class,"li") or @name="items"]')

a=html.xpath('//body//a[contains(@class,"li") and @name="items"]/text()')

a=html.xpath('//body//a[contains(@class,"li")]/text()')

# 10 按序选择

a=html.xpath('//a[2]/text()')

a=html.xpath('//a[2]/@href')

# 取最后一个（了解）

a=html.xpath('//a[last()]/@href')

a=html.xpath('//a[last()]/text()')

# 位置小于3的

a=html.xpath('//a[position()<3]/@href')

a=html.xpath('//a[position()<3]/text()')

# 倒数第二个

a=html.xpath('//a[last()-2]/@href')

# 11 节点轴选择

ancestor：祖先节点

使用了* 获取所有祖先节点

a=html.xpath('//a/ancestor::*')

# # 获取祖先节点中的div

a=html.xpath('//a/ancestor::div')

a=html.xpath('//a/ancestor::div/a[2]/text()')

# attribute：属性值

a=html.xpath('//a[1]/attribute::*')

a=html.xpath('//a[1]/@href')

# child：直接子节点

a=html.xpath('//a[1]/child::*')

a=html.xpath('//a[1]/img/@src')

descendant：所有子孙节点

a=html.xpath('//a[6]/descendant::*')

following:当前节点之后所有节点(递归)

a=html.xpath('//a[1]/following::*')

a=html.xpath('//a[1]/following::*[1]/@href')

# following-sibling:当前节点之后同级节点（同级）

a=html.xpath('//a[1]/following-sibling::*')

a=html.xpath('//a[1]/following-sibling::a')

a=html.xpath('//a[1]/following-sibling::*[2]')

a=html.xpath('//a[1]/following-sibling::*[2]/@href')

巴特西

bs4使用

安装使用

遍历文档树

查找文档树

选择器介绍

CSS选择器

xpath选择器

最新文章

热门文章