python3 调用 beautifulSoup 进行简单的网页处理

from bs4 import BeautifulSoup

file = open('index.html','r',encoding='utf-16-le') #此处有坑！！！
soup = BeautifulSoup(file,'lxml')

print (soup)  # 打印读出的内容
print ('\n ------------- \n')
print (soup.get_text())  # 取所有标签中的文字
print ('\n ------------- \n')
print (soup.prettify()) # 格式化输出

# 以标签的形式输出
print (soup.title)
print ('\n ------------- \n')
print (soup.body)
print ('\n ------------- \n')
print (soup.body.div)

import re
print (soup.find_all('br'))  # 仅仅用来搜索标签 
print ('\n ------------- \n')
print (soup.find_all(re.compile('^b')))#可以使用正则表达式  以b开头的标签
print ('\n ------------- \n')
print (soup.find_all(id='wiz_custom_css'))
print ('\n ------------- \n')
for strr in soup.strings:  # 取所有下一级标签中的字符串  .stripped_strings可以去空白
    print (strr)
print ('\n ------------- \n')

# 去除body中的标签，将结果保存于文件  待改进
# kill all script and style elements
for script in soup(["script", "style"]):
    script.extract()    # rip current tap

title_text = soup.title.get_text()
str_text = ''
for strr in soup.body.strings:  # 取所有下一级标签中的字符串  .stripped_strings可以去空白
    str_text = str_text + strr + '\n'

print (str_text)
if title_text == '':
    md_file = open('index.md','w')
    md_file.write(str_text)

else:
    md_file = open(title_text+'.md','w')
    md_file.write(str_text)

# 网上搜到的方式，<br/>标签没有转为换行，后面有另一种方式

#print soup
# kill all script and style elements
for script in soup(["script", "style"]):
    script.extract()    # rip current tap

# get text
text = soup.get_text()

#print text + '____________'
# break into lines and remove leading and trailing space on each
# splitlines 按\r \r\n \n三种标签分解为行 
# strip()移除首尾字符，参数默认为空格
lines = (line.strip() for line in text.splitlines()) 
# break multi-headlines into a line each
chunks = (phrase.strip() for line in lines for phrase in line.split("  ")) 
# drop blank lines
text = '\n'.join(chunk for chunk in chunks if chunk) # 这个循环…………

#wfile = open('aa.md','w')
#wfile.write(text)
print(text)

From WizNote

巴特西

python3 调用 beautifulSoup 进行简单的网页处理

python3 调用 beautifulSoup 进行简单的网页处理

最新文章

热门文章