解析HTML文件
2024-08-28 03:52:24
#!/usr/bin/env python3 # -*- coding: UTF-8 -*- from bs4 import BeautifulSoup
import operator
import os,shutil
import re def processhtml(item):
html_path = item
with open(html_path) as fp:
soup = BeautifulSoup(fp, "html.parser")
return soup def IsComputer(soup_arg):
soup = soup_arg
result = False
try:
value = soup.find('input', {'name':'资源类型'}).get('value')
if value == '主机':
print('资源类型:主机')
result = True
elif value == '数据库':
print('资源类型:数据库')
else:
print('资源类型:其他')
except:
pass
return result def IsAgree(soup_arg):
soup = soup_arg
result = False
try:
for row in soup.findAll('tr'):
cells = row.findAll('td')
if len(cells) == 4:
if cells[1].findChild("font") != None:
nStr = ""
nStr = nStr.join(cells[0].string)
target = ['帐号管理人员处理']
if (operator.eq(nStr.split(), target)):
print(cells[1].font.string)
if (operator.eq(nStr.split(), target) and cells[1].font.string == '同意'):
print("满足条件为:%s && 审批意见(同意)" % nStr.split()[0])
result = True
except IndexError as e:
pass
return result def IsIntersect(soup_arg):
soup = soup_arg
result = False
try:
value = soup.find('input', {'name':'239385_资源名称'}).get('value')
temp_list = re.split('[、:\n]', value)
hosts_list = []
hosts_list.clear()
for hostlist in temp_list:
if re.search('[a-z]', hostlist):
print(hostlist)
hosts_list.append(hostlist)
hosts_set = set(hosts_list)
if target_hosts.intersection(hosts_set):
print('非空,有交集')
result = True
else:
print("空,无交集")
except:
pass
return result def IsIntersect2(soup_arg):
soup = soup_arg
result = False
try:
value = soup.find('input', {'name':'所在的硬件设备/软件平台'}).get('value')
temp_list = re.split('[、:\n]', value)
hosts_list = []
hosts_list.clear()
for hostlist in temp_list:
if re.search('[a-z]', hostlist):
hosts_list.append(hostlist)
hosts_set = set(hosts_list)
if target_hosts.intersection(hosts_set):
print('非空,有交集')
result = True
else:
print("空,无交集")
except:
pass
return result if __name__ == '__main__':
target_hosts = {'cmszsoaa', 'cmszsoab', 'cmszdcss', 'cmszicss', 'cmsznpsa', 'cmsznpsb', 'cmszinta', 'cmszintb',
'cmszdpsa', 'cmszdpsb', 'mcbsoaa', 'mcbsoab', 'mcbinta', 'mcbintb', 'mcbdpsa', 'mcbdpsb',
'mcbnpsa', 'mcbnpsb', 'mcbdcss', 'mcbicss', 'newdcss', 'newicss'} work_dir = '/root/XmlOut/'
target_dir = '/root/AccountOut/' for parent, dirnames, filenames in os.walk(work_dir, followlinks=True):
for filename in filenames:
file_path = os.path.join(parent, filename)
print("filename with full path: %s" % file_path)
soup = processhtml(file_path)
flag1 = IsComputer(soup)
flag2 = IsAgree(soup)
flag3 = IsIntersect(soup)
flag4 = IsIntersect2(soup)
if (flag1 and flag2 and (flag3 or flag4)):
print('%s, ok----' % (file_path))
shutil.copy(file_path, target_dir)
最新文章
- Linux查看物理CPU个数、核数、逻辑CPU个数
- Extjs TabPanel 选项卡延迟加载
- [原创]Matlab获取当前时间信息
- 菜鸟学JS(五)——window.onload与$(document).ready()
- Java并发之:生产者消费者问题
- ECMAScript 5.1中对属性的操作
- Oracle 10g 数据文件的第一个数据块结构
- POJ 3667 Hotel (线段树区间合并)
- xampp
- python连接zookeeper的日志问题
- Win32 GDI 非矩形区域剪裁,双缓冲技术
- spring3.0事务的配置
- android SDK和ADT的更新
- MongoDB学习笔记-命令
- 第三次冲刺spring会议(第五次会议)
- 上传图文{"errcode":40007,"errmsg":"invalid media_id"}解决方案
- 数据定义语言(DDL Data Definition Language)基础学习笔记
- PythonStudy——三种字符串 Three strings
- 用excel批量生成insert语句
- 【16】命令模式(Command Pattern)
热门文章
- STL_string用法总结
- js中关于new Object时传参的一些细节分析
- CWnd* pParent
- 类 Fabric 主机管理程序开发
- c++ map: 使用struct或者数组做value
- Bequeath Connection and SYS Logon
- case....when ...多重判断
- Linxu基础入门
- BZOJ 1617 Usaco 2008 Mar. River Crossing渡河问题
- TypeError: CleanWebpackPlugin is not a constructor