【概述】做好一个web系统的安全运维,除了常规的防注入,防入侵等,还有一个检测并过滤敏感词,脏词..  这件事做得不好,轻则导致一场投诉或纠纷,重则导致产品被勒令关闭停运。

废话少说,先看下代码,可以拿过去直接使用。

 using Microsoft.VisualBasic;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text; namespace OpenCore.ContentSecurity
{
/// <summary>
/// 功能简介:基于DFA算法的高效率非法关键词检测过滤类(杜绝违法内容)..之所以高效,因为本算法对主输入的字符串,只循环了一次。 无需对词库的每个词进行replace的低效率处理。
/// 开发前参考内容:https://blog.csdn.net/u011966339/article/details/72832197
/// 更新日志:
/// 2020-4-15:加载字典的处理采用静态构造方法中处理,避免频繁加载,提升性能.
/// 支持多词库文件加载.
/// 优化了算法的细节,提高健壮性。
/// </summary>
public class SensitiveWordFilter
{
private static string[] dictionaryPathList = null;
/// <summary>
/// 内存词典
/// </summary>
private static WordGroup[] MEMORYLEXICON = new WordGroup[(int)char.MaxValue];
private static object lockObj = new object();
public static void Init(string[] sDictionaryFileName)
{
dictionaryPathList = sDictionaryFileName;
LoadDictionary();
}
public SensitiveWordFilter()
{ }
private string sourctText = string.Empty;
/// <summary>
/// 检测源
/// </summary>
private string SourctText
{
get { return sourctText; }
set { sourctText = value; }
}
/// <summary>
/// 检测源游标
/// </summary>
private int cursor = ;
/// <summary>
/// 匹配成功后偏移量
/// </summary>
private int wordlenght = ;
/// <summary>
/// 检测词游标
/// </summary>
private int nextCursor = ;
private List<string> illegalWords = new List<string>();
/// <summary>
/// 检测到的非法词集
/// </summary>
public List<string> IllegalWords
{
get { return illegalWords; }
}
/// <summary>
/// 判断是否是中文
/// </summary>
/// <param name="character"></param>
/// <returns></returns>
private bool isCHS(char character)
{
// 中文表意字符的范围 4E00-9FA5
int charVal = (int)character;
return (charVal >= 0x4e00 && charVal <= 0x9fa5);
}
/// <summary>
/// 判断是否是数字
/// </summary>
/// <param name="character"></param>
/// <returns></returns>
private bool isNum(char character)
{
int charVal = (int)character;
return (charVal >= && charVal <= );
}
/// <summary>
/// 判断是否是字母
/// </summary>
/// <param name="character"></param>
/// <returns></returns>
private bool isAlphabet(char character)
{
int charVal = (int)character;
return ((charVal >= && charVal <= ) || (charVal >= && charVal <= ));
}
/// <summary>
/// 转半角小写的函数(DBC case)
/// </summary>
/// <param name="input">任意字符串</param>
/// <returns>半角字符串</returns>
///<remarks>
///全角空格为12288,半角空格为32
///其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248
///</remarks>
private static string ToDBC(string input)
{
char[] c = input.ToCharArray();
for (int i = ; i < c.Length; i++)
{
if (c[i] == )
{
c[i] = (char);
continue;
}
if (c[i] > && c[i] < )
c[i] = (char)(c[i] - );
}
return new string(c).ToLower();
}
/// <summary>
/// 转换为简体中文
/// </summary>
/// <param name="sInput"></param>
/// <returns></returns>
private static string ToSimplifiedChiniese(string sInput)
{
if (string.IsNullOrEmpty(sInput))
{
return string.Empty;
}
try
{
return Strings.StrConv(sInput, VbStrConv.SimplifiedChinese, );
}
catch (Exception ex)
{ }
return sInput;
}
/// <summary>
/// 写入日志(非跨程序域的场景)
/// </summary>
/// <param name="Msg"></param>
private static void SaveLog(string Msg)
{
string sPath = Path.Combine(AppDomain.CurrentDomain.SetupInformation.ApplicationBase, "SecurityLog");
if (!Directory.Exists(sPath))
{
Directory.CreateDirectory(sPath);
}
sPath = string.Format("{0}\\{1}", sPath, DateTime.Now.ToString("yyyyMMdd") + ".log");
try
{
File.AppendAllText(sPath, "[" + DateTime.Now.ToString() + "]" + Msg + "\r\n");
}
catch
{
}
}
/// <summary>
/// 加载内存词库
/// </summary>
private static void LoadDictionary()
{
if (dictionaryPathList == null || dictionaryPathList.Length == )
{
SaveLog($"SensitiveWordFilter.LoadDictionary.字典路径配置为空");
return;
}
foreach (string sFileName in dictionaryPathList)
{
if (File.Exists(sFileName) == false)
{
SaveLog($"SensitiveWordFilter.LoadDictionary.路径:{sFileName}不是一个有效的文件");
return;
}
}
List<string> wordList = new List<string>();
Array.Clear(MEMORYLEXICON, , MEMORYLEXICON.Length);
foreach (string sDictionaryFile in dictionaryPathList)
{
string[] words = System.IO.File.ReadAllLines(sDictionaryFile, System.Text.Encoding.Default);
foreach (string word in words)
{
if (string.IsNullOrEmpty(word))
continue;
if (word.Trim().Length == )
continue;
string key = ToDBC(word);
wordList.Add(key);
//适配繁体,简体.addbyww@2020-4-15
string key_simple = ToSimplifiedChiniese(key);
if (key_simple != key)
{
wordList.Add(key_simple);
}
}
}
Comparison<string> cmp = delegate (string key1, string key2)
{
return key1.CompareTo(key2);
};
wordList.Sort(cmp);
for (int i = wordList.Count - ; i > ; i--)
{
if (wordList[i].ToString() == wordList[i - ].ToString())
{
wordList.RemoveAt(i);
}
}
foreach (var word in wordList)
{
if (word.Length > )
{
WordGroup group = MEMORYLEXICON[(int)word[]];
if (group == null)
{
group = new WordGroup();
MEMORYLEXICON[(int)word[]] = group;
}
group.Add(word.Substring());
}
}
}
/// <summary>
/// 检测
/// </summary>
/// <param name="blackWord"></param>
/// <returns></returns>
private bool Check(string blackWord)
{
wordlenght = ;
//检测源下一位游标
nextCursor = cursor + ;
bool found = false;
//遍历词的每一位做匹配
for (int i = ; i < blackWord.Length; i++)
{
//特殊字符偏移游标
int offset = ;
if (nextCursor >= sourctText.Length)
{
break;
}
else
{
//检测下位字符如果不是汉字 数字 字符 偏移量加1
for (int y = nextCursor; y < sourctText.Length; y++)
{ if (!isCHS(sourctText[y]) && !isNum(sourctText[y]) && !isAlphabet(sourctText[y]))
{
offset++;
//避让特殊字符,下位游标如果>=字符串长度 跳出
if (nextCursor + offset >= sourctText.Length) break;
wordlenght++;
}
else break;
}
if ((int)blackWord[i] == (int)sourctText[nextCursor + offset])
{
found = true;
}
else
{
found = false;
break;
}
}
nextCursor = nextCursor + + offset;
wordlenght++;
}
return found;
}
/// <summary>
/// 检测并替换敏感词为指定字符。之后返回
/// </summary>
/// <param name="replaceChar">比如:*</param>
public string getDataByFilter(string sSourceInput, char replaceChar)
{
if (string.IsNullOrEmpty(sSourceInput))
{
return sSourceInput;
}
if (MEMORYLEXICON == null || MEMORYLEXICON.Length == )
{
SaveLog($"SensitiveWordFilter.getDataByFilter.内存字典为空");
return sSourceInput;
}
//初始化
this.cursor = ;
this.wordlenght = ;
this.illegalWords.Clear();
this.sourctText = sSourceInput;
if (sourctText != string.Empty)
{
char[] tempString = sourctText.ToCharArray();
for (int i = ; i < SourctText.Length; i++)
{
//查询以该字为首字符的词组
WordGroup group = MEMORYLEXICON[(int)ToDBC(SourctText)[i]];
if (group != null)
{
for (int z = ; z < group.Count(); z++)
{
string word = group.GetWord(z);
if (word.Length == || Check(word))
{
string blackword = string.Empty;
for (int pos = ; pos < wordlenght + ; pos++)
{
blackword += tempString[pos + cursor].ToString();
tempString[pos + cursor] = replaceChar;
}
illegalWords.Add(blackword);
cursor = cursor + wordlenght;
i = i + wordlenght;
}
}
}
cursor++;
}
return new string(tempString);
}
else
{
return string.Empty;
}
}
}
/// <summary>
/// 具有相同首字符的词组集合
/// </summary>
public class WordGroup
{
/// <summary>
/// 集合
/// </summary>
private List<string> groupList=new List<string>();
public WordGroup()
{ }
/// <summary>
/// 添加词
/// </summary>
/// <param name="word"></param>
public void Add(string word)
{
if (groupList.Contains(word) == false)
{
groupList.Add(word);
}
}
/// <summary>
/// 获取总数
/// </summary>
/// <returns></returns>
public int Count()
{
return groupList.Count;
}
/// <summary>
/// 根据下标获取词
/// </summary>
/// <param name="index"></param>
/// <returns></returns>
public string GetWord(int index)
{
return groupList[index];
}
}
}

上面是一个完整的,独立的实现类。 下面给一个简单的调用示例:

   //全局配置,整个程序只要配置一次即可,后续无需配置
SensitiveWordFilter.Init(new string[] {
@"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\暴恐词库.txt",
@"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\反动词库.txt",
@"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\民生词库.txt",
@"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\色情词库.txt",
@"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\贪腐词库.txt",
@"C:\Users\x\Downloads\网站需要过滤的敏感词\mgck-master\其他词库.txt"
});//注:这里的路径一定要写正确,否则本算法无法生效。
//下列可以在多个地方实例化,可以并发执行
SensitiveWordFilter wordFilter = new SensitiveWordFilter();
Dictionary<string, string> dictTestData = new Dictionary<string, string>();
//多测几个示例,看看效果
dictTestData["杀^人游戏,有人找一夜q"] = "";//注意,这里本来不是"一夜q",可惜咱们博客园本身也有敏感词检测,无法发布。所以改成q。 如果有人需要测试,请在本地改为词库里的一些内容。!!
dictTestData["数学学习课堂"] = "";
dictTestData["打击法0功有,法0功毒害大众"] = "";
Dictionary<string, string> dictResult = new Dictionary<string, string>();
foreach(string sKey in dictTestData.Keys)
{
dictResult[sKey] = $"替换后:{wordFilter.getDataByFilter(sKey,'|')}, ------------检测违禁词:{string.Join(",",(wordFilter.IllegalWords==null?new List<string>():wordFilter.IllegalWords))}";
}
string sResultJson = JsonConverter.SerializeObject(dictResult);
Utils.SaveLog(sResultJson);

 最后,给一下打印的结果:

"杀^人游戏,有人找一夜q":     替换后: "杀^人游戏,有人找|||", ------------检测违禁词:一夜q",  
"数学学习课堂":     替换后:"数学学习课堂", ------------检测违禁词:,
"打击法0功有,法0功毒害大众":   替换后:"打击|||有,|||毒害大众", ------------检测违禁词:法0功,法0功"

-------------附

词库下载地址:https://codeload.github.com/chason777777/mgck/zip/master

最新文章

  1. hdu-1823 Luck and Love
  2. maven的简单安装与配置
  3. Java——匿名内部类
  4. Oralce11 客户端的安装和 PlSql Developer 的配置
  5. ASP.NET - 跳转页面
  6. 菜鸟学习spring IOC有感
  7. c语言scanf详解
  8. 第5天:js-系统时间对象
  9. Java开发小技巧(一)
  10. JAVA入门[18]-JdbcTemplate简单实例
  11. java数组排序,并将数组内的数据求和
  12. python ftp批量上传文件下载文件
  13. linux 批量测试域名返回码脚本
  14. js实现复制内容自动添加版权信息
  15. VMware中安装Centos 7
  16. @Autowired 与@Resource的区别(详细)
  17. python迭代器的说明
  18. [Coding Study]——目录
  19. JAVA 中的 StringBuilder 和 StringBuffer 适用的场景是什么?
  20. bootstrap Table API和一些简单使用方法

热门文章

  1. consoleInfo 输出 数组套对象 不显示...的方法 序列化 再反序列化
  2. C语言-转义字符
  3. Druid连接池和springJDbc框架-Java(新手)
  4. npm和yarn使用
  5. 记一次Metasploit心脏出血漏洞攻击测试
  6. 全国职业技能大赛信息安全管理与评估-MySQL爆破脚本
  7. go micro实战01:快速搭建服务
  8. 我是如何用IDEA调试BUG的?
  9. Convert JS object to JSON string
  10. 【翻译】.NET 5 Preview2发布