[C#] - 从 HTML 代码中转换 / 提取可读文字（PlainText）的方法

背景

在做网页数据分析的时候，我们关注的部分是内容，可以过滤掉HTML标签、Javascript、CSS等代码。

目标输入

<b>Hello World.</b><br/><p><i>Is there anyone out there?</i><p>

输出结果

Hello World. Is there anyone out there?

开发工具

Html Agility Pack
http://html-agility-pack.net/

实现方案1：（过滤规则严谨，保留HTML版式，推荐使用！）

//small but important modification to class https://github.com/zzzprojects/html-agility-pack/blob/master/src/Samples/Html2Txt/HtmlConvert.cs

public static class HtmlToText

{

    public static string Convert(string path)

    {

        HtmlDocument doc = new HtmlDocument();

        doc.Load(path);

        return ConvertDoc(doc);

    }

    public static string ConvertHtml(string html)

    {

        HtmlDocument doc = new HtmlDocument();

        doc.LoadHtml(html);

        return ConvertDoc(doc);

    }

    public static string ConvertDoc (HtmlDocument doc)

    {

        using (StringWriter sw = new StringWriter())

        {

            ConvertTo(doc.DocumentNode, sw);

            sw.Flush();

            return sw.ToString();

        }

    }

    internal static void ConvertContentTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)

    {

        foreach (HtmlNode subnode in node.ChildNodes)

        {

            ConvertTo(subnode, outText, textInfo);

        }

    }

    public static void ConvertTo(HtmlNode node, TextWriter outText)

    {

        ConvertTo(node, outText, new PreceedingDomTextInfo(false));

    }

    internal static void ConvertTo(HtmlNode node, TextWriter outText, PreceedingDomTextInfo textInfo)

    {

        string html;

        switch (node.NodeType)

        {

            case HtmlNodeType.Comment:

                // don't output comments

                break;

            case HtmlNodeType.Document:

                ConvertContentTo(node, outText, textInfo);

                break;

            case HtmlNodeType.Text:

                // script and style must not be output

                string parentName = node.ParentNode.Name;

                if ((parentName == "script") || (parentName == "style"))

                {

                    break;

                }

                // get text

                html = ((HtmlTextNode)node).Text;

                // is it in fact a special closing node output as text?

                if (HtmlNode.IsOverlappedClosingElement(html))

                {

                    break;

                }

                // check the text is meaningful and not a bunch of whitespaces

                if (html.Length == )

                {

                    break;

                }

                if (!textInfo.WritePrecedingWhiteSpace || textInfo.LastCharWasSpace)

                {

                    html= html.TrimStart();

                    if (html.Length == ) { break; }

                    textInfo.IsFirstTextOfDocWritten.Value = textInfo.WritePrecedingWhiteSpace = true;

                }

                outText.Write(HtmlEntity.DeEntitize(Regex.Replace(html.TrimEnd(), @"\s{2,}", " ")));

                if (textInfo.LastCharWasSpace = char.IsWhiteSpace(html[html.Length - ]))

                {

                    outText.Write(' ');

                }

                    break;

            case HtmlNodeType.Element:

                string endElementString = null;

                bool isInline;

                bool skip = false;

                int listIndex = ;

                switch (node.Name)

                {

                    case "nav":

                        skip = true;

                        isInline = false;

                        break;

                    case "body":

                    case "section":

                    case "article":

                    case "aside":

                    case "h1":

                    case "h2":

                    case "header":

                    case "footer":

                    case "address":

                    case "main":

                    case "div":

                    case "p": // stylistic - adjust as you tend to use

                        if (textInfo.IsFirstTextOfDocWritten)

                        {

                            outText.Write("\r\n");

                        }

                        endElementString = "\r\n";

                        isInline = false;

                        break;

                    case "br":

                        outText.Write("\r\n");

                        skip = true;

                        textInfo.WritePrecedingWhiteSpace = false;

                        isInline = true;

                        break;

                    case "a":

                        if (node.Attributes.Contains("href"))

                        {

                            string href = node.Attributes["href"].Value.Trim();

                            if (node.InnerText.IndexOf(href, StringComparison.InvariantCultureIgnoreCase)==-)

                            {

                                endElementString =  "<" + href + ">";

                            }

                        }

                        isInline = true;

                        break;

                    case "li":

                        if(textInfo.ListIndex>)

                        {

                            outText.Write("\r\n{0}.\t", textInfo.ListIndex++);

                        }

                        else

                        {

                            outText.Write("\r\n*\t"); //using '*' as bullet char, with tab after, but whatever you want eg "\t->", if utf-8 0x2022

                        }

                        isInline = false;

                        break;

                    case "ol":

                        listIndex = ;

                        goto case "ul";

                    case "ul": //not handling nested lists any differently at this stage - that is getting close to rendering problems

                        endElementString = "\r\n";

                        isInline = false;

                        break;

                    case "img": //inline-block in reality

                        if (node.Attributes.Contains("alt"))

                        {

                            outText.Write('[' + node.Attributes["alt"].Value);

                            endElementString = "]";

                        }

                        if (node.Attributes.Contains("src"))

                        {

                            outText.Write('<' + node.Attributes["src"].Value + '>');

                        }

                        isInline = true;

                        break;

                    default:

                        isInline = true;

                        break;

                }

                if (!skip && node.HasChildNodes)

                {

                    ConvertContentTo(node, outText, isInline ? textInfo : new PreceedingDomTextInfo(textInfo.IsFirstTextOfDocWritten){ ListIndex = listIndex });

                }

                if (endElementString != null)

                {

                    outText.Write(endElementString);

                }

                break;

        }

    }

}

internal class PreceedingDomTextInfo

{

    public PreceedingDomTextInfo(BoolWrapper isFirstTextOfDocWritten)

    {

        IsFirstTextOfDocWritten = isFirstTextOfDocWritten;

    }

    public bool WritePrecedingWhiteSpace {get;set;}

    public bool LastCharWasSpace { get; set; }

    public readonly BoolWrapper IsFirstTextOfDocWritten;

    public int ListIndex { get; set; }

}

internal class BoolWrapper

{

    public BoolWrapper() { }

    public bool Value { get; set; }

    public static implicit operator bool(BoolWrapper boolWrapper)

    {

        return boolWrapper.Value;

    }

    public static implicit operator BoolWrapper(bool boolWrapper)

    {

        return new BoolWrapper{ Value = boolWrapper };

    }

}

实现方案2：（过滤规则不严谨，适用于结构简单的HTML）

public static string StripHTML(string HTMLText, bool decode = true)

{

    Regex reg = new Regex("<[^>]+>", RegexOptions.IgnoreCase);

    var stripped = reg.Replace(HTMLText, "");

    return decode ? HttpUtility.HtmlDecode(stripped) : stripped;

}

参考资料

https://stackoverflow.com/a/25178738
https://stackoverflow.com/a/732110

巴特西

[C#] - 从 HTML 代码中转换 / 提取可读文字（PlainText）的方法

最新文章

热门文章

巴特西

[C#] - 从 HTML 代码中 转换 / 提取 可读文字（PlainText）的方法

最新文章

热门文章

[C#] - 从 HTML 代码中转换 / 提取可读文字（PlainText）的方法