Asp.net 使用正则和网络编程抓取网页数据(有用)

Asp.net 使用正则和网络编程抓取网页数据(有用)

        /// <summary>
/// 抓取网页对应内容
/// </summary>
/// <param name="strUrl">採集地址</param>
/// <param name="Begin">開始字符</param>
/// <param name="End">结束字符</param>
/// <returns></returns>
private static String GetContent(String strUrl, String Begin, String End)
{
String result = String.Empty;
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(strUrl);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
using (StreamReader reader = new StreamReader(response.GetResponseStream(), System.Text.Encoding.Default))
{
result = reader.ReadToEnd(); reader.Close();
response.Close();
} //抓取内容
Match table = Regex.Match(result, "(?<=" + Begin + ")[\\s\\S]*? (? =" + End + ")", RegexOptions.IgnoreCase);
result = NoHTML(table.Value); return result;
} ///<summary>
///去除HTML标记
///</summary>
///<param name="NoHTML">包含HTML的源代码 </param>
///<returns>已经去除后的文字</returns>
private static string NoHTML(string Htmlstring)
{
//删除脚本
Htmlstring = Regex.Replace(Htmlstring, @"<script[^>]*? >.*? </script>", "",
RegexOptions.IgnoreCase);
//删除HTML
Htmlstring = Regex.Replace(Htmlstring, @"<(.[^>]*)>", "",
RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"([\r\n])[\s]+", "",
RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"-->", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"<!--.*", "", RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(quot|#34);", "\"",
RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(amp|#38);", "&",
RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(lt|#60);", "<",
RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(gt|#62);", ">",
RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(nbsp|#160);", " ",
RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(iexcl|#161);", "\xa1",
RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(cent|#162);", "\xa2",
RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(pound|#163);", "\xa3",
RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&(copy|#169);", "\xa9",
RegexOptions.IgnoreCase);
Htmlstring = Regex.Replace(Htmlstring, @"&#(\d+);", "",
RegexOptions.IgnoreCase); Htmlstring = Regex.Replace(Htmlstring, ">", "");
Htmlstring = Regex.Replace(Htmlstring, "<", "");
Htmlstring = Regex.Replace(Htmlstring, "\r\n", "");
Htmlstring = Htmlstring.Substring(Htmlstring.IndexOf("\n") + 1);
if (Htmlstring.LastIndexOf("'") >= 0)
Htmlstring = Htmlstring.Substring(Htmlstring.LastIndexOf("'") + 1);
if (Htmlstring.IndexOf("class='tdbk'") >= 0)
Htmlstring = Htmlstring.Substring(Htmlstring.IndexOf("class='tdbk'") + "class='tdbk'".Length); return Htmlstring;
}

最新文章

  1. mciSendString 的两个小坑
  2. 11个Visual Studio代码性能分析工具
  3. 解决osg路径与文件名中的中文字符问题
  4. world machine, 输出lightmap
  5. php的SAPI,CLI SAPI,CGI SAPI
  6. Redis主从是否生效的特殊测试方法
  7. java复制File文件操作
  8. hdoj 5461 Largest Point
  9. Amoeba实现mysql主从读写分离
  10. Rxjava+Retrofit2+Okhttp3多文件上传(服务器端代码+客户端代码)
  11. PHP封装的一个单例模式Mysql操作类
  12. MongoDB集群之分片
  13. Java中谈尾递归--尾递归和垃圾回收的比较
  14. sftp无法连接问题
  15. linux 内核模块makefile通用模板
  16. Linux 小知识翻译 - 「版本号」的命名方式
  17. Integer Sequence Dividing CodeForces - 1102A (规律)
  18. GDB 调试器使用手冊
  19. mybatis的批量操作
  20. Netty学习第三节Netty的入门级学习

热门文章

  1. Xamarin Mono Android Ios 安装、破解(4.12)
  2. [GRYZ2015]INCR
  3. 微软控制台带来的PHP控制台输出问题
  4. PackageManager获取版本号
  5. 树形DP CCPC网络赛 HDU5834 Magic boy Bi Luo with his excited tree
  6. What do data scientist do?
  7. anaconda在linux下的安装注意事项
  8. JSP学习笔记(一)
  9. SRM DIV1 500pt DP
  10. Cocos2d-x 对于中文的支持-----iconv库