根据抓取的页面,用正则来匹配页面href和src

string UserAgent = "Mozilla/5.0 (Windows NT 5.2; rv:29.0) Gecko/20100101 Firefox/29.0";
string ContentType = ""; Uri strReqUrl = new Uri("http://m.lhrb.ufstone.net/");
protected void Application_BeginRequest(object sender, EventArgs e)
{ Uri u = new Uri(strReqUrl, Request.RawUrl);
byte[] b = getVerificationCode(u); //MemoryStream ms = new MemoryStream(b);
//Response.ClearContent();
//Response.ContentType = ContentType;
//Response.BinaryWrite(b); StringBuilder strHtml = new StringBuilder(Encoding.GetEncoding("gb2312").GetString(b));
StringBuilder sb = new StringBuilder();
GetHtmlUrl(ref strHtml);
Response.Write(strHtml.ToString());
Response.End();
}
public byte[] getVerificationCode(Uri url)
{
WebClient MyWebClient = new WebClient();
MyWebClient.Headers.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
MyWebClient.Headers.Add("Accept-Language", " zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3");
MyWebClient.Headers.Add("User-Agent", this.UserAgent);
MyWebClient.Credentials = CredentialCache.DefaultCredentials;
try
{
Byte[] pageData = MyWebClient.DownloadData(url.AbsoluteUri);
ContentType = MyWebClient.ResponseHeaders["Content-Type"];
return (pageData);
}
catch
{
return null;
}
}
    void GetHtmlUrl(ref StringBuilder strHtml)
{
//string headstr = "(src|href)=", endstr = "(\")";
//string reg = @"(?<=" + headstr + ")(.*?)(?=" + endstr + ")"; string reg = "(src|href)\\s*=\\s*(?:\"(?<1>[^\"]*)\"|(?<1>\\S+))";
Regex r = new Regex(reg, RegexOptions.None);
Match match = r.Match(strHtml.ToString());
StringBuilder sb = new StringBuilder();
while (match.Success)
{
//sb.Append(match.Groups["url"].Value + "\n");//得到href值
//sb.Append(match.Groups["text"].Value + "\n");//得到<a><a/>中间的内容 sb.Append(match + "\n");//得到href值
match = match.NextMatch();
//try
//{
// Uri u = new Uri(strReqUrl, match.Value.Replace("\"", "").Replace("'", ""));
// strHtml.Replace(match.Value, @"/" + u.ToString().Replace(strReqUrl.ToString(), ""));
//}
//catch
//{
//}
}
}

最新文章

  1. Undefined symbols for architecture arm64解决方案
  2. BZOJ 1030: [JSOI2007]文本生成器 [AC自动机 DP]
  3. 安装zookeeper遇到的问题以及解决方案
  4. 【C语言】C语言函数
  5. 关于IOS应用程序视图
  6. Java线程池应用
  7. ie10中元素超出父元素的宽度时不能自动隐藏
  8. 字符流缓冲区BufferedReader之readLine方法的原理
  9. Stage3D学习笔记(六):旋转动画效果
  10. JavaScript高级程序设计(六):关键字 void 和 delete 使用
  11. Gartner 如何看 RASP 和 WAF?
  12. CodeForces 158C - Cd and pwd commands(模拟)
  13. partial类修饰符
  14. Android适配难题全面总结
  15. dev控件 xtraTabbedMdiManager 如何将关闭子窗体改为收回主窗体内
  16. python 3.5 import theano ::hypot error
  17. sql优化问题笔记(mysql)
  18. Codeforces 235E. Number Challenge DP
  19. elk之elasticsearch 入门
  20. css3动画属性系列之transform细讲scale缩放

热门文章

  1. c语言结构体指针必须初始化
  2. Default Constructor的构造操作
  3. sql 查询一段时间内某个时间点数据
  4. 第一次点击button, view视图出现;第二次点击button,view视图消失
  5. java类的初始化
  6. 关于prototype属性的理解
  7. OpenCV 图片尺寸调整
  8. Educational Codeforces Round 15_B. Powers of Two
  9. 关于C++中虚函数表存放位置的思考
  10. dfs Codeforces Round #356 (Div. 2) D