1 爬虫,爬虫攻防

2 下载html

3 xpath解析html,获取数据和深度抓取(和正则匹配)

4 多线程抓取

熟悉http协议

提供两个方法Post和Get

public static string HttpGet(string url, Encoding encoding = null,  Dictionary<string,string> headDic=null)
{
string html = string.Empty;
try
{
HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;//模拟请求
request.Timeout = * ;
request.UserAgent = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36";
request.ContentType = "text/html; charset=utf-8";
if (headDic != null)
{
foreach (var item in headDic)
{
request.Headers.Add(item.Key, item.Value);
}
}
if(encoding==null)
encoding = Encoding.UTF8; // 如果是乱码就改成 utf-8 / GB2312
else
encoding=Encoding.GetEncoding("GB2312");
using (HttpWebResponse response = request.GetResponse() as HttpWebResponse)
{
if (response.StatusCode != HttpStatusCode.OK)
{
log.Warn(string.Format("抓取{0}地址返回失败,response.StatusCode为{1}", url, response.StatusCode));
}
else
{
try
{
StreamReader sr = new StreamReader(response.GetResponseStream(), encoding);
html = sr.ReadToEnd();//读取数据
sr.Close();
}
catch (Exception ex)
{
log.Error(string.Format("DownloadHtml抓取{0}保存失败", url), ex);
html = null;
}
}
} }
catch (WebException ex)
{
if (ex.Message.Equals("远程服务器返回错误: (306)。"))
{
log.Error("远程服务器返回错误: (306)。", ex);
return null;
}
}
catch (Exception ex)
{
log.Error(string.Format("DownloadHtml抓取{0}出现异常", url), ex);
html = null;
}
return html;
}
        /// <summary>
/// Post 调用借口
/// </summary>
/// <param name="url">接口地址</param>
/// <param name="value">接口参数</param>
/// <returns></returns>
public static string HttpPost(string url, string value)
{
string param = value;
Stream stream = null;
byte[] postData = Encoding.UTF8.GetBytes(param);
try
{
HttpWebRequest myRequest = (HttpWebRequest)WebRequest.Create(url); myRequest.Method = "POST";
myRequest.ContentType = "application/x-www-form-urlencoded";
myRequest.ContentLength = postData.Length;
stream = myRequest.GetRequestStream();
stream.Write(postData, , postData.Length); HttpWebResponse myResponse = (HttpWebResponse)myRequest.GetResponse();
if (myResponse.StatusCode == HttpStatusCode.OK)
{
StreamReader sr = new StreamReader(myResponse.GetResponseStream(), Encoding.UTF8);
string rs = sr.ReadToEnd().Trim();
sr.Close();
return rs;
}
else
{
return "失败:Status:" + myResponse.StatusCode.ToString();
}
}
catch (Exception ex)
{
return "失败:ex:" + ex.ToString();
}
finally
{
if (stream != null)
{
stream.Close();
stream.Dispose();
}
}
}

下载Html

StreamWriter sw = new StreamWriter("路径.txt", true, Encoding.GetEncoding("utf-8"));
sw.Write("爬取的html字符串");
sw.Close();

 xpath

http://www.cnblogs.com/zhaozhan/archive/2009/09/09/1563617.html

http://www.cnblogs.com/zhaozhan/archive/2009/09/09/1563679.html

http://www.cnblogs.com/zhaozhan/archive/2009/09/10/1563703.html

正则匹配

目前使用起来最好用的正则

<title>(?<html>[\s\S]+?)</title>  意思是匹配 <title> *********</title>标签里面的任意字符串

Regex reTitle = new Regex(@"<title>(?<html>[\s\S]+?)</title>"/>");
string title = reTitle.Match(html).Groups["html"].Value;

多个选择

Regex rgInfo = new Regex(@"<td align=""left"">(?<company>[^<>]+)</td><td align=""center"">(?<id>[\dA-Z]+)</td><td align=""center"">(?<cat>[^<>]+)</td><td align=""center"">(?<grade>[A-Z]+)</td><td align=""center"">(?<date>[^\s&]*)");
MatchCollection mchInfos = rgInfo.Matches(strHtml);
foreach (Match m in mchInfos)
{
string strCompany = m.Groups["company"].Value;
string strId = m.Groups["id"].Value;
string strCat = m.Groups["cat"].Value.Replace("&nbsp;", "");
string grade = m.Groups["grade"].Value;
string date = m.Groups["date"].Value;
}

多线程

List<Task> taskList = new List<Task>();
TaskFactory taskFactory = new TaskFactory();
for(int i=;i<;i++)
{
taskList.Add(taskFactory.StartNew(Crawler));//将一个执行Crawler方法的线程放到集合里面,创建并启动 任务
if (taskList.Count > ) //线程池启动15个线程
{
taskList = taskList.Where(t => !t.IsCompleted && !t.IsCanceled && !t.IsFaulted).ToList();
Task.WaitAny(taskList.ToArray());//有线程执行完毕
}
}
Task.WaitAll(taskList.ToArray());//100个线程全部执行完成
Console.WriteLine("抓取全部完成 - -", DateTime.Now);

该文档只是自己记录,纯属记事本

最新文章

  1. Python之路Day21-自定义分页和cookie
  2. lua学习例子
  3. scroll事件实现监控滚动条并分页显示示例(zepto.js)
  4. 表单form的enctype=&quot;multipart/form-data&quot;使用疑惑
  5. Ecshop 单选按钮组功能 颜色多选
  6. 基于NodeJS的全栈式开发
  7. HTML表单元素Emil和密码
  8. Mongodb 创建索引
  9. linux设备驱动归纳总结(二):模块的相关基础概念【转】
  10. Spring的5种通知
  11. IOS学习教程
  12. 必须知道的ADO.NET 数据库连接池
  13. Qt---Xml文件解析
  14. Activity内切换fragment实现底部菜单切换遇到的坑
  15. Springboot admin 发送邮件失败:com.sun.mail.smtp.SMTPSenderFailedException: 553 Mail from must equal authorized user
  16. shell 关于路径查询显示pwd
  17. php调用API支付接口 转至http://www.cnblogs.com/chaochao00o/p/6490463.html
  18. javascript 常用的正则表达式验证表单
  19. 用开源项目FlipImageView实现图片的翻转效果
  20. 用css3实现社交分享按钮

热门文章

  1. Grunt实战 --- 通过nodejs和Grunt实现项目在线构建
  2. 我的Android进阶之旅------>Android实现音乐示波器、均衡器、重低音和音场功能
  3. 怎样做大做强企业中的ERP?
  4. 关于TensorFlow若干问题的汇总
  5. Machine Learning No.9: Dimensionality reduction
  6. git设置只允许特定类型的文件
  7. BZOJ 4435 [Cerc2015]Juice Junctions 分治最小割+hash
  8. 磁卡ID卡IC卡的区别【转】
  9. POJ3261 Milk Patterns —— 后缀数组 出现k次且可重叠的最长子串
  10. 1 准备学习redis