




[Code 2.1.1]

 using System.Crawler;

#region 以GET方式请求数据
var ant = new WorkerAnt
WorkerId = (uint)Math.Abs(DateTime.Now.ToString("HHmmssfff").GetHashCode()),
var job = new JobContext
JobName = "Mike test job 1",
Uri = @"https://www.cnblogs.com/mikecheers/p/12090487.html",
#endregion #region 以POST方式请求数据
var requestDataBuilder = new StringBuilder();
requestDataBuilder.AppendLine("using System;");
requestDataBuilder.AppendLine("namespace HelloWorldApplication");
requestDataBuilder.AppendLine(" class HelloWorld");
requestDataBuilder.AppendLine(" {");
requestDataBuilder.AppendLine(" static void Main(string[] args)");
requestDataBuilder.AppendLine(" {");
requestDataBuilder.AppendLine(" Console.WriteLine(\"《C# 爬虫 破境之道》\");");
requestDataBuilder.AppendLine(" }");
requestDataBuilder.AppendLine(" }");
requestDataBuilder.AppendLine("}"); var requestData = Encoding.UTF8.GetBytes(
@"code=" + System.Web.HttpUtility.UrlEncode(requestDataBuilder.ToString())
+ @"&token=4381fe197827ec87cbac9552f14ec62a&language=10&fileext=cs"); new WorkerAnt
WorkerId = (uint)Math.Abs(DateTime.Now.ToString("HHmmssfff").GetHashCode())
}.Work(new JobContext
JobName = "Mike test job 2",
Uri = @"https://tool.runoob.com/compile.php",
ContentType = @"application/x-www-form-urlencoded; charset=UTF-8",
Method = WebRequestMethods.Http.Post,
Buffer = requestData,
#endregion Console.WriteLine("End of Main method.");


[Code 2.1.2]

Worker 471365603 JobStatus: WaitingForActivation
Worker 471365603 is starting a job named 'Mike test job 1'.
Worker 471365603 JobStatus: WaitingToRun
Worker 471365603 JobStatus: Running
Worker 1110506678 JobStatus: WaitingForActivation
Worker 1110506678 is starting a job named 'Mike test job 2'.
Worker 1110506678 JobStatus: WaitingToRun
Worker 1110506678 JobStatus: Running
End of Main method.
Totally 0 downloaded.
Totally 512 downloaded.
Totally 1024 downloaded.
Totally 1536 downloaded.
Totally 2048 downloaded.
Totally 2560 downloaded.
Totally 2624 downloaded.
Totally 3136 downloaded.
Totally 3648 downloaded.
Totally 4024 downloaded.
Totally 4028 downloaded.
Totally 4540 downloaded.
Totally 5052 downloaded.
Totally 5422 downloaded.
Totally 5934 downloaded.
Totally 6446 downloaded.
Totally 6822 downloaded.
Totally 7334 downloaded.
Totally 7846 downloaded.
Totally 8222 downloaded.
Totally 8734 downloaded.
Totally 9246 downloaded.
Totally 9758 downloaded.
Totally 10270 downloaded.
Totally 10782 downloaded.
Totally 10886 downloaded. <!DOCTYPE html>
<html lang="zh-cn">
<meta charset="utf-8" />
<meta name="v...
/* ********************** using 000.75ms / request ******************** */ Worker 471365603 JobStatus: RanToCompletion
Totally 0 downloaded.
Totally 81 downloaded.
{"output":"\u300aC# \u722c\u866b \u7834\u5883\u4e4b\u9053\u300b\n","errors":"\n"}
/* ********************** using 012.32ms / request ******************** */ Worker 1110506678 JobStatus: RanToCompletion


在[Code 2.1.1]中,主要涉及两个类:

  • WorkerAnt(工蚁):在我的爬虫世界里,工蚁就是最小的工作单位了,它们勤勤恳恳,辛勤劳作,任劳任怨,颇为辛苦!致敬!
  • JobContext(任务上下文):主要承载了任务的描述(参数信息),任务的状态信息,一个计时器(统计任务消耗的时间),还有Request相关的对象、缓存等。


在[Code 2.1.2]中,我们可以跟踪到每只小工蚁的运行状态,采集到的数据以及耗时。同时也可以看到“End of Main method.”出现在比较靠前的位置,这说明我们的小工蚁还是有点儿小聪明的,可以采用异步的方式采集数据。


[Code 2.1.3]

 namespace System.Crawler
using System;
using System.Diagnostics;
using System.IO;
using System.Net;
using System.Security.Cryptography.X509Certificates;
using System.Text;
using System.Threading.Tasks; public class JobContext
/// <summary>
/// 任务名称
/// </summary>
public String JobName { get; set; }
/// <summary>
/// 任务状态
/// </summary>
public TaskStatus JobStatus { get; set; }
/// <summary>
/// 跑表,计时器。
/// </summary>
public Stopwatch Watch { get; set; } public WebRequest Request { get; set; }
public WebResponse Response { get; set; }
public Stream RequestStream { get; set; }
public Stream ResponseStream { get; set; }
public MemoryStream Memory { get; set; }
public byte[] Buffer { get; set; } /// <summary>
/// 请求的目标Uri
/// </summary>
public String Uri { get; set; } /// <summary>
/// 设置509证书集合
/// </summary>
public X509CertificateCollection ClientCertificates { get; set; }
/// <summary>
/// Headers
/// </summary>
public WebHeaderCollection Headers { get; set; }
/// <summary>
/// 代理
/// </summary>
public IWebProxy Proxy { get; set; }
/// <summary>
/// 权限认证信息
/// </summary>
public ICredentials Credentials { get; set; } /// <summary>
/// 获取或设置用于请求的 HTTP 版本。返回结果:用于请求的 HTTP 版本。默认为 System.Net.HttpVersion.Version11。
/// </summary>
public Version ProtocolVersion { get; set; } /// <summary>
/// 获取或设置一个 System.Boolean 值,该值确定是否使用 100-Continue 行为。如果 POST 请求需要 100-Continue 响应,则为 true;否则为 false。默认值为 true。
/// </summary>
public bool Expect100Continue { get; set; } /// <summary>
/// 设置Request请求方式
/// </summary>
public String Method { get; set; } // Summary:
// Gets or sets the time-out value in milliseconds for the System.Net.HttpWebRequest.GetResponse()
// and System.Net.HttpWebRequest.GetRequestStream() methods.
// Returns:
// The number of milliseconds to wait before the request times out. The default
// value is 100,000 milliseconds (100 seconds).
// Exceptions:
// System.ArgumentOutOfRangeException:
// The value specified is less than zero and is not System.Threading.Timeout.Infinite.
public TimeSpan Timeout { get; set; } // Summary:
// Gets or sets a time-out in milliseconds when writing to or reading from a
// stream.
// Returns:
// The number of milliseconds before the writing or reading times out. The default
// value is 300,000 milliseconds (5 minutes).
// Exceptions:
// System.InvalidOperationException:
// The request has already been sent.
// System.ArgumentOutOfRangeException:
// The value specified for a set operation is less than or equal to zero and
// is not equal to System.Threading.Timeout.Infinite
public TimeSpan ReadWriteTimeout { get; set; } // Summary:
// Gets or sets the value of the Accept HTTP header.
// Returns:
// The value of the Accept HTTP header. The default value is null.
public string Accept { get; set; } // Summary:
// Gets or sets the value of the Content-type HTTP header.
// Returns:
// The value of the Content-type HTTP header. The default value is null.
public string ContentType { get; set; } // Summary:
// Gets or sets the value of the User-agent HTTP header.
// Returns:
// The value of the User-agent HTTP header. The default value is null.NoteThe
// value for this property is stored in System.Net.WebHeaderCollection. If WebHeaderCollection
// is set, the property value is lost.
public string UserAgent { get; set; } /// <summary>
/// 返回数据编码默认为NUll,可以自动识别,一般为utf-8,gbk,gb2312
/// </summary>
public Encoding Encoding { get; set; } /// <summary>
/// 请求时的Cookie
/// </summary>
public string Cookie { get; set; } public CookieCollection Cookies { get; set; } /// <summary>
/// 来源地址
/// </summary>
public string Referer { get; set; } /// <summary>
/// 是否允许自动跳转
/// </summary>
public bool AllowAutoRedirect { get; set; } /// <summary>
/// 最大连接数
/// </summary>
public int ConnectionLimit { get; set; } public JobContext()
Uri = null;
ClientCertificates = null;
Headers = null;
Proxy = null;
ProtocolVersion = System.Net.HttpVersion.Version11;
Expect100Continue = true;
Method = WebRequestMethods.Http.Get;
Timeout = TimeSpan.FromSeconds();
ReadWriteTimeout = TimeSpan.FromMinutes();
Accept = null;
ContentType = null;
UserAgent = null;
Encoding = null;
Cookie = null;
Cookies = null;
Referer = null;
AllowAutoRedirect = true;
ConnectionLimit = ;
Credentials = null;




[Code 2.1.4]

 namespace System.Crawler
using System;
using System.Diagnostics;
using System.IO;
using System.Net;
using System.Net.Security;
using System.Security.Cryptography.X509Certificates;
using System.Text;
using System.Threading.Tasks; /// <summary>
/// 一个爬虫的最小任务单位,一只小工蚁。
/// </summary>
public class WorkerAnt
public UInt32 WorkerId { get; set; } public void Work(JobContext context)
Console.WriteLine($"Worker { WorkerId } JobStatus: " + (context.JobStatus = TaskStatus.WaitingForActivation).ToString()); if (null == context)
throw new ArgumentNullException($"Worker { WorkerId } can not start a job with no context."); if (null == context.Method)
throw new ArgumentNullException($"Worker { WorkerId } can not start a job with no method."); if (null == context.Uri || !Uri.IsWellFormedUriString(context.Uri, UriKind.RelativeOrAbsolute))
throw new FormatException($"Worker { WorkerId } can not start a job with uri '{context.Uri}' is not well formed."); if (string.IsNullOrEmpty(context.JobName))
Trace.WriteLine($"Worker {WorkerId} is starting a job with no name.");
Trace.WriteLine($"Worker {WorkerId} is starting a job named '{context.JobName}'."); Console.WriteLine($"Worker { WorkerId } JobStatus: " + (context.JobStatus = TaskStatus.WaitingToRun).ToString());
context.Watch = new Stopwatch();
context.Watch.Start(); //这一句一定要写在创建连接的前面。使用回调的方法进行证书验证。
if (null != context.ClientCertificates && < context.ClientCertificates.Count)
ServicePointManager.ServerCertificateValidationCallback = (sender, certificate, chain, errors) => true; var Request = (context.Request = WebRequest.CreateHttp(context.Uri)) as HttpWebRequest; if (null != context.ClientCertificates && < context.ClientCertificates.Count)
foreach (X509Certificate item in context.ClientCertificates)
Request.ClientCertificates.Add(item); if (null != context.Headers && context.Headers.Count > )
Request.Headers.Add(context.Headers); Request.Proxy = context.Proxy; if (null != context.ProtocolVersion)
Request.ProtocolVersion = context.ProtocolVersion; Request.ServicePoint.Expect100Continue = context.Expect100Continue; Request.Method = context.Method; Request.Timeout = (Int32)context.Timeout.TotalMilliseconds; Request.ReadWriteTimeout = (Int32)context.ReadWriteTimeout.TotalMilliseconds; Request.Accept = context.Accept; Request.ContentType = context.ContentType; Request.UserAgent = context.UserAgent; if (!string.IsNullOrEmpty(context.Cookie))
Request.Headers[HttpRequestHeader.Cookie] = context.Cookie; if (null != context.Cookies)
Request.CookieContainer = new CookieContainer();
} Request.Referer = context.Referer; Request.AllowAutoRedirect = context.AllowAutoRedirect; if ( < context.ConnectionLimit)
Request.ServicePoint.ConnectionLimit = context.ConnectionLimit; Console.WriteLine($"Worker { WorkerId } JobStatus: " + (context.JobStatus = TaskStatus.Running).ToString()); if (null != context.Buffer && < context.Buffer.Length)
Request.ContentLength = context.Buffer.Length;
Request.BeginGetRequestStream(acGetRequestStream =>
var contextGetRequestStream = acGetRequestStream.AsyncState as JobContext;
contextGetRequestStream.RequestStream = contextGetRequestStream.Request.EndGetRequestStream(acGetRequestStream);
contextGetRequestStream.RequestStream.BeginWrite(context.Buffer, , context.Buffer.Length, acWriteRequestStream =>
var contextWriteRequestStream = acWriteRequestStream.AsyncState as JobContext;
}, contextGetRequestStream);
}, context);
} private void GetResponse(JobContext context)
context.Request.BeginGetResponse(new AsyncCallback(acGetResponse =>
var contextGetResponse = acGetResponse.AsyncState as JobContext;
using (contextGetResponse.Response = contextGetResponse.Request.EndGetResponse(acGetResponse))
using (contextGetResponse.ResponseStream = contextGetResponse.Response.GetResponseStream())
using (contextGetResponse.Memory = new MemoryStream())
var readCount = ;
if (null == contextGetResponse.Buffer) contextGetResponse.Buffer = new byte[];
IAsyncResult ar = null;
if ( < readCount) contextGetResponse.Memory.Write(contextGetResponse.Buffer, , readCount);
ar = contextGetResponse.ResponseStream.BeginRead(
contextGetResponse.Buffer, , contextGetResponse.Buffer.Length, null, contextGetResponse);
Console.WriteLine($"Totally {contextGetResponse.Memory.Length} downloaded.");
} while ( < (readCount = contextGetResponse.ResponseStream.EndRead(ar))); contextGetResponse.Request.Abort();
contextGetResponse.Buffer = null; var content = new UTF8Encoding(false).GetString(contextGetResponse.Memory.ToArray());
Console.WriteLine(content.Length > ? content.Substring(, ) + "..." : content); contextGetResponse.Watch.Stop();
Console.WriteLine("/* ********************** using {0}ms / request ******************** */"
+ Environment.NewLine + Environment.NewLine, (contextGetResponse.Watch.Elapsed.TotalMilliseconds / ).ToString("000.00"));
Console.WriteLine($"Worker { WorkerId } JobStatus: " + (contextGetResponse.JobStatus = TaskStatus.RanToCompletion).ToString());
}), context);



  • WorkerId:给每只小工蚁分配一个编号,就像人的身份证一样,没准等它老了,也能凭此编号领个社保什么的……
  • Work(JobContext context):对小工蚁发号施令,要它去完成某采集任务。
  • GetResponse(JobContext context):处理收到的回复数据。


Work(JobContext context)方法



  • 当JobContext.Method="GET"的时候,就不能指定JobContext.Buffer,否则会抛出异常;
  • 提供了JobContext.Credentials对象,但对象的属性是否合法;
  • 提供了JobContext.Method,但它的值是否有效;
  • 提供了JobContext.Headers,但其值是否可以加入到Request.Headers中,因为有些Key是只能通过Request的属性指定的,请参考:WebHeaderCollection.IsRestricted 方法




  • ServicePointManager.ServerCertificateValidationCallback = (sender, certificate, chain, errors) => true; :这一句一定要写在创建连接的前面。使用回调的方法进行证书验证。作为爬虫,我们就不要为难自己去验证证书了,但如果我们做的是浏览器程序,那么,还是守点儿规矩吧:)
  • Request.Proxy = context.Proxy; :Proxy属性比较特殊,当我们Create一个Request后,这个属性的默认值并不是Null,这也导致我们在使用过程中,总感觉第一个请求的耗时很长,后面的请求就很快。作为优化的一种方式,就是不去判断JobContext.Proxy是不是Null,都给这个属性赋值,即使JobContext.Proxy是Null,也要明确地给Request.Proxy赋值。
  • Request.ProtocolVersion = context.ProtocolVersion; :ProtocolVersion属性的类型是Version,但在赋值的时候,不要自己随便new一个Version,而是使用System.Net.HttpVersion进行赋值,参考JobContext的构造函数。目前支持的值有Version10和Version11,分别对应HTTP 1.0和HTTP 1.1。
  • Request.Timeout与Request.ReadWriteTimeout的区别在于:Timeout是用来限制GetResponse和GetRequestStream的超时时间,通俗点理解就是Timeout是单次采集的整体(包括Socket.Connect()、Request和Response、读写流)超时时间,默认值是100秒,通常我们会把这个时间设置的短一点,比如5~8秒,但需要注意的是,MSDN上说它只对同步操作有效,异步操作是不起作用的,所以,JobContext中的初始值并没有修改,还是保留了原来的初始值,如果你的小虫子是同步方式工作的,那么就需要注意它们的意义了;ReadWriteTimeout则是限制实际数据流读写操作的时间,同理,也是同步有效,异步无效。
  • Request.ContinueTimeout:这个属性是在Framework 4.5加入的,它表示在接收到 100-Continue 之前要等待的超时(以毫秒为单位),如果在超时过期之前接收到 100-Continue 响应,则可以发送实体正文。如果我们配置了Request.ServicePoint.Expect100Continue = true,这个timeout就会起作用。因为100Continue的报文很小,所以这个超时的默认时间也很短,仅为350毫秒。这也算是Http协议的优化吧,本节结尾会贴一小段100Continue的报文,让大家有个直观的赶脚:)
  • 还有示例中没有列举的其他属性,包括在新版本中加入的属性,大家可以参考MSDN文档的相关主题。也可以直接在VS中跳转到HttpWebRequest的定义,了解您当前版本适用的属性、方法。



GetResponse(JobContext context)方法:






另外就是附上说好的100 Continue报文:(通过Wireshark捕获)

[Code 2.1.5]

>>> 1. 发送HTTP报头,包含Expect: 100-continue ----------------------------------------------
POST /xxxxxxxxxxxxxx.ashx HTTP/1.1
Content-Type: application/x-www-form-urlencoded
encode_key: Ha1P29PAhyzRRmBiBkTJ6Q==
Content-Length: 480
Expect: 100-continue
Connection: Keep-Alive <<< 2. 收到100Continue ----------------------------------------------
HTTP/1.1 100 Continue
Content-Length: 0
Date: Sat, 21 Sep 2019 01:27:18 GMT
Server: WebSphere Application Server/8.0 >>> 3. 发送请求的数据实体 ----------------------------------------------
LloRmU0xleMjr8VibuqgDUvL9++cFpBDwtRt89fbWw2UsHjS1+cCPVmn0t9y4NysUZXAYIAlS5odowFdI/h5HAsSNk7jjaVsEK9dFseNfN+TaIIlwagFwvEEZ6tjZ0pF90hmq90iiHzH5ylDjuSfC3OJUpPrDEfAogcq/nRe8TwVRtVVSZ20RH5o0hDc/ibMSOBI/qVW+c1Ala2xfknQHi5RRGXSd3NauL9Bd0Oxk4lDIbGcWxVByoU9oZCeB8in4KdbjQtiHebigTRNiyS6lglZXY482ArxRq2Gourld/9F/gFhSCExiiBGkfwy6nzmdB66/JxBk4GYiO9fEfjamQAt3hPs8cE7zEDnPN25dVvpwhP66e3c81aUigOi6+P6634CyoSjMqyivy5p9SJsdFLeZueqH7QhZUAkR4+o4lyHVcdfs2FXlZnl23AWyBEMlcrwwzuGEYzLJqzkoxWVJ9KJP5qRbjQM <<< 4. 收到 Response ----------------------------------------------
HTTP/1.1 200 OK
Content-Language: zh-CN
Set-Cookie: JSESSIONID=0000Hc6wSQjAvFXM1m2GbqKaRSE:-1; Path=/; HttpOnly
Transfer-Encoding: chunked
Date: Sat, 21 Sep 2019 01:27:18 GMT
Server: WebSphere Application Server/8.0
Expires: Thu, 01 Dec 1994 16:00:00 GMT
Cache-Control: no-cache="set-cookie, set-cookie2" 168

100 Continue 参考报文


喜欢本系列丛书的朋友,可以点击链接加入QQ交流群(994761602)【C# 破境之道】


