
  WebDriver+phantomjs 这两个组合在一起使用,可以完成此任务。分别简单介绍下,WebDriver是一个前端的自动化测试框架,phantomjs是一个无界面的浏览器,基于webkit。WebDriver调用phantomjs.exe工作。下面是WebDriver提供的API,看来它能驱动各种浏览器工作。



在Nuget上,下载 Selenium.WebDriverSelenium.PhantomJS.WebDriver两个包,在项目中引用 WebDriver.dll,在输出目录下要有phantomjs.exe。



using OpenQA.Selenium;
using OpenQA.Selenium.PhantomJS;
using OpenQA.Selenium.Support.UI;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks; namespace ConsoleApplication1
public interface ICrawler
event EventHandler<OnStartEventArgs> OnStart;
event EventHandler<OnCompletedEvent> OnCompleted;
event EventHandler<OnErrorEventArgs> OnError; Task Start(Uri uri, Script script, Operation opreation);
} public class Operation
{ public Action<PhantomJSDriver> Action; public Func<IWebDriver, bool> Condition; public int timeout { get; set; }
} public class Script
public string Code { set; get; } public object[] Args { set; get; } } public class OnStartEventArgs
public Uri Uri { set; get; } public OnStartEventArgs(Uri uri)
this.Uri = uri;
} public class OnErrorEventArgs
public Uri Uri { set; get; } public Exception Exception { set; get; } public OnErrorEventArgs(Uri uri, Exception ex)
this.Uri = uri; this.Exception = ex;
} public class OnCompletedEvent
public Uri Uri { set; get; } public int ThreadId { set; get; } public string PageSource { get; private set; } public long Milliseconds { get; private set; } public PhantomJSDriver Driver { get; private set; } public OnCompletedEvent(Uri uri, int threadId, string pageSource, long milliseconds, PhantomJSDriver driver)
this.Uri = uri;
this.ThreadId = threadId;
this.PageSource = pageSource;
this.Milliseconds = milliseconds;
this.Driver = driver;
} public class HighCrawler : ICrawler
{ public event EventHandler<OnStartEventArgs> OnStart; public event EventHandler<OnCompletedEvent> OnCompleted; public event EventHandler<OnErrorEventArgs> OnError; private static PhantomJSOptions _options;
private static PhantomJSDriverService _service; static HighCrawler()
var service = PhantomJSDriverService.CreateDefaultService();
service.DiskCache = true;
service.IgnoreSslErrors = true;
service.HideCommandPromptWindow = true;
service.LoadImages = false;
service.LocalToRemoteUrlAccess = true; _service = service; _options = new PhantomJSOptions();
} public Task Start(Uri uri, Script script, Operation operation)
return Task.Factory.StartNew(() =>
if (OnStart != null)
this.OnStart(this, new OnStartEventArgs(uri));
} var driver = new PhantomJSDriver(_service, _options);
var watch = DateTime.Now;
driver.Navigate().GoToUrl(uri.ToString()); if (script != null) driver.ExecuteScript(script.Code, script.Args); if (operation.Action != null) operation.Action.Invoke(driver); var driverWait = new WebDriverWait(driver, TimeSpan.FromMilliseconds(operation.timeout)); //设置超时时间 if (operation.Condition != null) driverWait.Until(operation.Condition); var threadId = Thread.CurrentThread.ManagedThreadId; var milliseconds = DateTime.Now.Subtract(watch).Milliseconds; var pageSource = driver.PageSource; if (this.OnCompleted != null)
this.OnCompleted(this, new OnCompletedEvent(uri, threadId, pageSource, milliseconds, driver)); }
catch (Exception ex)
if (OnError != null)
this.OnError(this, new OnErrorEventArgs(uri, ex));


        /// <summary>
/// 解析网站
/// </summary>
/// <param name="url">待解析的网站</param>
/// <param name="waitId">等待加载的元素Id:"search-main"</param>
/// <param name="xpath">解析路径:"//div[@class=\"article panel article-result\"]//h5[@class=\"title\"]//a"</param>
private static void TestWaitForReady(string url, string waitId, string xpath, int timeout = )
{ var crawler = new HighCrawler(); crawler.OnStart += (s, e) =>
{ Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
}; crawler.OnError += (s, e) =>
Console.WriteLine("爬虫出现错误:" + e.Uri.ToString() + ",异常信息" + e.Exception.ToString());
}; crawler.OnCompleted += (s, e) =>
Console.WriteLine("接收到的源码长度:" + e.PageSource.Length); Thread.Sleep();
Console.WriteLine("爬虫结束,花费时间:" + e.Milliseconds);
var items = e.Driver.FindElements(By.XPath(xpath)); foreach (var item in items)
}; var operition = new Operation
Action = (x) =>
{ },
Condition = (x) =>
return x.FindElement(By.Id(waitId)).Displayed;
timeout = timeout
}; crawler.Start(new Uri(url), null, operition); }




