近期在做船仅仅识别方面的事情,须要大量的正样本来训练adaboost分类器。

于是到marinetraffic这个站点上下载船仅仅图片。写个爬虫来自己主动下载显然非常方便。

站点特点

在介绍爬虫之前首先了解一下marinetraffic这个站点的一些特点:

1. 会定期检測爬虫行为。假设觉得有爬虫大量下载图片。

会把该连接增加黑名单,后几天都没办法下载。

2. 船仅仅图片资源差异大。有的船仅仅有1000多张图,有的船仅仅没有一张图,我们须要的是非常多船仅仅的非常多张图。所以须要对下载的船仅仅按优先级排序。

3. 用来训练分类器的正样本要求检測对象的分辨率一样。而marinetraffic站点下载的图片能够设置下在的图片的宽度,站点依据长宽比,生成对应的高度。所以。不同图片高度不一样。须要自己后期处理。

解决方式

  1. 针对爬虫检測。设置一个随机等待时间,10s左右。能够绕过站点爬虫行为检測。
  2. 对船仅仅依照图片熟练排序,先下载图片数量多的,而且每一个船仅仅不用下载太多。保证图片的差异性。比如
  3. 在下载的时候使用统一的宽度。

    后期处理从图片中抠出分辨率一样的船仅仅

爬虫源代码

using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Net;
using System.Runtime.Serialization.Formatters.Binary;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks; namespace 船仅仅图像爬虫
{
class Program
{ static void download_all_shipid(List<string> shipid_list)
{
try
{ WebClient MyWebClient = new WebClient(); MyWebClient.Headers["User-Agent"] = "blah";
MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据; //Console.WriteLine("here1");
//http://www.marinetraffic.com/en/photos/of/ships/shipid:281519/ //http://www.marinetraffic.com/en/ais/index/ships/all
//http://www.marinetraffic.com/ais/index/ships/all/page:2/sort:COUNT_PHOTOS/direction:desc; for (int pageNum = 1; pageNum < 100; pageNum++)
{
Console.WriteLine("開始分析第" + pageNum + "张网页"); MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据;
MyWebClient.Headers["User-Agent"] = "blah";
try
{
//Console.WriteLine("here0");
Byte[] pageData = MyWebClient.DownloadData(@"http://www.marinetraffic.com/en/ais/index/ships/all/page:" + pageNum + "/sort:COUNT_PHOTOS/direction:desc/per_page:50"); //从指定站点下载数据
//pageHtml = Encoding.Default.GetString(pageData); //假设获取站点页面採用的是GB2312,则使用这句; string pageHtml = Encoding.UTF8.GetString(pageData); //假设获取站点页面採用的是UTF-8。则使用这句; //Console.WriteLine(pageHtml);//在控制台输入获取的内容;
//Console.WriteLine("here1");
int urlindex = -1;
string org_label = "shipid:";
urlindex = pageHtml.IndexOf(org_label, urlindex + 1); while (urlindex != -1)
{
int endOfUrl = pageHtml.IndexOf("/", urlindex + org_label.Length);
//Console.WriteLine("here2");
string shipid = pageHtml.Substring(urlindex + org_label.Length, endOfUrl - urlindex - org_label.Length);
if (!shipid_list.Contains(shipid))
{
Console.WriteLine("新增id:" + shipid);
shipid_list.Add(shipid);
}
//Console.WriteLine("已有id:" + shipid); urlindex = pageHtml.IndexOf(org_label, urlindex + 1);
} ///保存网页
//using (StreamWriter sw = new StreamWriter("ouput.html"))//将获取的内容写入文本
//{
// sw.Write(pageHtml);
//}
Console.WriteLine("完毕第" + pageNum + "页分析");
}
catch (WebException webEx)
{ Console.WriteLine(webEx.Message.ToString()); } //以下是一个随机数的方法保证10秒后再下载。以绕过违规检測。
Console.Write("绕开站点爬虫行为检測中......");
Random rd = new Random();
int time_sleep = rd.Next() % 10 + 10;
Thread.Sleep(time_sleep * 1000);
Console.WriteLine();
} Console.WriteLine("分析结束");
//以下把list内容保存进文件,使用序列化的方法;
string file = @"C:\Users\dragonfive\Desktop\爬虫获得船仅仅图片\第三批\0_100page_shipid.txt";
using (FileStream fsWriter = new FileStream(file, FileMode.OpenOrCreate, FileAccess.Write))
{
//以下对stu进行序列化。
BinaryFormatter bf = new BinaryFormatter();
bf.Serialize(fsWriter, shipid_list);
} } catch (WebException webEx)
{ Console.WriteLine(webEx.Message.ToString()); }
} /// <summary>
/// 依据得到的ship_id获得该ship_id的全部图片;
/// </summary>
/// <param name="ship_id"></param>
static void download_jpg(string ship_id)
{
try
{
Console.WriteLine("開始下载shipid为:"+ship_id+"的图片");
WebClient MyWebClient = new WebClient(); MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据
MyWebClient.Headers["User-Agent"] = "blah";
//http://www.marinetraffic.com/en/photos/of/ships/shipid:281519/
//http://www.marinetraffic.com/en/photos/of/ships/shipid:371668/per_page:1000/page:1
Byte[] pageData = MyWebClient.DownloadData(@"http://www.marinetraffic.com/en/photos/of/ships/shipid:" + ship_id + @"/per_page:100/page:1"); //从指定站点下载数据 //string pageHtml = Encoding.Default.GetString(pageData); //假设获取站点页面採用的是GB2312。则使用这句 string pageHtml = Encoding.UTF8.GetString(pageData); //假设获取站点页面採用的是UTF-8,则使用这句
//Console.WriteLine(pageHtml);//在控制台输入获取的内容
Console.WriteLine("元网页已下载");
//using (StreamWriter sw = new StreamWriter("ouput.html"))//将获取的内容写入文本
//{
// sw.Write(pageHtml);
//} int urlindex = -1;
string org_label = "data-original='";
urlindex = pageHtml.IndexOf(org_label, urlindex + 1); int i = 0; //Directory.CreateDirectory(@"./" );
while (urlindex != -1)
{
int endOfUrl = pageHtml.IndexOf("'", urlindex + org_label.Length); string url = pageHtml.Substring(urlindex + org_label.Length, endOfUrl - urlindex - org_label.Length); ////以下是unicode编码转换为string的方式;
//MatchCollection mc = Regex.Matches(strName, @"\\u([\w]{2})([\w]{2})", RegexOptions.Compiled | RegexOptions.IgnoreCase);
//byte[] bts = new byte[2];
//foreach (Match m in mc)
//{
// bts[0] = (byte)int.Parse(m.Groups[2].Value, NumberStyles.HexNumber);
// bts[1] = (byte)int.Parse(m.Groups[1].Value, NumberStyles.HexNumber);
// musicName += Encoding.Unicode.GetString(bts);
//}
//Console.WriteLine("接下来下载的是:" + musicName); //以下是一个随机数的方法保证10秒后再下载。以绕过违规检測。
Console.Write("绕过站点爬虫行为检測中......");
Random rd = new Random();
int time_sleep = rd.Next() % 10 + 10;
Thread.Sleep(time_sleep * 1000);
Console.WriteLine();
try
{
//这是下载的命令;
Console.WriteLine(url); MyWebClient.Credentials = CredentialCache.DefaultCredentials;//获取或设置用于向Internet资源的请求进行身份验证的网络凭据
MyWebClient.Headers["User-Agent"] = "blah";
Byte[] jpgdata = MyWebClient.DownloadData(url); //从指定网页下载数据; //把下载的内容保存在一个地方;
using (FileStream fs = new FileStream(@"C:\Users\dragonfive\Desktop\爬虫获得船仅仅图片\第三批\" + ship_id + "_" + i + ".jpg", FileMode.OpenOrCreate, FileAccess.Write))
{
fs.Write(jpgdata, 0, jpgdata.Length);
}
}
catch (WebException webEx)
{
Console.WriteLine("被捕获了吗?");
Console.WriteLine(webEx.Message.ToString()); } Console.WriteLine("成功下载第" + (i ++) + "张图片"); urlindex = pageHtml.IndexOf(org_label, urlindex + 1);
} ///保存网页
//using (StreamWriter sw = new StreamWriter("ouput.html"))//将获取的内容写入文本
//{
// sw.Write(pageHtml);
//}
Console.WriteLine("*****************************************");
Console.WriteLine("下载"+i+"张ship_id为"+ship_id+"的图片");
Console.WriteLine("*****************************************");
//Console.ReadLine(); //让控制台暂停,否则一闪而过了 } catch (WebException webEx)
{ Console.WriteLine(webEx.Message.ToString()); }
}
static void Main(string[] args)
{ List<string> shipid_list = new List<string>();
//shipid_list.Add("371681");//临时高速产生图片用这个;
download_all_shipid(shipid_list);
//string file = @"C:\Users\dragonfive\Desktop\爬虫获得船仅仅图片\第三批\0_100page_shipid.txt";
//using (FileStream fsReader = new FileStream(file, FileMode.Open, FileAccess.Read))
//{
// //以下进行反序列话;
// BinaryFormatter bf = new BinaryFormatter();
// shipid_list = (List<string>)bf.Deserialize(fsReader);
// Console.WriteLine("成功加载" + shipid_list.Count + "个shipid");
//}
////371652 371668 371681 1252401
//shipid_list.Remove("371652");
//shipid_list.Remove("371668");
//shipid_list.Remove("371681");
//shipid_list.Remove("1252401");
////132264
//shipid_list.Remove("371077");
//shipid_list.Remove("132264");
//shipid_list.Remove("224871");
//shipid_list.Remove("279923");
//shipid_list.Remove("369163");
//shipid_list.Remove("266342");
//shipid_list.Remove("371216");
//shipid_list.Remove("368174");
//shipid_list.Remove("369163"); foreach (var ship_id in shipid_list)
{
download_jpg(ship_id);
} Console.ReadLine(); //让控制台暂停,否则一闪而过了 }
}
}

最新文章

  1. DNS错误 事件4000 4013
  2. Java是如何处理别名(aliasing)的
  3. [CS231n-CNN] Linear classification II, Higher-level representations, image features, Optimization, stochastic gradient descent
  4. andriod ==和equals
  5. SQL-Server数据库学习笔记-表
  6. 关于Xcode7的HTTP请求不到网络的问题
  7. GourdScan &amp; sqlmapapi
  8. [attribute|=value]和[attribute^=value]选择器区别
  9. 第二次作业:结对编程,四则运算的GUI实现
  10. Memcached 及 Redis 架构分析和区别比较
  11. 基于H5的WebSocket简单实例
  12. 【第一部分】10Leetcode刷题
  13. jQuery-图片轮播-随意切换图片
  14. Kali系列之Hydra/Medusa mysql密码爆破
  15. Innodb存储引擎的缓存命中率计算
  16. MetaMask/json-rpc-engine
  17. LoRa---官方例程移植
  18. 【干货】已Window7 系统为例,谈谈boot引导程序-------附带看看数据隐藏
  19. 运行Delphi XE10的MongoDB例程,测试Delphi插入记录性能
  20. Memcache内存分配策略

热门文章

  1. litepal更好的操作sqlite3,配置与基本操作
  2. Maven项目:Plugin execution not covered by lifecycle configuration 解决方案
  3. Python开发注意事项
  4. Spring SpEL in JSP and Assign SpEL value to Java variable in JSP
  5. App开发Native.js入门指南
  6. WIFI 概览
  7. POJ1201Intervals(差分约束)
  8. CMSIS-RTOS 时间管理之时间延迟Time Delay
  9. 【C#】C#托付和事件的实例解说
  10. void指针 (补充)