using AnfleCrawler.Common;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks; namespace AnfleCrawler.DataAnalyzer
{
internal class Qy58 : AnalyzerBase
{
public override void Init(PageCrawler crawler)
{
base.Init(crawler); var url = new Uri("http://qy.58.com/caohejing/pn1/?PGTID=14177711280840.45006677554920316&ClickID=1");
//http://qy.58.com/19583455460359/?PGTID=14177659184690.5166369006238447&ClickID=4
crawler.PushUrl(url, );
} protected override void AnalyzeInternal(PageLandEntity current)
{
var lander = Crawler.Lander;
var pHandler = CreateContentHandler(current);
switch (current.Depth)
{
case :
{
pHandler.AjaxBlocks.Add(HACK);
var dom = lander.GetDocument(pHandler);
DoPerPaging(current, dom.DocumentNode, ".next"); foreach (var node in QueryNodes(dom.DocumentNode, ".compList a"))
{
var url = GetHref(node, current.Url);
Crawler.PushUrl(url, );
}
}
break;
case :
{
var dom = lander.GetDocument(pHandler);
var attr = new AttributeFiller(); attr.Append("Name:{0}", QueryTexts(dom.DocumentNode, ".compT").First()); foreach (var th in QueryNodes(dom.DocumentNode, ".basicMsg table th").Skip())
{
string sTh = th.InnerText, sTd;
switch (sTh)
{
case "联系电话":
case "邮箱":
var client = new System.Net.WebClient();
var iNode = QueryNode(th.NextSibling, "img");
byte[] imgRaw = client.DownloadData(GetHref(iNode, current.Url, attrName: "src"));
var img = new System.Drawing.Bitmap(new System.IO.MemoryStream(imgRaw));
sTd = OCR(img);
break;
case "公司地址":
sTd = QueryTexts(th.NextSibling, "span").First();
break;
default:
sTd = th.NextSibling.InnerText.HtmlTrim();
break;
}
attr.Append("{0}:{1}", sTh, sTd);
} var bo = new CompanyEntity();
bo.City = "上海";
bo.GroupName = "漕河泾企业";
bo.PageUrl = current.Url.OriginalString;
bo.UpdateDate = DateTime.Now;
attr.FillEntity(bo, new Dictionary<string, string>()
{
{"公司性质", "Nature"},
{"公司行业", "Industry"},
{"公司规模", "Scale"},
{"联系人", "ContactPerson"},
{"企业网址", "Website"}, {"联系电话", "Tel"},
{"邮箱", "Email"},
{"公司地址", "Address"},
});
Repository.SaveCompany(bo);
Crawler.OutWrite("保存企业 {0}", bo.Name);
}
break;
}
}
}
}

最新文章

  1. 面试web前端开发,被打击了
  2. 从网易与淘宝的font-size思考前端设计稿与工作流 (转)
  3. 【吉光片羽】ie6兼容性的几个点
  4. PHP Lex Engine Sourcecode Analysis(undone)
  5. oracle数据库创建表空间和表临时空间
  6. linux ubuntu关于U盘的安装 开机启动u盘的时候出现/casper/vmlinuz.efi: file not found
  7. bzoj2763
  8. 【转】谁说Vim不是IDE?(三)
  9. scikit learn 模块 调参 pipeline+girdsearch 数据举例:文档分类 (python代码)
  10. C#重写Equals方法步骤
  11. [Python学习笔记][Python内置函数]
  12. Git学习笔记总结和注意事项
  13. Installshield设置feature为必须选中状态,即必定安装状态
  14. wemall app商城源码Android之支付宝通知处理类
  15. 《CSS动画实用技巧》课程笔记
  16. eclipse导入maven时,pom文件的project一直报错(Failure to transfer org.apache.maven.plugins:maven-surefire-plugin:pom:2.12.)
  17. 【翻译】Neural Collaborative Filtering--神经协同过滤
  18. 数据结构Java实现02----单向链表的插入和删除
  19. Git branch 分支与合并分支
  20. Raft 一致性算法论文译文

热门文章

  1. Application.DoEvents():概念
  2. IOS调试lldb命令常用,po,
  3. Mac下安装LNMP(Nginx+PHP5.6)环境
  4. dom4j-1.6.1.jar与dom4j-1.4.jar
  5. android获取状态栏高度
  6. [转]tomcat中的session管理
  7. Doolitter分解 三对角矩阵分解 拟三对角分解
  8. python_way ,day26 django_admin 自定义
  9. 《第一本docker书》第4章 使用docker镜像和仓库 读书笔记
  10. Linux变量