use Mojo::UserAgent;
use Bloom::Filter;
use Smart::Comments;
use DBI; my $dbname = "bbs_url";
my $location = "localhost";
my $port = "3306";
my $database = "DBI:mysql:$dbname:$location:$port";
my $db_user = "root";
my $db_pass = "toor";
my $dbh = DBI->connect($database,$db_user,$db_pass); my $dept_level = ;
my $baseUrl = Mojo::URL->new($ARGV[] || 'http://bbs.xxxxx.cn/');
my ($domain) = $baseUrl =~ qr#http://(?:www.)?([^/]+)#;
my $filter = Bloom::Filter->new(capacity => , error_rate => 0.0001);
my $ua = Mojo::UserAgent->new(max_redirects => ); $name="xxxxx";
my $query = "CREATE TABLE $name("." `No` int(100) NOT NULL auto_increment,"." `depth` int(10) NOT NULL,"." `Url` text NOT NULL, PRIMARY KEY (`No`) ".") ENGINE=MyISAM DEFAULT CHARSET=utf8;";
my $sth = $dbh->prepare($query);
$sth->execute() or die "create table student error: ".$sth->errstr(); my $callback;$callback = sub {
my ($ua, $tx) = @_;
#open(FD,">>url.txt")|| die ("Could not open file");
return if !$tx->success; my $dept = $tx->req->headers->header('dept');
return if $dept > $dept_level;
++$dept;
$tx->res->dom->find("a[href]")->each(sub{
my $attrs = shift->attrs;
my $newUrl = Mojo::URL->new($attrs->{href}); if (!$newUrl->host and !$newUrl->scheme) {
$newUrl->host($tx->req->url->host);
$newUrl->scheme($tx->req->url->scheme);
}
$newUrl->fragment(undef);
next if ( $newUrl->scheme ne 'http' && $newUrl->scheme ne 'https' );
next if $newUrl->host !~ qr/$domain/;
next if ( $newUrl->path =~ /.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf)$/i ); if( !$filter->check($newUrl) ) {
if(($filter->key_count())% ==){
print $filter->key_count(), " $dept ", $newUrl, "\n";
}
if($dept== || $dept == || $dept ==){
#$n++;
#print FD $filter->key_count(),"\t",$dept,"\t",$newUrl,"\n";
my $sql="insert into $name(depth,Url) values('$dept','$newUrl')";
my $sth=$dbh->prepare("$sql");
$sth->execute();
$sth->finish();
}
#if($dept==4){
#$n++;
#print FD $filter->key_count(),"\t",$dept,"\t",$newUrl,"\n";
#my $sql="insert into $names(depth,Url) values('$dept','$newUrl')";
#my $sth=$dbh->prepare("$sql");
#$sth->execute();
#$sth->finish();
# }
$filter->add($newUrl);
$ua->get($newUrl => { dept => $dept } => $callback); }
});
}; $ua->get($baseUrl => { dept => } => $callback);
Mojo::IOLoop->start;

最新文章

  1. ubuntu中pycharm安装激活第二种方法的密钥
  2. [转]runtime 消息机制
  3. ASP.NET MVC 部署全站HTTPS
  4. Xcode里-ObjC, -all_load, -force_load
  5. zjuoj 3606 Lazy Salesgirl
  6. html5-websocket初探
  7. C# Dictionary 的几种遍历方法
  8. 也不知怎么了LVS.SH找不到,网上搜了一篇环境搭配CENTOS下面的高可用 参考
  9. Problem A 栈
  10. DOS 命令 attrib:修改文件属性
  11. ado.net与各种orm操作数据方式的比较
  12. C# 32位程序访问64位系统注册表
  13. socket串口通信
  14. Dynamics CRM Trigger plugin for N:N relationships
  15. eclipse 代码模板
  16. FZU2150 :Fire Game (双起点BFS)
  17. 5W2H+35问
  18. HDU 4768 Flyer (2013长春网络赛1010题,二分)
  19. LeetCode: Partition List 解题报告
  20. 安装cartographer_ros

热门文章

  1. note name
  2. 条码解析的一片js
  3. openldap安装配置
  4. Ax Grid 的显示根据用户的需求动态排序。
  5. dedecms后台登录如何去除验证码设置
  6. Windows phone 8 学习笔记(9) 集成(转)
  7. Loadrunner:安装LR11时提示缺少vc2005_sp1_with_atl_fix_redist
  8. 1.3查看Linux内核版本
  9. PayPal 开发详解(六):下载paypal立即付款SDK 并编译打包
  10. Python之路【第二篇】:Python基础(一)