- 论坛徽章:
- 1
|
本帖最后由 iakuf 于 2013-06-01 09:06 编辑
回复 1# 墨迹哥
我在被你们上次华丽的忽略的这个程序上改进了下,可以达到你们的要求。其中 $dept_level 是用来控制抓的深度,24 行的正则是用于控制只抓指定的域名。层次达到,并没新的 url 的时候,就会自动退出。
- #!/usr/bin/perl
- use strict;
- use Mojo::UserAgent;
- use Bloom::Filter;
- use Smart::Comments;
- my $dept_level = 2;
- my $filter = Bloom::Filter->new(capacity => 100000, error_rate => 0.0001);
- my $ua = Mojo::UserAgent->new;
- my $delay = Mojo::IOLoop->delay;
- my $end = $delay->begin(0);
- my $callback;$callback = sub {
- my ($ua, $tx) = @_;
- returnif !$tx->success;
- my $dept = $tx->req->headers->header('dept');
- return if $dept > $dept_level;
- ++$dept;
- $tx->res->dom->find("a[href]")->each(sub{
- my $attrs = shift->attrs;
- my $newUrl = $attrs->{href};
- next if $newUrl !~ /php-oa.com/;
- if( !$filter->check($newUrl) ) {
- print $filter->key_count(), " $dept ", $newUrl, "\n";
- $filter->add($newUrl);
- $ua->get($newUrl => { dept => $dept } => $callback);
- }
- });
- $end->();
- };
- $ua->get($ARGV[0] => { dept => 1} => $callback);
- Mojo::IOLoop->start unless Mojo::IOLoop->is_running;
复制代码 |
|