- 论坛徽章:
- 3
|
本帖最后由 墨迹哥 于 2013-06-04 20:39 编辑
回复 36# iakuf
@grshrd49 本想改写下之前云总那个做模块使用。。可是各种错误。。。你那个我没试过改写的效果。。。- package Web_PageAnaly;
- use threads;
- use threads::shared;
- use Thread::Queue;
- use Thread::Semaphore;
- use Bloom::Filter;
- use URI::URL;
- use Web::Scraper;
- require Exporter;
- @ISA=(Exporter);
- @EXPORT=qw( Page_Analy ProcessUrl );
- share($host);
- $host = URI::URL->new($base_url)->host;
- $queue = Thread::Queue->new();
- $semaphore = Thread::Semaphore->new( $max_threads );
- $mutex = Thread::Semaphore->new( 1 );
- $filter = shared_clone( Bloom::Filter->new(capacity => 1000000, error_rate => 0.0001) );
- $res;
- $link;
- sub Web_PageAnaly::Page_Analy{
- local($base_url)=shift;
- local($max_threads)=shift;
- $queue->enqueue( $base_url );
- $filter->add( $base_url );
- while( 1 ){
- foreach ( threads->list(threads::joinable) ){
- $_->join( );
- }
- my $item = $queue->pending();
- if( $item == 0 ){
- my $active = threads->list(threads::running);
- if( $active == 0 ){
- print "All done!\n";
- last;
- }else{
- sleep 1;
- next;
- }
- }
- $semaphore->down;
- threads->create( \&Web_PageAnaly::ProcessUrl );
- }
- foreach (threads->list()){
- $_->join( );
- }
- }
- sub Web_PageAnaly::ProcessUrl{
- scraper = scraper{ process '//a', 'links[]' => '@href';};
- while( my $url = $queue->dequeue_nb() ){
- eval{$res = $scraper->scrape( URI->new($url) )->{'links'};};
- if( $@ ){
- warn "$@\n";
- next;
- }
- next if (! defined $res );
- foreach( @{$res} ) {
- $link = $_->as_string;
- $link = URI::URL->new($link, $url);
- next if( $link->scheme ne 'http' && $link->scheme ne 'https' );
- next if( $link->host ne $host );
- $link = $link->abs->as_string;
- if( $link =~ /(.*?)#(.*)/ ){
- $link = $1;
- }
- next if( $link =~ /.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf)$/i );
- $mutex->down();
- if( ! $filter->check($link) ){
- print $filter->key_count(), " ", $link, "\n";
- open FILE,">>$host.html" or die $!;
- print FILE ("<a href='$link'>$link</a>\n");
- close(FILE);
- $filter->add($link);
- $queue->enqueue($link);
- }
- $mutex->up();
- undef $link;
- }
- undef $res;
- }
- undef $scraper;
- $semaphore->up();
- }
复制代码 |
|