- 论坛徽章:
- 3
|
- #!C:\Perl\bin\perl.exe
- use strict;
- use LWP::UserAgent;
- use HTTP::Request::Common qw(GET);
- my $url = "http://www.chinaunix.net/";
- my $base_url = "chinaunix";
- my @exclude = ("thread","uid-","uid/","forum-","Start_","forummodule-","fid=","search.php","=","download/","css","js","news","shtml","peixun","page-");#url排除
- my @storeurl;
- my @newurl;
- my @findurl;
- my @waitfindurl;
- my $temp=0;
- push @waitfindurl,$url;
- push @storeurl,$url;
- for my $wfu (@waitfindurl){
- #last if (@storeurl > 150);
- print @storeurl ." ". @waitfindurl." $temp $wfu\n";
- @findurl = ();
- @findurl = &findpageurl($wfu,$base_url,\@exclude);
- next unless ($findurl[0] =~ /http/i); #返回不时http就忽略
-
- for my $fu (@findurl){
- next if (map {$fu=~/^$_$/i} @storeurl); #比对url库中是否存在,存在就舍弃
- next unless ($fu =~ /http/i); #剔除非http的链接
- push @storeurl,$fu;
- push @newurl,$fu;
- print @storeurl." ".@newurl." storeurl _ newurl \n";
- }
- push @waitfindurl,@newurl;
- shift @waitfindurl;
- @newurl = ();#清空新找到的url列表
- $temp++;
- }
- open FF,">url.txt";
- print FF "$_\n" for(@storeurl);
- close FF;
- sub findpageurl{
- my ($url,$base_url,$a) = @_;
- my @exclude = @$a;
- my $UA = LWP::UserAgent->new();
- my $req = HTTP::Request->new( GET => "$url" );
- my $resp = $UA->request($req);
- my @page;
- my @hrefurl;
- my @srcurl;
- my $return_code;
- my $tempurl;
- #链接不是200返回错误代码
- $return_code = $resp->code;
- print "requset return code:$return_code\n";
- return $return_code unless ($return_code == 200);
- @page = split /\n/,$resp->content;
- for(@page){
- if(s/href="(.+?)"//g){
- $tempurl = $1;
- next if (map {$tempurl=~/$_/i} @exclude);
- substr($tempurl,-1) =~ s/\///; #移除url最后的 /
- push @hrefurl,$tempurl if ($tempurl =~ /$base_url/i);
- }
- #push @srcurl,$1 if(/src="(.*?)"/g);
- }
- print "requset return url\n";
- return @hrefurl;
- }
复制代码 我拿chinaunix做了测试,@exclude数组中会排除了站点中各种分页。下一步将更加更加细找到的url ,从而逐步去除@exclude这个数组
@waitfindurl中所有元素都检查过后脚本就结束。细细想下这个办法 因该还是有问题的 |
|