- 论坛徽章:
- 3
|
本帖最后由 grshrd49 于 2013-07-09 16:49 编辑
试试这个,哈哈!测试用了2天!爬hao123用了基本2天...
$rooturl最后不要/结尾,$base_url这个可以确保不要爬到外面去
(jpg|png|exe|rar|zip|bmp|apk)$/i)这些结尾的连接就不再深度查找
控制了302页面重定向的深度
如果遇到实在无法理解的非法页面,加到@exclude = ();数组中发现了就不再请求- #!C:\Perl\bin\perl.exe
- use strict;
- use LWP::UserAgent;
- use HTTP::Request::Common qw(GET);
- my $rooturl = "http://www.hao123.com";
- my $base_url = "hao123";
- my @exclude = ();
- #my @exclude = ("thread");#url排除
- my @storeurl;
- my @newurl;
- my @findurl;
- my @waitfindurl;
- my $temp=0;
- push @waitfindurl,$rooturl;
- push @storeurl,$rooturl;
- while($#waitfindurl>=0){
- my $wfu = shift @waitfindurl;
- #last if (@storeurl > 150);
- print @storeurl ." ". @waitfindurl." $temp $wfu\n";
- $temp++;
- @findurl = ();
- @findurl = &findpageurl($rooturl,$wfu,$base_url,\@exclude);
- next unless ($findurl[0] =~ /http/i); #返回不时http就忽略
-
- for my $fu (@findurl){
-
- my $finded = 0; #如果是已知的url就舍弃
- for my $su (@storeurl){
- if($fu eq $su){
- $finded++;
- last;
- }
- }
- next if($finded);
-
- next unless ($fu =~ /http/i); #剔除非http的链接
- push @storeurl,$fu;
- if (!($fu =~ /.(jpg|png|exe|rar|zip|bmp|apk)$/i)){ #不是规定结尾的连接不再深度查找
- if ($fu =~ /$base_url/i){ #是本站连接继续深度查找
- push @newurl,$fu;
- }
- }
-
- #这里可以即时将找到的url写入文件
- #open FF,">>url.txt";
- #print FF "$fu\n";
- #close FF;
- #print @storeurl." ".@newurl." storeurl _ newurl \n";
- }
- push @waitfindurl,@newurl;
- #shift @waitfindurl;
- @newurl = ();#清空新找到的url列表
- }
- @storeurl = reverse sort {$a<=>$b} @storeurl;
- #这里是全部找到后一起写入文件
- open FF,">url-last.txt";
- print FF "$_\n" for(@storeurl);
- close FF;
- #####################
- #处理发现的链接
- #####################
- #my @item;
- #for(@storeurl){
- # if($_ =~ /item/i){
- # push @item,$_ ;
- # }
- #}
- #@item = sort {$a<=>$b} @item;
- #print "$_\n" for(@item);
- ###################################
- #请求url从返回页面中获取有用的url
- ###################################
- sub findpageurl{
- my ($rooturl,$url,$base_url,$a) = @_;
- my @exclude = @$a;
- my $UA = LWP::UserAgent->new();
- $UA->max_redirect(1); #控制重定向深度
- my $req = HTTP::Request->new( GET => "$url" );
- my $resp = $UA->request($req);
- #$UA->timeout(10);
- my @page;
- my @hrefurl;
- my @srcurl;
- my $return_code;
- my $tempurl;
- #链接不是200返回错误代码
- $return_code = $resp->code;
- print "requset return code:$return_code\n";
- return $return_code unless ($return_code == 200);
- @page = split /\n/,$resp->content;
- for(@page){
- if(s/href="(.+?)"//g){
- #print "$1\n";
- $tempurl = $1;
- $tempurl = $rooturl . $tempurl if($tempurl =~ /^\//);
- next if (map {$tempurl=~/$_/i} @exclude);
- substr($tempurl,-1) =~ s/\///; #移除url最后的 /
- push @hrefurl,$tempurl;
- }
- push @srcurl,$1 if(/src="(.*?)"/g);
- }
- print "requset return url\n";
- return @hrefurl;
- }
复制代码 |
|