- 论坛徽章:
- 3
|
回复 17# grshrd49
在网上找了一段,感觉这个貌似跟我想法有重叠。
目前还在分析和整理。。- - ! 蛋腾了。你也来看看这个,看有什么可以借鉴的?- #!/usr/bin/perl
- use URI;
- use LWP;
- open(GC,">process.txt") || die "$!\n";
- print GC "PROCESS:\n";
- close (GC);
- my $t1=time();
- my $mainweb = $ARGV[0]; #首页地址
- my $deep=3; #爬虫深度
- my $report; #生成报告
- %hashurls=(); #储存url的深度
- @allurls=(); #所有url列表
- %hashtwo=(); #记录网页对应文件
- $hashurls{$mainweb}=1;
- $allurls[0]=$mainweb;
- $hashtwo{$mainweb}=0;
- my $i=1; #数组allurls的长度
- my $j=0; #爬下的网页文件的个数
- my $k=1; #记录网页对应文件的参数
- my $suc=0; #成功爬下网页数量
- my $fai=0; #失败网页数量
- my $dom; #域名
- print "mainweb=".$mainweb."\n";
- if( $mainweb =~ /http:\/\/www.([^\/\s]{1,500})/i ){
- $dom=$1; #域名
- print "domain is: ".$dom."\n";
- $report = $report."mainweb=".$mainweb."\n";
- $report = $report."domain is: ".$dom."\n";
- }
- else {
- print "wrong input\n";
- exit;
- }
- my $agent=LWP::UserAgent->new();
- $agent->agent("zzq spider");
- $agent->timeout(5);
- #按数组allurls的顺序爬网页
- foreach $u(@allurls){
- my $ret = &getweb($u);
- open(FD,">web/web".$j.".html") || die "$!\n";
- print FD "$ret\n";
- close(FD);
- $j++;
- if($hashurls{$u}<$deep){
- &geturls($ret,$dom,$hashurls{$u},$u);
- }
- }
- print "Finished!\n";
- open(GC,">>process.txt") || die "$!\n";
- print GC "Finished!\n";
- close (GC);
- #输出报告
- my $t2=time();
- my $costtime=$t2-$t1;
- my $ti1=localtime($t1);
- my $ti2=localtime($t2);
- print "\n\n\n\n\nreport:\n";
- $report = $report."start time = ".$ti1."\tfinish time = ".$ti2."\n";
- $report = $report."Costtime = ".$costtime." seconds\n";
- $report = $report."get ".$j." webs\tsuccess : ".$suc."\tfail : ".$fai."\n";
- foreach $ur(@allurls){
- $report = $report.$ur."\n ======> web".$hashtwo{$ur}.".html\n"
- }
- print "$report";
- open(BG,">report.txt") || die "$!\n";
- print BG "$report";
- close (BG);
- print "You can read this report in report.txt\n";
- #get the urls form the web.
- sub geturls{
- my ($ret,$domain,$tmpdeep,$webu) = @_;
- # print "now start geturls\n";
- while( $ret =~ /href=\"([^\"\?\s>]{1,500})\"/ig )
- {
- my $realurl = URI->new_abs( $1, $webu );
- if($realurl =~ /$domain/){
- if(defined $hashurls{$realurl}){
- next;
- }
- else {
- print "get a new url:$realurl\n";
- open(GC,">>process.txt") || die "$!\n";
- print GC "get a new url:$realurl\n";
- close (GC);
- $hashurls{$realurl} = $tmpdeep+1;
- $allurls[$i] = $realurl;
- $hashtwo{$realurl} = $k;
- $k++;
- $i++;
- }
- }
- else {
- next;
- }
- }
- while( $ret =~ /href=\'([^\'\?\s>]{1,500})\'/ig )
- {
- my $realurl = URI->new_abs( $1, $webu );
- if($realurl =~ /$domain/){
- if(defined $hashurls{$realurl}){
- next;
- }
- else {
- print "get a new url:$realurl\n";
- open(GC,">>process.txt") || die "$!\n";
- print GC "get a new url:$realurl\n";
- close (GC);
- $hashurls{$realurl} = $tmpdeep+1;
- $allurls[$i] = $realurl;
- $hashtwo{$realurl} = $k;
- $k++;
- $i++;
- }
- }
- else {
- next;
- }
- }
- while( $ret =~ /href=([^\"\?\'\s>]{1,500})/ig )
- {
- my $realurl = URI->new_abs( $1, $webu );
- if($realurl =~ /$domain/){
- if(defined $hashurls{$realurl}){
- next;
- }
- else {
- print "get a new url:$realurl\n";
- open(GC,">>process.txt") || die "$!\n";
- print GC "get a new url:$realurl\n";
- close (GC);
- $hashurls{$realurl} = $tmpdeep+1;
- $allurls[$i] = $realurl;
- $hashtwo{$realurl} = $k;
- $k++;
- $i++;
- }
- }
- else {
- next;
- }
- }
- while( $ret =~ /window\.open\(\'([^\'\?\s>]{1,500})/ig )
- {
- my $realurl = URI->new_abs( $1, $webu );
- if($realurl =~ /$domain/){
- if(defined $hashurls{$realurl}){
- next;
- }
- else {
- print "get a new url:$realurl\n";
- open(GC,">>process.txt") || die "$!\n";
- print GC "get a new url:$realurl\n";
- close (GC);
- $hashurls{$realurl} = $tmpdeep+1;
- $allurls[$i] = $realurl;
- $hashtwo{$realurl} = $k;
- $k++;
- $i++;
- }
- }
- else {
- next;
- }
- }
- }
- #get a web form internet
- sub getweb
- {
- my ($u) = @_;
- my $h=$j+1;
- print "webaddress: " . $u . "\n";
- print "getting the ".$h." th web of $i in total\n";
- open(GC,">>process.txt") || die "$!\n";
- print GC "webaddress: " . $u . "\n";
- print GC "getting the ".$h." th web of $i in total\n";
- close (GC);
- my $request=HTTP::Request->new(GET,$u);
- my $response=$agent->request($request);
- if ($response->is_success){
- print "successed!\n";
- open(GC,">>process.txt") || die "$!\n";
- print GC "successed!\n";
- close (GC);
- $suc++;
- return $response->content;
- }
- else{
- print "failed! Status = ".$response->status_line."\n";
- open(GC,">>process.txt") || die "$!\n";
- print GC "failed! Status = ".$response->status_line."\n";
- close (GC);
- $fai++;
- return $response->status_line;
- }
- }
复制代码 |
|