- 论坛徽章:
- 0
|
我写了脚本,用来下载用网页形式的手册,就是说如果脚本名为down.pl,那么./down.pl http://book/index.html,就会把index及相关的网页全部下载到硬盘上。但我的脚本始终不能下载全部的网页。
[code]
#!/usr/bin/perl
#script for download html page.
#Author Huang Yong.
#2007-9-1
use File::Basename;
use File::Spec;
sub downloadpage{
my $main_path = dirname($_[0]);
getpage($_[0],$main_path);
}
sub catfile{
$dirname = $_[0];
$filename = $_[1];
if($filename =~ m@^\./(.+)@){
$filename = $1;
}
while($filename =~ m@^\.\./(.+)@){
$dirname = dirname($dirname);
$filename = $1;
}
my $re = File::Spec->catfile($dirname,$filename);
$re =~ s@http:/@http://@;
return $re;
}
sub getpage{
my $filename;
my $str=$_[1];
my $pageurl = $_[0];
my $main_path=$_[1];
$str=~s/\./\\./g;
my $pattern = "$str/(.+)";
$str=$pageurl;
if($str=~m@$pattern@){
$filename = $1;
}
my $dirname = dirname($filename);
unless (-e $dirname) {mkdir $dirname};
if (-e $filename){
return;
}
print "Page $pageurl is download now...\n";
`wget $pageurl -P $dirname`;
my $ower = $pageurl;
my $remote_path = dirname($pageurl);
if($filename =~ /htm[l]?$/){
open fp, "< $filename" or die "Can not open file $filename";
while(<fp>){
while(m@href\s*=\s*"([^"]+)"(.*)$@){
my $link_url = $1;
my $keep = $2;
$link_url =~ s@([^#]+)#.+$@$1@;
#print "$ower:\n$link_url\n";
#print "$2\n";
unless ($link_url !~ m@http://@) {
$_ = $keep;
last;
}
unless ($link_url !~ m@mailto@){
$_ = $keep;
last;
}
unless ($link_url !~ m@^#@){
$_ = $keep;
last;
}
$pageurl = catfile($remote_path,$link_url);
print "$pageurl\n";
#getpage($pageurl,$main_path);
$_ = $keep;
}
}
close fp;
}
}
foreach(@ARGV){
print "Page $_ download now,please wait...\n";
downloadpage($_);
}
print "Page download success!\n";
[/code] |
|