#!/usr/bin/perl #script for download html page. #Author Huang Yong. #2007-9-1 use File::Basename; use File::Spec; sub downloadpage{ my $main_path = dirname($_[0]); getpage($_[0],$main_path); } sub catfile{ $dirname = $_[0]; $filename = $_[1]; if($filename =~ m@^\./(.+)@){ $filename = $1; } while($filename =~ m@^\.\./(.+)@){ $dirname = dirname($dirname); $filename = $1; } my $re = File::Spec->catfile($dirname,$filename); $re =~ s@http:/@http://@; return $re; } sub getpage{ my $filename; my $str=$_[1]; my $pageurl = $_[0]; my $main_path=$_[1]; $str=~s/\./\\./g; my $pattern = "$str/(.+)"; $str=$pageurl; if($str=~m@$pattern@){ $filename = $1; } my $dirname = dirname($filename); unless (-d $dirname) {mkdir $dirname}; if (-e $filename){ return; } print "Page $pageurl is download now...\n"; `wget $pageurl -P $dirname`; my $remote_path = dirname($pageurl); if($filename =~ /htm[l]?$/){ open fp, "< $filename" or die "Can not open file $filename"; my @file = <fp>; close fp; my $line; foreach $line(@file){ while($line =~ m@href\s*=\s*"([^"]+)"(.*$)@){ my $link_url = $1; #超链接 $line = $2; # 匹配剩余部分继续匹配 $link_url =~ s@([^#]+)#.+$@$1@; #去除锚点描述 unless ($link_url !~ m@http://@) {next;} unless ($link_url !~ m@mailto@) {next;} unless ($link_url !~ m@^#@) {next;} $pageurl = catfile($remote_path,$link_url); getpage($pageurl,$main_path); } } } } foreach(@ARGV){ print "Page $_ download now,please wait...\n"; downloadpage($_); } print "Page download success!\n |
欢迎光临 Chinaunix (http://bbs.chinaunix.net/) | Powered by Discuz! X3.2 |