- 论坛徽章:
- 0
|
本帖最后由 xti9er 于 2010-09-20 11:11 编辑
几年前写的旧代码,好久不用了,现在还能用。此代码既能能用来做好事也能用来做坏事,本来用来给老婆查论文资料的,结果她用不惯命令行的东西,就算了。做坏事就不用我说了吧。
配置文件如下格式
#########################
#=任意区域
#countryTD=乍得
#countryGI=直布罗陀
#countryCL=智利
#countryCF=中非共和国
#countryCN=中国
#countryMO=中国澳门特别行政区
#countryHK=中国香港特别行政区
#....等等
country=all
inurl:/index.htm
#########################- #!/usr/bin/perl -w
- #By xti9er
- require LWP::UserAgent;
- use LWP::Simple;
- use Color::Output;
- Color::Output::Init;
- $|=1;
- #----------------------------
- #从配置文件中提取字符串
- #----------------------------
- my $inrulfile=shift||"inurl.ini";
- my $countryset="";
- open(INURL,$inrulfile) or die $!;
- while($inurl=<INURL>)
- {
- chomp($inurl);
- next if $inurl=~/^#/;
- if($inurl=~/^country=(\w+)/i)
- {
- $countryset="country$1";
- }
- elsif($inurl=~/^country=$/i)
- {
- }
- else{
- push(@inurl,$inurl);
- }
- }
- close INURL;
- for my $inurl(@inurl)
- {
- $inurl=~s/\s/\+/g;
- $reginurl=$inurl;
- $reginurl=~s/\?/\\?/g;
- $reginurl=~s/\=/\\=/g;
- $reginurl=~s/\:/\\:/g;
- $reginurl=~s/\+/\\s/g;
- $sleeptime=5;
- #cprin("Sleeptime=[$sleeptime]\t Url=[$inurl]\t RegUrl=[$reginurl]\t country=[$countryset]\n",7);
- sleep(2);
- my $sdco=0;
- my $hostno=0;
- my $cksd=0;
- my $p75="-"x80;
- my $ua = LWP::UserAgent->new;
- $ua->timeout(20);
- $ua->env_proxy;
- $ua->agent("Mozilla/5.0");
- #print "Start ...\n";
- cprin("\t\t\t --=G o o g l e \t S c a n n e r=-- \n",5);
- cprin("\t\t\t By xti9er \n",13);
- cprin("$p75\n",7);
- $stime=time;
- $lstime=localtime();
- cprin("\t\t\tStart at $lstime\n",13);
- my $searchweb="http://www.google.com.hk";
- my $startpage=0;
- print "[Google] page:$searchweb/search?num=100&complete=1&hl=zh-CN&cr=$countryset&newwindow=1&q=$inurl&start=0&sa=N/\n";
- my $response = $ua->get("$searchweb/search?num=100&complete=1&hl=zh-CN&cr=$countryset&newwindow=1&q=$inurl&start=0&sa=N/")
- or (cprin("[Start] Get google start page faild:$!",5) and next);
- if ($response->is_success)
- {
- $getre=$response->content;
- @getlog=split(/href=/,$getre);
- $getco=0;
- for(@getlog)
- {
- $getco++;
- if($_=~/id=resultStats>(.*)?</)
- {
- $ttpageno=$1;
- $ttpageno=~s/\,//g;
- $ttpageno=~s/\D//g;
- $ttpageno=int($ttpageno/100);
- cprin("\t\t\t$ttpageno Google Pages To Read!\n$p75\n",5);
- sleep(2);
- for($startpage=0;$startpage<=$ttpageno;$startpage=$startpage+100)
- {
- sleep($sleeptime);
- cprin("Now Read The ".$startpage."th Page!\n--------------------------------------\n",13);
- spider($startpage,$inurl);
- }
- }
- }
- }
- else
- {
- cprin($response->status_line,5);next;
- }
- }
- sub spider
- {
- $pageno=shift;
- my $inurl=shift;
- sleep($sleeptime);
- open(WLOG,"+>>$stime.log") or (cprin($!,5) and next);
- my $searchweb="http://www.google.com.hk";
- my $searchurl="$searchweb/search?num=100&complete=1&hl=zh-CN&cr=$countryset&newwindow=1&q=$inurl&start=$pageno&sa=N/";
- my $nextpage=0;
- my $nextpageno=$pageno+100;
- cprin("Now URL: $searchurl\n",7);
- my $ua = LWP::UserAgent->new;
- $ua->timeout(20);
- $ua->env_proxy;
- $ua->agent("Mozilla/5.0");
- my $response = $ua->get($searchurl);
- if ($response->is_success)
- {
- $getre=$response->content; # or whatever
- @getlog=split(/href=/,$getre);
- }
- else
- {
- print "Get page count faild!\n";
- cprin($response->status_line,5);next;
- }
- my @sites;
- foreach $urlre(@getlog){
- if($urlre=~/^\"http\:\/\/(.*?)\"\starget=_blank\sclass=l/){
- push(@sites,"http://$1");
- }
- if($urlre=~/\/search\?.*?q=.*?start=$nextpageno\&\;sa=N/){
- $nextpage++;
- }
- }
- my %seen=();
- @sites = grep { !$seen{$_} ++ } @sites;
- $siteno=0;
- for my $nowsite (@sites)
- {
- $siteno++;
- print "URL:$nowsite\n";
- print WLOG $nowsite,"\n";
- }
- close WLOG;
- if($nextpage==0)
- {
- cprin("\t\t\tThe End\n",13);
- goto SPIDEREND;
- }
- }
- SPIDEREND:
- $etime=time;
- $ttime=$etime-$stime;
- print stime($ttime);
- sub cprin
- {
- ($str,$i)=@_;
- cprint("\x03" . $i . "$str\n\x030");
- }
- sub stime
- {
- my $stime=shift;
- my $hour=int($stime/(60*60));
- my $minute=int(($stime-($hour*60*60))/60);
- my $second=$stime-$hour*60*60-$minute*60;
- return ("$hour hours $minute mins $second secs");
- }
复制代码 |
|