- 论坛徽章:
- 1
|
本帖最后由 wxlfh 于 2012-11-03 13:52 编辑
回复 1# kofpet - #如果600G文件中可能的url实在太多,以致根本不能放入内存中,那就来个不精确的排序
- #按行读取,超过500000个url,则把前2000名写入文件;然后再重复
- #最后在文件中统计,并输出前1000名
- use strict;
- open my $log, "<" , "your_log_file" or die $!;
- open my $fh , ">+", "LOG_HASH.txt" or die $!;
- my ( %log,$max_key,$check,$i );
- $max_key = 50_0000;
- $check = 1_0000;
- $limit = 1999;
- while ( <$log> ) {
- chomp;
- my $url = farse_url( $_ );
- next unless $url;
- $i++;
- $log{$url}++;
- unless ( $i % $check ) { # 每找到10000个url检查一次
- my $keys = keys %log;
- if ( $keys > $max_key ) { # 超过500000个url就把前2000名写入文件
- write_file( [(sort { $log{$b}<=>$log{$a} } keys %log)[0..$limit]] );
- %log = ( );
- }
- $i = 0;
- }
- }
- write_file( [(sort { $log{$b}<=>$log{$a} } keys %log)[0..$limit]] ); # 最后不足500000个的,也把前2000名写入文件
- close $fh;
- %log = ( );
- open $log, "<", "LOG_HASH.txt" or die $!;
- while ( <$log> ) {
- chomp;
- $log{$1} += $2 if /^(\S+)\s=\s(\d+)/;
- }
- close $log;
- for ( (sort {$log{$b}<=>$log{$a}} keys %log)[0..999] ) {
- print "$_ => $log{$_}\n"; # 打印前1000名url
- }
- sub farse_url {
- my $line = shift;
- .
- . # farse your url and return
- .
- return $url;
- }
- sub write_file {
- my $keys = shift;
- print $fh "$_ = $log{$_}\n" for @{$keys};
- }
复制代码 |
|