- 论坛徽章:
- 145
|
本帖最后由 jason680 于 2017-04-17 07:02 编辑
回复 8# bmne
>>可能我笨...
是的,很多问题是你的笨加上懒所产生的...
>>如果非要归类,应该是归到“特解”中
特解只能解,某些特别状况,所以只能解决,两个文档 一大一小
且一定是小的文档 先读...
注:通解为数据库.....但你一直不采用
---------------------------------------------------------------------
>>再解释下“特解”详细分类:
1、当a.txt大于大于b.txt时(即a是大文件 b是小文件)。混合后的输出结果有两种情况:
b.txt在后面 a.txt在前面混合输出 请给出命令
a.txt在后面 b.txt在前面混合输出 请给出命令
2、当b.txt大于大于a.txt时(即b是大文件 a是小文件)。混合后的输出结果有两种情况:
b.txt在后面 a.txt在前面混合输出 请给出命令
a.txt在后面 b.txt在前面混合输出 请给出命令
你认为是四种,其实只有两种
(结果(内容)相同,只是排列先后不一样...)
a1 a1
a2 b1
b1 c1
b2 a2
c1 b2
c2 c2
最后两种,只是在打印有差别而以...
一是 print $0 a[n] # a1
一是 print a[n] $0 # 1a
-------------------------------------------------------
>> a=3GB b=10MB 或 a=10MB b=3GB 或 a=300MB b=60MB 等等...
实战中,最终输出结果<20GB 正常在10GB以下 大部分在2-4GB范围
最后一个问题最严重
你的 组合/混合 工具有很大的问题...
我用a,b文档不到M量级(K量级),组合结果3G
-rw-rw-r-- 1 jason jason 3000418650 Apr 16 21:04 50K500K <==约3G
(500K与500K预估近30G)
但你用的文档都是M量级,G量级,你确说组合结果<20G
你自已测测...就不说你了...
另外,文档内容是否重复,每一行长度大小,...就不提了,不然讨论不完...
注:程序combin.awk在最后
$ awk -vout=50K100K -f combin.awk 50k 100k
Read 50k ...
50k has 3026 lines
Read 100k ...
100k has 6019 lines
a b combine total:18213494 lines
21:01:05 start, total items:6019 ,outfile:50K100K
100.0% [======================================] 00:18
$ awk -vout=50K200K -f combin.awk 50k 200k
Read 50k ...
50k has 3026 lines
Read 200k ...
200k has 11981 lines
a b combine total:36254506 lines
21:01:31 start, total items:11981 ,outfile:50K200K
100.0% [======================================] 00:38
$ awk -vout=50K500K -f combin.awk 50k 500k
Read 50k ...
50k has 3026 lines
Read 500k ...
500k has 30120 lines
a b combine total:91143120 lines
21:02:46 start, total items:30120 ,outfile:50K500K
100.0% [======================================] 01:33
$ ls -l 50K*
-rw-rw-r-- 1 jason jason 599873115 Apr 16 21:01 50K100K <==近600M
-rw-rw-r-- 1 jason jason 1196939735 Apr 16 21:02 50K200K <==近1.2G
-rw-rw-r-- 1 jason jason 3000418650 Apr 16 21:04 50K500K <==约3G
------------文档内容不同(长度短,行数多),组合结果文档更大---------------------------------------------------
$ awk -vout=50K100Ks -f combin.awk 50ks 100ks
Read 50ks ...
50ks has 8541 lines
Read 100ks ...
100ks has 17072 lines
a b combine total:145811952 lines
21:31:21 start, total items:17072 ,outfile:50K100Ks
100.0% [======================================] 02:16
$ ls -l 50K100Ks 50ks 100ks
-rw-rw-r-- 1 jason jason 102401 Apr 16 21:27 100ks
-rw-rw-r-- 1 jason jason 1602983821 Apr 16 21:33 50K100Ks <==近1.6G
-rw-rw-r-- 1 jason jason 51206 Apr 16 21:27 50ks
$ ls -l 50K100K 50k 100k
-rw-rw-r-- 1 jason jason 102415 Apr 16 20:57 100k
-rw-rw-r-- 1 jason jason 599873115 Apr 16 21:01 50K100K <==近600M
-rw-rw-r-- 1 jason jason 51201 Apr 16 20:57 50k
-----------------------------------------------------------------------------------
$ cat combin.awk
function bar_set(num, msg, cmd, wid, len,dot){
_bar_sec_sys = systime();
printf("%s start, total items:%s %s\n", strftime("%H:%M:%S"), num, msg);
_bar_per_all = num;
_bar_wid = 80;
_bar_stp = int(num /20000)+1;
cmd = "tput cols";
cmd | getline wid;
close(cmd);
if(wid != 0) _bar_wid = wid;
#123456789012345678901234567890123456789012345678901234567890
#100.0% [====================] 01:00:00
# (6) 7 (>20) 30 (8) 39
if(_bar_wid < 40){
print "The screen columns is too small(" _bar_wid ")";
print "Please change it and more than 39" ;
exit(1);
}
_bar_dot_all = _bar_wid - 20;
_alv[1] = "*";
_alv[2] = "\\";
_alv[3] = "|";
_alv[0] = "/";
bar_make(1);
}
function bar_make(num, n, per, dot, all, per_str, alv_str, dot_str, sec){
_bar_num = num;
per_str = sprintf("%5.1f% ", num / _bar_per_all * 100);
dot = int(per_str / 100 * _bar_dot_all)
_bar_cnt = ++_bar_cnt % 4;
alv_str = _alv[_bar_cnt];
if(_bar_num == 1 || dot != _bar_dot){
_bar_dot = dot;
_bar_dot_s = "";
_bar_dot_e = "";
for(n=0;n<dot; ++n)
_bar_dot_s = _bar_dot_s"=";
for(n = dot+1; n <_bar_dot_all; ++n)
_bar_dot_e = _bar_dot_e".";
}
if(+per_str == 100) alv_str="";
dot_str = _bar_dot_s alv_str _bar_dot_e;
sec = systime() - _bar_sec_sys;
if(_bar_num ==1 || sec != _bar_sec){
_bar_sec = sec
if(sec < 3600){
_bar_sec_str = sprintf("%02d:%02d", int(sec/60), sec%60);
}else{
_bar_sec_str = sprintf("%02d:%02d:%02d", int(sec/3600),
int( (sec%3600)/60), sec%60);
}
}
printf("\r%s[%s] %s", per_str, dot_str, _bar_sec_str);
}
function bar(num){
if(num - _bar_num < _bar_stp) return;
bar_make(num);
}
BEGIN{
total = 1;
}
FNR==1{
print " Read " FILENAME " ..."
cmd="wc -l " FILENAME;
cmd | getline line;
total *= line;
print " " FILENAME " has " (+line) " lines"
T = NR;
close(cmd);
}
FNR==NR{
a[NR]=$0
next;
}
FNR == 1{
if(out=="")out="c.txt"
print " a b combine total:" total " lines"
bar_set(+line, ",outfile:"out);
}
{
bar(FNR)
# do something by yourself
for(n=1;n<T;++n)
print a[n] $0 > out
}
END{
print""
}
|
|