本帖最后由 yinyuemi 于 2011-09-02 09:24 编辑
- for((i=1;i<=1000;i++)); do echo '02,10,11,18,27,30
- 06,09,11,14,20,31
- 02,10,11,18,27,33
- 01,03,04,16,18,22
- 07,14,22,27,28,32
- 02,10,11,16,18,27
- 04,05,09,15,21,30
- 12,13,17,20,22,23
- 04,05,09,15,21,30' >>testfile; done
- wc -l testfile
- 9000 testfile
- 4#:
- time awk -F, '{s=0;m=1;for(i=1;i<=NF;i++)s+=$i^3;for(i=1;i<=NF;i++)m*=(!a[s-$i^3]++);if(m)print}' testfile
- 02,10,11,18,27,30
- 06,09,11,14,20,31
- 01,03,04,16,18,22
- 07,14,22,27,28,32
- 04,05,09,15,21,30
- 12,13,17,20,22,23
- real 0m0.154s
- user 0m0.154s
- sys 0m0.000s
- 10#:(ctrl-c 终止程序)
- time awk -F, '{T="";for(n=0;n++<NF;){T=T","a[$n];a[$n]=a[$n]","NR};split(T,t,",");f=0;for(n in t){if(t[n]=="")continue;if(gsub(t[n],"",T)>=5)f=1};if(f==0)print $0}' testfile
- 02,10,11,18,27,30
- 06,09,11,14,20,31
- 01,03,04,16,18,22
- 07,14,22,27,28,32
- 04,05,09,15,21,30
- 12,13,17,20,22,23
- real 10m20.540s
- user 10m18.173s
- sys 0m0.982s
- 12#:
- time awk -F, '{for(i=1;i<=l;i++){s=0;for(j=1;j<=NF;j++)if(index(a[i],$j))s++;if(s>=5)next}a[++l]=$0}1' testfile
- 02,10,11,18,27,30
- 06,09,11,14,20,31
- 01,03,04,16,18,22
- 07,14,22,27,28,32
- 04,05,09,15,21,30
- 12,13,17,20,22,23
- real 0m0.128s
- user 0m0.127s
- sys 0m0.001s
- 21#:
- time awk '
- function f(a,b){return "#"gensub(",","#"b"#","g",a)"#";}
- {e=0
- {for(i=1;i<NR;i++)
- if(split(f(a[i],""),x,f($0,"|"))>=6){e=1;break}
- }
- if(!e){a[NR]=$0;print $0}
- }' testfile
- 02,10,11,18,27,30
- 06,09,11,14,20,31
- 01,03,04,16,18,22
- 07,14,22,27,28,32
- 04,05,09,15,21,30
- 12,13,17,20,22,23
- real 0m0.841s
- user 0m0.837s
- sys 0m0.001s
- 4#waker兄的效率很高,不过适用于数值型数据
- 10# 通用性好,不过效率上要差
- 12# 如Tim兄所言,数值是2位的,效率很高
- 21# 通用性上稍差,如果是文本去重复的,且文本中包含正则符号或","或"#",可能会有问题(10#的代码使用gsub可能也有类似问题,没测试),效率上比4#和12#的要差
复制代码 |