回复 16# expert1
# awk '{
k=sprintf("%s_%020d",$5,$2); # for key
for(n=0;n++<6;)a[k,n]=$n;
c[k]
}
END{
$5="";
t=asorti(c,s); # sort all data
for(n=0;n++<t+1;){
N=s[n];
if($5==a[N,5]&&$3+1>=a[N,2]) # check overlay area
a[N,2]=$2; # keep the minimum value
else if($5)
print ">"$5"_E"++C,$6,$1,$4,$2,$3;
if($5!=a[N,5]) C=0; # reset count
for(m=0;m++<6;) # save last data in $1, $2, $3, ...
$m=a[N,m]
}
}' FILE
for example
sorted data
refseq1 860921 861380 + NM_152486 SAMD11 <== last , $1,$2,...
refseq1 861102 861593 + NM_152486 SAMD11 <== Now , a[N,1], a[N,2],...
if($5==a[N,5]&&$3+1>=a[N,2])
$5==a[N,5] is true , the $5(NM_152486) are the same
$3+1>=a[N,2] is true , 861380 > 861102
the end of last one is bigger than start of next one (Now)
to keep start value of the "last"
by a[N,2]=$2;
Note: there hav a bug that didn't check $3 for maximum value
refseq1 879088 880161 + NM_152486 SAMD11 <== last , $1,$2,...
refseq1 935046 935752 - NM_021170 HES4 <== Now , a[N,1], a[N,2],...
if($5==a[N,5]&&$3+1>=a[N,2])
$5==a[N,5] is false
and ouput the "last" data with count
else if($5)
print ">"$5"_E"++C,$6,$1,$4,$2,$3;
the $5 are different and reset count
if($5!=a[N,5]) C=0; # reset count
|