求教字符串快速比较问题 - Chinaunix

#coding=utf-8
import os
import sys
class cmp:
adict = {}
bdict = {}
def __init__(self):
self.adict = self.loadFile('a.txt')
self.bdict = self.loadFile('b.txt')
def loadFile(self, path):
if not path:
return False
if not os.path.exists(path):
return False
try:
fp = open(path, 'r')
lines = []
for line in fp:
lines.append(line.strip())
fp.close()
except:
return False
if not lines:
return False
names = []
values = []
for linnnum,line in enumerate(lines):
if not line:
continue
if linnnum % 2 == 0:
names.append(line)
else:
values.append(line)
return dict(zip(names, values))
def run(self):
for key,val in self.adict.items():
if not self.bdict.has_key(key):
print 'key %s not exists in b.txt' % key
else:
if self.bdict[key] == val:
print 'key %s eqal in a.txt and b.txt' % key
else:
print 'key %s not eqal in a.txt and b.txt' % key
def __del__(self):
self.adict = {}
self.bdict = {}
if __name__ == '__main__':
c = cmp()
c.run()

复制代码

D:\备份\cmp>python cmp.py
key @080404_HWI-EAS121_0001FC209NNAAXX_4_1_905_794 eqal in a.txt and b.txt
key @080404_HWI-EAS121_0001FC209NNAAXX_4_3_905_794 not exists in b.txt
key @080404_HWI-EAS121_0001FC209NNAAXX_4_1_931_319 not eqal in a.txt and b.tx
key @080404_HWI-EAS121_0001FC209NNAAXX_4_1_895_807 not eqal in a.txt and b.tx
key @080404_HWI-EAS121_0001FC209NNAAXX_4_1_906_759 eqal in a.txt and b.txt
D:\备份\cmp>

复制代码

from itertools import izip
def compare(a, b):
count = 0
for x,y in izip(a,b):
if x!=y:
count += 1
if count > 1: return False
if count == 1: return True
return False

复制代码

def compare(a, b):
count = 0
for x,y in izip(a,b):
if x!=y:
dif1 = x
dif2 = y
if count == 1: return None
count += 1
if count == 1: return dif1,dif2
return None

复制代码

def cmp(ss,ds):
n= 31
res=0
while n>0 :
if ss[n-1:n] != ds[n-1:n]:
res+=1
if res>1:
return False
n-=1
if res == 0:
return False
return True

复制代码

s1 ='AATTCTAACATGAAAGTAGGAAAGATGTCAC'
s2 ='AATTCTAACATGAAAGTAGGAAAGATGTCAC'
def cmp(ss,ds):
n= 31
res=0
while n>0 :
if ss[n-1:n] != ds[n-1:n]:
res+=1
if res>1:
return False
n-=1
if res ==0:
return False
return True
count =1000000
while count:
print cmp(s1,s2)
count-=1

复制代码

回复 hulnglei
如果你希望在100万个和100万个之间任何一对进行比较的话，需要考虑一下算法。例如可以将其中的一组建立一棵树，字符串的每个字符是一个节点，它的子节点是下一个字符，因此树的深度是字符串的长度。然后在此树中对另外一组的每个字符串进行搜索，当发现有两个节点不同时就可以停止搜索。这时可以省略后续所有节点的搜索，应该可以成数量级的加快计算速度。

如果你的字符串中的字符只有ATCG四个字符的话，并且长度是31的话，可以考虑使用两个32bit的长整型数保存其状态。这样可以通过异或位操作快速比较，当然还需要一个数比特1个数的函数，这些在C语言级别做的话应该是非常快的。
HyryStudio 发表于 2010-06-02 20:52

while (*s1 != 0 && *s1 == *s2)
s1++, s2++;
if (*s1 == 0 || *s2 == 0)
return (unsigned char) *s1 - (unsigned char) *s2;
return *s1 - *s2;

复制代码

extern inline int strcmp(const char * cs,const char * ct)
{
register int __res __asm__("ax");
__asm__("cld\n"
"1:\tlodsb\n\t"
"scasb\n\t"
"jne 2f\n\t"
"testb %%al,%%al\n\t"
"jne 1b\n\t"
"xorl %%eax,%%eax\n\t"
"jmp 3f\n"
"2:\tmovl $1,%%eax\n\t"
"jb 3f\n\t"
"negl %%eax\n"
"3:"
:"=a" (__res):"D" (cs),"S" (ct):"si","di");
return __res;
}

复制代码

s ='AATTCTAACATGAAAGTAGGAAAGATGTCAC'
d={'AATTCTAACATGAAAGTAGGAAAGATGTCAC':0}
def genstr(s,d):
n=31
while n>0:
for i in 'ATCG':
if i !=s[n-1:n]:
d.has_key(s[0:n-1]+i+s[n:])
n-=1
t=1800000
while t>0:
print t
genstr(s,d)
t-=1

复制代码

fo = file("out_3snp.txt",'w')
for keya,vala in self.adict.items():
if vala in self.bdict:
continue
else:
for i1 in range(5, 31):
for j1 in 'ACGT':
if vala[i1] == j1:
continue
else:
b1= vala[0:i1]+j1+vala[i1+1:31]
for i2 in range(5,31):
if i2==i1:
if (b1 not in self.cdict) and (b1 in self.bdict):
print >> fo, keya, self.bdict[b1][0], vala[:31], i1+1, vala[i1+32]
break
else:
for j2 in 'ATCG':
if vala[i2] == j2:
continue
else:
b2 = b1[0:i2]+j2+b1[i2+1:31]
for i3 in range(5, 31):
if i3==i1 or i3==i2:
if (b2 not in self.cdict) and (b2 in self.bdict):
print >> fo, keya, self.bdict[b2][0], vala[:31], i1+1, vala[i1], j1, vala[i1+32],
i2+1, vala[i2], j2, vala[i2+32]
break
else:
for j3 in 'ACGT':
if vala[i3] == j3:
continue
else:
b3 = b2[0:i3]+j3+b2[i3+1:31]
if (b3 not in self.cdict) and (b3 in self.bdict):
print >> fo, keya, self.bdict[b3][0], vala[:31], i1+1, vala[i1], j1, vala[i1+32], i2+1, vala[i2], j2, vala[i2+32], i3+1, vala[i3], j3, vala[i3+32]
-- INSERT --

复制代码