- 论坛徽章:
- 0
|
本帖最后由 hulnglei 于 2010-06-01 16:24 编辑
回复 2# t6760915
确实如此,我也只是估计!
两个输入文件都是这样格式:都是大概80M大小
@080404_HWI-EAS121_0001FC209NNAAXX_4_1_931_319
AATTCTAACATGAAAGTAGGAAAGATGTCAC
@080404_HWI-EAS121_0001FC209NNAAXX_4_1_906_759
AATTCACAGCATTGCAGTAGGTTTTTTATCT
@080404_HWI-EAS121_0001FC209NNAAXX_4_1_895_807
AATTCAACAACAGGACATTAATTGGAAACGG
@080404_HWI-EAS121_0001FC209NNAAXX_4_1_905_794
AATTCAGAGCGCCACCCCATGGTGCCTCCTG
上面是字符串名字,下面行就是字符串,我就是整个比较字符串
我的脚本:
#!/usr/bin/python
# Filename: read the fastq file from p1 and p1 , conpare the each two sequences# to find snp. I think if there is a snp, the nulcleotide among it is the same
import sys
class Fastq:
def __init__(self, name, seq):
self.name = name
self.seq = seq
def get_fastq(fh):
""" read a fastq entry from a file handle """
header = fh.readline()
# eof detection
if not header:
return None
# no fastq format
if header[0] != '@':
return None
seq = ""
line = fh.readline()
seq = line[:-1]
line = fh.readline()
while line:
if line[0] == '@':
fh.seek(-len(line), 1)
break
line = fh.readline()
return Fastq(header[1:-1], seq)
fip1 = open("seqp1.fasta")
fo = file("outsnp.txt",'w')
entryp1 = get_fastq(fip1)
while entryp1:
#... do what you have to do
fip2 = open("seqp2.fasta")
entryp2 = get_fastq(fip2)
while entryp2:
snplist = []
count = 0
for i in range(5, len(entryp1.seq)):
if entryp1.seq!=entryp2.seq:
count = count + 1
i= i+ 1
if count<2:
for j in range(5, len(entryp1.seq)):
if entryp1.seq[j]!=entryp2.seq[j]:
snplist.append(j)
snplist.append(entryp1.seq[j])
snplist.append(entryp2.seq[j])
j=j+1
print >> fo, entryp1.name, entryp2.name, snplist
entryp2 = get_fastq(fip2)
fip2.close()
print entryp1.name
entryp1 = get_fastq(fip1)
fip1.close()
fo.close() |
|