- 论坛徽章:
- 8
|
额,不好意思啊~我一学长给了我一个程序,可我一点都不懂python,只能大概明白它的意思,如果有大神愿意感兴趣的话,能不能试着把下面的这个python程序改成兼容性更好的perl呢,坐等大神出现!!!
p.s:也在perl区提出了,求关注~
下面是代码:- #usage: python cds_abs.py input_file output_file
- # the input file contains 2 columns
- #eg;
- #NM_000497.0 c.595+16G>T
- #
- import re
- import sys
- def usage():
- print '''
- python cds_abs.py input_file output_file
- '''
- if len(sys.argv)!=3:
- sys.exit(usage())
- import gzip
- fin=sys.argv[1]
- fout=sys.argv[2]
- f=open(fin,'r')
- o=open(fout,'w')
- ######
- def position(nm,ps):
- global ore
- cds=gzip.open(‘hg19.dbref.gz')
- # cds=open('hg19.dbref.sort')
- ps = int(ps)
- while 1:
- line=cds.readline()
- if not line:
- break
- nm1=re.search("NM_(\d+)",line.split('\t')[0])
- if not nm1:
- continue
- else:
- flag=line.split('\t')[2]
- global chr
- chr = line.split('\t')[1]
- nm1=nm1.group(1)
- if nm==nm1:
- while 1:
- pos=cds.readline()
- if not re.match("^C\d+",pos):
- continue
- else:
- cdspos=pos.split('\t')[1:3]
- if flag == '+':
- ore = '+'
- for num in range(int(cdspos[0]),int(cdspos[1])+1):
- ps-=1
- if ps==0:
- return (num,flag)
- else:
- ore = '-'
- for num in range(int(cdspos[1]),int(cdspos[0])-1,-1):
- ps-=1
- if ps==0:
- return (num,flag)
- cds.close()
- #########
- def result(nm,ps):
- fu = re.search("^-(\d+)",ps)
- if fu:
- # print fu.group(1)
- (res,flag)=position(nm,'1')
- if ore == '+':
- return (res-int(fu.group(1)))
- else:
- return (res+int(fu.group(1)))
- he=re.search("(\d+)\-(\d+)",ps)
- if he:
- (res,flag)=position(nm,he.group(1))
- if flag == '+':
- return (res-int(he.group(2)))
- else:
- return (res+int(he.group(2)))
- else:
- hehe=re.search("(\d+)\+(\d+)",ps)
- if hehe:
- (res,flag)=position(nm,hehe.group(1))
- if flag == '+':
- return res+int(hehe.group(2)),
- else:
- return res-int(hehe.group(2))
- else:
- (res,flag)=position(nm,ps)
- return res
- #res=position("000015",20)
- #print res
- ####
- #res=result("014020","205")
- #print res
- ###
- ########### for the finding of mutation in 3'UTR
- def utr3(nm, ps):
- global ore
- ps=int(ps)
- cds=gzip.open('hg19.dbref.gz')
- while 1:
- line=cds.readline()
- if not line:
- break
- nm1=re.search("NM_(\d+)",line.split('\t')[0])
- if not nm1:
- continue
- nm1 = nm1.group(1)
- if nm==nm1:
- global chr
- chr = line.split('\t')[1]
- flag = line.split('\t')[2]
- ut1 = line.split('\t')[5]
- ut2 = line.split('\t')[6]
- while 1:
- pos = cds.readline()
- if not re.match("^3U\d+",pos):
- continue
- utrpos = pos.split('\t')[1:3]
- if flag == '+':
- ore = '+'
- for num in range(int(utrpos[0]),int(utrpos[1])+1):
- ps -= 1
- if ps == 0:
- return num
- else:
- ore = '-'
- for num in range(int(utrpos[1]), int(utrpos[0])-1,-1):
- ps -= 1
- if ps == 0:
- return num
- for line in f.readlines():
- word=line.split('\t')[0:2]
- nm=re.search("NM_(\d+)",word[0])
- if nm:
- nm=nm.group(1)
- else:
- o.write("-\t-\t-\t-\t-\n")
- continue
- mut=re.search("c\.([0-9|\-|\+]+)_([0-9|\-|\+]+)",word[1])
- if mut:
- res=str(result(nm,mut.group(1)))
- res+='..'+str(result(nm,mut.group(2)))
- else:
- mut=re.search("c\.([0-9|\-|\+]+)",word[1])
- if mut:
- res=result(nm,mut.group(1))
- else:
- mut=re.search("c\.(\d+)",word[1])
- if mut:
- res=positon(nm,mut.group(1))
- else:
- ##### c.*111A>G : the 3'UTR region
- utr = re.search("c\.\*(\d+)",word[1])
- if utr:
- res = utr3(nm, utr.group(1))
- else:
- res="ERR"
- o.write(chr+"\t"+str(res)+'\t'+ore+"\t"+line)
- # delpos=re.search("(\d+)_(\d+)",word[1])
- # delpos=delpos.group(1)
- # a=delpos.group(1)
- # b=delpos.group(2)
- # res=position(nm,a)
- # res+=position(nm,b)
- # print res
复制代码 据说这个程序的逻辑是比较差的,遇到错误不能跳过,效率上也不怎么好。不过实现起来非常好用~
大神们,来吧来吧,嘻嘻,谢谢 |
|