- 论坛徽章:
- 0
|
读取一组数据,大约800W行,写了个脚本 运行速度太慢了,请各为高手指点一下,怎么优化
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- import os
- import sys
- import re
- import argparse
- import gzip
- from Bio import SeqIO
- from Bio.SeqRecord import SeqRecord
- def dict_fa(fa):
- Fa_dict = {}
- for seqrecord in SeqIO.parse(fa,'fastq'):
- Fa_dict[seqrecord.id] = str(seqrecord.seq)
- return Fa_dict
- def read_gz(f):
- with gzip.open(f) as R1:
- for line in R1:
- yield line
- def main():
- parser=argparse.ArgumentParser(description=__doc__,formatter_class=argparse.RawDescriptionHelpFormatter)
- parser.add_argument('-s','--seq',help='fasta file',dest='seq',required=True,type=str)
- args=parser.parse_args()
- fa_dict = dict_fa(args.seq)
- uniq_seq = set(fa_dict.values())
- for seq in uniq_seq:
- seq_count = list(fa_dict.values()).count(seq)
- for id in fa_dict.keys():
- if fa_dict[id] == seq:
- print(id,seq_count)
- if __name__=="__main__":
- main()
复制代码
|
|