- 论坛徽章:
- 5
|
{:2_172:}大神! 这个, 可能会快一点. 但是内存还是占用大了。而且,更大了。{:2_172:}{:2_172:}
不花内存, 内存小一点
scaffold16, scaffold35.... 存成一个, 一个文档
这样, 慢了. 非常久的时间- #!/usr/bin/perl
- die "Usage: perl $0 gff fq\n" unless @ARGV == 2;
- #@ARGV = qw/cd sca/;
- my ( $txt, $fIn ) = @ARGV;
- my %code = (
- 'TCA' => 'S', # Serine
- 'TCC' => 'S', # Serine
- 'TCG' => 'S', # Serine
- 'TCT' => 'S', # Serine
- 'TTC' => 'F', # Phenylalanine
- 'TTT' => 'F', # Phenylalanine
- 'TTA' => 'L', # Leucine
- 'TTG' => 'L', # Leucine
- 'TAC' => 'Y', # Tyrosine
- 'TAT' => 'Y', # Tyrosine
- 'TAA' => '_', # Stop
- 'TAG' => '_', # Stop
- 'TGC' => 'C', # Cysteine
- 'TGT' => 'C', # Cysteine
- 'TGA' => '_', # Stop
- 'TGG' => 'W', # Tryptophan
- 'CTA' => 'L', # Leucine
- 'CTC' => 'L', # Leucine
- 'CTG' => 'L', # Leucine
- 'CTT' => 'L', # Leucine
- 'CCA' => 'P', # Proline
- 'CCC' => 'P', # Proline
- 'CCG' => 'P', # Proline
- 'CCT' => 'P', # Proline
- 'CAC' => 'H', # Histidine
- 'CAT' => 'H', # Histidine
- 'CAA' => 'Q', # Glutamine
- 'CAG' => 'Q', # Glutamine
- 'CGA' => 'R', # Arginine
- 'CGC' => 'R', # Arginine
- 'CGG' => 'R', # Arginine
- 'CGT' => 'R', # Arginine
- 'ATA' => 'I', # Isoleucine
- 'ATC' => 'I', # Isoleucine
- 'ATT' => 'I', # Isoleucine
- 'ATG' => 'M', # Methionine
- 'ACA' => 'T', # Threonine
- 'ACC' => 'T', # Threonine
- 'ACG' => 'T', # Threonine
- 'ACT' => 'T', # Threonine
- 'AAC' => 'N', # Asparagine
- 'AAT' => 'N', # Asparagine
- 'AAA' => 'K', # Lysine
- 'AAG' => 'K', # Lysine
- 'AGC' => 'S', # Serine
- 'AGT' => 'S', # Serine
- 'AGA' => 'R', # Arginine
- 'AGG' => 'R', # Arginine
- 'GTA' => 'V', # Valine
- 'GTC' => 'V', # Valine
- 'GTG' => 'V', # Valine
- 'GTT' => 'V', # Valine
- 'GCA' => 'A', # Alanine
- 'GCC' => 'A', # Alanine
- 'GCG' => 'A', # Alanine
- 'GCT' => 'A', # Alanine
- 'GAC' => 'D', # Aspartic Acid
- 'GAT' => 'D', # Aspartic Acid
- 'GAA' => 'E', # Glutamic Acid
- 'GAG' => 'E', # Glutamic Acid
- 'GGA' => 'G', # Glycine
- 'GGC' => 'G', # Glycine
- 'GGG' => 'G', # Glycine
- 'GGT' => 'G', # Glycine
- );
- open( SCA, $fIn ) or die $!;
- my ( $key, %seqId );
- while (<SCA>) {
- chomp;
- /^>/ ? ( ($key) = /^>(\w+)/ ) : $seqId{$key} .= $_;
- }
- close SCA;
- open TXT, $txt or die $!;
- open OUT, '>', 'Sequence_extraction_part.fa' or die "don't worry";
- open AA, '>', 'aa_extraction_part.fa' or die "be happy";
- while (<TXT>) {
- my ( $chr, $s, $e, $p ) = ( split /\s+/, $_, 6 )[ 0, 3, 4, 2 ];
- next if $p ne 'CDS';
- my $l = $e - $s + 1;
- extract( $chr, $s, $l, $_ );
- }
- sub extract {
- my ( $SequenceId, $start, $len, $m ) = @_;
- unless ( exists $seqId{$SequenceId} ) {
- print "$SequenceId\t$start\t$start+$len\terro\n";
- return;
- }
- my $Seq = substr $seqId{$SequenceId}, $start - 1, $len;
- my $aa;
- while ( $Seq =~ /(.{3})/g ) {
- $aa .= $code{$1} || bad_codon($1);
- }
- my ( @OUT, @AA ); # cache 快了, 占内存
- my ( $lenseq, $lenaa ) = ( length($Seq), length($aa) );
- for ( my $j = 0 ; $j * 60 < $lenseq ; $j++ ) {
- push @OUT, substr( $Seq, $j * 60, 60 ), "\n";
- }
- for ( my $i = 0 ; $i * 60 < $lenaa ; $i++ ) {
- push @AA, substr( $aa, $i * 60, 60 ), "\n";
- }
- print OUT ">$m", @OUT;
- print AA ">$m", @AA;
- }
- close OUT;
- close AA;
- sub bad_codon {
- my $bad = shift;
- print qq[Bad codon "$bad"!!\n];
- exit;
- }
- __DATA__
复制代码 |
|