- 论坛徽章:
- 0
|
本帖最后由 321wangke321 于 2014-09-11 15:01 编辑
use strict;
use warnings;
use common;
###perl get_promoter.pl F:\perl software\promoter extraction\Prunus_persica_v1.0_scaffolds gene.fa F:\perl software\promoter extraction\glycolysis.txt Prunus_persica_v1.0_scaffolds.fa
open GFF,"$ARGV[0]";
my %hash_gene=();
while(<GFF>)
{
chomp;
next unless /gene/;
my ($gene_name)=(split(/\t/,$_))[9-1];
$gene_name=~/ID=(.*?);/;
$gene_name=$1;
$hash_gene{$gene_name}=$_;
}
my %genome_hash=&create_hash_($ARGV[2]);
open TF,$ARGV[1];
while(<TF>)
{
chomp;
my ($id,$start,$strand)=(split(/\t/,$hash_gene{$_}))[1-1,4-1,7-1];
&get_genome_seq(\%genome_hash,$id,$strand,$start-1000,$start+200,$_."promoter");
}
sub get_genome_seq
{
my($genome_hash,$id,$strand,$start,$end,$fasta_title)=@_;
if(defined($fasta_title))
{
if($strand eq "+")
{
print ">",$fasta_title,"\n";
print substr($$genome_hash{$id},$start-1,$end-$start+1),"\n";
}
else
{
print ">",$fasta_title,"\n";
print &reverse_com(substr($$genome_hash{$id},$start-1,$end-$start+1)),"\n";
}
}
else
{
if($strand eq "+")
{
print ">",$id,"_",$start,"_",$end,"_",$strand,"\n";
print substr($$genome_hash{$id},$start-1,$end-$start+1),"\n";
}
else
{
print ">",$id,"_",$start,"_",$end,"_",$strand,"\n";
print &reverse_com(substr($$genome_hash{$id},$start-1,$end-$start+1)),"\n";
}
}
}
sub create_hash_
{
my ($file)=@_;
my %hash=();
open FILE, $file;
my $title;
while(<FILE>)
{
chomp;
if(/>(.*)/)
{
$title=$1;
}
else
{
$hash{$title}.=$_;
}
}
close FILE;
return %hash;
}
sub reverse_com
{
my ($seq)=@_;
$seq= reverse $seq;
$seq=~tr/ATCGatcg/TAGCtagc/;
return $seq;
}
程序的目的是:根据queryID文件的ID(如ppa000003m.g)找到gene annotation的左边对应列数字(31458157),分别加-1000,加200后搜genome sequence,得到序列。这是找人帮忙写的,但是在他电脑上能运行,但是在我电脑就不能正常了。
附件是运行的脚本和文件。
麻烦帮忙看看,我是新手。谢谢。
|
|