- 论坛徽章:
- 8
|
本帖最后由 huang6894 于 2015-01-26 13:20 编辑
回复 2# 1002279625
- my($fa, $seq) = @ARGV;
- my @all_seq;
- open IN,"< $seq" or die "$!";
- while(<IN>){
- chomp;
- if(/^\[([ATCG]+)\]([ATCG]+)$/){
- map{push @all_seq,$_.$2}(split //,$1);
- }elsif(/^([ATCG]+)\[([ATCG]+)\]([ATCG]+)$/){
- map{push @all_seq,$1.$_.$3}(split //,$2);
- }elsif(/^([ATCG]+)\[([ATCG]+)\]$/){
- map{push @all_seq,$1.$_}(split //,$2);
- }else{
- push @all_seq,$_;
- }
- }
- close IN;
- $/ = '>';
- open IN1,"< $fa" or die "$!";
- <IN1>;
- while(<IN1>){
- chomp;
- my ($key, @seq) = split /\n/;
- my ($gene) = $key =~/^([^\|]+)(\s+)?|/;
- my $s = join("",@seq);
- foreach my $key2(@all_seq){
- my ($pos, $now) = (0, -1);
- until ( $pos eq -1 ) {
- $pos = index( $s, $key2, $now + 1 );
- $now = $pos;
- print $gene."\t".$pos."\t".$key2."\n" unless $pos < 0;
- }
- }
- }
- close IN2;
- $/ = "\n";
复制代码 结果:- AT1G50920.1 0 GGTTGGGTC
- AT1G50920.1 31 GGTTGGGTC
- AT1G15970.1 112 GCTTGGGTC
- AT1G75120.1 15 CGTTGGGTC
- AT1G75120.1 36 GGGTGGGTC
复制代码 如果数据量小也就罢了,太大,可以考虑用kmp算法,或者直接blat |
|