- 论坛徽章:
- 0
|
本帖最后由 picbhan 于 2013-03-28 19:58 编辑
- #!/usr/bin/perl
- use strict;
- use warnings;
- my (%ref, %data);
- open my $in , '<', 'refGene.txt'
- or die "Can't open 'refGene.txt' for reading. $!\n";
- while (<$in>) {
- chomp;
- my @t = split;
- push @{$ref{$t[1]}}, [@t];
- }
- close $in;
- open $in , '<', 'lncipedia_location.txt'
- or die "Can't open 'lncipedia_location.txt' for reading. $!\n";
- while (<$in>) {
- chomp;
- my @t = split;
- push @{$data{$t[1]}}, [@t];
- }
- close $in;
- open my $ref_include_inc , '>', 'ref_include_inc.txt'
- or die "Can't create 'ref_include_inc.txt' for writing. $!\n";
- open my $inc_include_ref , '>', 'inc_include_ref.txt'
- or die "Can't create 'inc_include_ref.txt' for writing. $!\n";
- open my $non_include , '>', 'non_include.txt'
- or die "Can't create 'non_include.txt' for writing. $!\n";
- open my $overlap , '>', 'overlap.txt'
- or die "Can't create 'overlap.txt' for writing. $!\n";
- open my $to , '>', 'ref_sorted.txt' or die "Can't create for writing. $!\n";
- while (my ($chr, $v) = each %ref) {
- # sort region using start sites, if equal, using end sites
- my @ref_data = sort {$a->[2] <=> $b->[2] or $a->[3] <=> $b->[3]} @$v;
- # output sorted refGenes
- print {$to} join("\t", @$_) . "\n" for @ref_data;
- # if no such chromosome in inc, ignore and try next chr
- next unless exists $data{$chr};
- # sort region using start sites, if equal, using end sites
- my @inc_data = sort {$a->[2] <=> $b->[2] or $a->[3] <=> $b->[3]} @{$data{$chr}};
- my @store; # use to store some ref regions, since inc regions may overlapped
- for (@inc_data) {
- my @id = @$_;
- if (@ref_data) {
- my $result; # one of ref_include_inc, inc_include_ref, overlap or undef
- while (@ref_data) {
- my @rd = @{+shift @ref_data};
- # igore genes on the left side of inc region
- next if $rd[3] < $id[2];
- # record each overlapped genes
- push @store, [@rd];
- # stop when this gene is on the right side of inc region
- last if $rd[2] > $id[3];
- # this gene must overlap with this inc region
- if ($id[2] >= $rd[2] && $id[3] <= $rd[3]) {
- $result = 'ref_include_inc';
- }
- elsif ($rd[2] >= $id[2] && $rd[3] <= $id[3]) {
- $result = 'inc_include_ref';
- }
- else {
- $result = 'overlap';
- }
- }
- # output result
- if ($result) {
- if ($result eq 'ref_include_inc') {
- print {$ref_include_inc} join("\t", @id), "\n";
- }
- elsif ($result eq 'inc_include_ref') {
- print {$inc_include_ref} join("\t", @id), "\n";
- }
- else {
- print {$overlap} join("\t", @id), "\n";
- }
- }
- else { # if $result is undef, means no overlapped genes
- print {$non_include} join("\t", @id), "\n";
- }
-
- # put this genes back into @ref_data, use to check next inc region
- unshift @ref_data, @store;
- @store = ();
- }
- else {
- # no regions in ref left, print all regions left in inc to non_include
- print {$non_include} join("\t", @id), "\n";
- next;
- }
- }
- delete $data{$chr};
- }
- while (my ($chr, $v) = each %data) {
- my @inc_data = @$v;
- for (@inc_data) {
- print {$non_include} join("\t", @$_), "\n";
- }
- }
- close $to;
- close $ref_include_inc;
- close $inc_include_ref;
- close $non_include;
- close $overlap;
复制代码 |
|