#!/usr/local/bin/perl -w # # localitzador # # filtering snps overlapping genomic features # # USAGE: # localitzador exp_snps.txt exp_exon.gff > exp_snp_seq.tbl 2> exp_snp_seq.err # # jplanas/bmorancho - UPF - 2002/02/28 # use strict; # use Data::Dumper; # # Setting VARS my (%SNPS,%COORDS,%snps_in_gene,$snp_file,$seq_file); my $PROG = "$0"; %SNPS = (); %COORDS = (); my $donor_len = 3; my $acceptor_len = 3; # # MAIN &parse_argvs(); &read_snps(); &read_seqs(); # print STDERR Data::Dumper->Dump([ \%SNPS, \%COORDS ],[ qw( *SNPS *COORDS ) ] ); &sort_snps(); &sort_seqs(); # print STDERR Data::Dumper->Dump([ \%SNPS, \%COORDS ],[ qw( *SNPS *COORDS ) ] ); &analyze_data(); &print_results(); exit(0); # # FUNCTIONS sub parse_argvs() { if (scalar(@ARGV) != 2) { print "USAGE: $PROG \n"; exit(1); } ($snp_file,$seq_file) = @ARGV; } # parse_argvs sub read_snps() { open(SNP_FILE, "< $snp_file") || do { print "$PROG: no es pot carregar $snp_file\n"; exit(1); }; print STDERR "### READING SNPS FILE\n"; my $c = 0; while () { my (@f,$snpchr,$snpstart,$snpend,$snpid); next if /^\s*$/o; # ens saltem linies en blanc # next if /^#/o; # tan sols si apareixen linies comentades al fitxer &counter(++$c); s/(\r)?\n//o; # treiem "\n" @f = split /\s+/o, $_; # supossem que cada registre tan sols te 4 camps... # $blockid = $f[0]; ($snpchr = uc($f[1])) =~ s/^CHR//o ; $snpstart = $f[2]; $snpend = $f[3]; $snpid = $f[4]; push @{ $SNPS{$snpchr} }, [ $snpstart,$snpend,$snpid ]; }; # while SNPS &endcounter($c); close(SNP_FILE); } # read_snps sub read_seqs() { open(SEQS, "< $seq_file") || do { print "$PROG: no es pot carregar $seq_file\n"; exit (1); }; print STDERR "### READING SEQUENCE FILE\n"; my $c = 0; while () { my (@f,$chr,$exonstart,$exonend,$geneid); next if /^\s*$/o; # ens saltem linies en blanc # next if /^#/o; # tan sols si apareixen linies comentades al fitxer &counter(++$c); s/(\r)?\n//o; # treiem "\n" @f = split /\s+/o, $_; # supossem que cada registre tan sols te 4 camps... ($chr = uc($f[0])) =~ s/^CHR//o ; $exonstart = $f[3]; $exonend = $f[4]; $geneid = $f[8]; push @{ $COORDS{$chr}{$geneid} }, [ $exonstart,$exonend ]; }; # while SNPS &endcounter($c); close(SEQS); } # read_seqs sub sort_snps() { my @chr = keys %SNPS; my $i = 0; print STDERR "### SORTING SNPS DATA\n"; while ($i < scalar(@chr)) { # my $ary = $SNPS{$chr[$i]} [$j][0..2]; my $ary = \@{ $SNPS{$chr[$i]} }; print STDERR "# Chromosome $chr[$i] : sorting ". scalar(@{ $ary })." SNPs...\n"; if (scalar(@{ $ary }) == 1) { $i = $i + 1; next; }; @{ $ary } = map { $_->[2] } sort { $a->[0] <=> $b->[0] # ordenem per acceptor || $a->[1] <=> $b->[1] } # ordenem per donor map { [ $_->[0], $_->[1], $_ ] } @{ $ary }; $i = $i + 1; }; } sub sort_seqs() { my @chr = keys %COORDS; my $i = 0; print STDERR "### SORTING GENOMIC DATA\n"; while ($i < scalar(@chr)) { # my $ary = $SNPS{$chr[$i]} [$j][0..2]; my $ary = \%{ $COORDS{$chr[$i]} }; my @gen = keys %{ $ary }; my $j = 0; print STDERR "# Chromosome $chr[$i] : sorting ". scalar(@gen)." genes...\n"; while ($j < scalar(@gen)) { my $arry = \@{ $ary->{$gen[$j]} }; if (scalar(@{ $arry }) == 1) { $j = $j + 1; next; }; @{ $arry } = map { $_->[2] } sort { $a->[0] <=> $b->[0] # ordenem per acceptor || $a->[1] <=> $b->[1] } # ordenem per donor map { [ $_->[0], $_->[1], $_ ] } @{ $arry }; $j = $j + 1; }; $i = $i + 1; }; } sub analyze_data() { my @chr = keys %COORDS; my $i = 0; print STDERR "### COMPARING SNPs COORDS AGAINST GENOMIC COORDS\n"; while ($i < scalar(@chr)) { # my $ary = $SNPS{$chr[$i]} [$j][0..2]; my $ary = \%{ $COORDS{$chr[$i]} }; my @gen = keys %{ $ary }; my $j = 0; my $chrid = $chr[$i]; my $c = 0; print STDERR "# Analyzing chromosome $chrid (". scalar(@gen)." genes)\n"; while ($j < scalar(@gen)) { my $arry = \@{ $ary->{$gen[$j]} }; my $k = 0; my $genid = $gen[$j]; my $exon_num = scalar(@{ $arry }); while ($k < $exon_num) { my ($ex_ini,$ex_end); &counter(++$c); $ex_ini = $arry->[$k][0]; $ex_end = $arry->[$k][1]; if ($exon_num == 1) { &checkcoords($chrid,$genid,$ex_ini,$ex_end,'exon'); last; }; if ($k > 0) { &checkcoords($chrid,$genid,($arry->[$k-1][1] + $donor_len), ($ex_ini - $acceptor_len),'intron'); }; if ($k == 0) { # check first exon &checkcoords($chrid,$genid,$ex_ini, ($ex_end - $donor_len),'exon'); &checkcoords($chrid,$genid,($ex_end - $donor_len), ($ex_end + $donor_len),'donor'); } elsif ($k == ($exon_num - 1)) { # check last exon &checkcoords($chrid,$genid,($ex_ini + $acceptor_len), $ex_end,'exon'); &checkcoords($chrid,$genid,($ex_ini - $acceptor_len), ($ex_ini + $acceptor_len),'acceptor'); } else { # check internal exon &checkcoords($chrid,$genid,($ex_ini + $acceptor_len), ($ex_end - $donor_len),'exon'); &checkcoords($chrid,$genid,($ex_ini - $acceptor_len), ($ex_ini + $acceptor_len),'acceptor'); &checkcoords($chrid,$genid,($ex_end - $donor_len), ($ex_end + $donor_len),'donor'); }; # if first/last/internal $k = $k + 1; }; # while $k $j = $j + 1; }; # while $j &endcounter($c); print STDERR "# Chromosome $chrid done: ".scalar(@gen). " genes - $c exons were checked...\n"; $i = $i + 1; }; # while $i } # read_seqs sub checkcoords() { my ($chrid, $genid, $ini, $end, $sec) = @_; my $i = 0; if ($ini >= $end) { return }; my $ary = \@{ $SNPS{$chrid} }; while ($i < scalar(@{ $ary })) { if ($ini > $ary->[$i][1]) { $i = $i + 1; next; }; if ($end < $ary->[$i][0]) { last; }; if (! defined($snps_in_gene{$chrid}) && ! defined($snps_in_gene{$chrid}{$genid}) && ! defined($snps_in_gene{$chrid}{$genid}{$sec})) { @{ $snps_in_gene{$chrid}{$genid}{$sec} } = (); }; push @{ $snps_in_gene{$chrid}{$genid}{$sec} }, $ary->[$i][2]; $i = $i + 1; }; # while return; } # checkcoords sub print_results() { my @chr = keys %snps_in_gene; my $n = 0; print STDERR "### HERE IS WHAT YOU GET... ISN'T IT ???\n"; while ($n < scalar(@chr)) { my $chrom = \%{ $snps_in_gene{$chr[$n]} }; my $m = 0; my @gens = keys %{ $chrom }; while ($m < scalar(@gens)) { &separador(); my $snps = \%{ $chrom->{$gens[$m]} }; foreach my $key (qw/ intron exon acceptor donor /) { if (! defined($snps->{$key})) { @{ $snps->{$key} } = (); }; print STDOUT "$chr[$n] $gens[$m] $key ". scalar(@{ $snps->{$key} }). " @{ $snps->{$key} } \n"; }; # foreach $key $m = $m + 1; }; # while $m &separador(); $n = $n + 1; }; # while $n } # print_results sub separador() { print STDOUT ("-" x 80)."\n"; } sub counter() { my $n = shift; if ($n % 100 != 0) { print STDERR "."; } else { print STDERR ". [$n]\n"; }; } sub endcounter() { my $n = shift; if ($n % 100 != 0) { print STDERR " [$n]\n"; }; }