#!/usr/bin/perl -w use strict; ############################################# # Authors: Marta Massanella & Ingrid Pallàs # # Date: March 2005 # ############################################# if ( scalar(@ARGV) < 1 ) { print STDERR "findsnp.pl \n"; exit(1); } open ( MYFILE, "<$ARGV[0]" ); my @entry = ; # @entry contains the sequences from Blast Result. close ( MYFILE ); ######################################## # MY VALUES # ######################################## my $i; my $j; my @J = (0,0); my $jj; my $l; my $n; my $o; my $name; my $percent1; my $flag; my @temp = (); my @query = (); my @sbjct = (); my @result = (); my @SNP = (); my @name2; my @startseq = (); my @percent = (); sub processResult(); ### IUPAC Ambiguity Codes ### my %code= ("u" => "t", "k" => "c/t", "m" => "a/c", "s" => "c/g", "w" => "a/t", "r" => "a/g", "y" => "c/t", "b" => "c/g/t", "d" => "a/g/t", "h" => "a/c/t", "v" => "a/c/g", "n" => "a/c/g/t", "x" => "a/c/g/t"); #################################################################### # PROGRAM BODY: FIND SNPs IN SECIS SEQUENCES # #################################################################### ### Scan the whole input comparing all the sequences and saving the comparison in a vector result ### $i = 0; # $i is an index which allows us to scan the whole input using an iterative composition while ( $i < scalar(@entry) ) { @temp = (); $entry[$i] = lc($entry[$i]); # Change all capital letters to small letters. if ( $entry[$i] =~ /^(>.*?)\s/ ) { # Searching the id for each seq & saving them in @name2 to recognize which seq # contains a SNP. $name = $1; push (@name2, $name); } if ( $entry[$i] =~ /identities.*?(\d+)\%/o ) { # Searching & saving the percent of identity(@percent) between query & sbjct seq # to be able to select the ones which are 95-99% identics. $percent1 = $1; push(@percent,$percent1); } if ( $entry[$i] =~ /score\s+/o ) { # To delimit every seq is used the word 'score' localized in the input file,then &processResult(); # the sequences are processed (see processResult function) & the vectors cleaned @query = (); @sbjct = (); $flag = 0; # This flag is used to identify the first nucleotide of each query seq when # $flag=0 is the begining of the seq and $flag=1 means that the seq continues. @J = (0, 0); # This meter (@J) is used to count the positions of whole nucleotides from both } # query & sbjct seq to be able to apply the IUPAC Ambiguity Code. if ($entry[$i] =~ /^query:/o) { # Looking for query sequences in the entry file, in order to extract them later # in a vector. $entry[$i] =~ s/^query:\s+(\d+)\s+//o; # The regular expression erases the elements typed before the nucleotides, but # the number of the first nucleotide of query seq is kept in @startseq only if if ($flag == 0) { # it is the begining of the seq ($flag=0). push(@startseq,$1); } $flag = 1; @temp = split( //, $entry[$i] ); # The query seq is splitted in a temporal vector in order to have a nucleotide # in each position of the vector. $j = 0; while ( $temp[$j] !~ /\s/o ) { # The temporal vector is scanned and each nucleotide is pushed in query vector, # until there is a space in @temp. push (@query,$temp[$j]); $jj = $J[0]; if (defined($code{$query[$jj]})) { # While the scanning is running, it is check if there is a letter from the # Ambiguity Code and it is replaced by equivalent letters . $query[$jj] = $code{$query[$jj]}; } $j = $j + 1; $J[0]++; } } elsif ( $entry[$i] =~ /^sbjct:/o ) { # The same procedure is applied for Subject Sequences. $entry[$i] =~ s/^sbjct:\s+\d+\s+//o; @temp = split ( //, $entry[$i] ); $j = 0; while ( $temp[$j] !~ /\s/o ) { push ( @sbjct, $temp[$j] ); $jj = $J[1]; if (defined($code{$sbjct[$jj]})) { $sbjct[$jj] = $code{$sbjct[$jj]}; } $j = $j + 1; $J[1]++; } } $i = $i + 1; } if ( scalar(@query) > 0 && scalar(@sbjct) > 0 ) { # The last sequences are processed in this alternative composition. &processResult(); } my $seqpos; my @seqpos2; my $w; my %finalresult; my %countSNP; my $s=0; $n = 0; while ( $n < scalar(@result) ) { #$n is an index which allows us to scan the whole input using iterative composition @SNP = split(/;/,$result[$n]); # Each position of Result vector is splitted in order to have in a new vector for # each sequence where each position of the new vector is the results of the $o = 0; # comparison. while ($o < scalar(@SNP) ) { if ($SNP[$o] !~ /^[actg\-0\/\s]+$/o ) { # If there is an element in the vector that we are not looking for, the program # will give you an error in order to have better results. $SNP[$o] = $SNP[$o]." ERROR in the sequence"; } unless ($SNP[$o] eq 0 ) { # If there is an element different from '0' in the vector, it is kept which seq my $b = $n + 1; # is ($n+1) and the position of the difference, the SNP, ($startseq + $o) my $c = $startseq[$n] + $o; if ( ( $percent[$n] > 94 ) && ( $percent[$n] < 100 ) ) { # For this study, only the seqs with 95-99% of identity are # analysed, here it is the selection of these ones. $seqpos = $c.$SNP[$o]; # The SNPs positions ($c) are concatenated with the nucleotide # of query seq and the change/s found in sbjct sequence.All these defined($countSNP{$seqpos}) || ($countSNP{$seqpos} = 0); #results are stored in %countSNP,where it is counted how many $countSNP{$seqpos} = $countSNP{$seqpos} + 1; # times a specific SNP has appeared in a specific position. # Moreover,all the results are pushed in a vector and afterwards push (@seqpos2,$seqpos); # a new hash is created (%finalresult) in order to have together # all the SNPs of each seq. $finalresult{$name2[$n]} = [@seqpos2]; } } $o = $o + 1; } $n = $n + 1; @seqpos2 = (); # Finally the vector with the results is cleaned. } open(FILE, ">infosecis.txt"); # All the information obtained is sent to a new file. The file # will contain the following information: my @seqs = keys(%finalresult); foreach my $key (@seqs) { print FILE "Sequence $key : @{ $finalresult{$key} }\n"; # 1.All the sequences with 95-99% of identity, with their id and } # all their SNPs (position, query nucleotide and change/s). my @poskeys = keys(%countSNP); foreach my $SNPs (@poskeys) # 2.All the SNPs(query nucleotide & change/s & their positions) { # counting how many times they appear in all seqs analyzed. print FILE "SNP frequence: $SNPs ; $countSNP{$SNPs}\n"; } close(FILE); print "The Results are in infosecis.txt file\n"; ####################################################### # FUNCTIONS # ####################################################### sub processResult() { # This function is to compare Query and Sbjct sequences and to obtain # a vector with all the results (@result).We are comparing each position if ( scalar(@query) > 0 && scalar(@sbjct) > 0 ) { # of Query and Sbjct seq scanning @query & @ @sbct:if they have the same my $k = 0; # nucleotide at the same position, we introduce a '0' in the @result. If my $part = ''; # not, we introduce the nucleotides from both sequences. while ( $k < scalar(@query) ) { if ( $query[$k] ne $sbjct[$k] ) { $part = $part."$query[$k]$sbjct[$k];"; } else { $part = $part."0;"; } $k = $k + 1; } push(@result,$part); } }