#!/usr/bin/perl -w

use strict;
use Getopt::Std;

#####VARIABLE DECLARATION######

my %matriu = ();  		#Hash where we will store the PWM that is being analyzed
my $seq;		  		#$seq will store temporarily the sequence that is being analyzed
#some iterators
my $i;
my $j;
my $x;
my $m=0; #matrix iterator
my $f;
my $consenseq = ""; #The sequence that obtains the max score with a PWM will be stored here
my $tipomat = 0  ; 	#0=relative matrix 1=log likelihood standard 2-Log likelihood sequence
my %opt;			#This hash will contain the options chosen by the user
my $n_params;		#This scalar will contain the nē of parameters the user has typed in the command prompt
my $cutoff= 0.85; #default cutoff value
my $nummat; #number of matrix analyzed
my @nombres=();	#sequence names will be stored here
my @conseq;		#For calculation reasons, $consenseq will be deposited first in this array
my @maxscore;	#This array will contain the values obtained applying the PWM to $consenseq
my @secuencias;	#The FASTA sequences read by function readfasta() will be stored here
my @found;		#This array will contain a switch for every PWM, in order to show an output or don't, depending on the results obtained for every matrix
my @scoresfin = (); 	#the scores of each reading frame on each sequence will be stored on this array. 
my @frecuencias = (); 	#This variable will store the frequences of the relative matrix
my %matrices;	#If the input PWM file contains many matrices, they will be stored here, and put properly on %matriu when needed

#Let's start getting the options
(getopts('vsmn:c:ho',\%opt)) or die("Problems reading options, try again, please\n");
$n_params = scalar(@ARGV);

#We'll put first the option -h, just in case the user needs some help
if (exists ($opt{h})){
        print "\n********************ProMotif Usage and Options*********************";
        print "\npromotif -options transfac_matrix_file FASTA_sequence_file";
        print "\n\nOPTIONS:";
        print "\n\t-c X.X\tCutoff value";
        print "\n\t-v \t Verbose";
        print "\n\t-m \t Show matrix information (it will halt the program before finding matches between seqs and PWM)";
        print "\n\t-s \t Show sequence information (as with -m,it will halt the program before finding matches between seqs and PWM)";
        print "\n\t-n\tChooses type of output matrix. 0-relative matrix 1-loglikelihood matrix (standard nucleotide proportion) 2-loglikelihood matrix (nucleotide proportion based on sequence content)";
        print "\n\t-o\tMakes an html output";
        print "\n\t-h\tShows this help";
        print "\n\n";
        exit;
}

#if parameters are insufficient, just go out and give the user another chance to read better the help explanations...
if ($n_params < 2 ) {
        &print_error("Syntax error in the parameters and/or options!! Type ProMotif -h for a explanative help");

        }

if (exists($opt{c}))
{
        $cutoff=$opt{c}; #if the user wants a treshold to be set, let's apply it
}
if (exists($opt{o}) && exists($opt{m})){print "\nOptions -m and -o are incompatible because no PWM vs sequence comparison is done when -m is set on. Please run these options separately";exit;}
if (exists($opt{n}))
{
        if ($opt{n} > 2 ||$opt{n} < 0 ) {
                &print_error ("-m parameter must be 0,1 or 2"); #if user wants loglikelihood matrix, this option must be set
                exit;
        }else {
                $tipomat = $opt{n};

        }
}

#Let's read the FASTA file
@secuencias = &readfasta ($ARGV[1],\@nombres); #we read the FASTA file as typed in paramter $ARGV[1] and put the sequence names on @nombres. The returned value (read sequences) will be stored in @secuencias

if (exists($opt{s})) {
	&seqinfo (@secuencias);  #if user just wanted some G+C and lenght information, don't make him wait until some PWM matches are found. Show the info and go back to shell...
	if (!exists($opt{m})) {exit;}
}


#We read the matrix and find the frequences
$nummat = &readpromat ($ARGV[0],\%matrices) ;  #we read the PWM file as typed in paramter $ARGV[0] and put the PWM's on %matrices. The returned value (number of PWM's in file) will be stored in $nummat


&open_html if (exists ($opt{o})); #if the user wants a fancy html output, now, before the loops have begun, is the moment to create its <html>,<head> tags and other head elements...

#The next loop will go through each matrix read by function readpromat()
for ($m=0;$m < $nummat;$m++){
	$found[$m]=0; #We initialize variable @found for this matrix (it will tell if any hit has been found with this matrix)
	print_mess ("Aplying matrix $m");
    for ($x=0;$x < scalar(@{$matrices{0}{A}});$x++){
    $matriu{A}[$x] = $matrices{$m}{A}[$x];
    $matriu{C}[$x] = $matrices{$m}{C}[$x];	#All operations will be done with %matriu, so let's copy the actual PWM to %matriu
    $matriu{T}[$x] = $matrices{$m}{T}[$x];
    $matriu{G}[$x] = $matrices{$m}{G}[$x]; 
	}

    &frecuencias (\%matriu,\@frecuencias);  #let's find the frequences in %matriu and put results on @frecuencias
    &relmat (\%matriu);	#We calculate the relative matrix 

    #CONSENSUS SEQUENCE: We find the matrix with the highest score
	@conseq ="";
	$consenseq = ""; #we initialise @conseq and $consenseq to avoid piling up different consensus sequences...
    @conseq = &consens(%matriu); #we find the sequence that obtains the maximum score for %matriu
    print_mess ("Consensus sequence :@conseq"); #and we print it
    for ($i=0;$i<scalar(@conseq);$i++){
        $consenseq= $consenseq . $conseq[$i]; #we put it on its proper place: $consenseq
    	}

    @maxscore = &aplicmat($consenseq,$tipomat,%matriu); #We calculate the score of the consensus matrix
    &print_mess ("Applying matrix $m to consensus sequence. Please wait");
    if ($tipomat == 1) {        #if user wants a likelihood matrix, calculate the log ratio too (funct.logmat)
        &logmat(\%matriu);
        } elsif ($tipomat == 2) {
            @frecuencias = ();
        &logmat2 (\%matriu,\@frecuencias);
        }
        
       	if (exists($opt{m})) {
		&printmat ($m,%matriu);	#if user wanted a PWM information, show it
	}
	if (!exists($opt{m})) {  #if user wanted a PWM information, he surely does not want to wait until all the PWM matching operations are done, so don't do it if -m option is set on

print "\n\nMATRIX\tSTART\tEND\tSCORE\tSEQUENCE\t\tSEQ.NO";
    for ($j=0;$j<scalar(@secuencias);$j++){  #for every sequence,
        $seq= $secuencias[$j];				 #put it on $seq
        print_mess ("Secuence $j:$nombres[$j]");	#show its name on verbose mode
        @scoresfin= &aplicmat ($seq,$tipomat,%matriu);	#and apply the actual PWM to it
        &print_mess ("Applying matrix to problem sequence $j. Please wait");
        for ($i=0;$i<scalar(@scoresfin);$i++){
        $scoresfin[$i] = $scoresfin[$i]/$maxscore[0];	#normalize results with the max score
        }

        $f= &output ($seq,$cutoff,$m,$j,@scoresfin);   #display the output results for this sequence and matrix. If there are succesful results, $f will know it...
        if ($f && !$found[$m]){$found[$m]=1;}		   #if results were found and they were'nt notified before, now its time to switch $found[$m] ON
    	&mat2html ($seq,$cutoff,$m,$j,@scoresfin) if (exists ($opt{o})); #if -o is set on, make the .html output
    }
    if ($found[$m]==0) {print "\nNo matches found in matrix $m.Try with a lower treshold\n";} #if no match was found, inform the user
}


}
&close_html if (exists ($opt{o})); #let's close the .html
##############################################################
##############################################################
#############            FUNCTIONS         ###################
##############################################################
##############################################################

# NAME: readfasta
# AIM: given a FASTA sequence file, readfasta will read the sequences and store it on an array
# PARAMETERS:
        #first- A DNA sequence file in FASTA format
        #Second- The array reference into which the sequence names will stored
# RETURNS: Sequence containing all the read sequences

sub readfasta {
    my ($file,$nombre) = @_;

    if (!open(FASTAFILE,"< $file")) {
        print_error ("\nFunction readfasta error!!! Input FASTA file \"$file\" not found; Unable to continue...\n");
        
    }

    my @seq=(); #Here we will store provisionally the sequence that is actually being readed
    my @secuencia = (); #Array were all the sequences will be finally stored

    my $s = 0; #switch: ON=> some sequence is appearing on the file
    my $i = 0; #iteration variable
    my $n = 0; #sequence number counter


    while (<FASTAFILE>){
	    
 		
        if ($s == 1 && m/\>(.+)/){
                $i = 0;
                $n = $n + 1;
                $secuencia[$n]="";
                @seq=();
                $$nombre[$n] = $1;


        }
        if ($s == 0 && m/\>(.+)/) {
                $s = 1;
                $$nombre[$n] = $1;
 
            }
 
 
        if ($s == 1 && m/\A(\w+)/g) {
	        tr/a-z/A-Z/;
                $seq[$i]= $1;
                
                $secuencia[$n]= join("",@seq);
                $i = $i +1;
                
              
        }


 

    }
    close FASTAFILE;
    print_mess ("FASTA file \"$file\" read.$i sequences found");
    
        return @secuencia;
 
}
 
 
# NAME: readpromat
# AIM: given a matrix file and a hash reference, readpromat will read the nucleotide proportions at each position of the PWM and put it in the hash
# PARAMETERS:
        #first- A PWM matrix in TRANSFAC format (http://www.gene-regulation.com/pub/databases/transfac/doc/matrix1.html for further information)
        #Second- The hash reference into which the data will stored
# RETURNS: Lenght of the read matrix

sub readpromat {
    my ($file,$href) = @_;
    my $j=0;

        my @nucleotids=();           # Nucleotide appearance order will be filed here
        my $trobatp0=0;     # This variable will act as a switch: 0 means that no p0 ocurrence has been found on matrix. 1 means that p0 has been found
        my $posicio=0;            # Current matrix position

    if (!open(MATFILE,"< $file")) {
        &print_error ("Function readmat error!!! Input matrix file  $file not found; Unable to continue...\n");
        exit;
    }

        while (<MATFILE>) {

        if ($trobatp0 == 0 &&  m/\APO\s+(\w+)\s+(\w+)\s+(\w+)\s+(\w+)/) #This instruction ignores all the PWM information previous to nucleotide proportion information (through "$trobatp0" switch) and retains the order of nucleotide appearance.
        {
        $nucleotids[0] = $1;
        $nucleotids[1] = $2;
        $nucleotids[2] = $3;  #we retain the nucleotide order of appearance in the PWM file
        $nucleotids[3] = $4;
        &print_mess ("Nucleotide entries found! $1 $2 $3 $4");
        $trobatp0 = 1; #We've found the start of the residue proportion, so let's turn the switch ON
        &print_mess ("Compatible matrix Found! reading PWM");
    
}

                if ($trobatp0 == 1 && m/\A\d+\s+([\-\d\.]+)\s+([\-\d\.]+)\s+([\-\d\.]+)\s+([\-\d\.]+)/) {
				#if PWM information has begin to appear and we found four digits sepparated by spaces, store that information
		    
                $$href{$j}{$nucleotids[0]}[$posicio] = $1;
                $$href{$j}{$nucleotids[1]}[$posicio] = $2;
                $$href{$j}{$nucleotids[2]}[$posicio] = $3;
                $$href{$j}{$nucleotids[3]}[$posicio] = $4;
              
		$posicio = $posicio + 1;

    } #This bucle reads all the PWM values and puts all information in hash %matpesos.

                if ($trobatp0 == 1 && m/\AXX/) {
                $trobatp0 = 0;
                &print_mess ("Matrix $j has been readed!!");
		    $j=$j+1;
		$posicio=0;
        } #When XX appears, the PWM is over, so let's turn trobatp0 switch OFF


        }

   
return $j; #we return the number of read matrix
}

# NAME: consens
# AIM: given a matrix file, function consens() finds a sequence that obtains the maximum score for that PWM
# PARAMETERS:
        #first- A PWM matrix in TRANSFAC format (http://www.gene-regulation.com/pub/databases/transfac/doc/matrix1.html for further information)
        
# RETURNS: the maximum score sequence

sub consens
{
    my %mat = @_;
    my @sec;
    my $longmat = scalar(@{$matriu{A}}); #we find the matrix lenght to stop the loop
    my $maxval;
    my $i=0;
    print_mess ("Finding consensus sequence...");

        for ($i=0;$i<$longmat;$i++){
            $maxval = $matriu{A}[$i];
            $sec[$i] = "A";   
            if ($matriu{C}[$i] > $maxval) {$sec[$i] = "C"; $maxval = $matriu{C}[$i];}
                if ($matriu{T}[$i] > $maxval) {$sec[$i] = "T"; $maxval = $matriu{T}[$i];} #if a base residue has better result than the maximum score obtained until that moment, retain it
                if ($matriu{G}[$i] > $maxval) {$sec[$i] = "G"; $maxval = $matriu{G}[$i];}
                }
                return @sec;

}

# NAME: aplicmat
# AIM: Given a raw sequence and a PWM matrix, aplicmat assigns a score to every nucleotide residue and stores it in an array
# PARAMETERS:
        #FIRST: raw DNA sequence
        #SECOND: PWM matrix
        #THIRD: Reference of the array where the scores will be stored
# RETURNS: Nothing

sub aplicmat{
        my ($dnaseq, $tipomat, %mat)= @_;
        my @punt;
        my $start=0; #this variable indicates the point in which the iteration must begin
        my $n; # reading iteration variable (within the pwm)
        my $i; # reading iteration variable (within the sequence)
        my $longmat =  scalar(@{$mat{A}}); #we calculate matrix lenght
        my $longseq ; #sequence lenght will be stored here

########part one: applying matrix to sequence and storing results into @punt
        my @seq = split(//,$dnaseq);

        $longseq = scalar(@seq);
        for ($start= 0; $start <= $longseq-$longmat; $start++){ #for each position in the sequence, until there is sequence enough to compare...
                $n=0;

                for ($i=$start; $i < $start+$longmat; $i++){
                        if ($seq[$i] =~ /[ACTG]/) {   
                                $punt[$start][$n] = $mat{$seq[$i]}[$n]; #we find the score of that residue on the actual PWM

                                }else { $punt[$start][$n] = 0; }		#if nucleotide residue letter is unknown, just score it with a zero
                                $n=$n+1;
                        }
        }

#########Part two: Calculating total score for each reading frame


        my @sc;   #it will store the score for each start reading frame
        my $q = 0;
        my $r = 0;


        if ($tipomat == 0) {   #if user wanted a relative matrix... we calculate it
                for ($q = 0; $q < scalar(@punt); $q++) {
                                                $sc[$q]=0;
                        for ($r = 0; $r < scalar(@{$punt[$q]})  ; $r++) {

                                if ($punt[$q][$r] eq "0") {$sc[$q] = -900;      }
                                else{$sc[$q] = $sc[$q] + $punt[$q][$r]; }

                        }
                }
        }

        elsif ($tipomat == 2||$tipomat == 1) {	#and if he wanted a log matrix, we calculate it too...
                for ($q = 0; $q < scalar(@punt); $q++) {

                        $sc[$q]=0;
                        for ($r = 0; $r < scalar(@{$punt[$q]})  ; $r++) {
                                if ($punt[$q][$r] eq "-inf") {
                                        $sc[$q] = -900;
                                }
                                else{
                                        $sc[$q] = $sc[$q] + $punt[$q][$r];

                                }
                        }

                }
        }

return @sc; #@sc contains now all the scores for that PWM-sequence comparison, so return it to main function
        }




# NAME: frecuencias
# AIM: Read the absolute PWM and calculate the frequency of each nucleotide in whole sequences. (it is used to calculate the loglikelihood matrix type 2, in which the frequences are not standard, but the frequences of given PWM)
# PARAMETERS: 
	#first-the reference of the PWM. 
	#second-reference were data will be stored.
# RETURNS:nothing

sub frecuencias {
        my ($href,$fref) = @_;
        my $posicio = 1; #we initialise all the variables
        my $A = $$href{A}[0];
        my $C = $$href{C}[0];
        my $G = $$href{G}[0];
        my $T = $$href{T}[0];

        while ($posicio < scalar(@{$$href{A}}) ) {
                $A = $A + $$href{A}[$posicio];
                $C = $C + $$href{C}[$posicio];		#we count all the A's,C's,G's and T's in the PWM
                $G = $G + $$href{G}[$posicio];
                $T = $T + $$href{T}[$posicio];
                $posicio++;
        }
        my $ntotal = $A + $C + $G + $T;

        $$fref[0] = $A / $ntotal;
        $$fref[1] = $C / $ntotal;					#and we split it by the total number of nucleotides
        $$fref[2] = $G / $ntotal;
        $$fref[3] = $T / $ntotal;

}





# NAME: relmat
# AIM: Give a PWM, relmat transforms it to a relative matrix.
# PARAMETERS: 
	#first- PWM obtained in readpromat function. 
	#second- number of positions of the sequences analysed in Transfac matrix

# RETURNS: NOTHING

sub relmat {

        my ($href) = @_;
        my $posicio = 0;
        my $elements = 0;

         #number of sequences contained in the PWM

        &print_mess ("Calculating relative matrix");
        while ($posicio < scalar(@{$$href{A}})) { #this bucle reads the PWM, converts it in a relative matrix, and put the result in the similar matrix.
        $elements = $$href{A}[$posicio] + $$href{C}[$posicio] + $$href{G}[$posicio] + $$href{T}[$posicio]; #Due to irregularities in some PWM matrices, we calculate total number of elements at every position to avoid errors
        $$href{A}[$posicio] = $$href{A}[$posicio] / $elements;
        $$href{C}[$posicio] = $$href{C}[$posicio] / $elements;
        $$href{G}[$posicio] = $$href{G}[$posicio] / $elements;
        $$href{T}[$posicio] = $$href{T}[$posicio] / $elements;

        $posicio++;

        } 



}



# NAME: logmat
# AIM: given a relative matrix, logmat transforms it in a log-likelihood matrix, using standard nucleotide proportions (0.25 each)
# PARAMETERS: 
	#first-the reference of the PWM. 
	#second-number of positions of the sequences analysed in Transfac matrix.
# RETURNS:Nothing



sub logmat {
        my ($href)=@_;
        my $posicio = 0;
        &print_mess ("Calculating log-likelihood matrix");
        while ($posicio < scalar(@{$$href{A}})) {

                if ($$href{A}[$posicio] == 0) {
                        $$href{A}[$posicio] = "-inf";
                }
                elsif ($$href{C}[$posicio] == 0) {
                        $$href{C}[$posicio] = "-inf";
                }
        elsif ($$href{G}[$posicio] == 0) {
                        $$href{G}[$posicio] = "-inf";
                }
        elsif ($$href{T}[$posicio] == 0) {
                        $$href{T}[$posicio] = "-inf";
                }
        else {
                $$href{A}[$posicio] = log($$href{A}[$posicio] / 0.25);
                $$href{C}[$posicio] = log($$href{C}[$posicio] / 0.25);
                $$href{G}[$posicio] = log($$href{G}[$posicio] / 0.25);
                $$href{T}[$posicio] = log($$href{T}[$posicio] / 0.25);
        }

        $posicio++;


    }


}



# NAME: logmat2
# AIM: given a relative matrix, logmat transforms it in a log-likelihood matrix (using the nucleotide appearance frequence given by frecuencias() )
# PARAMETERS: 
	#first-the reference of the PWM. 
	#second-number of positions of the sequences analysed in Transfac matrix.
# RETURNS:Nothing


sub logmat2 {
        my ($href,$fref) = @_;
        my $posicio = 0;
        &print_mess ("Calculating log-likelihood matrix");

        while ($posicio < scalar(@{$$href{A}})) {

                if ($$href{A}[$posicio] == 0) {
                        $$href{A}[$posicio] = "-inf";
                }
                elsif ($$href{C}[$posicio] == 0) {
                        $$href{C}[$posicio] = "-inf";
                }
        elsif ($$href{G}[$posicio] == 0) {
                        $$href{G}[$posicio] = "-inf";
                }
        elsif ($$href{T}[$posicio] == 0) {
                        $$href{T}[$posicio] = "-inf";
                }
        else {
                $$href{A}[$posicio] = log($$href{A}[$posicio] / $$fref[0]);
                $$href{C}[$posicio] = log($$href{C}[$posicio] / $$fref[1]);
                $$href{G}[$posicio] = log($$href{G}[$posicio] / $$fref[2]);
                $$href{T}[$posicio] = log($$href{T}[$posicio] / $$fref[3]);
        }

        $posicio++;

    }



}

# NAME: print_mess
# AIM: print a message in STDERR 
# PARAMETERS: 
	#first-the message to be printed
# RETURNS:Nothing
sub print_mess
{
        my @mess = @_;

        print STDERR ("\t: @mess \n") if (exists($opt{v}));
}

# NAME: print_error
# AIM: print a message in STDERR  and exits the program
# PARAMETERS: 
	#first-the message to be printed
# RETURNS:Nothing
sub print_error
{
        my @mess = @_;

        print STDERR ("\t: @mess \n") and exit();
}

# NAME: open_html
# AIM: starts a .html file named outmatrix.html to be completed with mat2html() and close_html
# PARAMETERS: None
# RETURNS:Nothing
sub open_html{
open(OUTFILE,"> outmatrix.html"); #we open the file for write-mode
print OUTFILE "<html>\n<head>\n<title>ProMotif output file</title>\n<body>"; #HTML main part generated... let's get into the body part...
print OUTFILE "\n<body bgcolor=\"#FFFFFF\">\n<div align=\"center\"> \n <p><img src=\"logo1.gif\"></div></p>"; #Background and ProMotif Logo printed... ready for some nucleotide, score and position information!
}

# NAME: mat2html
# AIM: After open_html use, mat2hml puts in a .html file the results of aplicmat() and gives a table and a graphical output
# PARAMETERS: 
	#first-The sequence analyzed
	#second-The treshold ("cutoff") value above which the results will be considered and printed
	#third- Number of matrix that is being applied to the sequence
	#fourth- an array, output of aplicmat(), containing the scores of every reading frame of the sequence
# RETURNS:Nothing

sub mat2html{
my ($dnaseq, $treshold, $nmat,$nseq,@scores) = @_;
my @seqvec=""; #we initialize @seqvec, who will contain the sequence for analysis purposes
@seqvec= split (//,$dnaseq); #we put the seq in an array
my $longseq = scalar (@seqvec); #calculate the seq lenght
my $longmat = scalar(@{$matriu{A}});	#calculate the PWM lenght
my $i;	#iteration variable
my $n=0;	#this variable will count the point of the sequence we are printing
my $found=0; #switch that turns ON when any result above the treshold is found
my $titulo=0;	#switch that turns ON when the title is printed once
my $seqpos=0;   #reminder of the position of the seq we're currently (someway like $n)
my $color;		#RGB code for every color will be stored here
my $value;		#it will show the edges of every color in the legend
print OUTFILE "\n<p><font size=\"2\" face=\"Arial\"><b>Matrix $nmat.Sequence $nombres[$nseq]</b></font>\n\</p>";
#we will start every round of mat2html printing the name of the sequence


for ($i=0;$i<scalar(@scores);$i++){
   
        if ($scoresfin[$i] >= $treshold) {
	$found=1;
	if ($found && !$titulo){ #if some results are found, but there is no title stil, print it
		print OUTFILE "\n\n<table border=\"0\">\n<tr align=\"center\">\n<td bgcolor=\"#6699FF\"><font face=\"Arial\" size=\"2\" color=\"#FFFFFF\"><b>Matrix.no</b></font></td>";
		print OUTFILE "\n<td bgcolor=\"#6699FF\"><font face=\"Arial\" size=\"2\" color=\"#FFFFFF\"><b>Start</b></font></td>";
		print OUTFILE "\n<td bgcolor=\"#6699FF\"><font face=\"Arial\" size=\"2\" color=\"#FFFFFF\"><b>End</b></font></td>";
		print OUTFILE "\n<td bgcolor=\"#6699FF\"><font face=\"Arial\" size=\"2\" color=\"#FFFFFF\"><b>Score</b></font></td>";
		print OUTFILE "\n<td bgcolor=\"#6699FF\"><font face=\"Arial\" size=\"2\" color=\"#FFFFFF\"><b>Seq fragment</b></font></td>";
		$titulo=1;
	}
        $f=$i+scalar(@{$matriu{A}}); #we assign $f to the final position of the matrix-pwm comparision
        
        #TABLE OF RESULTS
        print OUTFILE "\n<tr><td align= \"center\"><font face= \"Arial\"> $nmat </font></td>";
        print OUTFILE "\n<td align= \"center\"><font face= \"Arial\">$i</font></td>";
        print OUTFILE "\n<td align= \"center\"><font face= \"Arial\">$f</font></td>";
        printf OUTFILE ("\n<td align= \"center\"><font face= \"Arial\">%.2f</font></td>\n<td><font size= \"2\" face= \"Courier New\">",$scoresfin[$i]);
        for ($seqpos=$i;$seqpos < $f; $seqpos++){
	        print OUTFILE "$seqvec[$seqpos]";
        }
       print OUTFILE "</font></td></tr>";
        
        
     
    }

}
#COLOR LEGEND
print OUTFILE ("\n</table>\n<br>");
print OUTFILE ("\n<table border = \"1\"><tr>");
if ($found){
for ($i=0;$i < 12;$i++){
	$color=color(0.45 + 0.05 * $i); #we show every color

print OUTFILE ("\n<td bgcolor=\"$color\">\&nbsp\;</td>"); 
	}
	print OUTFILE ("\n</tr>\n<tr>");
	for ($i=0;$i < 12;$i++){
		$value=0.45 + 0.05 * $i; #we print every value
	print OUTFILE ("\n<td><font size=\"1\" face=\"Arial\">$value</font></td>"); 
	}
	print OUTFILE ("\n</tr>");
	}
print OUTFILE "\n</table><br>";

#GRAPHICAL OUTPUT
if ($found) {print OUTFILE "\n<table>";}
#if any result has been found, start a table. If don't, tell the user that no matches have been found
if (!$found) {print OUTFILE "\n<font size=\"2\" face=\"Arial\">No matches found</font>";}

for ($i=0;$i<$longseq;$i++){ #print the sequence
    if ($found) {print OUTFILE "\n<td><font size=\"2\" face=\"Courier New\">$seqvec[$i]</font></td>";}
        }
        
print OUTFILE "\n</tr>\n<tr> ";
my $lng=scalar(@scores);

for ($i=0;$i< $longseq-$longmat;$i++){
    
        if ($scores[$i] > $treshold) {
                $n=0;
          
                while ($n < $i){print OUTFILE "\n<td bgcolor=\"#FFFFFF\"</td>" ;$n++;}
						$color=color($scoresfin[$i]); #we call color() with the result score to be converted to color
                        while ($n < $i+$longmat){print OUTFILE "\n<td bgcolor=\"$color\">\&nbsp\;</td>" ;
                        $n++; #while there is sequence match, print it with the proper color
                        
                        }
                        while ($n < $longseq){print OUTFILE "\n<td bgcolor=\"FFFFFF\">\&nbsp\;</td>" ;$n++;} #now that the match has been coloured, left blank spaces until the end of the sequence
                        print OUTFILE "\n</tr>";}
                }
                print OUTFILE "\n</table>"; #close the table
}

# NAME: closehtml
# AIM: closes the outmatrix.html file
# PARAMETERS: none
# RETURNS:Nothing
sub close_html{
print OUTFILE "\n</body>";
print OUTFILE "\n</html>";
close (OUTFILE);
print STDERR "\n\t:outmatrix.html generated\n"; #close everything and inform the user that the new matrix has been created

}



# NAME: output
# AIM: Gives an output in the command shell of the results found on an aplicmat() usage
# PARAMETERS: 
	#first-The sequence analyzed
	#second-The treshold ("cutoff") value above which the results will be considered and printed
	#third- Number of matrix that is being applied to the sequence
	#fourth-Number of sequence that is currently being tested
	#fifth- an array, output of aplicmat(), containing the scores of every reading frame of the sequence
# RETURNS:Nothing
sub output{
my ($seq,$treshold,$nmat,$nseq,@scoresfin) =@_;
my @seqvec2 = split (//,$seq); #we put $seq on array @seqvec2
my $longseq = scalar(@seqvec2);	#we measure the sequence
my $seqturno;	#here we will store the actual nucleotide being readed
my $p;
my $found=0;	#succesful results switch $found ON
my $longmat = scalar(@{$matriu{A}});
my $titulo=0;   #title switch. It's turned on when a table heading has been printed



for ($i=0;$i<scalar(@scoresfin);$i++){
   
        $seqturno="";
        for ($p=$i;$p < $i+$longmat;$p++){
                $seqturno = $seqturno.$seqvec2[$p]; #we store the fragment of the sequence where the match has ocurred in $seqturno
        }
        if ($scoresfin[$i] >= $treshold) {
	    
        $f=$i+scalar(@{$matriu{A}});
        printf ("\n$nmat\t$i\t$f\t%.2f\t$seqturno\t\t$nseq",$scoresfin[$i]); #we show the results in a table
        $found = 1;
    }
}
if (!$found) {print_mess ("No score higher than the cutoff treshold in sequence!! ($treshold)\n");}
return $found;
}

# NAME: Seqinfo
# AIM: Shows information on a DNA sequences array (lenght, G+C content)
# PARAMETERS: 
	#first-The array containing the sequences to be analyzed
# RETURNS:Nothing
sub seqinfo {
	my (@seq ) = @_;
	my %secu;
	my %sec;
	
	my $i = 0;
	my $j = 0; 
	my $l = 0; #sequence lenght
	my $n = 0; #g+c counter
	my $p = 0; #g+c content in %

	

	
	print "\nSEQUENCE INFORMATION";
	print "\n Seq\tlenght\tnēC+G\t  %";
	while ($i < scalar (@seq)) {
	
		@{$secu{$i}} = split(//,$seq[$i]);
		$l = scalar(@{$secu{$i}});
		$n = 0;
		$j = 0;
		while ($j < scalar (@{$secu{$i}})) {
			if( $secu{$i}[$j] eq 'C' || $secu{$i}[$j] eq 'G'){
				$n = $n +1;
			} 	
			$j++;		
		}	
		$p = ($n / scalar(@{$secu{$i}}))*100 ;
		print "\n $i\t $l\t $n\t";	
		printf ("%.2f\t",$p);
		$i++;
		
	}
	print "\n";
	
}	

# NAME: printmat
# AIM: given a PWM, it prints it
# PARAMETERS: 
	#first-The number of matrix that is being printed
	#second-The PWM to be printed
# RETURNS:Nothing

sub printmat {
	my ($m,%mat) = @_;
	my $i = 0;
	my $longmat = scalar(@{$matriu{A}});
	print "\n\n####Matrix $m####";
	print "\nA\tC\tG\tT\t\n";	#we print the matrix heading (A C G T)
	for ($i=0;$i<$longmat;$i++) {
		
		printf ("%.2f\t",$matriu{A}[$i]);
		printf ("%.2f\t",$matriu{C}[$i]);
		printf ("%.2f\t",$matriu{G}[$i]);	#we print the values
		printf ("%.2f\t",$matriu{T}[$i]);
		print "\n";
	}
	
	
	
}


# NAME: color
# AIM: given a decimal number, it returns the RGB code of that number in a default scale
# PARAMETERS: 
	#first-the number to be turned to the color scale
	# RETURNS: the RGB code in hex of that color
sub color{
my $value = $_[0];
my @colores=("#FFFFB4","#FFFFB4","#FFFF90","#FFFF6C","#FFFF48","#FFFF24","#FFFF00","#FFCC00","#FF9900","#FF6600","#FF0000","#CC0000");
my $tag="#FFFFB4";
my $i;
my $flo;

for ($i=0;$i < 12; $i++){
	$flo= 0.45 + 0.05 * $i;
	if ($value >= $flo) {$tag= $colores[$i];}
}
return $tag;

}
