#! /usr/bin/perl -w
use strict;
use Getopt::Std;

###########################################################################################
###########################################################################################
###########################################################################################
#                                                                                         
#     PROMFINDER: PROMOTER MOTIFS FINDER.                             
#                                                                                         
#     CREATION DATE: March 2003                                                         
#    
#     AUTHORS: Alexandra BALLESTER     e-mail: alexandra.ballester01@campus.upf.es
#              Patricia GORDILLO       e-mail: patricia.gordillo01@campus.upf.es
#
# This program is FREE SOFTWARE. You can distribute it and/or modify it. 
#
# The program is aimed to be a powerful tool to find promoter regions in DNA sequences (FASTA format) 
# using as much Weight Matrixes (TRANSFAC) as desired.
#
#
#
#
#
#
#
############################################################################################
#########################  GLOBAL VARIABLES  DECLARATION ####################################
############################################################################################

my %secuencias = ();
my %matrices = ();
my $nummat;
my @posicion;
my $seqname;
my @tot_consenso;
my @max; #consenso
my %total;
my %prin_vent;
my %pmax;#cada secuencia
my $thold;
my %opt;   #variables para getopts
my $param_one; #variables para getopts
my $param_two; #variables para getopts
my $n_params;  #variables para getopts
my @consens; #Es el vector en el cual, en cada posicion, hallaremos el string correspondiente a la secuencia consenso
my $hit; #Esta variable nos servira para imprimir por pantalla que no hemos encontrado ningun hit.
my $hitsequence; #Esta variable sirve para imprimir en el output el hit para cada secuencia/matriz.
my $hitparcial; #Esta variable va resumiendo nucleotido a nucleotido el hit para cada secuencia/matriz.
my $counter_pinta; 


############################################################################################
######################## READING PROGRAM PARAMETERS AND OPTIONS ############################
############################################################################################
#######################################################################################


             #####################################################
             #                  reading options
             #####################################################

(getopts('vmst:',\%opt)) or die("Problems reading options, try again, please\n");      

print_mess("Loading options: done! ");

              #####################################################
              #                 reading parameters
              #####################################################
print_mess("Reading parameters....");

$n_params = scalar(@ARGV);


# if there are more than two files, quit the program
($n_params == 2) or die (print_error("Wrong number of parameters ($n_params)! Want to try again?\n"));
($param_one) = $ARGV[0];
($param_two) = $ARGV[1];


# write on the screen the name of the parameters one and two
print_mess ("First argument is... ",$param_one,"\n");
print_mess ("Second argument is... ",$param_two,"\n");
print_mess("Parameters LOADED! ");

my ($fastafile,$matrixfile) = @ARGV;

unless (-e $fastafile && -r $fastafile) {
print STDERR "## HEY I can't access to or read $fastafile\n"; #Comprobamos que el fichero de secuencias existe y se puede leer
exit(1);
}

unless (-e $matrixfile && -r $matrixfile) {
print STDERR "## HEY I can't access to or read  $matrixfile\n"; #Comprobamos que el fichero de matrices existe y se puede leer
exit(1);
}


####################################################################################### 
######################## OPENING OF OUTFILES ##########################################
#######################################################################################

# Opening for writing (>).

open (OUTFILE,">outfile.html");
open (OUTGRAF,">graphical.html");

print OUTFILE "<html> <head> <title> PROMFINDER  </title>
</head> <BODY    TEXT color ='0000FF'    BGCOLOR='FFFACD' link='black' vlink='black' alink='black'><br><br><font color='0000ff'> <font size='10'><center><blockquote><b><font color='909000'> PROMFINDER: TRANSCRIPTIONAL FACTORS MOTIFS SCANNING.</font></b></font></center><br><p></blockquote><font color='8B0000'><font size='6'> RESULTS. <hr> </font></font><br><p>\n";

print OUTGRAF "<html> <head> <title> PROMFINDER  </title>
</head> <BODY    TEXT color ='0000FF'    BGCOLOR='FFFACD' link='black' vlink='black' alink='black'><br><br><font color='0000ff'> <font size='10'><center><blockquote><b><font color='909000'> PROMFINDER: TRANSCRIPTIONAL FACTORS MOTIFS SCANNING.</font></b></font></center><br><p></blockquote><font color='8B0000'><font size='6'> GRAPHICAL RESULTS. <hr> </font></font><br><p>\n";

############################################################################################
###########################         MAIN LOOP     ##########################################
############################################################################################

if (exists($opt{t})){
&opt_t()
}
&leer_fichero_fasta($fastafile);
&pasa_seq2vector();
if (exists($opt{s}) && !exists($opt{m}))  {
    &opt_s() ;
    exit ();
}
if (exists($opt{s}) && exists($opt{m}))  {
&opt_s() 
}

&leer_fichero_matriz($matrixfile);
if (exists($opt{m})){
&opt_m();
}
&printa_results_matriz();

&pasa_fasta_x_matriz();


#####################################################
# Finish successfully the program execution
#####################################################
print_mess("Everything well done. Exit!\n");
exit(0);

####################################################################################################################################################
################################################    SUBROUTINES   ##################################################################################
####################################################################################################################################################


######################################################## READING FASTA FILE ########################################################################
sub leer_fichero_fasta() {

my $file = $_[0];
my $seqname;

open(FASTA,"< $file") || die("### No puedo abrir $file...");

print_mess("Reading $file file....");

while (<FASTA>) {

    next if /^\s*$/o; #Saltamos las lineas vacias

    chomp;

    if ($_ =~ m/^>([^\s]+).*$/o) { #capturamos el identificador de la secuencia
	$seqname = $1;
	$secuencias{$seqname} = "";
	next;
    };

    $secuencias{$seqname} = $secuencias{$seqname} . $_ ;
}; # while

close(FASTA);

if (scalar(keys %secuencias) == 0) { #comprobamos que el fichero introducido contiene informacion de secuencia/-s

    print STDERR "## HEY $file no te cap secuencia\n";
    
    exit(1);
};


print_mess("Dealing with $file file information....");

} # leer_fichero_fasta


######################################################## GET SEQUENCES INTO ARRAYS #######################################################################


sub pasa_seq2vector() {

my $k;

foreach $k (keys %secuencias) {


    $secuencias{$k} = [ split //o, uc($secuencias{$k}) ]; #pasamos la secuencia a un vector que contendra un nucleotido en mayusculas por posicion 
  

}; # foreach


} # pasa_seq2vector


################################## READING MATRIXES FILE & NORMALIZATION   #######################################################################
######################## FINDING CONSENSUS SEQUENCE AND ITS VALUE FOR EACH MATRIX ################################################################

sub leer_fichero_matriz() {

my $matriz = $_[0];

open(MATRIZ,"< $matriz") || die("### No puedo abrir $matriz..."); #comprobamos que el fichero introducido contiene informacion de matriz/-ces

print_mess("Reading $matriz file....");
print_mess("Rearrange of  $matriz file information....");
print_mess("Finding consensus sequences for each matrix .....");
print_mess("Calculating scores for consensus sequences.....");


my $m= 0;
my @nucleotidos;
my $suma = 0;
my @consenso;
my $comp = " ";

$nummat = 0;


while (<MATRIZ>) {


    next if /^\s*$/o; #Saltamos las lineas vacias

    if ($_ =~ m/\AP[0O]\s+(\w+)\s+(\w+)\s+(\w+)\s+(\w+)/) { #identificamos el inicio de la matriz

	# restamos una unidad a $posicion[$nummat] porque al final de la iteracion cuando ha
        # leido la ultima fila, se suma 1 al valor de la variable; cuando este valor ya era el  
	# correspondiente al numero de filas de la matriz.
	$consens[$nummat]="";
	$posicion[$nummat]--;

	if ($nummat-1 >= 0) {
	$tot_consenso[$nummat-1] = @consenso;
    }
	
		$nummat = $nummat +1;
		$posicion[$nummat] = 0;
		
		$nucleotidos[0] = $1;            #Capturamos los nucleotidos que encabezan la matriz
		$nucleotidos[1] = $2;
		$nucleotidos[2] = $3;
		$nucleotidos[3] = $4;

		@consenso= " ";
		$max[$nummat-1] = 0;
		next
		};
    
    
    if ($_=~ m/\A\d+\s+([\-\d\.]+)\s+([\-\d\.]+)\s+([\-\d\.]+)\s+([\-\d\.]+)/) {

	$matrices{$nummat}{$nucleotidos[0]}[$posicion[$nummat]] = $1;
	$matrices{$nummat}{$nucleotidos[1]}[$posicion[$nummat]] = $2;    #Capturamos los valores numericos para cada nucleotido/posicion
	$matrices{$nummat}{$nucleotidos[2]}[$posicion[$nummat]] = $3;
	$matrices{$nummat}{$nucleotidos[3]}[$posicion[$nummat]] = $4;
	
	
	$suma = $matrices{$nummat}{$nucleotidos[0]}[$posicion[$nummat]] + $matrices{$nummat}{$nucleotidos[1]}[$posicion[$nummat]] + $matrices{$nummat}{$nucleotidos[2]}[$posicion[$nummat]] + $matrices{$nummat}{$nucleotidos[3]}[$posicion[$nummat]];
	
	$matrices{$nummat}{$nucleotidos[0]}[$posicion[$nummat]] = $matrices{$nummat}{$nucleotidos[0]}[$posicion[$nummat]] / $suma;
	$matrices{$nummat}{$nucleotidos[1]}[$posicion[$nummat]] = $matrices{$nummat}{$nucleotidos[1]}[$posicion[$nummat]] / $suma;   #Normalizamos la matriz
	$matrices{$nummat}{$nucleotidos[2]}[$posicion[$nummat]] = $matrices{$nummat}{$nucleotidos[2]}[$posicion[$nummat]] / $suma;
	$matrices{$nummat}{$nucleotidos[3]}[$posicion[$nummat]] = $matrices{$nummat}{$nucleotidos[3]}[$posicion[$nummat]] / $suma;
	
################ Obtaining Consensus sequence and its value for each matrix #####################################################################################

	if ( $matrices{$nummat}{$nucleotidos[0]}[$posicion[$nummat]] >$matrices{$nummat}{$nucleotidos[1]}[$posicion[$nummat]] && $matrices{$nummat}{$nucleotidos[0]}[$posicion[$nummat]] > $matrices{$nummat}{$nucleotidos[2]}[$posicion[$nummat]] && $matrices{$nummat}{$nucleotidos[0]}[$posicion[$nummat]] > $matrices{$nummat}{$nucleotidos[3]}[$posicion[$nummat]]) {
	    
	    
	    $consens[$nummat-1] = $consens[$nummat-1]."A";
	    $consenso[$posicion[$nummat]] = "A";
	    $max[$nummat-1] = $max[$nummat-1] +$matrices{$nummat}{$nucleotidos[0]}[$posicion[$nummat]];
	    
	}
	
	
	if ( $matrices{$nummat}{$nucleotidos[1]}[$posicion[$nummat]] >$matrices{$nummat}{$nucleotidos[0]}[$posicion[$nummat]] && $matrices{$nummat}{$nucleotidos[1]}[$posicion[$nummat]] > $matrices{$nummat}{$nucleotidos[2]}[$posicion[$nummat]] && $matrices{$nummat}{$nucleotidos[1]}[$posicion[$nummat]] > $matrices{$nummat}{$nucleotidos[3]}[$posicion[$nummat]]) {

	    $consens[$nummat-1] = $consens[$nummat-1]."C";
	    $consenso[$posicion[$nummat]] = "C";	
	    $max[$nummat-1] = $max[$nummat-1] + $matrices{$nummat}{$nucleotidos[1]}[$posicion[$nummat]];
	}

	
	if ( $matrices{$nummat}{$nucleotidos[2]}[$posicion[$nummat]] >$matrices{$nummat}{$nucleotidos[0]}[$posicion[$nummat]] && $matrices{$nummat}{$nucleotidos[2]}[$posicion[$nummat]] > $matrices{$nummat}{$nucleotidos[1]}[$posicion[$nummat]] && $matrices{$nummat}{$nucleotidos[2]}[$posicion[$nummat]] > $matrices{$nummat}{$nucleotidos[3]}[$posicion[$nummat]]) {

	    $consens[$nummat-1] = $consens[$nummat-1]."G";
	    $consenso[$posicion[$nummat]] = "G";
	    $max[$nummat-1] = $max[$nummat-1] +  $matrices{$nummat}{$nucleotidos[2]}[$posicion[$nummat]];
	}

	
	if ( $matrices{$nummat}{$nucleotidos[3]}[$posicion[$nummat]] >$matrices{$nummat}{$nucleotidos[0]}[$posicion[$nummat]] && $matrices{$nummat}{$nucleotidos[3]}[$posicion[$nummat]] > $matrices{$nummat}{$nucleotidos[1]}[$posicion[$nummat]] && $matrices{$nummat}{$nucleotidos[3]}[$posicion[$nummat]] > $matrices{$nummat}{$nucleotidos[2]}[$posicion[$nummat]]) {

	    $consens[$nummat-1] = $consens[$nummat-1]."T";
	    $consenso[$posicion[$nummat]] = "T";
	    $max[$nummat-1] = $max[$nummat-1] + $matrices{$nummat}{$nucleotidos[3]}[$posicion[$nummat]];
	}

	if (scalar (@consenso) == $posicion[$nummat]) {
	     $consenso[$posicion[$nummat]] = "X";
	     $consens[$nummat-1] = $consens[$nummat-1]."X";

	     if ( $matrices{$nummat}{$nucleotidos[0]}[$posicion[$nummat]] >=  $matrices{$nummat}{$nucleotidos[1]}[$posicion[$nummat]]) {

		 $comp = $matrices{$nummat}{$nucleotidos[0]}[$posicion[$nummat]];
	     }
		 else {
		     $comp = $matrices{$nummat}{$nucleotidos[1]}[$posicion[$nummat]];
		 }
	     		

	     if ( $matrices{$nummat}{$nucleotidos[1]}[$posicion[$nummat]] < $matrices{$nummat}{$nucleotidos[2]}[$posicion[$nummat]]) {
		 $comp = $matrices{$nummat}{$nucleotidos[2]}[$posicion[$nummat]];
} 

	     
	     if ($matrices{$nummat}{$nucleotidos[2]}[$posicion[$nummat]] < $matrices{$nummat}{$nucleotidos[3]}[$posicion[$nummat]]){
		 $comp = $matrices{$nummat}{$nucleotidos[3]}[$posicion[$nummat]];
} 

 	     
	     $max[$nummat-1] = $max[$nummat-1] + $comp;
	 }
	
  }
    $posicion[$nummat] ++;

    next;
	

next if ($_ =~ m/\AXX\s+(\d+)/);  #Indicamos al programa que salte la linea si coincide con la expresion regular de fin de matriz


} #while


# restamos una unidad aqui por la misma razon que en el caso anterior pero este solo 
# afecta a la ultima de las matrices a la que no se le resta arriba, puesto que tras
# la ultima matriz no vuelve a haber un match para inicio de nueva matriz.

$posicion[$nummat]--;


$tot_consenso[$nummat - 1] = @consenso;

print_mess("$matriz file READ! --> There are $nummat matrix/-es");
print_mess("Information REARRANGED!");
print_mess("Consensus sequences obtained and stored!");
print_mess("Scores STORED!");


}#leer_fichero_matriz


################################## SEQUENCES EVALUATION USING EACH MATRIX ##################################################################


sub pasa_fasta_x_matriz() {
    my @bgcolor;
    $bgcolor[2] = "FF0000";
    $bgcolor[1] = "00FF00";
    $bgcolor[0] = "0000FF";
    $bgcolor[3] = "FF7F50";
    $bgcolor[4] = "808080";


    print OUTFILE "<br>\n"; 
    print OUTFILE "<br>\n"; 
    print OUTFILE "<u>RESULTS OBTAINED FROM CANDIDATE SEQUENCES</u>\n"; 
    print OUTFILE "<br>\n"; 
    print OUTFILE "<br>\n"; 
    print OUTFILE "<center><table bgcolor= 'dda0dd' border=2 cellspacing=2 cellpadding=2 width=80% ><th align='center' width=18%> SEQUENCE<th align='center' width=15%> MATRIX<th align='center' width=15%> FIRST POSITION <th align='center' width=15%> LAST POSITION <th align='center' width=20%> SCORE <th align='center' width=40%> HITSEQUENCE </table>\n"; 
   
   
    my $pinta_sec;
    my $k;
    my $n;
    my $p;
    my $av_sec;
    my $v;
    $hit = 0;

    print_mess("\n");
    print_mess("Processing sequence/-s with matrix/-es......");


    foreach $k (keys %secuencias) {
	
	   $pinta_sec = 0;
	print OUTGRAF "<table cellpadding='5'>\n";
	print OUTGRAF "<td align = 'right'><u>Sequence $k</u></td><tr>\n";
	
	while ($pinta_sec <  (scalar (@{$secuencias{$k}}))){
	    print OUTGRAF "<td width='2%' >$secuencias{$k}[$pinta_sec]</td>\n";
	    $pinta_sec ++;
	}#while
	   print OUTGRAF "<tr>\n";
	   print OUTGRAF "</table>\n";
	   print OUTGRAF "<br>\n";
	
	
	foreach $n (keys %matrices){  
	    
	    
	    $pmax{$k}[$n] =0;
	    $av_sec = 0;
	    $total{$k}[$n] = 0;
	    $pinta_sec = 0;
	    print OUTGRAF "<table cellpadding='5'>\n";
	    print OUTGRAF "<td align = 'right'> Matrix $n </td><tr>\n";
	    
	    while ($av_sec < (scalar(@{$secuencias{$k}}) - $posicion[$n])) {
		$v = 0;
		$p = 0;
		$hitparcial = " ";
		
		while ($v < $posicion[$n]) {
		    
		    $p = $p + ($matrices{$n}{$secuencias{$k}[$av_sec+$v]}[$v]/$max[$n-1]);
		    $hitparcial = $hitparcial.$secuencias{$k}[$av_sec+$v];
		    $v = $v + 1;
		}
		$hitsequence = $hitparcial;
		
		$thold = 0.8;
		if (exists($opt{t})) {
		    $thold = $opt{t};
		}
		
		if ($p>=$thold){
		    
		    print "HIT FOUND !!!!!!!!!!!!!!!!!!!!!\n";
		    print "SEQUENCE ID  \t\t HIT \t\t\t SCORE \t\t\t MATRIX\n";
		    print "-----------------------------------------------------------------------------------\n";
		    print "$k \t $hitsequence \t $p \t $n\n";
		    print "\n";
		   
		    
		    print OUTFILE "<center><table border=2 cellspacing=2 cellpadding=2 width=80% ><th align='center' width=15%> $k <th align='center' width=15%> $n <th align='center' width=15%> $av_sec <th align='center' width=15%>".( $av_sec+$v)." <th align='center' width=20%> $p <th align='center' width=40%> $hitsequence </table></center>\n"; 
		    
		    
		    $hit ++;
		    
		}
		
		
		if ($p >= $thold){
			
		#    print OUTGRAF "<tr>\n";
		    $pinta_sec = 0;
		    $counter_pinta = 0;
		   # while ($pinta_sec <= $av_sec+$v){

		    while ($pinta_sec <= $av_sec+$v){


			if ($pinta_sec < $av_sec) {
			    print OUTGRAF "<td width='2%'>&nbsp;</td>\n";
			}
			if ($pinta_sec == $av_sec){
			    
			    while ($counter_pinta <= $posicion[$n]){
				print OUTGRAF "<td width='2%' bgcolor='$bgcolor[$n]'>&nbsp;</td>\n";
				$counter_pinta ++;
			    }
			}
			

			$pinta_sec ++;
		    }#while	
		    }
		

		$total{$k}[$n] = $total{$k}[$n] + $p;  
		
		if ($pmax{$k}[$n] < $p){
		    $pmax{$k}[$n] = $p;
		    $prin_vent{$k}[$n] = $av_sec;
		    
		}#if del pmax
		    
		    
		    $av_sec = $av_sec + 1; 
		
	    }
	    
	    print OUTGRAF "</table>\n";  
	    
	} #foreach $nummat
	
	
    }#foreach sec
	
	if ($hit == 0){
	    print "NO HIT FOUND!!!!!\n";
	}
    
       
    print "Used THRESHOLD VALUE = $thold\n";
    print "\n";
    
    print "TWO HTML FILES HAVE BEEN GENERATED WITH RESULTS !!!! RESULTS: outfile.html GRAPHICAL VIEW: graphical.html (IN CURRENT DIRECTORY)\n";
    print "HOPE THEY'LL WORTH YOU!\n";


} #pasa_fasta_x_matriz


################################## PRINTING RESULTS #######################################################################


sub printa_results_matriz() {
    my $k;
    my $n;
    my $contador = 0;
    
    
    print OUTFILE "<U>CONSENSUS SEQUENCES OBTAINED FROM EACH MATRIX</U>\n";   
    print OUTFILE "<br>\n";  
    print OUTFILE "<center><table bgcolor='d3d3d3' border=2 cellspacing=2 cellpadding=2 width=80% ><th align='center' width=10%> MATRIX<th align='center' width=50%> CONSENSUS SEQUENCE<th align='center' width=70%> CONSENSUS SEQUENCE VALUE<tr></table>\n"; 
    
    
    while ($contador < $nummat){
	
	
	print OUTFILE "<table  border=2 cellspacing=2 cellpadding=2 width=80% ><td width=10% align='center'>".($contador+1)." <td width=50% align='center'>".($consens[$contador])."<td width=70% align='center'> $max[$contador]<tr></table>\n"; #print de las secuencias consenso en el fichero html
	$contador ++;
    }
    
    print OUTFILE "<i><p><font size=-1><blockquote>Consensus Sequences table. When appearing symbol X means that two or more nucleotides have equal scores in the same matrix being this score the maximum value for the position.</blockquote></font></i>\n"; 
    print OUTFILE "</center>\n"; 
    
    
} #printa_results_matriz   


#######################################################################################
#############################  GETOPTS #############################################
#######################################################################################


              #####################################################
                 #  doing things according to the options selected
              #####################################################

sub opt_m {
    if (exists($opt{m})) {
	# doing things for the "m" option
	print "OPTION m: selected!\n";
	print "\n";
	print "_____________INFORMATION ABOUT MATRIXES______________\n";
	print "\n";
	print "number of matrixes contained in $matrixfile = $nummat\n";
	print "\n";
	print "\t _____CONSENSUS SEQUENCES_______\n";
	my $r=0;
	while ($r < $nummat) {
	    print "matrix ".($r+1)." consensus sequence: ".($consens[$r])."  associated value: $max[$r]\n";
	    $r ++;
	}
    }
}#opt_m
    
    sub opt_s {
	if (exists($opt{s})) {
	    # doing things for the "s" option
	print "OPTION s: selected!\n";
	print "\n";
	print "\n";
	print "_____________INFORMATION ABOUT SEQUENCES______________\n";
	my $k;
	foreach $k (keys %secuencias){
	    
	    my  $numgc = 0;
	    my  $congc = 0;
	    
	    while ($congc < (scalar (@{$secuencias{$k}}))){
		
		if ($secuencias{$k}[$congc] eq "G" || $secuencias{$k}[$congc] eq "C") {
		    $numgc ++;
		}
		$congc ++;
	    }
	    print "For sequence--> $k:\n";
	    print "The content of G and C nucleotides is $numgc and its lenght is ".(scalar (@{$secuencias{$k}}))."\n";
	    print "So the percentage of G+C is ".($numgc/scalar (@{$secuencias{$k}}))."\n";
	    print "\n";
	    
	    
	}
	print_mess("\n");
    } 
    }#opt_s
    
    sub opt_t{
	if (exists($opt{t})) {
# doing things for the "t" option (the associated value is at $opt{c}
	    $thold = $opt{t};
	    
	    print_mess("OPTION t: selected and associated value is $opt{t}!");
	}
	else {
	    $thold = 0.8;
	    print_mess("treeshold value is 0.8 if you want to run the program with another treeshold call it: promfinder.pl -c treeshold_value sequences_file matrixes_file")
	    }
    }#opt_t
    
############ Subroutines to print messages (info) or errors(break the execution)
    
# INFO: print information about different stages of the processing
# The info is always send to the Standard Error!
# option "v" (verbose) allows to see the messages on the screen
    sub print_mess
{
    my @mess = @_;
    
    print STDERR ("\t: @mess \n") if (exists($opt{v}));
}

# ERROR: print message and quit the program
sub print_error
{
    my @mess = @_;
    
    print STDERR ("\t: @mess \n") and exit();
}


################################################################################################################################################
################################################################################################################################################
######################################################   THE  END     ##########################################################################
################################################################################################################################################