#!/usr/bin/perl -w

use strict;

if (scalar(@ARGV) < 3){
    print "provatab.pl sequences.fa exonerate.gff species.txt\n";     #a file with the sequences we have, exonerate results,
    exit(1);                                                            #list of generous (it was the best way 
}                                                                       #according to the way we had the names of the sequences)

my $sequencia = $ARGV[0];
my $fitxerexonerate = $ARGV[1];
my $especies = $ARGV[2];

######we read the sequences file and save it in a hash#####

if (!open (SEQ,"< $sequencia")){                           
    print "provatab.pl:impossible obrir $sequencia\n";
    exit(1);
}

my %seq;                                                 #the hash will have the name of the sequence as the key
my $nom;                                                 #and the sequence as the value. 
my $seq;                              
my @cap; my %cap;                                        #this hash will have the location of the gene.
while (<SEQ>){                                         
    chomp ($_);
    if ($_ =~ m/\A\>(\w+).*\Z/){
	@cap = split;                                  #in the 4th position of the array we'll have the location.
	$nom = $1;
	$cap{$nom} = $cap[3];                            
	$seq{$nom}="";
    }
   
    else {
   	$seq{$nom} = $seq{$nom}.$_;
    }
  
}

close(SEQ);

my @keys = keys(%seq);

####we read the species file and we save them in an array####

if (!open (ESP, "< $especies")){
    print "provatab.pl:impossible obrir $especies\n";
    exit(1);
}

my $i=0; my @especies;
while (<ESP>){
    $especies[$i]=$_;
    chomp($especies[$i]);
    $i=$i+1;
}
close(ESP);


####### we compare the list of species with the keys of the hash to know which ortologous gene we found#######

print "##especies\torthologous\tlocation\n";
$i=0; my @trobat_ortoleg;my @paraules;my $espgen;
while ($i<scalar@especies){
    ($trobat_ortoleg[$i], $espgen) = ortoleg(\@keys,$especies[$i]);        #the ortoleg function is below, after "FUNCTIONS"
    if ($trobat_ortoleg[$i] !~ /not/){                                     #if we found the gene, we'll print the location
	print "$trobat_ortoleg[$i]\t$cap{$espgen}\n";
    }
    else{
	print "$trobat_ortoleg[$i]\n";
    }
    $i= $i +1;
}
print "\n";

#### now, we're going to read the exonerate file####
if (!open(EXONERATE,"< $fitxerexonerate")){
    print "provatab.pl: impossible obrir $fitxerexonerate\n";
exit(1);
}


####we read the whole file and save the gff lines####
my @linia;                                               ##this array will have the elements of a gff line
my @resultat;                                            ##this is a matrix whith the things we're interested in. Each line is an intron
my $j;my @gff;my @ultlinia;
my $donor; my $start; my $longitud; my $start_reverse; my $end_reverse;
while (<EXONERATE>){                                      ##we pick up the gff lines and save them in an array
	if (/\Avulgar/){
	    push @ultlinia, $_;
	}
    if (/exonerate:coding2genome/ && $_ !~ /^\#/){                            
	push @gff, $_;
    }
    }
close(EXONERATE);

######now, we'll work on the gff part to get the parts we are interested in#####

$resultat[0][0]= "name";  $resultat[0][1]="intron start end"; $resultat[0][2]="donor site"; $resultat[0][3]= "acceptor site"; $resultat[0][4] = "5'"; $resultat[0][5]= "3'"; $resultat[0][6]= "strand";   #### the first line of the matrix, has the titles.

my $k= 0; $i=0; $j=1;
while ($k < scalar(@gff)){
  
   @linia = split(/\s+/,$gff[$k]);  #we save the line in an array.
    
                                    #We'll construct a matrix with information about the introns: name, start, end, donor, termini
   if ($linia[2] eq "splice5" && $linia[6] eq "\+"){    ##when we have a line with splice5 and positive strand
   $resultat[$j][$i]= $linia[0];                        #the name
   $i=$i+1;
   $donor= substr($seq{$linia[0]}, $linia[3]-5, 14);    #the donor sequence. We save it for the moment,we want it in another place 
   $start= $linia[12];                                  #place in the matrix. The same for the termini
   }
   if ($linia[2] eq "splice5" && $linia[6] eq "\-"){    ##the same for the negative strand 
   $resultat[$j][$i]= $linia[0];
   $i=$i+1;
   $longitud= longitud ($seq{$linia[0]});
   $start_reverse = $longitud - $linia[3]+1;                 #we change the coordinates, look for the sequence, and save the reverse comp
   $donor = substr($seq{$linia[0]}, $start_reverse-10, 14);   
   $donor =~ tr/ACTGactgNn/TGACtgacNn/;  
   $donor = reverse ($donor);
   $start= $linia[12];                                      
   }
   if ($linia[2] eq "intron"){
	$resultat[$j][$i] = "$linia[3],$linia[4]";             ###start, end of the intron 
	$i= $i + 1;
   }
   if ($linia[2] eq "splice3" && $linia[6] eq "\+"){                    ####the same with the splice site
       $resultat[$j][$i] = $donor;
       $i= $i + 1;
       $resultat[$j][$i] = substr($seq{$linia[0]}, $linia[4]-40, 45);
       $i= $i + 1;
       $resultat[$j][$i] = $start;
       $i = $i + 1;
       $resultat[$j][$i] = $linia[12];
       $i = $i +1;
       $resultat[$j][$i] = $linia[6];
       $j = $j + 1;
       $i = 0;
  }
   if ($linia[2] eq "splice3" && $linia[6] eq "\-"){
       $resultat[$j][$i] = $donor;
       $i= $i + 1;
       $longitud = longitud ($seq{$linia[0]});
       $end_reverse = $longitud - $linia[4]+1;
       $resultat[$j][$i] = substr($seq{$linia[0]}, $end_reverse-6, 45);
       $resultat[$j][$i] =~ tr/ACTGactgNn/TGACtgacNn/;
       $resultat[$j][$i] = reverse ($resultat[$j][$i]);
       $i= $i + 1;
       $resultat[$j][$i] = $start;
       $i = $i + 1;
       $resultat[$j][$i] = $linia[12];
       $i = $i +1;
       $resultat[$j][$i] = $linia[6];
       $j = $j + 1;
       $i = 0;
  } 
 $k = $k+1;  
}



print "##orthologous introns found\n";$i=0;
while ($i<scalar(@resultat)){                                            ####we print the matrix
    $j=0;
    while ($j<scalar(@{$resultat[$i]})){
	print "$resultat[$i][$j]";
	print "\t";
	$j=$j+1;
    }
    print "\n";
    $i=$i+1;
}


print "\n";$k=0;
while ($k<scalar(@ultlinia)){                                ##here we work on the line starting with "vulgar" in the output of exonerate
    my @vulgar = split (/\s+/, $ultlinia[$k]);               
    $j=0; my $e=0;
    while ($j<scalar(@resultat)){
	if ($vulgar[5] eq $resultat[$j][0]){                ##we compare the names with the ones we have in the matrix 
	    $e=1;
	}
	$j++;
    }
    if ($vulgar[4] eq "\-"){
	print "##query in the negative strand\n$vulgar[5]\n";       ##if the query is in the negative strand, we want to know it
    }
	if($e == 0){                                        ##if we didn't find an orthologous intron we look for an alignment
	my $coverage = $vulgar[11]/120*100;
	print "##found alignment,not intron\n$vulgar[5]\tcoverage:\t$coverage%\n";  
    }
    $k++;
}


###################FUNCTIONS###################

sub ortoleg{                                                 #this is to compare list of species we searched in and the genes we have
my @keys = @{$_[0]};                                         #we compare only the 3 first characters because they are always 
my $nom  = $_[1];                                            #in our files. Using more letters we'll have problems. Maybe it's not pretty
my $nomgentallat; my $especietallat; my $espgen;               #but it works
my $i= 0;                                                    ###we'll see how to print this.
my $ortoleg = "$nom\tnot found orthologous gene";
while ($i<scalar(@keys)){
    $nomgentallat= substr($keys[$i],0,3);                                
    $especietallat= substr($nom,0,3);                                    
    if ($nomgentallat eq $especietallat){
	$ortoleg = "$nom\tfound orthologous gene";
	$espgen = $keys[$i];
    }
    $i= $i + 1;
}

return $ortoleg, $espgen;
}



sub longitud{
    my @sequencia = split(//,$_[0]);                          ###just to know the length of the sequence
    return scalar(@sequencia);
}

#################################

####we'll save the donor and acceptor sequence in fasta format and run geneid and the program to look for branch points###
if (scalar(@resultat)>1){
  open (FASTADON, ">donors.fa");
    $j = 1;
  while ($j<scalar(@resultat)){
    print FASTADON ">$resultat[$j][0]\n$resultat[$j][2]\n";        ####we save the name of the sequence and the donor sequence
    $j++;
}
close (FASTADON);

open (FASTAACC, ">acceptors.fa");                                  ####the same with the acceptor sequences
$j = 1;
while ($j<scalar(@resultat)){
    print FASTAACC ">$resultat[$j][0]\n$resultat[$j][3]\n";
    $j++;
}
close (FASTAACC);

`./u12tools/geneid -G -do -P ./u12tools/paramnous.param donors.fa > geneid_donors.gff`;       ##looking for donors with geneid

`./u12tools/geneid -G -ao -P ./u12tools/paramnous.param acceptors.fa > geneid_acceptors.gff`; ##looking for acceptors with geneid

`./u12tools/wmd.pl ./u12tools/LD404Burge27BS13.plain acceptors.fa > branchos`;                ##looking for branch sites

if (!open GFFDON, "<geneid_donors.gff"){                                                ##we'll parse the geneid output for donors
    print "impossible obrir geneid_donors.gff\n";
    exit(1);
}

my @gff2;
while (<GFFDON>){
 if ($_ !~ /\A\#/){
	push @gff2, $_;
    }
}
close (GFFDON);

my @sites;                ##this matrix will have the donor and acceptor types and score and the branch sequence and its position
$sites[0][0]= "name";$sites[0][1]= "donor prediction";$sites[0][2]="donor score";$sites[0][3]="acceptor prediction";$sites[0][4]="acceptor score";$sites[0][5]="branch sequence";$sites[0][6]="position of branch point (in intron)";
 $i=0;$j=1; @linia = "";
while ($i<scalar(@gff2)){                                       ##we read the lines and save the name, type of intron predicted and score
    @linia = split(/\t/, $gff2[$i]);
    if ($linia[3] == 4 && $linia[6] eq "\+"){                   ##4 is the position of the end of the exon
	$sites[$j][0] = $linia[0];
	$sites[$j][1] = $linia[2];
	$sites[$j][2] = $linia[5];
	$j++;
    }
   $i++;
}

if (!open GFFACC, "<geneid_acceptors.gff"){
    print "impossible obrir geneid_acceptors.gff\n";
    exit(1);
}

my @gff3;
while (<GFFACC>){
    if ($_ !~ /\A\#/){
	push @gff3, $_;
    }
}
close (GFFACC);

$i=0;@linia = "";my @sites_no;my $p=0;                           ##we read the output for the acceptors
while($i<scalar(@gff3)){
    @linia = split(/\t/, $gff3[$i]);
    if ($linia[3] == 40 && $linia[6] eq "\+"){                  ##the end of intron is position 40 and strand must be +
	my $type_acc = substr ($linia[2],9);                    ##to compare the types of donor and acceptor predicted
	$j = 0; my $e=0;                                        ##if they are the same we'll put the acceptor in the same line of the
	    while ($j < scalar(@sites)){                        ##matrix than the donor
		my $type_don = substr ($sites[$j][1],6);                    
		if ($linia[0] eq $sites[$j][0] && $type_acc eq $type_don){          
       		    $sites[$j][3] = $linia[2];
		    $sites[$j][4] = $linia[5];
		    $e = 1;
		}
		$j++
		}
	     if($e == 0){
		    $sites_no[$p] = "$linia[0]\t$linia[2]\t$linia[5]";         ##if they are not the same, we'll save them in an array
		    $p++;
		}  
   }
	$i++;
}

if (!open BRANCH, "<branchos"){
    print "impossible obrir branchos\n";
    exit(1);
}

my @branchos_no;my $q=0;my $posbranch;
while (<BRANCH>){
    @linia = split(/\t/, $_);
    $j=0;my $e=0;
    while ($j<scalar(@sites)){                                        ##with the branch points we do the same, but we only 
	if ($linia[0] eq $sites[$j][0]){                              ##compare the names
	    $sites[$j][5] = $linia[3];                                 ##this is to know the position of branch point:
	    $sites[$j][6] = -(41 - ($linia[2]+10));                                ##$linia[2] has the start of the consensus, and 10 
	    $e=1;                                                     ##is the offset to the branch point. 41 is the start of exon
	}
	$j++;
    }
     if ($e==0){
	$posbranch = $linia[2]+10-41;
	$branchos_no[$q] = "$linia[0]\t$posbranch\t$linia[3]";
	$q++;
    }
}
    close (BRANCH);

print "\n##donor and acceptor sites predictions in geneid, branch point sequence and position of it in the intron.\n\n"; 
$i=0;
while ($i<scalar(@sites)){                                            ####we print the matrix
    $j=0;
    while ($j<scalar(@{$sites[$i]})){
	print "$sites[$i][$j]";
	if (!$sites[$i][$j]){                                         ##we compared acceptors and branch points to donors, so we
	    print "\t";                                                 ##can have the position of acceptors empty.
	}
	print "\t";
	$j=$j+1;
    }
    print "\n";
    $i=$i+1;
}

if (scalar(@sites_no)>0){
print "\n##acceptors predicted with geneid without a donor of the same type\n";
}
$i=0;
while ($i<scalar(@sites_no)){
    print "$sites_no[$i]\n";
    $i++;
}

if (scalar(@branchos_no)>0){
print "\n##branch sites without a donor pair\n";
}
$i=0;
while ($i<scalar(@branchos_no)){
    print "$branchos_no[$i]\n";
    $i++;
}
}
