#!/usr/bin/perl -w use strict; if (scalar(@ARGV) < 3){ print "provatab.pl sequences.fa exonerate.gff species.txt\n"; #a file with the sequences we have, exonerate results, exit(1); #list of generous (it was the best way } #according to the way we had the names of the sequences) my $sequencia = $ARGV[0]; my $fitxerexonerate = $ARGV[1]; my $especies = $ARGV[2]; ######we read the sequences file and save it in a hash##### if (!open (SEQ,"< $sequencia")){ print "provatab.pl:impossible obrir $sequencia\n"; exit(1); } my %seq; #the hash will have the name of the sequence as the key my $nom; #and the sequence as the value. my $seq; my @cap; my %cap; #this hash will have the location of the gene. while (){ chomp ($_); if ($_ =~ m/\A\>(\w+).*\Z/){ @cap = split; #in the 4th position of the array we'll have the location. $nom = $1; $cap{$nom} = $cap[3]; $seq{$nom}=""; } else { $seq{$nom} = $seq{$nom}.$_; } } close(SEQ); my @keys = keys(%seq); ####we read the species file and we save them in an array#### if (!open (ESP, "< $especies")){ print "provatab.pl:impossible obrir $especies\n"; exit(1); } my $i=0; my @especies; while (){ $especies[$i]=$_; chomp($especies[$i]); $i=$i+1; } close(ESP); ####### we compare the list of species with the keys of the hash to know which ortologous gene we found####### print "##especies\torthologous\tlocation\n"; $i=0; my @trobat_ortoleg;my @paraules;my $espgen; while ($i){ ##we pick up the gff lines and save them in an array if (/\Avulgar/){ push @ultlinia, $_; } if (/exonerate:coding2genome/ && $_ !~ /^\#/){ push @gff, $_; } } close(EXONERATE); ######now, we'll work on the gff part to get the parts we are interested in##### $resultat[0][0]= "name"; $resultat[0][1]="intron start end"; $resultat[0][2]="donor site"; $resultat[0][3]= "acceptor site"; $resultat[0][4] = "5'"; $resultat[0][5]= "3'"; $resultat[0][6]= "strand"; #### the first line of the matrix, has the titles. my $k= 0; $i=0; $j=1; while ($k < scalar(@gff)){ @linia = split(/\s+/,$gff[$k]); #we save the line in an array. #We'll construct a matrix with information about the introns: name, start, end, donor, termini if ($linia[2] eq "splice5" && $linia[6] eq "\+"){ ##when we have a line with splice5 and positive strand $resultat[$j][$i]= $linia[0]; #the name $i=$i+1; $donor= substr($seq{$linia[0]}, $linia[3]-5, 14); #the donor sequence. We save it for the moment,we want it in another place $start= $linia[12]; #place in the matrix. The same for the termini } if ($linia[2] eq "splice5" && $linia[6] eq "\-"){ ##the same for the negative strand $resultat[$j][$i]= $linia[0]; $i=$i+1; $longitud= longitud ($seq{$linia[0]}); $start_reverse = $longitud - $linia[3]+1; #we change the coordinates, look for the sequence, and save the reverse comp $donor = substr($seq{$linia[0]}, $start_reverse-10, 14); $donor =~ tr/ACTGactgNn/TGACtgacNn/; $donor = reverse ($donor); $start= $linia[12]; } if ($linia[2] eq "intron"){ $resultat[$j][$i] = "$linia[3],$linia[4]"; ###start, end of the intron $i= $i + 1; } if ($linia[2] eq "splice3" && $linia[6] eq "\+"){ ####the same with the splice site $resultat[$j][$i] = $donor; $i= $i + 1; $resultat[$j][$i] = substr($seq{$linia[0]}, $linia[4]-40, 45); $i= $i + 1; $resultat[$j][$i] = $start; $i = $i + 1; $resultat[$j][$i] = $linia[12]; $i = $i +1; $resultat[$j][$i] = $linia[6]; $j = $j + 1; $i = 0; } if ($linia[2] eq "splice3" && $linia[6] eq "\-"){ $resultat[$j][$i] = $donor; $i= $i + 1; $longitud = longitud ($seq{$linia[0]}); $end_reverse = $longitud - $linia[4]+1; $resultat[$j][$i] = substr($seq{$linia[0]}, $end_reverse-6, 45); $resultat[$j][$i] =~ tr/ACTGactgNn/TGACtgacNn/; $resultat[$j][$i] = reverse ($resultat[$j][$i]); $i= $i + 1; $resultat[$j][$i] = $start; $i = $i + 1; $resultat[$j][$i] = $linia[12]; $i = $i +1; $resultat[$j][$i] = $linia[6]; $j = $j + 1; $i = 0; } $k = $k+1; } print "##orthologous introns found\n";$i=0; while ($i1){ open (FASTADON, ">donors.fa"); $j = 1; while ($j$resultat[$j][0]\n$resultat[$j][2]\n"; ####we save the name of the sequence and the donor sequence $j++; } close (FASTADON); open (FASTAACC, ">acceptors.fa"); ####the same with the acceptor sequences $j = 1; while ($j$resultat[$j][0]\n$resultat[$j][3]\n"; $j++; } close (FASTAACC); `./u12tools/geneid -G -do -P ./u12tools/paramnous.param donors.fa > geneid_donors.gff`; ##looking for donors with geneid `./u12tools/geneid -G -ao -P ./u12tools/paramnous.param acceptors.fa > geneid_acceptors.gff`; ##looking for acceptors with geneid `./u12tools/wmd.pl ./u12tools/LD404Burge27BS13.plain acceptors.fa > branchos`; ##looking for branch sites if (!open GFFDON, "){ if ($_ !~ /\A\#/){ push @gff2, $_; } } close (GFFDON); my @sites; ##this matrix will have the donor and acceptor types and score and the branch sequence and its position $sites[0][0]= "name";$sites[0][1]= "donor prediction";$sites[0][2]="donor score";$sites[0][3]="acceptor prediction";$sites[0][4]="acceptor score";$sites[0][5]="branch sequence";$sites[0][6]="position of branch point (in intron)"; $i=0;$j=1; @linia = ""; while ($i){ if ($_ !~ /\A\#/){ push @gff3, $_; } } close (GFFACC); $i=0;@linia = "";my @sites_no;my $p=0; ##we read the output for the acceptors while($i){ @linia = split(/\t/, $_); $j=0;my $e=0; while ($j0){ print "\n##acceptors predicted with geneid without a donor of the same type\n"; } $i=0; while ($i0){ print "\n##branch sites without a donor pair\n"; } $i=0; while ($i