Predictor2.0.pl

Predictor.pl Version 2.0



#!/usr/bin/perl -w

use strict;

#To clean the sreen.

print `clear`;  

#It stores the name of the input, output and parameter files on the $entrada, $sortida and $param variables. 
          

my $entrada = $ARGV[0];   
my $sortida = $ARGV[1];   
my $param = $ARGV[2];     

#If the following condition is true (the number of elements of the  ARGV vector is lower than 1, that is, there has not been introduced any input, output or 
#parameter) the program shows an error message and exits.

if (scalar(@ARGV) < 3) { 

  print "Error de format: ./predictor.pl [nom fitxer entrada] [nom fitxer sortida] [ruta d'acc-bés del fitxer de paràmetres]\n";
  print "\n";
  exit(1);

}

#If the input can not be opened, the program shows an error message and exits. Else, the inputfile is opened to read.

if (!open(DADES,"< $entrada")) {
 
 print "predictor.pl: impossible obrir $entrada\n";
 print "\n";

  exit(1);

}



my $i=0;  # variable referred to vector positions.
my $e=1;  # variable referred to the number of cDNA read.



my $seq_linial=""; #variable containing "empty word"

#Iterative composition that reads the input file. Meanwhile it is read, the variable $seq_linial is modified by adding the present content of the $_ 
#to the existing content of the variable $seq_linial.The anonymous variable reads line by line automatically.At the end, this variable ($seq_linial) will contain all the input file content in only one line.

 while (<DADES>)  {  

    $seq_linial = $seq_linial.$_;
     

 }
   
#We split the content of the variable $seq_linial abiding by the criteria of a line change and the symbol ">".Therefore we will get separate all the cDNA sequences. We declare a vector called @cdna in which we assign a cDNA sequence to each position.  

my  @cdna = split(/\n>/,$seq_linial);

# The programm will run the iterative composition while the value of the variable $i is minor than the number of vector elements ( cDNA sequences).              
    

while ($i<scalar(@cdna)) {

    if ($i==0) {                  #If the first sequence is been read... ( first position of the vector)

       	open(MIG,"> mid.fa");     #This opens a reading file and prints the content of the $i position at the @cdna vector. 
	
             print MIG "$cdna[$i]";
             print "llegint sequencia $e\n";  #This line indicates the number of the sequence being read.    
	

    } else {                      #If it is not the first sequence...

     	open(MIG,"> mid.fa");     #this opens a writing file and prints the content of the $i position, preceeded by the ">" symbol, at the @cdna vector.
	
             print MIG ">$cdna[$i]";

         print "llegint sequencia $e\n";      #This line indicates the number of sequence being read.
	
    }


			 
close(MIG); #We close the file mid.fa in order to be able to use this file by Geneid.

    
my $id;                     # we generate a variable that will contain the ID of that sequence found in the mid.fa file
   
   open(MIG,"< mid.fa");    # the mid.fa file is opened to read 

   while(<MIG>) {           # the file is read
      
   if(/^>/){$id=$_;}        # Those lines where the ">" symbol is found are stored on the $id variable (only the ID of the sequence). 
   
   }

   close(MIG);              # The mid.fa file is closed

    my @id=split(//,$id);   # The elements (characters) of the $id variable are splited and introduced into a vector called: @id

    shift(@id);             # As the first character of the @id vector correspond to the ">" symbol, which is not found in the names of gff files, it is 
                                        # removed by the order "shift".This order eliminates the first character of the @id vector.  

    $id=join("",@id);        # The rest of the characters are joined again forming part of one single element in the $id variable. 
   
   chomp($id);               # The change of line is removed.

# The Geneid program is executed in order to make the gene predictions on the sequence present at mid.fa file. The content of the prediction is redirected to an outfile ( which name had been specified by the user). . 

# In case this file contains a previous prediction, the new one will be added at the end. 

print `bin/geneid -vP $param -R $id,".gff"  mid.fa >> $sortida 2>> t.error`;      
 
	 
    $e=$e+1;                # The $e variable takes its value plus one (corresponding to the next sequence).  


    $i=$i+1;                # The $e variable takes its value plus one (corresponding to the next position of the vector).  



   
}

 close(DADES);              # The input file is closed.

print `rm mid.fa`;          # The mid.fa file is removed.

print "*************************** DONE ****************************\n";