Predictor.pl

Predictor.pl Version 1.0


#!/usr/bin/perl -w  

 use strict;  

 # To clean the screen  


 print `clear`;   


 # It stores the name of the input, output and parameter files on the $entrada, $sortida and $param variables. 

          
 my $entrada = $ARGV[0];  
 my $param = $ARGV[2];    
 my $sortida = $ARGV[1];  

 # If the following condition is true (the number of elements of the  ARGV vector is lower than 1, that is, there has not been introduced any input, output or parameter) the program shows an error message and exits.  



 if (scalar(@ARGV) < 3) {  

 print "Error de format: ./predictor.pl [nom fitxer entrada] [nom fitxer sortida] [ruta d'acc-bés al fitxer de parametres] \n"; 

 print "\n";  

 exit(1);  


 }  

 # If the input can not be opened, the program shows an error message and exits. Else, the inputfile is opened to read.  


 if (!open(DADES,"< $entrada")) {  
 
 print "predictor.pl: impossible obrir $entrada\n";  
 
 print "\n";   

  exit(1);  

 }  


 my $i=0;        # Variable referred to vector positions.  
 my $e=1;        # Variable referred to the number of cDNA read.  
 my $seq_linial=" ";   # Variable containing "empty word".  



 # Iterative composition that reads the input file. Meanwhile it is read, the variable $seq_linial is modified by adding the present content of the $_ to the existing content of the variable $seq_linial.The anonymous variable reads line by line automatically.At the end, this variable ($seq_linial)  will contain all the input file content in only one line.  
 
	
 while (<DADES>)  {  
         

     $seq_linial = $seq_linial.$_;  
     

  }  
   
 # We split the content of the variable $seq_linial abiding by the criteria of a line change and the symbol ">".Therefore we will get separate all the cDNA sequences. We declare a vector called @cdna in which we assign a cDNA sequence to each position.   
  

   my  @cdna = split(/\n>/,$seq_linial);  

             
 # The program will run the iterative composition while the value of the variable $i is minor than the number of vector elements ( cDNA sequences).    

  while ($i<scalar(@cdna)) {  

     if ($i==0) {   # If the first sequence is been read... ( first position of the vector).  

       open(MIG,"> mid.fa");    # This opens a reading file and prints the content of the $i position at the @cdna vector.   
	
              print MIG "$cdna[$i]";  
              print "llegint sequencia $e\n"; # This line indicates the number of the sequence being read.       
	
     } else {                     # If it is not the first sequence...  

    	open(MIG,"> mid.fa");    # this opens a writing file and prints the content of the $i position, preceeded by the ">" symbol, at the @cdna vector. 	      
    
	
              print MIG ">$cdna[$i]";  

              print "llegint sequencia $e\n";      # This line indicates the number of sequence being read.  
	
    } 


			 
     close(MIG);   # We close the file mid.fa in order to be able to use this file by Geneid.  

  # The Geneid program is executed in order to make the gene predictions on the sequence present at mid.fa file. The content of the prediction is redirected to an outfile ( which name had been specified by the user). In case this file contains a previous prediction, the new one will be added at the end. Moreover, an error file is generated in order to track the process.  

   print `bin/geneid -vP $param mid.fa >> $sortida 2>> t.error`;      
 
	 
     $e=$e+1;   # The $e variable takes its value plus one (corresponding to the next sequence).  

     $i=$i+1;   # The $e variable takes its value plus one (corresponding to the next position of the vector).  

   
 }  

  close(DADES);  # The input file is closed.  

 Print `rm mid.fa`;  # The mid.fa file is removed.  

 print "*************************** DONE ****************************\n";