Predictor.pl Version 2.0


#!/usr/bin/perl -w use strict; #To clean the sreen. print `clear`; #It stores the name of the input, output and parameter files on the $entrada, $sortida and $param variables. my $entrada = $ARGV[0]; my $sortida = $ARGV[1]; my $param = $ARGV[2]; #If the following condition is true (the number of elements of the ARGV vector is lower than 1, that is, there has not been introduced any input, output or #parameter) the program shows an error message and exits. if (scalar(@ARGV) < 3) { print "Error de format: ./predictor.pl [nom fitxer entrada] [nom fitxer sortida] [ruta d'acc-bés del fitxer de paràmetres]\n"; print "\n"; exit(1); } #If the input can not be opened, the program shows an error message and exits. Else, the inputfile is opened to read. if (!open(DADES,"< $entrada")) { print "predictor.pl: impossible obrir $entrada\n"; print "\n"; exit(1); } my $i=0; # variable referred to vector positions. my $e=1; # variable referred to the number of cDNA read. my $seq_linial=""; #variable containing "empty word" #Iterative composition that reads the input file. Meanwhile it is read, the variable $seq_linial is modified by adding the present content of the $_ #to the existing content of the variable $seq_linial.The anonymous variable reads line by line automatically.At the end, this variable ($seq_linial) will contain all the input file content in only one line. while (<DADES>) { $seq_linial = $seq_linial.$_; } #We split the content of the variable $seq_linial abiding by the criteria of a line change and the symbol ">".Therefore we will get separate all the cDNA sequences. We declare a vector called @cdna in which we assign a cDNA sequence to each position. my @cdna = split(/\n>/,$seq_linial); # The programm will run the iterative composition while the value of the variable $i is minor than the number of vector elements ( cDNA sequences). while ($i<scalar(@cdna)) { if ($i==0) { #If the first sequence is been read... ( first position of the vector) open(MIG,"> mid.fa"); #This opens a reading file and prints the content of the $i position at the @cdna vector. print MIG "$cdna[$i]"; print "llegint sequencia $e\n"; #This line indicates the number of the sequence being read. } else { #If it is not the first sequence... open(MIG,"> mid.fa"); #this opens a writing file and prints the content of the $i position, preceeded by the ">" symbol, at the @cdna vector. print MIG ">$cdna[$i]"; print "llegint sequencia $e\n"; #This line indicates the number of sequence being read. } close(MIG); #We close the file mid.fa in order to be able to use this file by Geneid. my $id; # we generate a variable that will contain the ID of that sequence found in the mid.fa file open(MIG,"< mid.fa"); # the mid.fa file is opened to read while(<MIG>) { # the file is read if(/^>/){$id=$_;} # Those lines where the ">" symbol is found are stored on the $id variable (only the ID of the sequence). } close(MIG); # The mid.fa file is closed my @id=split(//,$id); # The elements (characters) of the $id variable are splited and introduced into a vector called: @id shift(@id); # As the first character of the @id vector correspond to the ">" symbol, which is not found in the names of gff files, it is # removed by the order "shift".This order eliminates the first character of the @id vector. $id=join("",@id); # The rest of the characters are joined again forming part of one single element in the $id variable. chomp($id); # The change of line is removed. # The Geneid program is executed in order to make the gene predictions on the sequence present at mid.fa file. The content of the prediction is redirected to an outfile ( which name had been specified by the user). . # In case this file contains a previous prediction, the new one will be added at the end. print `bin/geneid -vP $param -R $id,".gff" mid.fa >> $sortida 2>> t.error`; $e=$e+1; # The $e variable takes its value plus one (corresponding to the next sequence). $i=$i+1; # The $e variable takes its value plus one (corresponding to the next position of the vector). } close(DADES); # The input file is closed. print `rm mid.fa`; # The mid.fa file is removed. print "*************************** DONE ****************************\n";