#!/usr/bin/perl -w ### Aquest programa ha estat creat per Marina Reixachs, Gemma Vidal i Laura Martinez prenent com a referencia el programa creat el curs 2013-2014 per Jan Huertas et al. i amb la colaboracio d'Alexandros Pittis, tutor del grup. ### Aquest script permet identificar si el conjunt de querys donades es troben en el genoma d'Equus przewalskii. use strict; #Per a executar el programa cal tenir els fitxers de families querys (multifasta) en una carpeta "Inputs", les querys separades que duguin per nom l'ID de la base de dades en una carpeta "Inputs_separats" i les U canviades per X. El programa s'ha d'executar des de la carpeta "Inputs". my $query; $query = $ARGV[0]; chomp $query; unless (-e "../Outputs_blast/") { system ("mkdir ../Outputs_blast/"); } unless (-e "../Outputs_fastafetch/") { system ("mkdir ../Outputs_fastafetch/"); } unless (-e "../Outputs_subseq/") { system ("mkdir ../Outputs_subseq/"); } #Blastall #Aquest programa creara un blast simplificat (m 9 per saber a que equival cada columna) amb tota la informacio nomes dels hits amb un e-value menor a 1e-3 dins d'una carpeta amb el nom de cada familia de proteines. #Per agilitzar el proces, si el blast ja existeix no es fara. if (-e "$query.fasta") { print "The query exists!\n"; if (-e "../Outputs_blast/${query}.blast") { print "This blast has already been done!\n"; } else { print "Executing $query blast... Wait for it!\n"; system ("blastall -e 1e-3 -m 9 -p tblastn -i $query.fasta -d /cursos/BI/genomes/vertebrates/2014/Equus_przewalskii/genome.fa -o ../Outputs_blast/${query}.blast"); print "${query}.blast exists at ../Outputs_blast\n"; } } else { print "The query does not exist!\n"; die; } #Index #L'index es crea nomes si encara no s'ha creat. if (-e "/cursos/BI/genomes/vertebrates/2014/Equus_przewalskii/genome.index") { print "The index exists!\n"; } else { print "Creating index... Wait for it!\n"; system ("fastaindex /cursos/BI/genomes/vertebrates/2014/Equus_przewalskii/genome.fa /cursos/BI/genomes/vertebrates/2014/Equus_przewalskii/genome.index"); print "The index has been created!\n"; } #Fastafetch #Per tal de llegir els noms de cada hit, es crea un document amb nomes aquests (noms.txt). Es llegira aquest document i es fara servir cada nom per crear l'arxiu fetch amb nomes aquell hit. En aquest punt tambe s'emmagatzemara la llargada de cada contig, que sera utilitzada en fer el subseq. #Si el fetch ja existeix, no es creara. system ("cut ../Outputs_blast/${query}.blast -s -f 2 | sort -u > ../Outputs_blast/${query}_noms.txt"); my $i = 0; open (NOMSBLAST,"<../Outputs_blast/${query}_noms.txt"); my @v = ; close (NOMSBLAST); my %contig_length; for (@v) { chomp; my $contig = $_; unless (-e "../Outputs_fastafetch/${_}.fa") { system ("fastafetch /cursos/BI/genomes/vertebrates/2014/Equus_przewalskii/genome.fa /cursos/BI/genomes/vertebrates/2014/Equus_przewalskii/genome.index '$_' > '../Outputs_fastafetch/${_}.fa' "); $i = $i + 1; } unless (exists ($contig_length{$contig})) { my $cl = 0; open (FASTA,"<../Outputs_fastafetch/${_}.fa"); while() { chomp; next if ($_ =~ m/^>/); my @nucl = split(//, $_); $cl = $cl + scalar(@nucl); } $contig_length{$contig} = $cl; } } print "$i files created!\n"; #Fastasubseq my %hits; my %bestqueries; my %bestpvalue; my $start_pos = 0; my $stop_pos = 0; my $start; my $length; my @blast; my $hit; #Per a cada hit li assignem la query que tingui un menor E-value. open (BLASTALL, "<../Outputs_blast/${query}.blast"); while() { chomp; next if ($_ =~ m/^#/); @blast = split("\t",$_); if (exists($bestqueries{$blast[1]})) { if ($blast[10] < $bestpvalue{$blast[1]}) { $bestqueries{$blast[1]} = $blast[0]; $bestpvalue{$blast[1]} = $blast[10]; } } else { $bestqueries{$blast[1]} = $blast[0]; $bestpvalue{$blast[1]} = $blast[10]; } #En aquest cas llegim l'arxiu generat per blast i agafem els valors de les columnes 8 i 9, corresponents a posicio d'inici i final del hit. #Aquest pas estableix les posicions d'inici i final, considerant si el sentit es positiu o negatiu. if ($blast[8] < $blast[9]) { $start_pos = $blast[8]; $stop_pos = $blast[9]; } else { $start_pos = $blast[9]; $stop_pos = $blast[8]; } if ( exists $hits{$blast[1]} and exists $hits{$blast[1]}{$blast[0]} ) { #actualitza les posicions start i stop if ($hits{$blast[1]}{$blast[0]}{"START"} > $start_pos) { $hits{$blast[1]}{$blast[0]}{"START"} = $start_pos; } if ($hits{$blast[1]}{$blast[0]}{"STOP"} < $stop_pos) { $hits{$blast[1]}{$blast[0]}{"STOP"} = $stop_pos; } } else { #inicialitza les posicions d'inici i final $hits{$blast[1]}{$blast[0]}{"START"} = $start_pos; $hits{$blast[1]}{$blast[0]}{"STOP"} = $stop_pos; } } close (BLASTALL); print "$_ $bestqueries{$_} $bestpvalue{$_}\n" for keys %bestqueries; my $end; unless (-e "../Outputs_subseq/${query}/") { system ("mkdir ../Outputs_subseq/${query}/"); } #Nomes es tenen en compte els hits amb un E-value menor o igual a 1e-10. Per a cada hit es prenen les posicions de inici i final de la millor query amb 1000 nucleotids més upstream i downstream. #En cas que la posicio d'inici resultant sigui negativa es prendra com a start la posicio 1. #En cas que la llargada resultant sigui superior a la llargada del contig s'utilitzara la llargada total del contig menys la posicio de start. foreach my $contig (keys %hits) { if ($bestpvalue{$contig} <= 1e-10) { my $bestquery = $bestqueries{$contig}; $start_pos = $hits{$contig}{$bestquery}{"START"}; $stop_pos = $hits{$contig}{$bestquery}{"STOP"}; $start = $start_pos - 1000; if ($start <= 0) { $start = 1; } if (($stop_pos + 1000) > $contig_length{$contig}) { $length = $contig_length{$contig} - $start; } else { $length = ($stop_pos - $start) + 1000; } $end = $start + $length; system ("fastasubseq '../Outputs_fastafetch/${contig}.fa' $start $length > '../Outputs_subseq/${query}/${contig}_${bestquery}_subseq.fa' "); } } print "Subseq files created!\n"; print "Exonerate in process... Wait for it!\n"; #EXONERATE #primer creem totes les carpetes necessaries. unless (-e "../Exonerate/gff/${query}") { system ("mkdir -p ../Exonerate/gff/${query}"); } unless (-e "../Exonerate/cDNAgff/${query}") { system ("mkdir -p ../Exonerate/cDNAgff/${query}"); } unless (-e "../Exonerate/cDNAfa/${query}") { system ("mkdir -p ../Exonerate/cDNAfa/${query}"); } unless (-e "../Exonerate/aa_gen/${query}") { system ("mkdir -p ../Exonerate/aa_gen/${query}"); } #seguidament executem totes les comandes de l'exonerate per cadascun dels arxius de la carpeta subseq my $t = 0; opendir (SUBSEQDIR, "../Outputs_subseq/${query}"); while (my $file = readdir(SUBSEQDIR)) { next if ($file =~ m/^\./); chomp $file; print "$file\n"; my $protein_file = $file; $protein_file =~ s/\_subseq.fa//g; chomp $protein_file; my @fields = split /_/, $protein_file; my $contig = $fields[0]; my $bestquery = $fields[1]; print "$contig $bestquery\n"; system ("exonerate -m p2g --showtargetgff -q '../Inputs_separats/${bestquery}_2.0.fasta' -t '../Outputs_subseq/${query}/${file}' > '../Exonerate/gff/${query}/${protein_file}.gff' "); system ("egrep -w exon '../Exonerate/gff/${query}/${protein_file}.gff' > '../Exonerate/cDNAgff/${query}/${protein_file}_cDNA.gff' "); system ("fastaseqfromGFF.pl '../Outputs_subseq/${query}/${file}' '../Exonerate/cDNAgff/${query}/${protein_file}_cDNA.gff' > '../Exonerate/cDNAfa/${query}/${protein_file}_cDNA.fa' "); system ("fastatranslate -F 1 '../Exonerate/cDNAfa/${query}/${protein_file}_cDNA.fa' > '../Exonerate/aa_gen/${query}/${protein_file}_1_aa_gen.fa' "); } print "Exonerates done!\n"; print "T-coffee in progress... Wait for it!\n"; unless (-e "../tcoffee/${query}/") { system ("mkdir -p ../tcoffee/${query}"); } #tcoffee my $j = 0; opendir (TRANSLATE, "../Exonerate/aa_gen/${query}/"); while (my $tfile = readdir(TRANSLATE)) { next if ($tfile =~ m/^\./); chomp $tfile; my $protein_tfile = $tfile; $protein_tfile =~ s/\_aa_gen.fa//g; chomp $protein_tfile; my @fields = split /_/, $protein_tfile; my $contig = $fields[0]; my $bestquery = $fields[1]; unless (-e "../tcoffee/${query}/${protein_tfile}_tcoffee.fa") { system ("t_coffee ../Inputs_separats/${bestquery}_2.0.fasta ../Exonerate/aa_gen/${query}/$tfile > ../tcoffee/${query}/${protein_tfile}_tcoffee.fa"); } $j = $j + 1; } print "\n"; print "$j tcoffee created!\n"; print "\n"; print "T-coffees created! Now it's time to run Genewise\n"; unless (-e "../Genewise/${query}") { system ("mkdir -p ../Genewise/${query}") } #genewise #igual que en l'exonerate, es genera el genewise per tots els arxius de la carpeta subseq opendir (SUBSEQDIR, "../Outputs_subseq/${query}"); while (my $arxiu = readdir(SUBSEQDIR)) { next if ($arxiu =~ m/^\./); chomp $arxiu; my $protein_arxiu = $arxiu; $protein_arxiu =~ s/\_subseq.fa//g; chomp $protein_arxiu; my @camps = split /_/, $protein_arxiu; my $scaffold = $camps[0]; my $millorquery = $camps[1]; system ("genewise -cdna -pep -pretty -gff -both '../Inputs_separats/${millorquery}_2.0.fasta' '../Outputs_subseq/${query}/${arxiu}' > '../Genewise/${query}/${protein_arxiu}_genewise.fa' "); } print "Genewise created! If there's something wrong with Exonerate predictions use pep sequence from Genewise to run again T-coffee\n"; print "\n";