Selenoproteïnes en Leishmania braziliensis

 
#!/usr/bin/perl -w
 #=====================================================================================================#
 # This script takes as input a list of ids (one id per line, but can have more than 1 ID file)	       #
 # and reads a sequence file containing these sequences. It will return				       #
 # one file per id list with the sequences of that list (-o flag)				       #
 # Or print them all to STDOUT  (default)							       #
 # If sure that names in id and FASTA file identical (*sure*) use -i flag			       #
 # USAGE : retrieveseqs.pl SEQUENCE_FILE ID_FILE(S)                                                    #
 #=====================================================================================================#


# USAGE : retrieveseqs.pl SEQUENCE_FILE ID_FILE(S)

use strict;
use Data::Dumper;
use Getopt::Std;
my %opts;
getopts('hVvoisndf',\%opts);

&usage unless ($ARGV[0] && $ARGV[1]);

my %seqname = ();
my %SEQS    = ();
my %seqlist = ();
my @tlist   = ();
my @notfound;
my $output = $opts{o} || undef;
my $identical = $opts{i} || 0;
my $sfile = $opts{s}|| undef;
my $names = $opts{n}||undef;
my $verbose = $opts{v}|| undef;
my $V = $opts{V}|| undef;
my %found;
my $filename;
my $got=0;
my $count;
my ($SEQFILE,@qtfile) = @ARGV unless $names;
my $debug = $opts{d} || undef;
my $fast = $opts{f} || undef;
&usage() if $opts{h};


$verbose = 1 if $V;

if($names)
{
    $SEQFILE = $ARGV[0];
    @qtfile = split(/\s+/, $ARGV[1]);
    my $id; 
    foreach my $name(@qtfile)
    {
	if($fast)
	{
	    $name =~  /^([^\s]*)/;
	    $id = $1;

	}
	else
	{
	    $id = $name;
	}
	defined(%{ $SEQS{$id} }) ||                           ## If doesn't exist, create "%{ $SEQS{$id}" foreach id with 2 keys
	    (%{ $SEQS{$id} } = ( TYPE => [], SEQ => '' ));    ## one a list of the sequence ids (list) the other the seq
	# next;
    }
    push @{ $SEQS{$id}{TYPE} }, $id;
    
}
else
{
    if ((my $k = @qtfile) > 1) 
    {
	foreach my $file (@qtfile) {  # foreach file with list of names
	    if ($file =~ /\.gz$/) 
	    {
		open(F,"zcat $file |") || &usage; 
	    } 
	    else 
	    {
		open(F,"< $file")|| &usage;
	    };
	    my $query;
	    while () 
	    { 
		next if /^\s+$/;
		my $id; 
		chomp;
		if($fast)
		{
		    /^([^\s]*)/;
		    $id = $1;
		}
		else
		{
		    ($id = $_) =~ s/\s*$//og;    # remove trailing space
		}
		push @tlist, $id;
		defined(%{ $SEQS{$id} }) ||                           ## If doesn't exist, create "%{ $SEQS{$id}" foreach id with 2 keys
		    (%{ $SEQS{$id} } = ( TYPE => [], SEQ => '', FNAME => $file ));    ## one a list of the sequence ids (list) the other the seq
		$. == 1 && do {                                    
		    $query = $id;  # $query= FIRST id in file
		    
		};
		push @{ $SEQS{$file}{TYPE} }, $id;
	    };
	    close(F);
	};
    }
    else{
	$filename = $qtfile[0];
	if ($filename =~ /\.gz$/) {
		open(F,"zcat $filename |") || &usage; 
	    } else {
		open(F,"< $filename")|| &usage;
	    };
	    my $query;
	    while () { 
		next if /^\s+$/;
		my $id; 
		chomp;	
		if($fast)
		{
		    /^([^\s]*)/;
		    $id = $1;
		    
		}
		else
		{
		    ($id = $_) =~ s/\s*$//og;
		}
		push @tlist, $id;
		defined(%{ $SEQS{$id} }) ||                           ## If doesn't exist, create "%{ $SEQS{$id}" foreach id with 2 keys
		    (%{ $SEQS{$id} } = ( TYPE => [], SEQ => ''));    ## one a list of the sequence ids (list) the other the seq
		$. == 1 && do {                                    
		    $query = $id;  # $query= FIRST id in file
		};


	    }
    
    }
}
@tlist = @qtfile if $names;
my @list_of_names = @tlist; ## so that I can pop found names from tlist and then retrieve all anmes again
my $number_of_names = @tlist;
$number_of_names = @qtfile if $names;
#&debug("tlist : @tlist");

if($fast)
{
    &fast_seqs(); 
    print STDERR "@notfound were not found\n" if $notfound[0];
    print STDERR "($got of $number_of_names found)\n";
    exit(0);
}
else{
    $identical == 1 ? &get_identical_seqs() : &get_seqs();
}
&output();
print STDERR "@notfound were not found\n" if $notfound[0];


#################################################################
#################################################################
sub fast_seqs
{
    
    my ($seq_name, $seq, $isseq, $hit);
    $isseq = 0;
    if ($SEQFILE =~ /\.gz$/) {
	open(F,"zcat $SEQFILE |");
    } else {
	open(F,"< $SEQFILE") || die("cannot open $SEQFILE : $!\n");
    };
    
    $count=1;
    my $want=0;
    while() 
    {
	

	if(/>/ && $got == $number_of_names){
	    print STDERR "($got of $number_of_names found)\n" if $verbose;
	    exit(0);
	    }
	my ($l,$t);
	next if /^\s*$/o;
	chomp;
	$l = $_;

	if($l =~ /^>([^\s]*)\s?/o)
	{
	    $want=0;
	    $count++;
	    print STDERR "." if $count % 100==0 && $verbose;;
 	    print STDERR "[$count ($got found)]\n" if $count % 10000 ==0 && $verbose;
	    $t = $1; 
	    if(defined($SEQS{$t}))
	    {
		$got++;
		$want=1;
	
		print "$_\n";
		next;
	    }
	    else{ $want=0;}
	}
	else
	{
	
	    if($want==1)
	    {
		print "$_\n";
	    }
	    else
	    {
		next;
	    }
	}
	     
    }
}
#################################################################
#################################################################
sub get_seqs
{
    
    my ($seq_name, $seq, $isseq, $hit);
    $isseq = 0;
    if ($SEQFILE =~ /\.gz$/) {
	open(F,"zcat $SEQFILE |");
    } else {
	open(F,"< $SEQFILE") || die("cannot open $SEQFILE : $!\n");
    };
    
    $count=1;
    while() 
    {
	
	&output() if $got ==  $number_of_names;
	my ($l,$t);
	next if /^\s*$/o;
	chomp;
	$l = $_;

	$l =~ /^>((.*?)(\s+.*)?)$/o && do 
	{
	    @tlist = ();
	    map{push @tlist, $_ unless $found{$_}} @list_of_names;
	    $count++;
	    print STDERR "." if $count % 100 ==0 && $verbose; 
	    print STDERR "[$count]\n" if $count % 10000 ==0 && $verbose; 
	    $t = $2;               # $t = sequence name (defined further down)
	    $isseq && do 
	    {
		($hit) = ( grep { &do_test($_,$seq_name)} @tlist);
			(defined($hit) && exists($SEQS{$hit})) && do     
		{## if curr seq is desired and its hash is defined
		     $SEQS{$hit}{SEQ} = $seq;                 ## get the id's sequence
		     $found{$hit}++;
		     $got++;
		     $seqname{$hit} = $seq_name;
		     print STDERR "!" if $V;
		     
		 };
	    };
	    $seq_name = $t; 
	    $seq = '';
	    $isseq = 1;
	    next;                                             ## next moves to sequence line 
	};
	$seq .= $l;                                           ## $seq now gets sequence ( because of next above)
    };
    
    close(F);
    $isseq && do                                             ## get last sequence
    {
	($hit) = grep { &do_test($_,$seq_name) } @tlist;
	(defined($hit) && exists($SEQS{$hit})) && do     
		{## if curr seq is desired and its hash is defined
		     $SEQS{$hit}{SEQ} = $seq;                 ## get the id's sequence
		     $found{$hit}++;
		     $got++;
		     $seqname{$hit} = $seq_name;
		     print STDERR "!" if $V;
		     
		 };
    };

}
#################################################################
#################################################################
sub get_identical_seqs
{
    my ($seq_name, $seq, $isseq, $hit);
    $isseq = 0;
    if ($SEQFILE =~ /\.gz$/) {
	open(FILE,"zcat $SEQFILE |");
    } else {
	open(FILE,"< $SEQFILE");
    };
    
     $count=1;
    
    while() {

	&output() if $got ==  $number_of_names;
	my ($l,$t);
	next if /^\s*$/o;
	chomp;
	$l = $_;
	$l =~ /^>((.*?)(\s+.*)?)$/o && do {
	    next unless exists($SEQS{$1});
	    @tlist = ();
	    map{push @tlist, $_ unless $found{$_}} @list_of_names;
	    $count++;
	    print STDERR "." if $count % 100 ==0 && $verbose; 
	    print STDERR "[$count]\n" if $count % 10000 ==0 && $verbose;
	    $t = $1;               # $t = sequence name (defined further down)
	    $isseq && do {	
		exists($SEQS{$seq_name}) && do { 
		    $SEQS{$seq_name}{SEQ} = $seq; 
		    $seqname{$seq_name} = $seq_name;
		    print STDERR "!" if $V;
		    $got++;   
		}
	    };
	    $seq_name = $t; # quotemeta($t);                        ## quotemeta will backslash all nonstd chars, so $t = $seq_name
	    $seq = '';
	    $isseq = 1;
	    next;                                             ## next moves to sequence line 
	};
	$seq .= $l;                                           ## $seq now gets sequence ( because of next above)
    }
    
    close(FILE);
    $isseq && do {                                            ## get last sequence
	exists($SEQS{$seq_name}) && do { $SEQS{$seq_name}{SEQ} = $seq; }
    }; print STDERR "[$count]\n"; 
}
######################################################################################
######################################################################################
sub output
{
    print STDERR "[$count ($got/$number_of_names found]\n" if $verbose;   
    if ($output)                                 ## if we want one outfile per id infile
    {              
	
	foreach my $file (@qtfile)
	{
	    if($names)
	    {
		foreach my $Q (@qtfile)
		{
		    open(F,">$Q.fa");
		    my ($S,$l,$i);
		    $S = $SEQS{$Q}{SEQ};                           ## $S = sequence
		    $l = length($S);
		    $i = 0;
		    print F ">$Q\n";
		    while($i<$l)                                   ## convert to FASTA
		    {
			print F substr($S,$i,60) . "\n"; 
			$i=$i+60;
		    }
		}
	    }
	    else
	    {
		open(F,">$file\.fa");                                  ## create one outfile per id infile
		foreach my $name (@{ $SEQS{$file}{TYPE} }) {              ## foreach id in this file
		    my ($S,$l,$i);
		    $S = $SEQS{$name}{SEQ};                               ## $S = sequence
		    $l = length($S);
		    $i = 0;
		    if($SEQS{$name}{SEQ}) {  print F ">$seqname{$name}\n";}	
		    
		    ## convert to FASTA
		    while($i<$l)  
		    {
			
			print F substr($S,$i,60) . "\n"; 
			$i=$i+60;
		    }
		};
		close(F);
	    }
	}
    }
    elsif($sfile)
    {
	foreach my $name (@list_of_names)
	{
	    $name =~ /^(.*?)\s/;
	    my $q = $1 || $name;
	    open(F, ">$q.fa");
	    my ($S,$l,$i);
	    $S = $SEQS{$name}{SEQ};                            ## $S = sequence
	    $l = length($S);
	    $i = 0;
	    if($SEQS{$name}{SEQ}) {  print F ">$seqname{$name}\n";}
	    while($i<$l)  
	    {
		print F substr($S,$i,60) . "\n"; 
		$i=$i+60;
	    }
	    
	    close(F);
	}
	
    }
    elsif($names)
    {
	foreach my $Q (@qtfile)
	{
	    my ($S,$l,$i);
	    $S = $SEQS{$Q}{SEQ};                           ## $S = sequence
	    $l = length($S);
	    $i = 0;
	    print STDOUT ">$Q\n";
	    while($i<$l)                                   ## convert to FASTA
	    {
		print STDOUT substr($S,$i,60) . "\n"; 
		$i=$i+60;
	    }
	}
    }
    else{  ## print all sequences to STDOUT (usefull when only one file with ids)
		foreach my $name (@list_of_names)
		{
		  
		    my ($S,$l,$i);
		    $S = $SEQS{$name}{SEQ};                           ## $S = sequence
		    $l = length($S);
		    $i = 0;
		    if($SEQS{$name}{SEQ}) {  print STDOUT ">$seqname{$name}\n";}
		    else{push @notfound, $name}
		    
		    while($i<$l)                                   ## convert to FASTA
		    {
			print STDOUT substr($S,$i,60) . "\n"; 
			$i=$i+60;
		    }
		}
	}
    exit(0);
}



sub do_test() { ### A trick of pep's to check whether the current id is longer than the hash key or not and then search for the shortest in the longest
    my ($id_passed,$seqname,$ll,$seqnamel,$o);

    ($id_passed,$seqname) = @_;
    $ll = length($id_passed);
    $seqnamel = length($seqname);
    my ($target,$query) = ($ll > $seqnamel) ? ($id_passed,$seqname) : ($seqname,$id_passed);
      $o = (($target =~ /$query/i) ? $seqname : undef);
    &debug("id : $id_passed, seq : $seqname, l : $ll, seqn :$seqnamel");
    &debug("o:$o") if $o;
      return $o
}


sub debug
{
    if ($debug)
    {
	print STDERR "@_\n";
    }
}


sub usage
{
    print STDERR < 
    -v : verbose output, print a progress indicator (a "." for every 1000 sequences processed)
    -V : as above but a "!" for every desired sequence found.
    -f : fast, takes first characters of name "(/^([^\\s]*)/)" given until the first space as the search string
         make SURE that those chars are UNIQUE.
    -i : use when the ids in the id file are EXACTLY identical
         to those in the FASTA file
    -h : Show this help and exit.
    -o : will create one fasta file for each of the id files
    -s : will create one fasta file per id
    -n : means that the last arguments (after the sequence file)
         passed are a QUOTED list of the names desired.
EndOfHelp

die("\n*** A minimum of two input files is required : $! ***\n\n");
}


#==========================Known Bugs=========================================#
# When 2 different seqs have v. similar names, script gets confused	      #
# eg MMSELP-I and MMSELP-II, will only return MMSELP-I use -i to avoid	      #
#=============================================================================#