#!/usr/bin/perl -w
#=====================================================================================================#
# This script takes as input a list of ids (one id per line, but can have more than 1 ID file) #
# and reads a sequence file containing these sequences. It will return #
# one file per id list with the sequences of that list (-o flag) #
# Or print them all to STDOUT (default) #
# If sure that names in id and FASTA file identical (*sure*) use -i flag #
# USAGE : retrieveseqs.pl SEQUENCE_FILE ID_FILE(S) #
#=====================================================================================================#
# USAGE : retrieveseqs.pl SEQUENCE_FILE ID_FILE(S)
use strict;
use Data::Dumper;
use Getopt::Std;
my %opts;
getopts('hVvoisndf',\%opts);
&usage unless ($ARGV[0] && $ARGV[1]);
my %seqname = ();
my %SEQS = ();
my %seqlist = ();
my @tlist = ();
my @notfound;
my $output = $opts{o} || undef;
my $identical = $opts{i} || 0;
my $sfile = $opts{s}|| undef;
my $names = $opts{n}||undef;
my $verbose = $opts{v}|| undef;
my $V = $opts{V}|| undef;
my %found;
my $filename;
my $got=0;
my $count;
my ($SEQFILE,@qtfile) = @ARGV unless $names;
my $debug = $opts{d} || undef;
my $fast = $opts{f} || undef;
&usage() if $opts{h};
$verbose = 1 if $V;
if($names)
{
$SEQFILE = $ARGV[0];
@qtfile = split(/\s+/, $ARGV[1]);
my $id;
foreach my $name(@qtfile)
{
if($fast)
{
$name =~ /^([^\s]*)/;
$id = $1;
}
else
{
$id = $name;
}
defined(%{ $SEQS{$id} }) || ## If doesn't exist, create "%{ $SEQS{$id}" foreach id with 2 keys
(%{ $SEQS{$id} } = ( TYPE => [], SEQ => '' )); ## one a list of the sequence ids (list) the other the seq
# next;
}
push @{ $SEQS{$id}{TYPE} }, $id;
}
else
{
if ((my $k = @qtfile) > 1)
{
foreach my $file (@qtfile) { # foreach file with list of names
if ($file =~ /\.gz$/)
{
open(F,"zcat $file |") || &usage;
}
else
{
open(F,"< $file")|| &usage;
};
my $query;
while ()
{
next if /^\s+$/;
my $id;
chomp;
if($fast)
{
/^([^\s]*)/;
$id = $1;
}
else
{
($id = $_) =~ s/\s*$//og; # remove trailing space
}
push @tlist, $id;
defined(%{ $SEQS{$id} }) || ## If doesn't exist, create "%{ $SEQS{$id}" foreach id with 2 keys
(%{ $SEQS{$id} } = ( TYPE => [], SEQ => '', FNAME => $file )); ## one a list of the sequence ids (list) the other the seq
$. == 1 && do {
$query = $id; # $query= FIRST id in file
};
push @{ $SEQS{$file}{TYPE} }, $id;
};
close(F);
};
}
else{
$filename = $qtfile[0];
if ($filename =~ /\.gz$/) {
open(F,"zcat $filename |") || &usage;
} else {
open(F,"< $filename")|| &usage;
};
my $query;
while () {
next if /^\s+$/;
my $id;
chomp;
if($fast)
{
/^([^\s]*)/;
$id = $1;
}
else
{
($id = $_) =~ s/\s*$//og;
}
push @tlist, $id;
defined(%{ $SEQS{$id} }) || ## If doesn't exist, create "%{ $SEQS{$id}" foreach id with 2 keys
(%{ $SEQS{$id} } = ( TYPE => [], SEQ => '')); ## one a list of the sequence ids (list) the other the seq
$. == 1 && do {
$query = $id; # $query= FIRST id in file
};
}
}
}
@tlist = @qtfile if $names;
my @list_of_names = @tlist; ## so that I can pop found names from tlist and then retrieve all anmes again
my $number_of_names = @tlist;
$number_of_names = @qtfile if $names;
#&debug("tlist : @tlist");
if($fast)
{
&fast_seqs();
print STDERR "@notfound were not found\n" if $notfound[0];
print STDERR "($got of $number_of_names found)\n";
exit(0);
}
else{
$identical == 1 ? &get_identical_seqs() : &get_seqs();
}
&output();
print STDERR "@notfound were not found\n" if $notfound[0];
#################################################################
#################################################################
sub fast_seqs
{
my ($seq_name, $seq, $isseq, $hit);
$isseq = 0;
if ($SEQFILE =~ /\.gz$/) {
open(F,"zcat $SEQFILE |");
} else {
open(F,"< $SEQFILE") || die("cannot open $SEQFILE : $!\n");
};
$count=1;
my $want=0;
while()
{
if(/>/ && $got == $number_of_names){
print STDERR "($got of $number_of_names found)\n" if $verbose;
exit(0);
}
my ($l,$t);
next if /^\s*$/o;
chomp;
$l = $_;
if($l =~ /^>([^\s]*)\s?/o)
{
$want=0;
$count++;
print STDERR "." if $count % 100==0 && $verbose;;
print STDERR "[$count ($got found)]\n" if $count % 10000 ==0 && $verbose;
$t = $1;
if(defined($SEQS{$t}))
{
$got++;
$want=1;
print "$_\n";
next;
}
else{ $want=0;}
}
else
{
if($want==1)
{
print "$_\n";
}
else
{
next;
}
}
}
}
#################################################################
#################################################################
sub get_seqs
{
my ($seq_name, $seq, $isseq, $hit);
$isseq = 0;
if ($SEQFILE =~ /\.gz$/) {
open(F,"zcat $SEQFILE |");
} else {
open(F,"< $SEQFILE") || die("cannot open $SEQFILE : $!\n");
};
$count=1;
while()
{
&output() if $got == $number_of_names;
my ($l,$t);
next if /^\s*$/o;
chomp;
$l = $_;
$l =~ /^>((.*?)(\s+.*)?)$/o && do
{
@tlist = ();
map{push @tlist, $_ unless $found{$_}} @list_of_names;
$count++;
print STDERR "." if $count % 100 ==0 && $verbose;
print STDERR "[$count]\n" if $count % 10000 ==0 && $verbose;
$t = $2; # $t = sequence name (defined further down)
$isseq && do
{
($hit) = ( grep { &do_test($_,$seq_name)} @tlist);
(defined($hit) && exists($SEQS{$hit})) && do
{## if curr seq is desired and its hash is defined
$SEQS{$hit}{SEQ} = $seq; ## get the id's sequence
$found{$hit}++;
$got++;
$seqname{$hit} = $seq_name;
print STDERR "!" if $V;
};
};
$seq_name = $t;
$seq = '';
$isseq = 1;
next; ## next moves to sequence line
};
$seq .= $l; ## $seq now gets sequence ( because of next above)
};
close(F);
$isseq && do ## get last sequence
{
($hit) = grep { &do_test($_,$seq_name) } @tlist;
(defined($hit) && exists($SEQS{$hit})) && do
{## if curr seq is desired and its hash is defined
$SEQS{$hit}{SEQ} = $seq; ## get the id's sequence
$found{$hit}++;
$got++;
$seqname{$hit} = $seq_name;
print STDERR "!" if $V;
};
};
}
#################################################################
#################################################################
sub get_identical_seqs
{
my ($seq_name, $seq, $isseq, $hit);
$isseq = 0;
if ($SEQFILE =~ /\.gz$/) {
open(FILE,"zcat $SEQFILE |");
} else {
open(FILE,"< $SEQFILE");
};
$count=1;
while() {
&output() if $got == $number_of_names;
my ($l,$t);
next if /^\s*$/o;
chomp;
$l = $_;
$l =~ /^>((.*?)(\s+.*)?)$/o && do {
next unless exists($SEQS{$1});
@tlist = ();
map{push @tlist, $_ unless $found{$_}} @list_of_names;
$count++;
print STDERR "." if $count % 100 ==0 && $verbose;
print STDERR "[$count]\n" if $count % 10000 ==0 && $verbose;
$t = $1; # $t = sequence name (defined further down)
$isseq && do {
exists($SEQS{$seq_name}) && do {
$SEQS{$seq_name}{SEQ} = $seq;
$seqname{$seq_name} = $seq_name;
print STDERR "!" if $V;
$got++;
}
};
$seq_name = $t; # quotemeta($t); ## quotemeta will backslash all nonstd chars, so $t = $seq_name
$seq = '';
$isseq = 1;
next; ## next moves to sequence line
};
$seq .= $l; ## $seq now gets sequence ( because of next above)
}
close(FILE);
$isseq && do { ## get last sequence
exists($SEQS{$seq_name}) && do { $SEQS{$seq_name}{SEQ} = $seq; }
}; print STDERR "[$count]\n";
}
######################################################################################
######################################################################################
sub output
{
print STDERR "[$count ($got/$number_of_names found]\n" if $verbose;
if ($output) ## if we want one outfile per id infile
{
foreach my $file (@qtfile)
{
if($names)
{
foreach my $Q (@qtfile)
{
open(F,">$Q.fa");
my ($S,$l,$i);
$S = $SEQS{$Q}{SEQ}; ## $S = sequence
$l = length($S);
$i = 0;
print F ">$Q\n";
while($i<$l) ## convert to FASTA
{
print F substr($S,$i,60) . "\n";
$i=$i+60;
}
}
}
else
{
open(F,">$file\.fa"); ## create one outfile per id infile
foreach my $name (@{ $SEQS{$file}{TYPE} }) { ## foreach id in this file
my ($S,$l,$i);
$S = $SEQS{$name}{SEQ}; ## $S = sequence
$l = length($S);
$i = 0;
if($SEQS{$name}{SEQ}) { print F ">$seqname{$name}\n";}
## convert to FASTA
while($i<$l)
{
print F substr($S,$i,60) . "\n";
$i=$i+60;
}
};
close(F);
}
}
}
elsif($sfile)
{
foreach my $name (@list_of_names)
{
$name =~ /^(.*?)\s/;
my $q = $1 || $name;
open(F, ">$q.fa");
my ($S,$l,$i);
$S = $SEQS{$name}{SEQ}; ## $S = sequence
$l = length($S);
$i = 0;
if($SEQS{$name}{SEQ}) { print F ">$seqname{$name}\n";}
while($i<$l)
{
print F substr($S,$i,60) . "\n";
$i=$i+60;
}
close(F);
}
}
elsif($names)
{
foreach my $Q (@qtfile)
{
my ($S,$l,$i);
$S = $SEQS{$Q}{SEQ}; ## $S = sequence
$l = length($S);
$i = 0;
print STDOUT ">$Q\n";
while($i<$l) ## convert to FASTA
{
print STDOUT substr($S,$i,60) . "\n";
$i=$i+60;
}
}
}
else{ ## print all sequences to STDOUT (usefull when only one file with ids)
foreach my $name (@list_of_names)
{
my ($S,$l,$i);
$S = $SEQS{$name}{SEQ}; ## $S = sequence
$l = length($S);
$i = 0;
if($SEQS{$name}{SEQ}) { print STDOUT ">$seqname{$name}\n";}
else{push @notfound, $name}
while($i<$l) ## convert to FASTA
{
print STDOUT substr($S,$i,60) . "\n";
$i=$i+60;
}
}
}
exit(0);
}
sub do_test() { ### A trick of pep's to check whether the current id is longer than the hash key or not and then search for the shortest in the longest
my ($id_passed,$seqname,$ll,$seqnamel,$o);
($id_passed,$seqname) = @_;
$ll = length($id_passed);
$seqnamel = length($seqname);
my ($target,$query) = ($ll > $seqnamel) ? ($id_passed,$seqname) : ($seqname,$id_passed);
$o = (($target =~ /$query/i) ? $seqname : undef);
&debug("id : $id_passed, seq : $seqname, l : $ll, seqn :$seqnamel");
&debug("o:$o") if $o;
return $o
}
sub debug
{
if ($debug)
{
print STDERR "@_\n";
}
}
sub usage
{
print STDERR <
-v : verbose output, print a progress indicator (a "." for every 1000 sequences processed)
-V : as above but a "!" for every desired sequence found.
-f : fast, takes first characters of name "(/^([^\\s]*)/)" given until the first space as the search string
make SURE that those chars are UNIQUE.
-i : use when the ids in the id file are EXACTLY identical
to those in the FASTA file
-h : Show this help and exit.
-o : will create one fasta file for each of the id files
-s : will create one fasta file per id
-n : means that the last arguments (after the sequence file)
passed are a QUOTED list of the names desired.
EndOfHelp
die("\n*** A minimum of two input files is required : $! ***\n\n");
}
#==========================Known Bugs=========================================#
# When 2 different seqs have v. similar names, script gets confused #
# eg MMSELP-I and MMSELP-II, will only return MMSELP-I use -i to avoid #
#=============================================================================#