#!/usr/bin/perl -w
use strict;
use Getopt::Std;
my %switches;
getopts("h",\%switches);
if (exists($switches{h})) {
print "fastaseqfromGFF.pl is a Perl script to cut out sequences from a fasta file\n",
"where these sequences are defined by the coordinates contained in a GFF file\n\n";
print "syntax: fastaseqfromGFF.pl [-h] \n\n";
print "arguments: is the the file containing the sequence in FASTA format\n",
" is the file containing the coordinates in GFF format\n";
print "author: robert.castelo\@upf.edu\n";
exit(1);
}
if (scalar(@ARGV) < 2) {
print STDERR "fastaseqfromGFF.pl [-h] \n";
exit(1);
}
if (!open(FHFASTA, $ARGV[0])) {
print STDERR "fastaseqfromGFF.pl: impossible to open $ARGV[0]\n";
exit(1);
}
if (!open(FHGFF, $ARGV[1])) {
print STDERR "fastaseqfromGFF.pl: impossible to open $ARGV[1]\n";
exit(1);
}
my $defline=;
my $seq="";
while () {
chomp;
$seq = $seq . $_;
}
close(FHFASTA);
my $strand="+";
my @gff;
my $i=0;
while () {
my @v=split(/\t/, $_);
$gff[$i][0] = $v[3]; # store start
$gff[$i][1] = $v[4]; # store end
$gff[$i][2] = substr($seq, $v[3]-1, $v[4]-$v[3]+1); # store sequence
if ($v[6] eq "-" || $v[6] eq -1) {
$strand="-";
}
$i = $i + 1;
}
close(FHGFF);
# sort the GFF lines by start and end position
@gff = sort{${$a}[0] <=> ${$b}[0] ?
${$a}[0] <=> ${$b}[0] :
${$a}[1] <=> ${$b}[1]}(@gff);
my $l=scalar(@gff);
my $subseq="";
$i=0;
while ($i < $l) {
$subseq = $subseq . $gff[$i][2];
$i = $i + 1;
}
if ($strand eq "-") {
$subseq=~tr/acgtACGT/tgcaTGCA/;
$subseq=reverse($subseq);
}
print $defline;
$i=0;
while ($i < length($subseq)) {
print substr($subseq,$i,60),"\n";
$i = $i + 60;
}