fastaseqfromGFF.pl

#!/usr/bin/perl -w

use strict;
use Getopt::Std;

my %switches;

getopts("h",\%switches);

if (exists($switches{h})) {
  print "fastaseqfromGFF.pl is a Perl script to cut out sequences from a fasta file\n",
        "where these sequences are defined by the coordinates contained in a GFF file\n\n";
  print "syntax: fastaseqfromGFF.pl [-h]  \n\n";
  print "arguments:  is the the file containing the sequence in FASTA format\n",
        "            is the file containing the coordinates in GFF format\n";
  print "author: robert.castelo\@upf.edu\n";
  exit(1);
}

if (scalar(@ARGV) < 2) {
  print STDERR "fastaseqfromGFF.pl [-h]  \n";
  exit(1);
}

if (!open(FHFASTA, $ARGV[0])) {
  print STDERR "fastaseqfromGFF.pl: impossible to open $ARGV[0]\n";
  exit(1);
}

if (!open(FHGFF, $ARGV[1])) {
  print STDERR "fastaseqfromGFF.pl: impossible to open $ARGV[1]\n";
  exit(1);
}

my $defline=;
my $seq="";

while () {
  chomp;
  $seq = $seq . $_;
}
close(FHFASTA);

my $strand="+";
my @gff;
my $i=0;

while () {
  my @v=split(/\t/, $_);

  $gff[$i][0] = $v[3]; # store start
  $gff[$i][1] = $v[4]; # store end
  $gff[$i][2] = substr($seq, $v[3]-1, $v[4]-$v[3]+1); # store sequence

  if ($v[6] eq "-" || $v[6] eq -1) {
    $strand="-";
  }

  $i = $i + 1;
}
close(FHGFF);

# sort the GFF lines by start and end position
@gff = sort{${$a}[0] <=> ${$b}[0] ?
            ${$a}[0] <=> ${$b}[0] :
            ${$a}[1] <=> ${$b}[1]}(@gff);


my $l=scalar(@gff);
my $subseq="";

$i=0;
while ($i < $l) {
  $subseq = $subseq . $gff[$i][2];
  $i = $i + 1;
}

if ($strand eq "-") {
  $subseq=~tr/acgtACGT/tgcaTGCA/;
  $subseq=reverse($subseq);
}

print $defline;

$i=0;
while ($i < length($subseq)) {
  print substr($subseq,$i,60),"\n";
  $i = $i + 60; 
}