#!/usr/bin/perl -w use strict; use Getopt::Std; my %switches; getopts("h",\%switches); if (exists($switches{h})) { print "fastaseqfromGFF.pl is a Perl script to cut out sequences from a fasta file\n", "where these sequences are defined by the coordinates contained in a GFF file\n\n"; print "syntax: fastaseqfromGFF.pl [-h] \n\n"; print "arguments: is the the file containing the sequence in FASTA format\n", " is the file containing the coordinates in GFF format\n"; print "author: robert.castelo\@upf.edu\n"; exit(1); } if (scalar(@ARGV) < 2) { print STDERR "fastaseqfromGFF.pl [-h] \n"; exit(1); } if (!open(FHFASTA, $ARGV[0])) { print STDERR "fastaseqfromGFF.pl: impossible to open $ARGV[0]\n"; exit(1); } if (!open(FHGFF, $ARGV[1])) { print STDERR "fastaseqfromGFF.pl: impossible to open $ARGV[1]\n"; exit(1); } my $defline=; my $seq=""; while () { chomp; $seq = $seq . $_; } close(FHFASTA); my $strand="+"; my @gff; my $i=0; while () { my @v=split(/\t/, $_); $gff[$i][0] = $v[3]; # store start $gff[$i][1] = $v[4]; # store end $gff[$i][2] = substr($seq, $v[3]-1, $v[4]-$v[3]+1); # store sequence if ($v[6] eq "-" || $v[6] eq -1) { $strand="-"; } $i = $i + 1; } close(FHGFF); # sort the GFF lines by start and end position @gff = sort{${$a}[0] <=> ${$b}[0] ? ${$a}[0] <=> ${$b}[0] : ${$a}[1] <=> ${$b}[1]}(@gff); my $l=scalar(@gff); my $subseq=""; $i=0; while ($i < $l) { $subseq = $subseq . $gff[$i][2]; $i = $i + 1; } if ($strand eq "-") { $subseq=~tr/acgtACGT/tgcaTGCA/; $subseq=reverse($subseq); } print $defline; $i=0; while ($i < length($subseq)) { print substr($subseq,$i,60),"\n"; $i = $i + 60; }