#!/usr/bin/perl -w use strict; use Getopt::Long; my $myprompt = ' Usage: perl fastamaker.pl Sequence_File [--minlen=17] [--maxlen=24] [--keepheader] > output.fas Sequence_File: your input small RNA sequence file. --minlen: minimum length (10-30) of sequences which are treated. A sequence will be discarded if it is shorter than this value. --maxlen: maximum length (10-35) of sequences which are treated. A sequence will be discarded if it is longer than this value. --keepheader: keep original header information if your input is FASTA format. Example: perl fastamaker.pl my_old_file.fas --minlen=21 --maxlen=24 --keepheader > my_new_file.fas '; my $help; my $keepheader; my $minlen=17; my $maxlen=24; my $myresult = GetOptions( "help|h" => \$help, "keepheader|k" => \$keepheader, "minlen=i" => \$minlen, "maxlen=i" => \$maxlen, ); if( !$myresult || defined $help || !defined $ARGV[0] ) { print STDERR $myprompt; exit(1); } if($minlen<10 || $minlen>30 || $maxlen<10 || $maxlen>35 || $minlen>$maxlen ) { print STDERR "Please check your minlen and/or maxlen.\n"; exit(1); } my $filename=$ARGV[0]; if(! -e $filename) { print STDERR "$filename is not exists.\n"; exit 1; } my $filetype=&checkFileType($filename); # guess if the file is FASTA format. if($filetype eq 'FASTA' && defined $keepheader) { &convertFASTA($filename,$minlen,$maxlen); #if input is FASTA format and user hope to keep header information by set "--keepheader". } else { &convertSIG($filename,$minlen,$maxlen); #if input is just sequence by lines or user chose not to keep header information of their FASTA file. } sub convertSIG { my($filename,$minlen,$maxlen)=@_; open(FP,$filename); my %fasta; my $seq=""; while(my $line=) { next if($line=~/^\s*$/); chomp $line; $line=~s/^\s+|\s+$//g; next if($line=~/^>/); if($line=~/^[ATCGU \t]+$/i) { $line=~s/\s+//g; next if(length($line)<$minlen || length($line)>$maxlen); $line=~tr/a-z/A-Z/; $line=~tr/U/T/; $seq=substr($line,0,$minlen); if(! defined $fasta{$seq}) { $fasta{$seq}=1; } else { $fasta{$seq}++; } } } close(FP); my $i=1; while(my($myseq,$mycount)=each(%fasta)) { print ">S${i} ${mycount}\n${myseq}\n"; $i++; } } sub convertFASTA { my($filename,$minlen,$maxlen)=@_; open(FP,$filename); my %fasta; my $header=""; my $seq=""; while(my $line=) { next if($line=~/^\s*$/); # ignore blnak line chomp $line; $line=~s/^\s+|\s+$//g; #clean blank letter in two end of the line if($line=~/^>/) { $line=~s/^>//g; $header=$line; #keep head information next; } if($line=~/^[ATCGU \t]+$/i) { $line=~s/\s+//g; if(length($line)<$minlen || length($line)>$maxlen) { $header=""; next; } #print $line,"\n"; $line=~tr/a-z/A-Z/; $line=~tr/U/T/; $seq=substr($line,0,$minlen); if(! defined $fasta{$seq}) { $fasta{$seq}=$header; } else { $fasta{$seq}=$fasta{$seq}."; ".$header; } } $header=""; } close(FP); my $i=1; while(my($myseq,$myheader)=each(%fasta)) { print ">S${i} ${myheader}\n${myseq}\n"; $i++; } } sub checkFileType { my($filename)=@_; my @lines; my $i=0; open(FP0,$filename); while( my $line= ) { next if($line=~/^\s*$/); last if($i>10); chomp $line; $line=~s/\s+//g; push(@lines,$line); $i++; } close(FP0); my $filetype="SIGN"; #'SIGN' means the file format is like below. It is common format of small RNA sequences. #ATCGATCG #ATGGATCG #ACCGATCG $filetype="FASTA" if(@lines>=2 && $lines[0]=~/^>\w/ && $lines[1]=~/^[a-zA-Z]+$/ ); return $filetype; }