#!/usr/bin/perl #################################### # # TEclass2txt -f html-file -o text-file # # Reads TEclass output as html file and writes to text file as tab-delimited # Authors: C. Grunau, D. Roquis, 28 mars 2011 ##################################### use diagnostics; use strict; use Carp; use FileHandle; use File::Path; use File::Basename; use Getopt::Std; my $FILE = FileHandle->new(); my $start = " "; my $result = ""; my $text = ""; my $extraction = ""; my $header = "No\tID\tResult\tORFs\tStrand\tSVM\ classif\ (4mer)\tSVM\ classif\ (5mer)\tLVQ\ classif\tRF\ classif\n"; my $footer = ""; use vars qw( $opt_f $opt_o ); # Command line parsing # getopts('f:o:'); # # Read input file and split into fasta sequences on the fly # #replace Mac and PC CRLF by UNIX LF system ("perl -pi -e 's/\r/\n/g' $opt_f"); open($FILE,$opt_f) || croak(sprintf("Cannot open file \"%s\"",$opt_f)); while ( <$FILE> ) { chomp; $text = $text.$_; } close($FILE) || croak(sprintf("Cannot close open file \"%s\"",$opt_f)); #Copy all the table rows in the prevoiously selected text while ($text =~ /$start(.*?)$end/g) { $result = $start.$1.$end; $extraction = $extraction.$result;} #Lines 68 to 80 removes all HTML tags in order to save the output as a .txt fil #Replace all white space caracters and replace the ones within a cell with underscores $extraction =~ s/\ \-\ /\-/gms; $extraction =~ s/\>\s\\<\/tr>/\n/g; $extraction =~ s/\<\/td>\<\/tr>/\n/g; $extraction =~ s/\<\/td>\<\/tr>/\n/g; #Remove all the remaining HTML tags and replace them by tabulations $extraction =~ s/[\<[^\<]+\>]+/\t/gs; $extraction =~ s/\<[^\<]+\>/\t/gs; $extraction =~ s/[\t ]+/\t/gs; # Replace duplicate whitespace mid-string with 1 tab $extraction =~ s/[\t ]*$//gms; # Remove ending spaces/tabs $extraction =~ s/^[\t ]*//gms; # Remove starting spaces/tabs $extraction =~ s/_/\ /gms; # Replace the underscores with spaces #Removes all the headers if the input was from a concatenation #$extraction =~ s/No\.\sID\sResult\sORFs\sStrand\sSVM\sclassif\.\s\(4mer\)\sSVM\sclassif\.\s\(5mer\)\sLVQ\tclassif\.\sRF\sclassif\.\n//g; #make sure file is empty, overwrites existing file without mercy open($FILE,">".$opt_o) || croak(sprintf("Cannot open file \"%s\"",$opt_o)); print $FILE ""; close ($FILE); #Add the header and the footer $extraction = $header.$extraction.$footer; open($FILE,">".$opt_o) || croak(sprintf("Cannot open file \"%s\"",$opt_o)); print $FILE $extraction; close ($FILE);