#!/usr/bin/perl
####################################
#
# TEclass2txt -f html-file -o text-file
#
# Reads TEclass output as html file and writes to text file as tab-delimited
# Authors: C. Grunau, D. Roquis, 28 mars 2011
#####################################
use diagnostics;
use strict;
use Carp;
use FileHandle;
use File::Path;
use File::Basename;
use Getopt::Std;
my $FILE = FileHandle->new();
my $start = "
";
my $result = "";
my $text = "";
my $extraction = "";
my $header = "No\tID\tResult\tORFs\tStrand\tSVM\ classif\ (4mer)\tSVM\ classif\ (5mer)\tLVQ\ classif\tRF\ classif\n";
my $footer = "";
use vars qw( $opt_f $opt_o );
# Command line parsing
#
getopts('f:o:');
#
# Read input file and split into fasta sequences on the fly
#
#replace Mac and PC CRLF by UNIX LF
system ("perl -pi -e 's/\r/\n/g' $opt_f");
open($FILE,$opt_f) || croak(sprintf("Cannot open file \"%s\"",$opt_f));
while ( <$FILE> ) {
chomp;
$text = $text.$_;
}
close($FILE) || croak(sprintf("Cannot close open file \"%s\"",$opt_f));
#Copy all the table rows in the prevoiously selected text
while ($text =~ /$start(.*?)$end/g) { $result = $start.$1.$end; $extraction = $extraction.$result;}
#Lines 68 to 80 removes all HTML tags in order to save the output as a .txt fil
#Replace all white space caracters and replace the ones within a cell with underscores
$extraction =~ s/\ \-\ /\-/gms;
$extraction =~ s/\>\s\\>\<\/tr>/\n/g;
$extraction =~ s/\<\/td>\<\/tr>/\n/g;
$extraction =~ s/\<\/td>\<\/tr>/\n/g;
#Remove all the remaining HTML tags and replace them by tabulations
$extraction =~ s/[\<[^\<]+\>]+/\t/gs;
$extraction =~ s/\<[^\<]+\>/\t/gs;
$extraction =~ s/[\t ]+/\t/gs; # Replace duplicate whitespace mid-string with 1 tab
$extraction =~ s/[\t ]*$//gms; # Remove ending spaces/tabs
$extraction =~ s/^[\t ]*//gms; # Remove starting spaces/tabs
$extraction =~ s/_/\ /gms; # Replace the underscores with spaces
#Removes all the headers if the input was from a concatenation
#$extraction =~ s/No\.\sID\sResult\sORFs\sStrand\sSVM\sclassif\.\s\(4mer\)\sSVM\sclassif\.\s\(5mer\)\sLVQ\tclassif\.\sRF\sclassif\.\n//g;
#make sure file is empty, overwrites existing file without mercy
open($FILE,">".$opt_o) || croak(sprintf("Cannot open file \"%s\"",$opt_o));
print $FILE "";
close ($FILE);
#Add the header and the footer
$extraction = $header.$extraction.$footer;
open($FILE,">".$opt_o) || croak(sprintf("Cannot open file \"%s\"",$opt_o));
print $FILE $extraction;
close ($FILE);