#!/usr/bin/perl #################################### # # censor2txt -f html-file -o text-file # # Reads Censor output as html file until "Masked Sequence" and writes to text file as tab-delimited # Authors: C. Grunau, D. Roquis, 28 mars 2011 ##################################### use diagnostics; use strict; use Carp; use FileHandle; use File::Path; use File::Basename; use Getopt::Std; my $FILE = FileHandle->new(); my $start = "smr_"; my $end = "<\/tbody>"; my $end2 = "

Masked Sequence"; my $result = ""; my $result2 = ""; my $text = ""; my $extraction = ""; my $header = "Name\tFrom\tTo\tLink\tName\tFrom\tTo\tClass\tDir\tSim\tPos\tScore\n"; my $footer = ""; use vars qw( $opt_f $opt_o); # Command line parsing # getopts('f:o:'); # # Read input file and split into fasta sequences on the fly # #replace Mac and PC CRLF by UNIX LF system ("perl -pi -e 's/\r/\n/g' $opt_f"); open($FILE,$opt_f) || croak(sprintf("Cannot open file \"%s\"",$opt_f)); while ( <$FILE> ) { chomp; $text = $text.$_; } close($FILE) || croak(sprintf("Cannot close open file \"%s\"",$opt_f)); #Copy text from the beginning of the file till the "Masked Sequence" section while ($text =~ /$start(.*?)$end2/g) { $result = $start.$1.$end2;} #Copy all the table rows in the prevoiously selected text while ($result =~ /$start(.*?)$end/g) { $result2 = $start.$1.$end; $extraction = $extraction.$result2;} #Lines 75 to 86 removes all HTML tags in order to save the output as a .txt file #The next two lines remove the tags around hyperlinks $extraction =~ s//\t/g; #Replace the end of a row in a table by a newline caracter $extraction =~ s/\<\/td><\/tr><\/tbody>/\n/g; #Remove all the remaining HTML tags and replace them by tabulations $extraction =~ s/\<[^\<]+\>/\t/gs; $extraction =~ s/[\t ]+/\t/gs; # Replace duplicate whitespace mid-string with 1 space $extraction =~ s/[\t ]*$//gms; # Remove ending spaces/tabs $extraction =~ s/^[\t ]*//gms; # Remove starting spaces/tabs #make sure file is empty, overwrites existing file without mercy open($FILE,">".$opt_o) || croak(sprintf("Cannot open file \"%s\"",$opt_o)); print $FILE ""; close ($FILE); #Add the header and the footer $extraction = $header.$extraction.$footer; open($FILE,">".$opt_o) || croak(sprintf("Cannot open file \"%s\"",$opt_o)); print $FILE $extraction; close ($FILE);