Perl - HTML

back

This program takes the output from Microsoft Word HTML export, and allows some minor customisation of the final output ...

#!/Perl

use HTML::Parser ();
use Data::Dump ();

my $program = "stripms";
## user feature variables ##
my $dodebug = 1;
##my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/My Webs/moon-01.htm";
my $definp = "C:/HOMEPAGE/P26/compgr.htm";
my $defout = "C:/HOMEPAGE/P26/temphtml.htm";
my $deflog = "temphtml.txt"; # output log file ... more if $dodebug = 1!
my $defskip = "tempskip.txt"; # view what has been REJECTED, DELETED, CHOPPED
my $WEBVERS = "P26.2005.05.04";
my $addcode = "";

my $clearhtml = 1; # clear HTML attributes
my $clearop = 1; # clear MS o:p paragraph thingy
my $clearpatts = 1; # clear paragraph attributes
my $clearhstyl = 1; # no SYTLE statment in head - include through file, if required ... *TBD*
my $cleartdsty = 0; # clear TD attributes
my $fiximg = 1; # modify the IMG tag
my $clearhlink = 1; # clear a LINK REL statement
my $clearspan = 1; # remove all SPAN tags
my $cleardiv = 1; # remove all DIV tags
my $clearmeta = 1; # remove META (head) tag

## BODY actions
my $clearbsyle = 0; # no BODY attributes
my $fixblstyle = 1; # modify body language, if given
my $deflang = 'EN-AU'; # use English (Austrlian)

## program variables ##
my $WHITE_PATTERN2 = "^[ \t\n\r]*\$"; # spacey if ($var ~= /$WHITE_PATTERN2/o ) { ...}
my ($FH, $HH, $CH); # run log, html and strip log ...
my $doout = 1; # do the OUTPUT, but can be off'ed ...
my $inpfil = "";
my $subok = 0;

my $inhtml = 0; # in document
my $inpara = 0; # in paragraph tag
my $inhead = 0; # processing header
my $inbody = 0; # body processing
my $instyle = 0; # style processing

my $start_time = time();

open $HH, ">$defout" or die "No HTML output file ... [$defout]!\n";
open $FH, ">$deflog" or die "No OUT LOG file ...\n";
open $CH, ">$defskip" or die "No SKIP file ...\n";

my $p = HTML::Parser->new(api_version => 3);

$p->handler(default => \&hand, "event, line, column, text, tagname, attr");

# $p->parse_file(@ARGV ? shift : die "No input given ....\n");
parse_args(@ARGV);
# if we did NOT get an INPUT file, what to DO ...
if ( !length($inpfil) ) {
if ($dodebug) {
$inpfil = $definp;
} else {
die "No input file given ...\n";
}
}
print "$program: Started on " . localtime($start_time) ; ### . " in $cwdir ...\n" if $shwtm;

$p->parse_file($inpfil);

close $FH; # log file output
close $HH; # ouput HTML file
close $CH; # log of discarded items

system $defout; # run the HTML file

if ($subok) {
open $HH, "<$defout" or die "No HTML input file ... [$defout]!\n";
open $FH, "<$inpfil" or die "No re-open of the source ... [$inpfil]!\n";
my @infil = <$HH>; # slurp the file, from the disk
my @outfil = <$FH>; # slurp the original source
close $FH; # log file output
close $HH; # ouput HTML file
open $HH, ">$defout" or die "No HTML input file ... [$defout]!\n";
open $FH, ">$inpfil" or die "No re-open of the source ... [$inpfil]!\n";
print $HH @outfil;
print $FH @infil;
close $FH; # log file output
close $HH; # ouput HTML file
}

## Event table
## ["S", $tag, $attr, $attrseq, $text]
## ["E", $tag, $text]
## ["T", $text, $is_data]
## ["C", $text]
## ["D", $text]
## ["PI", $token0, $text]
sub hand {
my($event, $line, $column, $text, $tagname, $attr) = @_;
my $typ = uc(substr($event,0,1)); ## get TYPE
my @d = "$typ L$line C$column";
#substr($text, 40) = "..." if length($text) > 40;
push(@d, $text);
push(@d, $tagname) if defined $tagname;
push(@d, $attr) if $attr;
my $otxt = Data::Dump::dump(@d);
#print $FH Data::Dump::dump(@d), "\n";
#print Data::Dump::dump(@d), "\n";
print "$otxt\n";
# now process the data ...
my $locout = 1; # one time only output flag
my $i;
my $tag = '*NO_TAG*';
if (defined $tagname) {
$tag = uc($tagname);
}

# Event table
########################################################################
if ($typ eq 'S') { # START OF TAG, and possible ATTRIBUTES
## ["S", $tag, $attr, $attrseq, $text]
if ($tag eq 'HTML') {
$inhtml = 1;
if ($clearhtml) {
$text = '<html>';
}
if (defined $addcode) { ## = "";
$text .= "\n";
$text .= "$addcode";
}
} elsif ($tag eq 'P') {
$inpara = 1;
if ($clearpatts) {
print $FH "Paragraph from $text to <p> ...\n";
$text = "<p>";
}
} elsif ($tag eq 'HEAD') {
$inhead = 1;
} elsif ($tag eq 'BODY') {
$inbody = 1;
if ($clearbsyle) { # no BODY attributes
$text = '<body>';
} elsif ($fixblstyle) { # modify body language, if given
# use my $deflang = 'EN-AU'; # use English (Austrlian)
my %att = %$attr; # copy the HASH, to do modifications
$i = 0;
prt( "Checking BODY attrib ...\n" );
foreach $key (keys %att) {
prt ( "Checking attrib $key ...\n" );
if ($key eq 'lang') {
prt ( "Found lang=$key ...\n" );
if ($att{$key} ne $deflang) {
prt ( "Modifying 'lang=$att{$key} to [$deflang] ...\n" );
$att{$key} = $deflang;
}
$i++;
}
}
if ($i) { # ok, change output, re-run to build new HTML
$text = "<$tag"; # start tag again
while (($key,$value) = each %att) {
$text .= " $key=$value";
}
$text .= '>'; # close tag
}
}
} elsif ($tag eq 'STYLE') {
$instyle =1;
if ($clearhstyl) {
# in head - close out S style to E sytle
if ($inhead) {
$doout = 0; # CLOSE output
}
}
} elsif ($tag eq 'SPAN') {
if ($clearspan) { # remove all SPAN tags
$locout = 0;
}
} elsif ($tag eq 'O:P') {
if ($clearop) { # clear MS o:p paragraph thingy
$locout = 0;
}
} elsif ($tag eq 'LINK') {
if ($clearhlink) {
$locout = 0;
}
} elsif ($tag eq 'DIV') {
if ($cleardiv) {
$locout = 0;
}
} elsif ($tag eq 'TD') {
if ($cleartdsty) {
$text = '<td>';
}
} elsif ($tag eq 'IMG') {
if ($fiximg) { # modify the IMG tag
my %att = %$attr; # copy the HASH, to do modifications
$i = 0;
foreach $key (keys %att) {
if ($key eq 'v:shapes') {
print $FH "Deleting attrib $key ...\n";
delete $att{$key}; # remove this MS reference
$i++;
}
}
if ($i) {
$text = '<IMG'; # start IMG tag again
while (($key,$value) = each %att) {
$text .= " $key=$value";
}
$text .= '>'; # close IMG tag
}
}
} elsif ($tag eq 'META') {
if ($clearmeta) { # remove META (head) tag
$locout = 0;
}
}
########################################################################
} elsif ($typ eq 'E') {
## ["E", $tag, $text]
if ($tag eq 'HTML') {
$inhtml = 0;
} elsif ($tag eq 'P') {
$inpara = 0;
} elsif ($tag eq 'HEAD') {
$inhead = 0;
} elsif ($tag eq 'BODY') {
$inbody = 0;
} elsif ($tag eq 'STYLE') {
$instyle = 0;
if ($clearhstyl) {
# in head - close out S style to E sytle
if ($inhead) {
$doout = 1; # OPEN output
$locout = 0; # but NOT for this style one
}
}
} elsif ($tag eq 'SPAN') {
if ($clearspan) { # remove all SPAN tags
$locout = 0;
}
} elsif ($tag eq 'O:P') {
if ($clearop) { # clear MS o:p paragraph thingy
$locout = 0;
}
} elsif ($tag eq 'LINK') {
if ($clearhlink) {
$locout = 0;
}
} elsif ($tag eq 'DIV') {
if ($cleardiv) {
$locout = 0;
}
}
} elsif ($typ eq 'T') {
## ["T", $text, $is_data]
} elsif ($typ eq 'C') {
## ["C", $text]
$locout = 0; # toss all CODE
} elsif ($typ eq 'D') {
## ["D", $text]
} elsif ($typ eq 'P') {
## ["PI", $token0, $text]
}
### end event table ###########################################################

if ($text =~ /$WHITE_PATTERN2/o) {
print $CH "ws[$otxt]\n";
print $CH "ws[$text]\n";
} else {
if ($doout && $locout) {
print $FH "$otxt\n";
print $HH "$text\n";
} else {
print $CH "$otxt\n";
print $CH "$text\n";
}
}
}

sub parse_args {
my (@av) = @_; # get stack
while (@av) {
my $arg = uc($av[0]);
if ($arg =~ /^-/) {
if ($arg eq '-V') {
print "Version: 0.0.1 - May 2005\n";
} elsif (($arg eq '-H') || ($arg eq '-?')) {
die "stripms infile [options]\n";
} else {
die "ERROR: Unknown option [$arg]\n";
}
} else {
if (length($inpfil)) {
die "ERROR: Can not handle two input files ...\n";
}
$inpfil = $arg;
if ( !(-f $inpfil) ) {
die "ERROR: Can NOT locate file [$inpfil] ...\n";
}

}
shift @av;
}
}

sub prt {
if ($dodebug) {
print $FH @_;
}
}

# EOF

back