#!/user/bin/perl
# AIM: To strip HTML from a file using HTML::Strip
# 08/08/2014 - Small update - Due to CSPAN failed to compile due to missing strcasecmp in Windows
# had to download source - HTML-Strip-1.06.tar.gz - unpacked and did 'fix' and zipped to
# HTML-Strip-1.06.zip
# SRC: F:\Projects\HTML-Strip-1.06 - upshtml-strip.bat - hhtml-strip.bat
# See 'setup.txt' for the compile, test and install of this module, then use HTML::Strip; compile
# Also had to reduce HiRes to just those used
# and it ran...
###use strict;
use Carp;
use HTML::Strip;
use Time::HiRes qw( gettimeofday tv_interval );
### use Time::HiRes qw( usleep ualarm gettimeofday tv_interval nanosleep );
my ($t0, $t1, $elapsed);
$t0 = [gettimeofday];
##my $definp = 'C:/HOMEPAGE/P26/browser1.htm';
my $definp = 'C:/HOMEPAGE/P26/favorites.htm';
my $html_file = shift || $definp;
my $out_file = 'tempstr.txt';
my ($OF);
my $msg = '';
sub trim_leading($) {
my ($ln) = shift;
$ln = substr($ln,1) while ($ln =~ /^\s/); # remove all LEADING space
return $ln;
}
sub trim_tailing($) {
my ($ln) = shift;
$ln = substr($ln,0, length($ln) - 1) while ($ln =~ /\s$/g); # remove all TRAILING space
return $ln;
}
sub trim_ends($) {
my ($ln) = shift;
$ln = trim_tailing($ln); # remove all TRAINING space
$ln = trim_leading($ln); # remove all LEADING space
return $ln;
}
sub trim_all {
my ($ln) = shift;
$ln =~ s/\n/ /gm; # replace CR (\n)
$ln =~ s/\r/ /gm; # replace LF (\r)
$ln =~ s/\t/ /g; # TAB(s) to a SPACE
$ln = trim_ends($ln);
$ln =~ s/\s{2}/ /g while ($ln =~ /\s{2}/); # all double space to SINGLE
return $ln;
}
die "ERROR: Can not locate input file $html_file - $! - " if ! -f $html_file;
open $OF, ">$out_file" || die "ERROR: Could not create OUT file! - $! -\n";
$msg = "Processing file $html_file ...\n";
print $OF $msg;
# get the file data
my $raw_html = read_file_f( $html_file );
# create 'stripper' ...
my $hs = HTML::Strip->new();
# parse it ...
my $clean_text = $hs->parse( $raw_html );
$hs->eof;
# remove any MULTIPLE space
while( $clean_text =~ / /gm ) {
$clean_text =~ s/ / /gm;
}
# remove any LEADING line spaces
while( $clean_text =~ /^ /gm ) {
$clean_text =~ s/^ //gm;
}
# remove any STARTING CR,LF pairs
while( $clean_text =~ /^\r\n/ ) {
$clean_text =~ s/^\r\n//;
}
# remove any DOUBLE CR/LF pairs
# this does NOT seem to do the job????
while( $clean_text =~ /\r\n\r\n/gm ) {
$clean_text =~ s/(\r\n\r\n)/\r\n/gm;
}
my @lines = split("\n", $clean_text);
$msg = "Now split into " . scalar @lines . " lines ...\n";
print $msg;
my ($line,$len);
my @nlines = ();
foreach $line (@lines) {
chomp $line;
$line = trim_all($line);
$len = length($line);
next if ($len == 0);
push(@nlines,$line);
}
### $clean_text = join("\n",@nlines);
##print "$clean_text\n";
print $OF $clean_text;
$t1 = [gettimeofday];
$elapsed = tv_interval ( $t0, $t1 );
$msg = "$0 processing took $elapsed seconds ...\n";
print $msg;
print $OF $msg;
# @lines = @nlines; # split("\n", $clean_text);
@lines = split("\n", $clean_text);
$msg = "Now split into " . scalar @lines . " lines ...\n";
print $msg;
print $OF $msg;
my @words = ();
my @la;
foreach $line (@lines) {
chomp $line;
@la = split(' ', $line);
push(@words, @la);
}
$msg = "Now split into " . scalar @words . " words ...\n";
print $msg;
print $OF $msg;
$t2 = [gettimeofday];
$elapsed = tv_interval ( $t1, $t2 );
$msg = "$0 processing took $elapsed seconds ...\n";
print $msg;
print $OF $msg;
my $word;
my %HWords = ();
my $newwds = 0;
my $oldwds = 0;
foreach $word (@words) {
if( exists $HWords{$word} ) {
$HWords{$word}++;
$oldwds++;
} else {
$HWords{$word} = 1; # start count
###print $OF "$word\n";
$newwds++;
}
}
$msg = "Got $newwds new words, and $oldwds repeats...\n";
print $msg;
print $OF $msg;
foreach $word (keys %HWords) {
$msg = $word . ' count = ' . $HWords{$word} . "\n";
print $OF $msg;
}
$t3 = [gettimeofday];
$elapsed = tv_interval ( $t2, $t3 );
$msg = "$0 output took $elapsed seconds ...\n";
print $msg;
print $OF $msg;
$msg = "See $out_file for the results ...\n";
print $msg;
print $OF $msg;
close $OF;
system $out_file;
sub read_file_f {
my( $file_name, %args ) = @_ ;
my $buf ;
my $buf_ref = $args{'buf_ref'} || \$buf ;
my $mode = O_RDONLY ;
$mode |= O_BINARY if $args{'binmode'} ;
local( *FH ) ;
sysopen( FH, $file_name, $mode ) or
carp "Can't open $file_name: $!" ;
my $size_left = -s FH ;
while( $size_left > 0 ) {
my $read_cnt = sysread( FH, ${$buf_ref},
$size_left, length ${$buf_ref} ) ;
unless( $read_cnt ) {
carp "read error in file $file_name: $!" ;
last ;
}
$size_left -= $read_cnt ;
}
# handle void context (return scalar by buffer reference)
return unless defined wantarray ;
# handle list context
return split m|?<$/|g, ${$buf_ref} if wantarray ;
# handle scalar context
return ${$buf_ref} ;
}
# eof - striphtml03.pl