#!/user/bin/perl # AIM: To strip HTML from a file using HTML::Strip # 08/08/2014 - Small update - Due to CSPAN failed to compile due to missing strcasecmp in Windows # had to download source - HTML-Strip-1.06.tar.gz - unpacked and did 'fix' and zipped to # HTML-Strip-1.06.zip # SRC: F:\Projects\HTML-Strip-1.06 - upshtml-strip.bat - hhtml-strip.bat # See 'setup.txt' for the compile, test and install of this module, then use HTML::Strip; compile # Also had to reduce HiRes to just those used # and it ran... ###use strict; use Carp; use HTML::Strip; use Time::HiRes qw( gettimeofday tv_interval ); ### use Time::HiRes qw( usleep ualarm gettimeofday tv_interval nanosleep ); my ($t0, $t1, $elapsed); $t0 = [gettimeofday]; ##my $definp = 'C:/HOMEPAGE/P26/browser1.htm'; my $definp = 'C:/HOMEPAGE/P26/favorites.htm'; my $html_file = shift || $definp; my $out_file = 'tempstr.txt'; my ($OF); my $msg = ''; sub trim_leading($) { my ($ln) = shift; $ln = substr($ln,1) while ($ln =~ /^\s/); # remove all LEADING space return $ln; } sub trim_tailing($) { my ($ln) = shift; $ln = substr($ln,0, length($ln) - 1) while ($ln =~ /\s$/g); # remove all TRAILING space return $ln; } sub trim_ends($) { my ($ln) = shift; $ln = trim_tailing($ln); # remove all TRAINING space $ln = trim_leading($ln); # remove all LEADING space return $ln; } sub trim_all { my ($ln) = shift; $ln =~ s/\n/ /gm; # replace CR (\n) $ln =~ s/\r/ /gm; # replace LF (\r) $ln =~ s/\t/ /g; # TAB(s) to a SPACE $ln = trim_ends($ln); $ln =~ s/\s{2}/ /g while ($ln =~ /\s{2}/); # all double space to SINGLE return $ln; } die "ERROR: Can not locate input file $html_file - $! - " if ! -f $html_file; open $OF, ">$out_file" || die "ERROR: Could not create OUT file! - $! -\n"; $msg = "Processing file $html_file ...\n"; print $OF $msg; # get the file data my $raw_html = read_file_f( $html_file ); # create 'stripper' ... my $hs = HTML::Strip->new(); # parse it ... my $clean_text = $hs->parse( $raw_html ); $hs->eof; # remove any MULTIPLE space while( $clean_text =~ / /gm ) { $clean_text =~ s/ / /gm; } # remove any LEADING line spaces while( $clean_text =~ /^ /gm ) { $clean_text =~ s/^ //gm; } # remove any STARTING CR,LF pairs while( $clean_text =~ /^\r\n/ ) { $clean_text =~ s/^\r\n//; } # remove any DOUBLE CR/LF pairs # this does NOT seem to do the job???? while( $clean_text =~ /\r\n\r\n/gm ) { $clean_text =~ s/(\r\n\r\n)/\r\n/gm; } my @lines = split("\n", $clean_text); $msg = "Now split into " . scalar @lines . " lines ...\n"; print $msg; my ($line,$len); my @nlines = (); foreach $line (@lines) { chomp $line; $line = trim_all($line); $len = length($line); next if ($len == 0); push(@nlines,$line); } ### $clean_text = join("\n",@nlines); ##print "$clean_text\n"; print $OF $clean_text; $t1 = [gettimeofday]; $elapsed = tv_interval ( $t0, $t1 ); $msg = "$0 processing took $elapsed seconds ...\n"; print $msg; print $OF $msg; # @lines = @nlines; # split("\n", $clean_text); @lines = split("\n", $clean_text); $msg = "Now split into " . scalar @lines . " lines ...\n"; print $msg; print $OF $msg; my @words = (); my @la; foreach $line (@lines) { chomp $line; @la = split(' ', $line); push(@words, @la); } $msg = "Now split into " . scalar @words . " words ...\n"; print $msg; print $OF $msg; $t2 = [gettimeofday]; $elapsed = tv_interval ( $t1, $t2 ); $msg = "$0 processing took $elapsed seconds ...\n"; print $msg; print $OF $msg; my $word; my %HWords = (); my $newwds = 0; my $oldwds = 0; foreach $word (@words) { if( exists $HWords{$word} ) { $HWords{$word}++; $oldwds++; } else { $HWords{$word} = 1; # start count ###print $OF "$word\n"; $newwds++; } } $msg = "Got $newwds new words, and $oldwds repeats...\n"; print $msg; print $OF $msg; foreach $word (keys %HWords) { $msg = $word . ' count = ' . $HWords{$word} . "\n"; print $OF $msg; } $t3 = [gettimeofday]; $elapsed = tv_interval ( $t2, $t3 ); $msg = "$0 output took $elapsed seconds ...\n"; print $msg; print $OF $msg; $msg = "See $out_file for the results ...\n"; print $msg; print $OF $msg; close $OF; system $out_file; sub read_file_f { my( $file_name, %args ) = @_ ; my $buf ; my $buf_ref = $args{'buf_ref'} || \$buf ; my $mode = O_RDONLY ; $mode |= O_BINARY if $args{'binmode'} ; local( *FH ) ; sysopen( FH, $file_name, $mode ) or carp "Can't open $file_name: $!" ; my $size_left = -s FH ; while( $size_left > 0 ) { my $read_cnt = sysread( FH, ${$buf_ref}, $size_left, length ${$buf_ref} ) ; unless( $read_cnt ) { carp "read error in file $file_name: $!" ; last ; } $size_left -= $read_cnt ; } # handle void context (return scalar by buffer reference) return unless defined wantarray ; # handle list context return split m|?<$/|g, ${$buf_ref} if wantarray ; # handle scalar context return ${$buf_ref} ; } # eof - striphtml03.pl