htmltools.pl to HTML.

index -|- end

Generated: Tue Jun 8 17:27:00 2010 from htmltools.pl 2010/03/31 43.5 KB.

#!/Perl
# NAME: htmltools.pl
# AIM: HTML tools - utility functions - 2006-08-26
# to include this, must declare @hrefs and @imgs if
# 20100312 - collecthrefs - remove trailing '"', if present
# collecthrefs( $txt, 1 ) or collectimgs( $txt, 1 ) resp. called
# set added 2009-06-24
# inline_clean_paras($) inline_clean_td($) remove_empty_paras($) remove_doctype($)
# and clean_line_with_wrap($) - and added $htmldbg8 to kill debug outputs

my $htmtdbg1 = 0;
my $htmtdbg2 = 0;
my $htmtdbg3 = 0;
my $htmtdbg4 = 0;
my $htmtdbg5 = 0;
my $htmtdbg6 = 0;
my $htmtdbg7 = 0;   # show acquired <body background="something"...>
my $htmldbg8 = 0; # remove some debug from 2009-06-24 set added

sub set_htools_dbg($) {
    my ($val) = @_;
    $htmtdbg1 = $val; $htmtdbg2 = $val; $htmtdbg3 = $val; $htmtdbg4 = $val;
    $htmtdbg5 = $val; $htmtdbg6 = $val; $htmtdbg7 = $val; $htmldbg8 = $val;
}

sub set_htools_dbg_on()  { set_htools_dbg(1); }
sub set_htools_dbg_off() { set_htools_dbg(0); }

my @tools_htm = ();

sub trimbothends {
   my ($txt) = shift;
   while ($txt =~ /^\s/) {
      $txt = substr($txt,1);
   }
   while ($txt =~ /\s$/) {
      $txt = substr($txt,0,length($txt)-1);
   }
   return $txt;
}

sub tag2newline { # ($txt2,'td');
   my ($txt, $tag) = @_;
   my $len = length($txt);
   my $ntxt = '';
   my $i;
   my $ch = '';
   my $ft = '';
   my $lcnt = 0;
   for ($i = 0; $i < $len; $i++ ) {
      $ch = substr($txt,$i,1);
      if ($lcnt && ($ch eq '<')) {
         $ft = $ch;
         $i++;
         for ( ; $i < $len; $i++ ) {
            $ch = substr($txt,$i,1);
            $ft .= $ch;
            if ($ch eq '>') {
               if ($ft =~ /^<$tag/i) {
                  $ft = "\n".$ft;
               }
               last;
            }
         }
         $ntxt .= $ft;
      } else {
         $ntxt .= $ch;
         if ($ch eq "\n") {
            $lcnt = 0;
         } else {
            $lcnt++;
         }
      }
   }
   return $ntxt;
}

sub comments2newline($) { # ($txt2);
   my ($txt) = shift;
   my $len = length($txt);
   my $ntxt = '';
   my $i;
   my $ch = '';
   my $ft = '';
   my $lcnt = 0;
   for ($i = 0; $i < $len; $i++ ) {
      $ch = substr($txt,$i,1);
      if ($lcnt && ($ch eq '<')) {
         $ft = $ch;   # set start
         $i++;
         if ($i < $len) {
            $ch = substr($txt,$i,1);
            $ft .= $ch;
            #if (($ct eq '!')||($ch eq '?')) {
            if ($ch eq '!') {
               $ft = "\n".$ft;
            }
         }
         $ntxt .= $ft;
      } else {
         $ntxt .= $ch;
         if ($ch eq "\n") {
            $lcnt = 0;
         } else {
            $lcnt++;
         }
      }
   }
   return $ntxt;
}

# strip a tag completely ...
# from <tag. ... to ... </tag>
sub striptag {
   my ($txt, $tag) = @_;
   my $len = length($txt);
   my $ntxt = '';
   my $ch = '';
   my $ftag = '';
   my $nline = '';
   my $i = 0;
   my $intag = 0;
   ###prt("Processing $len chars for $tag ...\n");
   for ( ; $i < $len; $i++) {
      $ch = substr($txt, $i, 1);
      if ($intag) {
         if ($ch eq "<") {
            ###prt("Got begin < ...\n");
            $i++;
            $ftag = '';
            for ( ; $i < $len; $i++ ) {
               $ch = substr($txt, $i, 1);
               if ($ch eq '>') {
                  last;
               } else {
                  $ftag .= $ch;
               }
            }
            ###prt("Got tag [$ftag] ...\n");
            ###if (lc($ftag) eq lc($tag)) {
            if (lc(substr($ftag,1)) eq lc($tag)) {
               $intag = 0;
            }
         } 
      } else {
         if ($ch eq "<") {
            ###prt("Got begin < ...\n");
            $i++;
            $ftag = '';
            for ( ; $i < $len; $i++ ) {
               $ch = substr($txt, $i, 1);
               if (($ch eq '>')||($ch eq ' ')) {
                  last;
               } else {
                  $ftag .= $ch;
               }
            }
            ###prt("Got tag [$ftag] ...\n");
            if (lc($ftag) eq lc($tag)) {
               if ($ch eq ' ') {
                  $i++;
                  for ( ; $i < $len; $i++ ) {
                     $ch = substr($txt, $i, 1);
                     if ($ch eq '>') {
                        last;
                     }
                  }
               }
               $intag = 1;
            } else {
               $ntxt .= '<'.$ftag.$ch;
            }
         } else {
            $ntxt .= $ch;
         }
      }
   }
   return $ntxt;
}

sub return_tag {
   my ($txt, $tag) = @_;
   my $len = length($txt);
   my $ntxt = '';
   my $ch = '';
   my $ftag = '';
   my $nline = '';
   my $i = 0;
   my $intag = 0;
   ###prt("Processing $len chars for $tag ...\n");
   for ( ; $i < $len; $i++) {
      $ch = substr($txt, $i, 1);
      if ($intag) {
         if ($ch eq "<") {
            ###prt("Got begin < ...\n");
            $i++;
            $ftag = '';
            for ( ; $i < $len; $i++ ) {
               $ch = substr($txt, $i, 1);
               if ($ch eq '>') {
                  last;
               }
               $ftag .= $ch;
            }
            ###prt("Got tag [$ftag] ...\n");
            ###if (lc($ftag) eq lc($tag)) {
            if (lc(substr($ftag,1)) eq lc($tag)) {
               $intag = 0;
               return $ntxt;
            }
            $ntxt = '';
            $ch = '';
         } 
         $ntxt .= $ch;
      } else {
         if ($ch eq "<") {
            ###prt("Got begin < ...\n");
            $i++;
            $ftag = '';
            for ( ; $i < $len; $i++ ) {
               $ch = substr($txt, $i, 1);
               if (($ch eq '>')||($ch eq ' ')||($ch =~ /\s/)) {
                  last;
               }
               $ftag .= $ch;
            }
            ###prt("Got tag [$ftag] ...\n");
            if (lc($ftag) eq lc($tag)) {
               if (($ch eq ' ')||($ch =~ /\s/)) {
                  $i++;
                  for ( ; $i < $len; $i++ ) {
                     $ch = substr($txt, $i, 1);
                     if ($ch eq '>') {
                        last;
                     }
                  }
               }
               $intag = 1;
            }
         }
      }
   }
   return $ntxt;
}

sub dropcomments { # strip_comments - strip comments - comment strip
   my ($txt) = shift;
   my $ntxt = '';
   my $len = length($txt);
   my $ch = '';
   my $pch1 = '';
   my $pch2 = '';
   my $i = 0;
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($txt, $i, 1);
      if ($ch eq '<') {
         if ((($i + 3) < $len)&&
            (substr($txt, $i+1, 3) eq '!--')) {
            $i += 2;
            $pch1 = '';
            $pch2 = '';
            for ( ; $i < $len; $i++) {
               $ch = substr($txt, $i, 1);
               if (($ch eq '>')&&($pch1 eq '-')&&($pch2 eq '-')) {
                  last;
               }
               $pch2 = $pch1;
               $pch1 = $ch;
            }
         } else {
            $ntxt .= $ch;
         }
      } else {
         $ntxt .= $ch;
      }
   }
   return $ntxt;
}

sub dropcomments_from_array {
   my (@arr) = @_;
   my $txt = '';
   foreach my $ln (@arr) {
      chomp $ln;
      $txt .= ' {=*==*=} ' if (length($txt));
      $txt .= $ln;
   }
   $txt = dropcomments( $txt );
   @arr = split( / \{=\*==\*=\} /, $txt );
   return @arr;
}

# Collect HREF anchors from a TEXT stream
# 25/07/2007 - Skip over comments <!-- to -->
sub collecthrefs {
   my ($txt,$del) = @_;
   my $ntxt = '';
   my $len = length($txt);
   my $ch = '';
   my $hrf = '';
   my $i;
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($txt,$i,1);
      if ($ch eq '<') {
         $hrf = $ch;
         $i++;
         for ( ; $i < $len; $i++) {
            $ch = substr($txt,$i,1);
            $hrf .= $ch;
            # 25/07/2007 watch OUT for COMMENTS - skip these
            if ($ch eq '-') {
               if ($hrf eq '<!--') {
                  # we have START of a COMMENT - YUK!!!
                  $i++;   # move to NEXT
                  for ( ; $i < $len; $i++) {
                     $ch = substr($txt,$i,1);
                     $hrf .= $ch;
                     if ($ch eq '>') {
                        if ($hrf =~ /-->$/) {
                           last;
                        }
                     }
                  }
               }
            }
            if ($ch eq '>') {
               last;
            }
         }

         if ($hrf =~ /^<a\s/i) {
            if ($del == 0) {
               $ntxt .= $hrf; # no delete - add the text
            }
            prt("Got anchor [$hrf] ...\n") if ($htmtdbg3);
            #if ($hrf =~ /href=["']*(\S+)["']?./im) {
            # 20100312 - allow spaces before inverted commas
            if ($hrf =~ /href=\s*["']*(\S+)["']?./im) {
               $hrf = $1;
               $hrf =~ s/"$//; # 20100312 - remove trailing '"'
               push(@hrefs,$hrf);
               push(@tools_htm,$hrf);
               prt("Got [$hrf] ...\n") if ($htmtdbg2);
            }
         } elsif ($hrf =~ /^<\/a>$/i) {
            if ($del == 0) {
               $ntxt .= $hrf;
            }
         } else {
            $ntxt .= $hrf;
         }
      } else {
         $ntxt .= $ch;
      }
   }
   prt( "Collected ". scalar @hrefs . " HREF ...\n" ) if ($htmtdbg2);

   return $ntxt;
}

# Collect HREF anchors from a TEXT stream
# 25/07/2007 - Skip over comments <!-- to -->
sub collect_hrefs {
   my ($txt) = shift;
   my $len = length($txt);
   my $ch = '';
   my $hrf = '';
   my @hrarr = ();
   my $i;
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($txt,$i,1);
      if ($ch eq '<') {
         $hrf = $ch;   # start a tag
         $i++;
         for ( ; $i < $len; $i++) {
            $ch = substr($txt,$i,1);
            $hrf .= $ch;
            # 25/07/2007 watch OUT for COMMENTS - skip these
            if ($ch eq '-') {
               if ($hrf eq '<!--') {
                  # we have START of a COMMENT - YUK!!!
                  $i++;   # move to NEXT
                  for ( ; $i < $len; $i++) {
                     $ch = substr($txt,$i,1);
                     $hrf .= $ch;
                     if ($ch eq '>') {
                        if ($hrf =~ /-->$/) {
                           last;
                        }
                     }
                  }
               }
            }
            if ($ch eq '>') {
               last;
            }
         }

         if ($hrf =~ /^<a\s/i) {
            prt("Got anchor [$hrf] ...\n") if ($htmtdbg3);
            if ($hrf =~ /href=(["']?\S+["']?)./im) {
               $hrf = $1;
               $hrf =~ s/"//g;
               $hrf =~ s/'//g;
               push(@hrarr,$hrf);
               prt("Got [$hrf] ...\n") if ($htmtdbg2);
            }
         }
      }
   }
   prt( "Collected ". scalar @hrarr . " HREF ...\n" ) if ($htmtdbg2);
   return @hrarr;
}


sub collecthrefs_nearly_ok {
   my ($txt,$del) = @_;
   my $ntxt = '';
   my $len = length($txt);
   my $ch = '';
   my $hrf = '';
   my $i;
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($txt,$i,1);
      if ($ch eq '<') {
         $hrf = $ch;
         $i++;
         for ( ; $i < $len; $i++) {
            $ch = substr($txt,$i,1);
            $hrf .= $ch;
            if ($ch eq '>') {
               last;
            }
         }

         if ($hrf =~ /^<a\s/i) {
            if ($del == 0) {
               $ntxt .= $hrf; # no delete - add the text
            }
            prt("Got anchor [$hrf] ...\n") if ($htmtdbg3);
            if ($hrf =~ /href=["'](\S+)["']./i) {
               $hrf = $1;
               push(@hrefs,$hrf);
               push(@tools_htm,$hrf);
               prt("Got [$hrf] ...\n") if ($htmtdbg2);
            }
         } elsif ($hrf =~ /^<\/a>$/i) {
            if ($del == 0) {
               $ntxt .= $hrf;
            }
         } else {
            $ntxt .= $hrf;
         }
      } else {
         $ntxt .= $ch;
      }
   }
   prt( "Collected ". scalar @hrefs . " HREF ...\n" ) if ($htmtdbg2);

   return $ntxt;
}

sub collect_anchors {
   my ($txt,$del) = @_;
   my $ntxt = '';
   my $len = length($txt);
   my $ch = '';
   my $hrf = '';
   my $i;
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($txt,$i,1);
      if ($ch eq '<') {
         $hrf = $ch;
         $i++;
         for ( ; $i < $len; $i++) {
            $ch = substr($txt,$i,1);
            $hrf .= $ch;
            # 25/07/2007 watch OUT for COMMENTS - skip these
            if ($ch eq '-') {
               if ($hrf eq '<!--') {
                  # we have START of a COMMENT - YUK!!!
                  $i++;   # move to NEXT
                  for ( ; $i < $len; $i++) {
                     $ch = substr($txt,$i,1);
                     $hrf .= $ch;
                     if ($ch eq '>') {
                        if ($hrf =~ /-->$/) {
                           last;
                        }
                     }
                  }
               }
            }
            if ($ch eq '>') {
               last;
            }
         }

         if ($hrf =~ /^<a\s/i) {
            prt("Got anchor [$hrf] ...\n") if ($htmtdbg3);
            push(@hrefs,$hrf);
            push(@tools_htm,$hrf);
            if ($del == 0) {
               $ntxt .= $hrf; # no delete - add the text
            }
         } elsif ($hrf =~ /^<\/a>$/i) {
            if ($del == 0) {
               $ntxt .= $hrf;
            }
         } else {
            $ntxt .= $hrf;
         }
      } else {
         $ntxt .= $ch;
      }
   }
   prt( "Collected ". scalar @tools_htm . " anchors ...\n" ) if ($htmtdbg2);

   return $ntxt;
}


sub ret_anchor_array {
   my ($txt) = shift;
   @tools_htm = ();
   collect_anchors( $txt, 0 );
   return @tools_htm;
}


sub ret_hrefs_array {
   my ($txt) = shift;
   @tools_htm = ();
   collecthrefs( $txt, 0 );
   return @tools_htm;
}

sub collectimgs {
   my ($txt,$del) = @_;
   my $ntxt = '';
   my $len = length($txt);
   my $ch = '';
   my $hrf = '';
   my $i;
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($txt,$i,1);
      if ($ch eq '<') {
         $hrf = $ch;
         $i++;
         for ( ; $i < $len; $i++) {
            $ch = substr($txt,$i,1);
            $hrf .= $ch;
            if ($ch eq '>') {
               last;
            }
         }

         if ($hrf =~ /^<img\s+/i) {
            if ($del == 0) {
               $ntxt .= $hrf;
            }
            prt("Got [$hrf] ...\n") if ($htmtdbg3);
            if ($hrf =~ /src=["']*(\S+)['"]*.*/i) {
               $hrf = $1;
               $hrf =~ s/['"]$//;
               push(@imgs,$hrf);
               prt("Got [$hrf] ...\n") if ($htmtdbg2);
            }
         } else {
            $ntxt .= $hrf;
         }
      } else {
         $ntxt .= $ch;
      }
   }
   return $ntxt;
}

# return image array
# BUT NOT ONLY
# <img border="0" src="[images/construc.gif]" width="87" height="87" alt="under construction">
# BUT ALSO 
# <link rel="stylesheet" type="text/css" href="[home.css]">
# AND
# <link rel="shortcut icon" href="[favicon.ico]">
# AND 
# <script language="JavaScript" type="text/javascript" src="[fgtoc.js]">
# AND
# <body background="[clds4.jpg]" ...>
# AND
# <applet code="[TimerClass.class]"
#        width="90"
#        height="20">
sub ret_imgs_array {
   my ($txt) = shift;
   my @ims = ();
   my $len = length($txt);
   my $ch = '';
   my $hrf = '';
   my $i;
   my $lnum = 1;
   my $cnum = 0;
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($txt,$i,1);
      $cnum++;
      if ($ch eq '<') {
         ### prt( "htmltools:$lnum:$cnum: Start TAG ...\n" ) if ($htmtdbg6);
         $hrf = $ch;
         $i++;
         for ( ; $i < $len; $i++) {
            $ch = substr($txt,$i,1);
            $cnum++;
            if ($ch eq "\n") {
               $hrf .= ' ';
               $lnum++;
               $cnum = 0;
            } else {
               $hrf .= $ch;
            }
            if ($ch eq '>') {
               last;
            } elsif ($ch eq '-') {
               if ($hrf eq '<!--') {
                  prt( "htmltools:$lnum:$cnum: Entered a COMMENT - get to comment end ...\n" ) if ($htmtdbg6);
                  $i++;
                  $hrf = '';
                  for ( ; $i < $len; $i++) {
                     $ch = substr($txt,$i,1);
                     $cnum++;
                     if ($ch eq "\n") {
                        $hrf = '';
                        $lnum++;
                        $cnum = 0;
                     } else {
                        $hrf .= $ch;
                     }
                     if ($ch eq '>') {
                        if ($hrf =~ /-->$/) {
                           prt( "htmltools:$lnum:$cnum: End COMMENT ...[$hrf]\n" ) if ($htmtdbg6);
                           $hrf = '';
                           last;   # out of inner inner
                        }
                     }
                  }
                  $hrf = '';
                  last;   # out of inner
               }
            }
         }
         ### prt( "htmltools:$lnum:$cnum: [$hrf]\n" );
         if ($hrf =~ /^<img\s+/i) {
            prt("htmltools:$lnum:$cnum: Got [$hrf] ...\n") if ($htmtdbg3);
            if ($hrf =~ /src=\s*["']*(\S+)['"]*.*/i) {
               $hrf = $1;
               $hrf =~ s/['"]$//;
               push(@ims,$hrf);
               prt("htmltools:$lnum:$cnum: Got IMG SRC [$hrf] ...\n") if ($htmtdbg4);
            } else {
               prt( "WARNING: htmltools:$lnum:$cnum: IMG sans source [$hrf]\n" );
            }
         } elsif ($hrf =~ /<link\s+/i) {
            ###if ($hrf =~ /href=["']*([\w\.]+)['"]*.*/i) {
            if ($hrf =~ /href=["']*(\S+)['"]*.*/i) {
               $hrf = $1;
               $hrf =~ s/>$//;
               $hrf =~ s/['"]$//;
               push(@ims,$hrf);
               prt("htmltools:$lnum:$cnum: Got LINK HREF [$hrf] ...\n") if ($htmtdbg4);
            }
         } elsif ($hrf =~ /<script\s+/i) {
            if ($hrf =~ /src=["']*(\S+)['"]*.*/i) {
               $hrf = $1;
               $hrf =~ s/>$//;
               $hrf =~ s/['"]$//;
               push(@ims,$hrf);
               prt("htmltools:$lnum:$cnum: Got SCRIPT SRC [$hrf] ...\n") if ($htmtdbg4);
            }
            #else {
            #   prt( "WARNING: htmltools: SCRIPT sans SRC [$hrf]\n" );
            #}
         } elsif ($hrf =~ /^<body\s+(.*)>$/i) {
            $hrf = $1;
            if ($hrf =~ /background=["']*(\S+)['"]*.*/i) {
               $hrf = $1;
               $hrf =~ s/>$//;
               $hrf =~ s/['"]$//;
               push(@ims,$hrf);
               prt("htmltools:$lnum:$cnum: Got body background [$hrf] ...\n") if ($htmtdbg7);
            }
            #else {
            #   prt( "WARNING: htmltools: body sans background [$hrf]\n" );
            #}
         } elsif ($hrf =~ /^<applet\s+(.*)>$/i) {
            $hrf = $1;
            if ($hrf =~ /code=["']*(\S+)['"]*.*/i) {
               $hrf = $1;
               $hrf =~ s/>$//;
               $hrf =~ s/['"]$//;
               push(@ims,$hrf);
               prt("htmltools:$lnum:$cnum: Got applet code [$hrf] ...\n") if ($htmtdbg7);
            }
            #else {
            #   prt( "WARNING: htmltools: applet sans code [$hrf]\n" );
            #}
         }
      }
      if ($ch eq "\n") {
         $lnum++;
         $cnum = 0;
      }
   }

   if ($htmtdbg5) {
      $i = scalar @ims;
      prt( "Returning $i IMG/OTHER items ...\n" );
      foreach $hrf (@ims) {
         prt( "$hrf " );
      }
      prt("\n");
   }
   return @ims;
}

# just remove a <tag>, and </tag> ...
# but leave the stuff between
sub removetag {
   my ($txt, $tg) = @_;
   my $ntxt = '';
   my $len = length($txt);
   my $ch = '';
   my $hrf = '';
   my $i;
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($txt,$i,1);
      if ($ch eq '<') {
         $hrf = $ch;
         $i++;
         for ( ; $i < $len; $i++) {
            $ch = substr($txt,$i,1);
            $hrf .= $ch;
            if ($ch eq '>') {
               last;
            }
         }
         if ($hrf =~ /^<$tg\s/i) {
         } elsif ($hrf =~ /^<$tg>$/i) {
         } elsif ($hrf =~ /^<\/$tg>$/i) {
         } else {
            $ntxt .= $hrf;
         }
      } else {
         $ntxt .= $ch;
      }
   }
   return $ntxt;
}

sub remove_script {
   my ($txt) = shift;
   my $dbgsc = 0;   # only if a LOT of noise wanted
   my $tg = 'script';
   my $ntxt = '';
   my $len = length($txt);
   my $ch = '';
   my $hrf = '';
   my $i;
   my $insc = 0;
   my $quot = '';
   my $pch = '';
   my $qtxt = '';
   my $lstl = '';
   my $lnum = 1;
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($txt,$i,1);
      if ($insc) {
         if ( (($ch eq '"')||($ch eq "'")) && ( $pch ne "\\" ) ) {
            $quot = $ch;
            $qtxt = $ch;
            $pch = $ch;
            prt( "$lnum:$i: Begin QUOTES [$ch] ...[$lstl]\n" ) if ($dbgsc);
            $i++;
            for ( ; $i < $len; $i++) {
               $ch = substr($txt,$i,1);
               if (($ch eq $quot) && ( $pch ne "\\" )) {
                  $qtxt .= $ch;
                  prt( "$lnum:$i: End QUOTES [$quot] [$qtxt]\n" ) if ($dbgsc);
                  last;
               } elsif ($ch eq "\n") {
                  prt( "$lnum:$i: End QUOTES ON NEW LINE [$qtxt]\n" ) if ($dbgsc);
                  last;
               }
               $pch = $ch;
               $qtxt .= $ch;
            }
         } elsif (($ch eq '*')&&($pch eq '/')) {
            prt( "$lnum:$i: Begin /* comment ... [$lstl]\n" ) if ($dbgsc);
            $qtxt = "$pch$ch";
            $pch = $ch;
            $i++;
            for ( ; $i < $len; $i++) {
               $ch = substr($txt,$i,1);
               if (($ch eq '/')&&($pch eq '*')) {
                  prt( "$lnum:$i: End /* comment ... [$qtxt]\n" ) if ($dbgsc);
                  last;
               }
               $pch = $ch;
               $qtxt .= $ch;
            }
         } elsif (($ch eq '/')&&($pch eq '/')) {
            prt( "$lnum:$i: Begin // comment ...[$lstl]\n" ) if ($dbgsc);
            $qtxt = "$pch$ch";
            $pch = $ch;
            $i++;
            for ( ; $i < $len; $i++) {
               $ch = substr($txt,$i,1);
               if ($ch eq "\n") {
                  prt( "$lnum:$i: End comment ... [$qtxt]\n" ) if ($dbgsc);
                  last;
               }
               $pch = $ch;
               $qtxt .= $ch;
            }
         } elsif ($ch eq '<') {
            $hrf = $ch;
            $lstl .= $ch;
            $i++;
            prt( "$lnum:$i: Being tag ... [$lstl]\n" ) if ($dbgsc);
            if ($i < $len) {
               $ch = substr($txt,$i,1);
               if ($ch =~ /[\w\/!]/) {   # if alphanumeric, or '/' or '!'
                  for ( ; $i < $len; $i++) {
                     $ch = substr($txt,$i,1);
                     $hrf .= $ch if ($ch ne "\n");
                     if ($ch eq '>') {
                        prt( "$lnum:$i: End tag ... [$hrf]\n" ) if ($dbgsc);
                        last;
                     } elsif ($hrf eq '<!--') {
                        prt( "$lnum:$i: Skip comment tag ... [$hrf]\n" ) if ($dbgsc);
                        last;
                     }
                     if ($ch eq "\n") {
                        $lstl = '';
                     } else {
                        $lstl .= $ch;
                     }
                  }
                  if ($hrf =~ /<\/$tg>/i) {
                     prt( "$lnum:$i: End $tg [$hrf]\n" ) if ($dbgsc);
                     $insc = 0;
                  }
               } else {
                  prt( "$lnum:$i: Non-alphanumeric follows - assume NOT tag ...\n" ) if ($dbgsc);
               }
            }
         }

         $pch = $ch;
         if ($ch eq "\n") {
            $lstl = '';
            $lnum++;
         } else {
            $lstl .= $ch;
         }
      } else {
         if ($ch eq '<') {
            $hrf = $ch;
            $i++;
            for ( ; $i < $len; $i++) {
               $ch = substr($txt,$i,1);
               $hrf .= $ch;
               if ($ch eq '>') {
                  last;
               }
               $lnum++ if ($ch eq "\n");
            }
            if ($hrf =~ /^<$tg\s+/i) {
               prt( "$lnum:$i: Begin $tg sp [$hrf]\n" ) if ($dbgsc);
               $insc = 1;
               $pch = '';
            } elsif ($hrf =~ /^<$tg>$/i) {
               prt( "$lnum:$i: Begin $tg [$hrf]($i)\n" ) if ($dbgsc);
               $insc = 1;
               $pch = '';
            } else {
               $ntxt .= $hrf;
            }
         } else {
            $ntxt .= $ch;
         }
         $lnum++ if ($ch eq "\n");
      }
   }
   return $ntxt;
}

sub removefont {
   my ($txt) = shift;
   my $ntxt = removetag($txt,'font');
   return $ntxt;
}

sub removetagattrib {
   my ($txt, $tag) = @_;
   my $ntxt = '';
   my $len = length($txt);
   my $ch = '';
   my $hrf = '';
   my $i;
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($txt,$i,1);
      if ($ch eq '<') {
         $hrf = $ch;
         $i++;
         for ( ; $i < $len; $i++) {
            $ch = substr($txt,$i,1);
            $hrf .= $ch;
            if ($ch eq '>') {
               last;
            }
         }
         if ($hrf =~ /^<$tag>$/i) {
            $ntxt .= $hrf;
         } elsif ($hrf =~ /^<$tag\s+/i) {
            ###prt("Removing $tag attrib [$hrf]\n");
            $ntxt .= substr($hrf,0,length($tag)+1).'>';
         } else {
            $ntxt .= $hrf;
         }
      } else {
         $ntxt .= $ch;
      }
   }
   return $ntxt;
}

# add 2009-06-24
sub remove_table_attribs {
   my ($txt) = shift;
   my $ntxt = removetagattrib($txt,'table');
   return $ntxt;
}

sub removetdattrib {
   my ($txt) = shift;
   my $ntxt = removetagattrib($txt,'td');
   return $ntxt;
}

sub removetrattrib {
   my ($txt) = shift;
   my $ntxt = removetagattrib($txt,'tr');
   return $ntxt;
}

sub substitutions { # ($txt2);
   my ($txt) = shift;
   $txt =~ s/&nbsp;/ /gm;
   $txt =~ s/&amp;/&/gm;
   return $txt;
}

sub trimblanklines {
   my ($txt) = shift;
   my $len = length($txt);
   my $ntxt = '';
   my $ln = '';
   my $ch = '';
   my $i = 0;
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($txt,$i,1);
      if (($ch eq "\n")||($ch eq "\r")) {
         if (length($ln)) {
            if ($ln =~ /\S+/) {   # if got NOT space
               $ln = trimbothends($ln);
               if (length($ln)) {
                  $ntxt .= $ln . $ch;
               }
            }
         }
         $ln = '';
      } else {
         $ln .= $ch;
      }
   }
   if (length($ln)) {
      if ($ln =~ /\S+/) {
         $ln = trimbothends($ln);
         if (length($ln)) {
            $ntxt .= $ln;
         }
      }
   }
   return $ntxt;
}

sub trimblanks {
   my ($txt) = shift;
   my $len = length($txt);
   my $ntxt = '';
   my $ln = '';
   my $ch = '';
   my $i = 0;
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($txt,$i,1);
      if (($ch eq "\n")||($ch eq "\r")) {
         if (length($ln)) {
            if ($ln =~ /\S+/) {   # if got NOT space
               ###$ln = trimbothends($ln);
               ###if (length($ln)) {
                  $ntxt .= $ln . $ch;
               ###}
            }
         }
         $ln = '';
      } else {
         $ln .= $ch;
      }
   }
   if (length($ln)) {
      if ($ln =~ /\S+/) {
         ###$ln = trimbothends($ln);
         ###if (length($ln)) {
            $ntxt .= $ln;
         ###}
      }
   }
   return $ntxt;
}


sub trimblanklines_OK_maybe {
   my ($txt) = shift;
   my $len = length($txt);
   my $ntxt = '';
   my $ln = '';
   my $ch = '';
   my $i = 0;
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($txt,$i,1);
      if ($ch eq "\n") {
         if (length($ln)) {
            if ($ln =~ /\S+/) {
               while ($ln =~ /^\s/) {
                  $ln = substr($ln,1);
               }
               if (length($ln)) {
                  $ntxt .= $ln . $ch;
               }
            }
         }
         $ln = '';
      } else {
         $ln .= $ch;
      }
   }
   if (length($ln)) {
      if ($ln =~ /\S?/) {
         $ntxt .= $ln;
      }
   }
   return $ntxt;
}

sub triminlinetd {
   my ($txt) = shift;
   my $len = length($txt);
   my $ntxt = '';
   my $ln = '';
   my $ch = '';
   my $lt = '';
   my $nlt = '';
   my $nln = '';
   my $i = 0;
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($txt,$i,1);
      if ($ch eq "\n") {
         if ($ln =~ /.*<td.*>(.*)<\/td>/i) {
            $lt = $1;   # get text between <td>...</td>
            # $nlt =~ s/\s//g; this removes ALL spaces - NOT GOOD!
            $nlt = trimbothends($lt);
            if (length($nlt)) {
               ###prt("Got inline <td>...</td> - [$ln] [$lt] [$nlt]...\n");
               $nln = $ln;
               $nln =~ s/$lt/$nlt/;
               ###prt("New line [$nln]...\n");
               $ln = $nln;
            }
         }
         $ntxt .= $ln.$ch;
         $ln = '';
      } else {
         $ln .= $ch;
      }
   }
   if (length($ln)) {
      if ($ln =~ /\S?/) {
         $ntxt .= $ln;
      }
   }
   return $ntxt;
}

# strip from '<?' to '?>', excluding within quotes
sub strip_php_script {
   my ($txt) = shift;
   my $ntxt = '';
   my $max = length($txt);
   my $pch = '';
   my $inphp = 0;
   my $inquote = '';
   for (my $i = 0; $i < $max; $i++) {
      my $ch = substr($txt,$i,1);
      if ($inphp) {
         ##print "Should be END PHP ...\n" if (($ch eq '>')&&($pch eq '?'));
         if (length($inquote)) {
            # wating for end of QUOTE
            if ( ($ch eq $inquote) && ($pch ne "\\") ) {
               #print "End of QUOTE\n";
               $inquote = '';
            }
         } else {
            if ( (($ch eq '"')||($ch eq "'")) && ($pch ne "\\") ) {
               #print "Start of QUOTE\n";
               $inquote = $ch;
            } 
            if (($ch eq '>')&&($pch eq '?')) {
               $inphp = 0;
               #print "End of PHP ...\n";
            }
         }
         $pch = $ch;
         next;
      } else {
         if (($ch eq '?') && ($pch eq '<')) {
            $ntxt = substr($ntxt, 0, length($ntxt) - 1);
            #print "Start of PHP ...\n";
            $inphp = 1;
            next;
         }
      }
      $pch = $ch;
      $ntxt .= $ch;
   }
   return $ntxt;
}

sub drop_php_from_array {
   my (@arr) = @_;
   my $txt = '';
   foreach my $ln (@arr) {
      chomp $ln;
      $txt .= ' {=*==*=} ' if (length($txt));
      $txt .= $ln;
   }
   $txt = strip_php_script( $txt );
   @arr = split( / \{=\*==\*=\} /, $txt );
   my $lnc = scalar @arr;
   for (my $i = 0; $i < $lnc; $i++) {
      $arr[$i] .= "\n";
   }
   return @arr;
}


sub htmlexpand {
   my ($rtxt) = shift;
   my $tlen = length($rtxt);
prt( "len=$tlen - Add STYLE TAG to new line ...\n") if ($htmtdbg1);
$rtxt = tag2newline($rtxt,'style');
$tlen = length($rtxt);
prt( "len=$tlen - Add TABLE TAG to new line ...\n") if ($htmtdbg1);
$rtxt = tag2newline($rtxt,'table');
$tlen = length($rtxt);
prt( "len=$tlen - Add TR TAG to new line ...\n") if ($htmtdbg1);
$rtxt = tag2newline($rtxt,'tr');
$tlen = length($rtxt);
prt( "len=$tlen - Add TH TAG to new line ...\n") if ($htmtdbg1);
$rtxt = tag2newline($rtxt,'th');
$tlen = length($rtxt);
prt( "len=$tlen - Add TD TAG to new line ...\n") if ($htmtdbg1);
$rtxt = tag2newline($rtxt,'td');
$tlen = length($rtxt);
prt( "len=$tlen - Add SCRIPT TAG to new line ...\n") if ($htmtdbg1);
$rtxt = tag2newline($rtxt,'script');
$tlen = length($rtxt);
prt( "len=$tlen - Add PRE TAG to new line ...\n") if ($htmtdbg1);
$rtxt = tag2newline($rtxt,'pre');
$tlen = length($rtxt);
prt( "len=$tlen - Returned from htmlexpand ...\n" ) if ($htmtdbg1);
   return $rtxt;
}

sub dropdoctype {
   my ($txt) = shift;
   my $tlen = length($txt);
    my $pch = '';
    my $ch = '';
    my $rtxt = '';
    for (my $i = 0; $i < $tlen; $i++) {
        $ch = substr($txt,$i,1);
        if ($ch eq '<') {
            $pch = substr($txt,$i);
            if ($pch =~ /^<!DOCTYPE\s+/i) {
                ###prt( "Got DOCTYPE ...\n" );
                $i++;   # move to next
                for (; $i < $tlen; $i++) {
                    $ch = substr($txt,$i,1);
                    if ($ch eq '>') {
                        $ch = '';
                        last;
                    }
                }
            }
        }
        $rtxt .= $ch;
    }
    return $rtxt;
}

sub html_clean_up1 {
   my ($rtxt) = shift;
   my $tlen = length($rtxt);
prt( "len=$tlen - Drop DOCTYPE <!DOCTYPE... > ...\n");
$rtxt = dropdoctype($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Drop comments <!--...--> ...\n");
$rtxt = dropcomments($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Strip <HEAD>...</HEAD> tag ...\n");
$rtxt = striptag($rtxt, 'HEAD');
$tlen = length($rtxt);
prt( "len=$tlen - Strip <script>...</script> tag ...\n");
$rtxt = striptag($rtxt,'script');
$tlen = length($rtxt);
prt( "len=$tlen - Strip <noscript>...</noscript> tag ...\n");
$rtxt = striptag($rtxt,'noscript');
$tlen = length($rtxt);
prt( "len=$tlen - Strip <SELECT>...</SELECT> tag ...\n");
$rtxt = striptag($rtxt,'select');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <font ...> tags ...\n");
$rtxt = removefont($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Remove <b> tags ...\n");
$rtxt = removetag($rtxt,'b');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <tt> tags ...\n");
$rtxt = removetag($rtxt,'tt');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <nobr> tags ...\n");
$rtxt = removetag($rtxt,'nobr');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <span> tags ...\n");
$rtxt = removetag($rtxt,'span');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <div> tags ...\n");
$rtxt = removetag($rtxt,'div');
$tlen = length($rtxt);
if ($rtxt =~ /<strong>/) {
    prt( "len=$tlen - Remove <strong> tags ...\n");
    $rtxt = removetag($rtxt,'strong');
    $tlen = length($rtxt);
}
prt( "len=$tlen - Remove <ul> tags ...\n");
$rtxt = removetag($rtxt,'ul');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <u> tags ...\n");
$rtxt = removetag($rtxt,'u');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <h1> tags ...\n");
$rtxt = removetag($rtxt,'h1');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <h2> tags ...\n");
$rtxt = removetag($rtxt,'h2');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <li> tags ...\n");
$rtxt = removetag($rtxt,'li');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <br> tags ...\n");
$rtxt = removetag($rtxt,'br');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <html> tags ...\n");
$rtxt = removetag($rtxt,'html');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <body> tags ...\n");
$rtxt = removetag($rtxt,'body');
$tlen = length($rtxt);
prt( "len=$tlen - Remove p attributes ...\n");
$rtxt = removetagattrib($rtxt,'p');
$tlen = length($rtxt);
prt( "len=$tlen - Remove th attributes ...\n");
$rtxt = removetagattrib($rtxt,'th');
$tlen = length($rtxt);
prt( "len=$tlen - Remove tr attributes ...\n");
$rtxt = removetrattrib($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Remove td attributes ...\n");
$rtxt = removetdattrib($rtxt);
$tlen = length($rtxt);
$rtxt = trimblanklines($rtxt);
$tlen = length($rtxt);
    return $rtxt;
}

sub htmlcleanall {
   my ($rtxt) = shift;
   my $tlen = length($rtxt);
prt( "len=$tlen - Drop comments <!--...--> ...\n");
$rtxt = dropcomments($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Strip <HEAD>...</HEAD> tag ...\n");
$rtxt = striptag($rtxt, 'HEAD');
$tlen = length($rtxt);
prt( "len=$tlen - Strip <script>...</script> tag ...\n");
$rtxt = striptag($rtxt,'script');
$tlen = length($rtxt);
prt( "len=$tlen - Strip <noscript>...</noscript> tag ...\n");
$rtxt = striptag($rtxt,'noscript');
$tlen = length($rtxt);
prt( "len=$tlen - Strip <SELECT>...</SELECT> tag ...\n");
$rtxt = striptag($rtxt,'select');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <font ...> tags ...\n");
$rtxt = removefont($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Remove <b> tags ...\n");
$rtxt = removetag($rtxt,'b');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <tt> tags ...\n");
$rtxt = removetag($rtxt,'tt');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <nobr> tags ...\n");
$rtxt = removetag($rtxt,'nobr');
$tlen = length($rtxt);
prt( "len=$tlen - Remove <span> tags ...\n");
$rtxt = removetag($rtxt,'span');
$tlen = length($rtxt);
prt( "len=$tlen - Remove th attributes ...\n");
$rtxt = removetagattrib($rtxt,'th');
$tlen = length($rtxt);
prt( "len=$tlen - Remove tr attributes ...\n");
$rtxt = removetrattrib($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Remove td attributes ...\n");
$rtxt = removetdattrib($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Delete <a...> & </a>\n");
$rtxt = collecthrefs($rtxt,1);
$tlen = length($rtxt);
prt( "len=$tlen - Delete <img...>\n");
$rtxt = collectimgs($rtxt,1);
$tlen = length($rtxt);
prt( "len=$tlen - Do substitutions ...\n");
$rtxt = substitutions($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Trim blank lines ...\n");
$tlen = length($rtxt);
$rtxt = trimblanklines($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Trim inline td ...\n");
$rtxt = triminlinetd($rtxt);
$tlen = length($rtxt);
prt( "len=$tlen - Returned from htmlcleanall ...\n");
   return $rtxt;
}

# added 18/07/2008
sub get_tag_attr_array {
    my ($tag) = shift;
    my ($i, $i2, $ch, $ln, @arr, $tg, $spc, $com, $incom, $pch, $nch);
    $tg = '';
    $ln = length($tag);
    @arr = ();
    $spc = 0;
    $incom = 0;
    $pch = '';
    my $indent = '    ';
    for ($i = 0; $i < $ln; $i++) {
        $i2 = $i + 1;
        $ch = substr($tag,$i,1);
        $nch = '';
        $nch = substr($tag,$i2,1) if ($i2 < $ln);
        $tg .= $ch; # add it to the tag
        if ($incom) {
            if ($ch eq $com) {
                $incom = 0;
            }
        } else {
            if (($ch eq '"')||($ch eq "'")) {
                $com = $ch;
                $incom = 1;
            } elsif (($ch =~ /^\s$/) && ($nch ne '/') && ($nch ne '>')) {
                $spc++;
                if ($spc > 1) {
                    if (length($tg)) {
                        $tg = $indent.$tg if (@arr);
                        push(@arr,$tg);
                    }
                    $tg = '';
                }
            }
        }
        $pch = $ch;
    }
    if (length($tg)) {
        $tg = $indent.$tg if (@arr);
        push(@arr,$tg);
    }
    return @arr;
}


sub split_tag_attrs {
    my ($tag) = shift;
    my @arr = ();
    if ($tag =~ /\s+/) {
        # there is a chance it has more than ONE attribute
        @arr = get_tag_attr_array($tag);
    } else {
        push(@arr,$tag);
    }
    return @arr;
}

sub array_tags2newline {
    my (@arr) = @_;
    my @narr = ();
    my ($ch, $len);
    my ($ln, $i, $pre, $lc, $l, $tag, $intag);
    my $maxtag = 60;
    $pre = '';
    $lc = scalar @arr;
    $intag = 0;
    $tag = '';
    for ($l = 0; $l < $lc; $l++) {
        $ln = $arr[$l]; # get LINE
        $len = length($ln); # and its LENGTH
        for ($i = 0; $i < $len; $i++) {
            # process char by char
            $ch = substr($ln,$i,1);
            if ($intag) {
                # seek END of tag
                $tag .= $ch;
                if ($ch eq '>') {
                    if ($tag =~ /^<.+>$/) {
                        # got WHOLE tag
                        if (length($tag) > $maxtag) {
                            push(@narr, split_tag_attrs($tag));
                        } else {
                            push(@narr,$tag);
                        }
                    } else {
                        push(@narr,$tag);
                    }
                    $tag = '';
                    $intag = 0;
                }
            } else {
                # seek start of TAG
                if ($ch eq '<') {
                    if (length($pre)) {
                        if ($pre =~ /^\s+$/) {
                            # is all space - dump it
                        } else {
                            push(@narr,$pre);
                        }
                    }
                    $pre = ''# clear anything before
                    $tag = $ch; # START tag
                    $intag = 1; # and now IN A TAG
                } else {
                    $pre .= $ch;
                }
            }
        }
        # done LINE, so add this tag
        push(@narr,$tag) if length($tag);
        $tag = '';
    }
    return @narr;
}

# set added 2009-06-24
# inline_clean_paras($) inline_clean_td($) remove_empty_paras($) remove_doctype($)

sub clean_line_with_wrap {
   my ($t, $m) = @_;
   my $ln = length($t);
   my $nl = '';
   my $nll = 0;
   for (my $j = 0; $j < $ln; $j++) {
      my $c = substr($t,$j,1);
      if ($c =~ /\n/) {
         if (length($nl)) {
            my $j2 = $j + 1;
            if ($j2 < $ln) {
               my $c2 = substr($t,$j2,1); # get first after
               if (($c2 eq '.')||($c2 eq ':')||($c2 eq ',')||($c2 =~ /\s/)) {
                  $c = ''; # kill CR
               } else {
                  $c = ' '; # conv to space
               }
            } else {
               $c = ''; # last, kill CR
            }
         } else {
            $c = ''; # first, kill CR
         }
      }
      if (length($c)) {
         $nll++;
         if ($nll > $m) {
            if ($c =~ /\s/) {
               $c = "\n";
               $nll = 0;
            }
         }
         $nl .= $c;
      }
   }
   return $nl;
}

# take things like
#<p>
#params
#returns one value,
#the symbolic constant identifying the RGB destination blend
#function. The initial value is
#GL_ZERO
#.
#See
#glBlendFunc
#and
#glBlendFuncSeparate
#.
#</p>
# and return
#<p>
#params returns one value, the symbolic constant identifying the RGB destination blend
#function. The initial value is GL_ZERO. See glBlendFunc and glBlendFuncSeparate.
#</p>
sub inline_clean_paras {
   my ($tx,$mx) = @_;
   my $len = length($tx);
   my $ntxt = '';
   my @wlist = ();
   my ($i, $ch, $intg, $inp, $ino, $tag, $bp, $ep, $wt, $tlen);
   my ($btxt, $etxt);
   $intg = 0;
   $inp = 0;
   $ino = 0;
   $tag = '';
   $bp = 0;
   $ep = 0;
   prt( "Doing [$len] chars for P... max line $mx\n" ) if ($htmldbg8);
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($tx,$i,1);
      if ($intg) {
         if ($ch eq '>') {
            #prt( "$i: End tag\n" );
            $intg = 0;
            if ($tag =~ /^p$/i) {
               $inp = 1;
               $ino = 0;
               $bp = $i + 1;
               #prt( "$bp: Begin p [$tag]\n" );
            } elsif ($tag =~ /^\/p$/i) {
               if ($inp) {
                  $inp = 0;
                  $ep = $i - 4;
                  #prt( "$ep: End p [$tag][$bp - $ep]\n" );
               } else {
                  prt( "$ep: End p [$tag] NOT inp!\n" ) if ($htmldbg8);
               }
            } else {
               $ino = 1;
               #prt( "$i: Other [$tag]\n" );
            }
            $tag = '';
         } else {
            $tag .= $ch;
         }
      } else {
         if ($ch eq '<') {
            $tag = '';
            $intg = 1;
            #prt( "$i: Start tag\n" );
         }
      }
      $ntxt .= $ch;
      $tlen = $ep - $bp;
      if (($bp > 0) && ($tlen > 0)) {
         $wt = substr($tx,$bp,$tlen);
         #prt( "Work:$bp:$ep:$tlen: [$wt]\n" );
         # exit 1;
         push(@wlist, [$bp,$ep]);
         $ep = 0;
         $bp = 0;
      }
   }
   $len = scalar @wlist;
   prt( "Done [$i] chars. Got $len work items...\n" ) if ($htmldbg8);
   for ($i = $len - 1; $i >= 0; $i--) {
      $bp = $wlist[$i][0];
      $ep = $wlist[$i][1];
      $tlen = $ep - $bp;
      $wt = substr($tx,$bp,$tlen);
      $btxt = substr($tx,0,$bp);
      $etxt = substr($tx,$ep+1);
      #prt( "Work:$bp:$ep:$tlen: [$wt]\n" );
      $wt = clean_line_with_wrap($wt,$mx);
      #prt( "AWork:$bp:$ep:$tlen: [$wt]\n" );
      $tx = $btxt.$wt.$etxt;
      $btxt = substr($btxt,length($btxt)-3) if (length($btxt) > 3);
      $etxt = substr($etxt,0,4) if (length($etxt) > 4);
      #prt( "Bgn:[$btxt] End:[$etxt]\n" );
   }
   $len = length($tx);
   prt( "P Returning [$len] chars...\n" ) if ($htmldbg8);
   return $tx;
}


# added 2009-06-24
sub inline_clean_td {
   my ($tx,$mx) = @_;
   my $len = length($tx);
   my $ntxt = '';
   my @wlist = ();
   my ($i, $ch, $intg, $inp, $ino, $tag, $bp, $ep, $wt, $tlen);
   my ($btxt, $etxt);
   $intg = 0;
   $inp = 0;
   $ino = 0;
   $tag = '';
   $bp = 0;
   $ep = 0;
   prt( "Doing [$len] chars for TD... max line $mx\n" ) if ($htmldbg8);
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($tx,$i,1);
      if ($intg) {
         if ($ch eq '>') {
            #prt( "$i: End tag\n" );
            $intg = 0;
            if ($tag =~ /^td$/i) {
               $inp = 1;
               $ino = 0;
               $bp = $i + 1;
               #prt( "$bp: Begin TD [$tag]\n" );
            } elsif ($tag =~ /^\/td$/i) {
               if ($inp) {
                  $inp = 0;
                  $ep = $i - 5;
                  #prt( "$ep: End TD [$tag][$bp - $ep]\n" );
               } else {
                  prt( "$ep: End td [$tag] NOT inp!\n" ) if ($htmldbg8);
               }
            } else {
               $ino = 1;
               #prt( "$i: Other [$tag]\n" );
            }
            $tag = '';
         } else {
            $tag .= $ch;
         }
      } else {
         if ($ch eq '<') {
            $tag = '';
            $intg = 1;
            #prt( "$i: Start tag\n" );
         }
      }
      $ntxt .= $ch;
      $tlen = $ep - $bp;
      if (($bp > 0) && ($tlen > 0)) {
         $wt = substr($tx,$bp,$tlen);
         #prt( "Work:$bp:$ep:$tlen: [$wt]\n" );
         # exit 1;
         push(@wlist, [$bp,$ep]);
         $ep = 0;
         $bp = 0;
      }
   }
   $len = scalar @wlist;
   prt( "Done [$i] chars. Got $len work items...\n" ) if ($htmldbg8);
   for ($i = $len - 1; $i >= 0; $i--) {
      $bp = $wlist[$i][0];
      $ep = $wlist[$i][1];
      $tlen = $ep - $bp;
      $wt = substr($tx,$bp,$tlen);
      $btxt = substr($tx,0,$bp);
      $etxt = substr($tx,$ep+1);
      #prt( "Work:$bp:$ep:$tlen: [$wt]\n" );
      $wt = clean_line_with_wrap($wt,$mx);
      #prt( "AWork:$bp:$ep:$tlen: [$wt]\n" );
      $tx = $btxt.$wt.$etxt;
      $btxt = substr($btxt,length($btxt)-3) if (length($btxt) > 3);
      $etxt = substr($etxt,0,4) if (length($etxt) > 4);
      #prt( "Bgn:[$btxt] End:[$etxt]\n" );
   }
   $len = length($tx);
   prt( "TD Returning [$len] chars...\n" ) if ($htmldbg8);
   return $tx;
}


# added 2009-06-24 - will remove <p>( |\n)+</p>
sub remove_empty_paras {
   my ($t) = shift;
   my $l = length($t);
   my ($c, $p, $p2, $c2, $tx);
   my $nt = '';
   my $it = 0;
   my $tg = '';
   $tx = '';   # start no text in para
   for ($p = 0; $p < $l; $p++) {
      $c = substr($t,$p,1);
      if ($it) {
         if ($c eq '>') {
            $it = 0;
            if ($tg =~ /^p$/) {
               # got para start
               $p2 = $p + 1; # bump to next
               $tx = '';   # start text accumulation
               for (; $p2 < $l; $p2++) {
                  $c2 = substr($t,$p2,1);
                  if ($c2 eq '<') { # reached NEXT tag
                     if ($tx =~ /\S/) {
                        # has NON space, so leave it
                     } else {
                        # all spacey stuff up to thsi next tag
                        $tg = '';
                        $p2++;   # move on, and get this tag
                        for (; $p2 < $l; $p2++) {
                           $c2 = substr($t,$p2,1);
                           if ($c2 eq '>') {
                              if ($tg =~ /^\/p$/) {
                                 $p = $p2; # SUCCESS - skip it
                                 $c = ''; # kill anything to add
                                 # and have already add '<p', so
                                 $nt = substr($nt,0,length($nt)-2);
                              }
                              last; # out of 2nd inner loop
                           } else {
                              $tg .= $c2;
                           }
                        } # second inner loop on finding next end of tag
                     }
                     last# out of 1st inner loop
                  } else {
                     $tx .= $c2; # accumulate text BETWEEN tags
                  }
               }  # for - inner loop on finding <P> tag
            }  # if is <p>
            $tg = '';
         } else {
            $tg .= $c;
         }
      } elsif ($c eq '<') {
         $it = 1; # tag commencing
         $tg = ''; # so start with nothing
      }
      $nt .= $c;
   }
   return $nt;
}

# added 2009-06-24 - only checks initial lines
sub remove_doctype {
   my ($tx) = shift;
   my $l = length($tx);
   for (my $p = 0; $p < $l; $p++) {
      my $c = substr($tx,$p,1);
      if ($c eq '<') {
         my $dt = ''; # start possible doctype accumulation
         $p++; # bump to next
         for (; $p < $l; $p++) {
            $c = substr($tx,$p,1);
            if ($c eq '>') {  # reached end of tag
               if ($dt =~ /^!DOCTYPE\s+/i) {
                  $p++;
                  return substr($tx,$p);
               }
               last; # not doctype - end of search
            } else {
               $dt .= $c;
            }
         }
         last;
      } elsif ($c =~ /\S/) {
         last; # not '<' or space - end of search
      }
   }
   return $tx;
}

1;
# eof - htmltools.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional