Generated: Tue Feb 2 17:54:42 2010 from htmlparse02.pl 2008/07/24 4.4 KB.
#!/perl -w # NAME: htmlparse.pl # AIM: use strict; use warnings; use HTML::Parser; require 'logfile.pl' or die "Unable to load logfile.pl ...\n"; # log file stuff my ($LF); my $pgmname = $0; if ($pgmname =~ /\w{1}:\\.*/) { my @tmpsp = split(/\\/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = "temp.$pgmname.txt"; open_log($outfile); prt( "$0 ... Hello, World ...testing HTML::Parser ...\n" ); my @accum = (); my ($p); my $type = 1; #1; # 5; #3; #2; my @array = (); my $scnt = 0; if ($type == 1) { # Create parser object $p = HTML::Parser->new( api_version => 3, start_h => [\&start_sub, "self, tagname, attr, attrseq, text"], end_h => [\&end_sub, "self, tagname, text"], text_h => [\&text_sub, "self, text"], process_h => [\&process_sub, "self, token0, text"], marked_sections => 1 ); } #my $p = HTML::Parser->new( api_version => 3, # marked_sections => 1, # ); # Parse document text chunk by chunk # $p->parse($chunk1); # $p->parse($chunk2); #... # $p->eof; # signal end of document if ($type == 2) { # THIS FAILS ???? ################# $p = HTML::Parser->new( api_version => 2 ); #$p = HTML::Parser->new(); #$p->handler(start => "start_sub", "self, tagname, attr, attrseq, text"); #$p->handler(end => "end_sub", "self, tagname, text"); #$p->handler(text => "text_sub", "self, text, is_cdata"); #$p->handler(process => "process_sub", "self, token0, text"); $p->handler(start => [\&start_sub, "self, tagname, attr, attrseq, text"]); $p->handler(end => [\&end_sub, "self, tagname, text"]); $p->handler(text => [\&text_sub, "self, text, is_cdata"]); $p->handler(process => [\&process_sub, "self, token0, text"]); $p->handler(comment => sub { my($self, $tokens) = @_; for (@$tokens) {$self->comment($_);}}, "self, tokens"); $p->handler(declaration => sub { my $self = shift; $self->declaration(substr($_[0], 2, -1));}, "self, text"); } if ($type == 3) { $p = HTML::Parser->new(); # Event is one of text, start, end, declaration, comment, process or default. $p->handler(start => \@accum, '"S", attr, attrseq, text' ); # The array elements will be ['S', \%attr, \@attr_seq, $text]. } if ($type == 5) { $p = HTML::Parser->new(api_version => 3, handlers => { text => [\@array, "event,text"], comment => [\@array, "event,text"] } ); } # Parse directly from file $p->parse_file('favorites.htm'); # or # open(F, "foo.html") || die; # $p->parse_file(*F); if ($type == 3) { $scnt = scalar @accum; prt( "$type: Got $scnt start events ...\n" ); } if ($type == 5) { $scnt = scalar @array; prt( "$type: Got $scnt text, comment events ...\n" ); for (my $k = 0; $k < $scnt; $k++) { my $ev = $array[$k][0]; my $tx = $array[$k][1]; my $ttx = trim_all($tx); if (length($ttx)) { prt( "$ev - $tx ...\n" ); } else { prt( "$ev - blank ...\n" ); } } } close_log($outfile,1); exit(0); sub start_sub { my($self, $tagname, $attr, $attrseq, $origtext) = @_; my $attrs = ''; my ($ky, $msg, $cnt); foreach $ky (keys %$attr) { $attrs .= ' '.$ky.'="'.$$attr{$ky}.'"'; } $msg = "$type:1: start tag=<$tagname"; $msg .= $attrs if length($attrs); $msg .= '>'; prt( "$msg\n" ); $attrs = ''; $cnt = 0; foreach $ky (@$attrseq) { $cnt++; $attrs .= '|' if length($attrs); $attrs .= $ky; } if (length($attrs) && ($cnt > 1)) { $msg = "$type:2: start tag=$tagname"; $msg .= ' - attr order = ['.$attrs.']'; prt( "$msg\n" ); } prt( "$type:3: start ot[$origtext]\n" ) if ($cnt > 1); } sub end_sub { my($self, $tagname, $origtext) = @_; prt( "$type: end tag=$tagname ot[$origtext] ...\n" ); } sub text_sub { my($self, $origtext) = @_; my $ttxt = trim_all($origtext); if (length($ttxt)) { prt( "$type: text ot[$origtext] ...\n" ); } else { prt( "$type: text blank ...\n" ); } } sub process_sub { prt( "$type: process called ...\n" ); } # eof