Generated: Tue Feb 2 17:54:23 2010 from chkintlinks.pl 2008/11/28 10.7 KB.
#!/perl -w # NAME: chkintlinks.pl # AIM: Given a HTML file, check for <a name="abc"...> and <a href="#abc"...> # are consistent # 13/11/2008 - geoff mclane - http://geoffair.net/mperl use strict; use warnings; use File::Basename; ##require 'logfile.pl' or die "Unable to load logfile.pl ...\n"; require 'fgutils.pl' or die "Unable to load fgutils.pl ...\n"; # log file stuff my ($LF); my $pgmname = $0; if ($pgmname =~ /\w{1}:\\.*/) { my @tmpsp = split(/\\/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = "temp.$pgmname.txt"; open_log($outfile); my $in_file = "C:\\HOMEPAGE\\GA\\mperl\\perl_hash.htm"; ###my $in_file = "C:\\HOMEPAGE\\GA\\mperl\\perl_ref.htm"; ###my $in_file = "C:\\HOMEPAGE\\GA\\fg\\fgfs-045.htm"; ###my $in_file = "C:\\HOMEPAGE\\GA\\fg\\fgall.htm"; ##my $in_file = "C:\\HOMEPAGE\\GA\\fg"; ##my $in_file = "C:\\HOMEPAGE\\GA\\fg\\fgfs-045b.htm"; ##my $in_file = "C:\\HOMEPAGE\\GA\\fg\\fgfs-044.htm"; my $resursive = 1; my @warnings = (); my $done_files = 0; my $loadlog = 0; prt( "$0 ... Hello, processing $in_file ...\n" ); if (-f $in_file) { my %h = process_file($in_file); } elsif (-d $in_file) { process_directory($in_file,$resursive); } else { prt("WARNING: $in_file is NOT file or folder ... check name, location!\n"); } if (($done_files > 2)||(scalar @warnings > 10)) { show_warnings(0); $loadlog = 1; } close_log($outfile,$loadlog); exit(0); ###################################### ###### SUBS ONLY ###### sub is_my_file_type { my ($fil) = shift; if ($fil =~ /\.htm$/i) { return 1; } elsif ($fil =~ /\.html$/i) { return 1; } return 0; } sub html_to_lines { my ($rlm, @lns) = @_; my $intag = 0; my $text = ''; # gather TEXT between tags my @nlines = (); my ($fln, $ln, $ch, $pch, $nch, $len, $i, $i2, $tag, $xml, $dnx); my ($lnnm, $lnb, $nlnm); my ($ppch, $incomm); my $show_comm_dbg = 0; $pch = ''; $ppch = ''; $nch = ''; $tag = ''; $xml = ''; $dnx = 0; $lnnm = 0; $nlnm = 0; $lnb = 0; $incomm = 0; $text = ''; # start NO TEXT foreach $fln (@lns) { chomp $fln; $ln = trim_all($fln); $len = length($ln); $lnnm++; # count another xml line for ($i = 0; $i < $len; $i++) { $i2 = $i + 1; $ch = substr($ln,$i,1); $nch = (($i2 < $len) ? substr($ln,$i2,1) : ' '); if ($intag) { # on first GREATER THAN - SPACE $tag .= $ch; if ($ch eq '>') { if ( $incomm ) { prt("$lnnm: potential end of XML tag pch=$pch ppch=$ppch\n") if ($show_comm_dbg); if (($pch eq '-') && ($ppch eq '-')) { $nlnm++; push(@nlines,$tag); ### prt( "push(\@xlnmap, [ $nlnm, $lnb, $lnnm ]); # each NEW line has BEGIN and END\n" ); $$rlm{$nlnm} = "$lnb-$lnnm"; # each NEW line has BEGIN and END $tag = ''; $intag = 0; $xml = ''; $incomm = 0; prt( "$lnnm: Exit comment [$ln]\n" ) if ($show_comm_dbg); } } else { $nlnm++; push(@nlines,$tag); ### prt( "push(\@xlnmap, [ $nlnm, $lnb, $lnnm ]); # each NEW line has BEGIN and END\n" ); $$rlm{$nlnm} = "$lnb-$lnnm"; # each NEW line has BEGIN and END $tag = ''; $intag = 0; $xml = ''; $incomm = 0; } } } else { if ($ch eq '<') { if (length($text)) { $nlnm++; push(@nlines,$text); $$rlm{$nlnm} = "$lnb-$lnnm"; # each NEW line has BEGIN and END $text = ''; } $tag = $ch; # start a tag line $intag = 1; # signal in a tag $xml = ''; $dnx = 0; $lnb = $lnnm; # set the BEGIN xml line if ($nch eq '!') { # but watch out for <!DOCTYPE ...> if ($ln =~ /<!--/) { prt( "$lnnm: Entering comment [$ln]\n" ) if ($show_comm_dbg); $incomm = 1; } } } else { $text .= $ch; } } $ppch = $pch; $pch = $ch; } # done a line - this is like a SPACE if ($intag && length($tag)) { $tag .= ' ' if !($tag =~ /(=|\s)$/); } } prtw("WARNING: Exit STILL in comment!\n") if ($incomm); if (length($tag)) { prtw("WARNING: xml re-lining error! Left pending tag [$tag]\nin $in_file file ...\n"); } return @nlines; } sub process_file { my ($fil) = shift; my ($in_name,$in_dir) = fileparse($fil); my (@attribs, %atthash, %lnmap); my (@lines, $xlncnt, $lnnum, $line, $tag, $xln); my @names = (); my @hrefs = (); my @lhrefs = (); my %names_hash = (); my %hrefs_hash = (); my %lhrefs_hash = (); my ($name, $href, $nmcnt, $hrcnt, $lrcnt, $msg); my $dbg_name = 0; my $dbg_href = 0; my %hash = (); if (open INF, "<$fil") { %lnmap = (); @lines = <INF>; close INF; $xlncnt = scalar @lines; @lines = html_to_lines(\%lnmap, @lines); write2file(join("\n",@lines),'tempxml.txt'); $lnnum = scalar @lines; $msg = "Processing $lnnum, from $xlncnt lines, from $fil ...\n"; $lnnum = 0; prt( "$msg" ); foreach $line (@lines) { chomp $line; $lnnum++; $xln = $lnmap{$lnnum}; #prt( "line $xln: $line\n" ); @attribs = space_split($line); $tag = $attribs[0]; if ($tag && length($tag)) { %atthash = (); # clear HASH - only if NOT a comment <!-- ... --> if ($tag =~ /^</) { %atthash = array_2_hash_on_equals(@attribs) if !($tag =~ /^<!--/); } if ($tag =~ /<a/i) { ###or if ($line =~ /^<a\s+(.+)>/i) { if (defined $atthash{'name'}) { $name = $atthash{'name'}; $name =~ s/>$//; $name = strip_both_quotes($name); if (is_in_array($name,@names)) { prtw("WARNING: Name $name already in NAME array ($fil:$xln)...\n" ); } else { push(@names,$name); $msg = "$xln: Name = [$name]"; $names_hash{$name} = $msg; prt( "$msg\n" ) if ($dbg_name); } } if (defined $atthash{'href'}) { $href = $atthash{'href'}; $href =~ s/>$//; $href = strip_both_quotes($href); if (is_in_array($href,@hrefs)) { ### prtw("WARNING: HREF $href already in HREF array ...\n" ); } else { push(@hrefs,$href); $msg = "$xln: HREF = [$href]"; $hrefs_hash{$href} = $msg; prt( "$msg\n" ) if ($dbg_href); } } #prt( "$line\n" ); } } } $nmcnt = scalar @names; $hrcnt = scalar @hrefs; foreach $href (@hrefs) { if ($href =~ /^#.+/) { $name = $href; push(@lhrefs, substr($href,1)); $lhrefs_hash{$href} = $hrefs_hash{$name}; # copy the info } } $lrcnt = scalar @lhrefs; prt( "Got $nmcnt names, and $hrcnt HREF. $lrcnt local, entries ...\n" ); $hrcnt = 0; foreach $href (@lhrefs) { if ( !is_in_array($href,@names) ) { $name = '#'.$href; $msg = $lhrefs_hash{$name}; prtw("WARNING: Got local HREF of [$href], but not NAME! ($msg)\n"); $hrcnt++; } } if ($hrcnt) { prtw("WARNING: Got $hrcnt local ref with no NAME anchor in $in_name!\n"); } elsif ($lrcnt) { prt("All $lrcnt local references point to a NAME.\n" ); } else { ###prtw("WARNING: NO local references found.\n" ); } $hrcnt = 0; foreach $name (@names) { if ( !is_in_array($name,@lhrefs)) { $hrcnt++; } } if ($hrcnt) { prtw("WARNING: Got $hrcnt ANCHOR names, with NO internal links in $in_name ...\n"); foreach $name (@names) { if ( !is_in_array($name,@lhrefs)) { $msg = $names_hash{$name}; prtw("NOTE: NO HREF for [$name]! ($msg)\n"); } } } } else { prtw("WARNING: Unable to open $fil!\n" ); } $hash{'NAMES'} = { %names_hash }; $hash{'HREFS'} = { %hrefs_hash }; $hash{'LOCAL'} = { %lhrefs_hash }; $done_files++; return %hash; } sub process_directory { my ($inf, $rec) = shift; prt( "Processing $inf folder ...\n" ); if ( opendir( DIR, $inf ) ) { my @files = readdir(DIR); closedir DIR; foreach my $fil (@files) { if (($fil eq ".")||($fil eq "..")) { next; } my $ff = $inf."\\".$fil; if ( -d $ff ) { process_directory( $ff ) if ($rec); } else { if (is_my_file_type($fil)) { my %h = process_file( $ff ); } } } } } sub prtw { my ($tx) = shift; $tx =~ s/\n$// if ($tx =~ /\n$/); prt("$tx\n"); push(@warnings,$tx); } sub show_warnings { my ($dbg) = shift; if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS ...\n" ); foreach my $line (@warnings) { prt("$line\n" ); } prt("\n"); } elsif ($dbg) { prt("\nNo warnings issued.\n\n"); } } # eof - chkintlinks.pl