Generated: Sun Apr 15 11:46:42 2012 from showhrefs.pl 2011/11/21 21.3 KB.
#!/perl -w # NAME: showhrefs.pl (also see competing gethrefs??.pl) # AIM: Given a HTML file, extract, and show HREF (anchor) entries. # 21/11/2011 - Add -l = load log, -a = show all # 20/10/2010 - Added alpha sort, and consider -d <dir> to delete those in this directory. # 14/03/2010 - Lots of IMPROVEMENTS, including batch running... ie parse_arg(list of files) # 2010/03/22 - strip '#down' off file name before checking # 20100312 - Some tidying... # 29/07/2007 - geoff mclane - http://geoffair.net/mperl/index.htm use strict; use warnings; use File::Basename; use Socket; use LWP::Simple; unshift(@INC, 'C:/GTools/perl'); require 'logfile.pl' or die "Unable to load logfile.pl ...\n"; require 'htmltools.pl' or die "Unable to load htmltools.pl ...\n"; # log file stuff my ($LF); my $pgmname = $0; if ($pgmname =~ /(\\|\/)/) { my @tmpsp = split(/(\\|\/)/,$pgmname); $pgmname = $tmpsp[-1]; } my $perl_root = 'C:\GTools\perl'; my $outfile = $perl_root."\\temp.$pgmname.txt"; open_log($outfile); my $VERS = "0.0.3 2011-11-21"; # from gethrefs02.pl use constant { HRT_UNKNOWN => 0, HRT_LOCAL => 1, HRT_LINK => 2, HRT_SCRIPT => 4, HRT_FILE => 8, HRT_BASE => 16 }; # constants - from chklink1.pl my $HR_REM = 1; my $HR_LOC = 2; my $HR_FIL = 3; my $HR_FTP = 4; my $HR_MT = 5; my $HR_JAV = 6; my %type2name = ( $HR_REM => 'remote', $HR_LOC => 'local', $HR_FIL => 'file', $HR_FTP => 'fpt', $HR_MT => 'mailto', $HR_JAV => 'java' ); sub get_type_href($) { my $typ = shift; if (defined $type2name{$typ}) { return $type2name{$typ}; } return "$typ not listed"; } # here my $H_HTTP = 1; my $H_HTTPS = 2; my $H_FTP = 3; my $H_MAILTO = 4; my $H_JAVA = 5; my $H_FILE = 5; my $H_LOCAL = 6; my $H_OTHER = 7; # options my $g_base_dir = "C:\\HOMEPAGE\\GA\\"; my @in_files = (); #my @in_files = qw ( climate\climate-01.htm ); #my @in_files = qw( misc/flags/index.htm ); #my @in_files = qw( home2.htm ); #my @in_files = qw ( cgi\index.htm ); #my @in_files = qw ( travel\maroc\maroc-slide4.htm ); ##my $g_base_dir = "C:\\HOMEPAGE\\HOM\\test4\\"; ##my @in_files = qw(collections.htm limited-edition.htm groom-center.htm product-lines.htm); ##my $g_in_file = 'collections.htm'; ##my $g_in_file = 'C:\HOMEPAGE\HOM\test4\limited-edition.htm'; ##my $g_in_file = 'C:\HOMEPAGE\HOM\test4\groom-center.htm'; ##my $g_in_file = 'C:\HOMEPAGE\HOM\test4\product-lines.htm'; my $load_log = 0; my $min_href = 45; my $def_out_file = $perl_root."\\tempnohrefs.htm"; my $remove_hrefs = 0; my $do_alpha_sort = 1; my $show_local_href = 0; my $show_ips = 0; # debug my $dbg38 = 0; #prt( "[dbg38] Got [$hr2] = [$txt] [$fil]\n" ) if ($dbg38); my $debug_on = 0; my @def_files = qw( unix\git.htm ); # my @def_files = qw( mperl\src\index.htm ); # global program variables my $g_in_file = ''; my @hrefs = (); my @anchors = (); my $g_title = ''; my $g_hcnt = 0; my $g_acnt = 0; #my $hrf = ''; #my $hfile = ''; my $g_dirname = $g_base_dir; #my $had_menu = 0; my $g_filename = ''; my @html_ext = qw( .htm .html .shtml .php ); my @warnings = (); # DEBUG my $dbg_03 = 0; my $verbosity = 0; sub VERB1() { return ($verbosity >= 1); } sub VERB2() { return ($verbosity >= 2); } sub VERB5() { return ($verbosity >= 5); } sub VERB9() { return ($verbosity >= 9); } sub show_warnings($) { my ($val) = @_; if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } else { ### prt( "\nNo warnings issued.\n\n" ); } } sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg); } show_warnings($val); close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } sub get_file_title($) { my ($inf) = shift; my ($IN); my $tit = ''; if (open $IN, "<$inf") { my @lines = <$IN>; close $IN; ###my $lc = scalar @lines; ###prt( "Processing $lc lines from $inf ...\n" ); $tit = return_tag( join( '', @lines ), 'title' ); $tit =~ s/\n/ /gm; $tit = trim_all($tit); } else { ###prt( "ERROR: Failed to open $inf ... $! ...\n" ); $tit = "<open failed on $inf>"; } return $tit; } sub get_IPAddress($$$) { my ($nm,$rip,$show) = @_; my @addr = gethostbyname($nm); my $cnt = 0; if( !@addr ) { prt( "Can't resolve [$nm]: $!\n" ) if ($show); return 0; } @addr = map { inet_ntoa($_) } @addr[4 .. $#addr]; foreach my $k (@addr) { ${$rip} = $k if ($cnt == 0); $cnt++; prt( "$cnt: $nm resolves to IP [$k]\n" ) if ($show); } return $cnt; } sub process_file($) { my ($inf) = shift; if (open INF, "<$inf") { my @lines = <INF>; close INF; my $lc = scalar @lines; prt( "\nProcessing $lc lines from $inf ...\n" ); my $ft = join( '', @lines ); $g_title = return_tag( $ft, 'title' ); $g_title =~ s/\n/ /gm; $g_title = trim_all($g_title); my $ntxt = remove_script( $ft ); $ntxt = dropcomments($ntxt); # write2file($ntxt,"tempnew.txt"); $ntxt = trimblanklines($ntxt); @hrefs = ret_hrefs_array( $ntxt ); @anchors = ret_anchor_array( $ntxt ); if ($remove_hrefs) { $ntxt = collecthrefs($ntxt,1); write2file($ntxt,$def_out_file); prt("Written $def_out_file file, without anchors...\n"); } } else { prt( "ERROR: Failed to open [$inf] ... $! ...\n" ); return 0; } return 1; } sub anchor_href($) { my ($txt) = shift; my $len = length($txt); my $ch = ''; my $pch = ' '; my $tag = ''; for (my $i = 0; $i < $len; $i++) { $ch = substr($txt, $i, 1); if ((lc($ch) eq 'h')&&($pch =~ /\s/)) { $pch = substr($txt, $i); if ($pch =~ /^href=/i) { $tag = substr($txt, ($i+5)); $tag = trim_all($tag); if ( $tag =~ /^['"]/ ) { $pch = substr($tag,0,1); $tag = substr($tag,1); } else { $pch = ' '; } my $ind = index($tag, $pch); if ($ind != -1) { $tag = substr($tag,0,$ind); } return $tag; } } $pch = $ch; } return $tag; } ######################################################### # Passed an array of extensions, # check if this is one of them? ######################################################### sub file_has_my_ext($$) { my ($fil, $rexts) = @_; my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ ); my $lcext = lc($ext); foreach my $ex (@{$rexts}) { return 1 if (lc($ex) eq $lcext); } return 0; } sub get_href_type($) { my ($src) = shift; if ($src =~ /^http:/i) { #push(@httprefs, [$src, $fil, $lnnos] ); return $H_HTTP; # remote HREF } elsif ($src =~ /^https:/i) { return $H_HTTPS; # remote HREF #push(@httpsrefs, [$src, $fil, $lnnos] ); } elsif ($src =~ /^ftp:/i) { #push(@ftprefs, [$src, $fil, $lnnos] ); return $H_FTP; # remote HREF } elsif ($src =~ /^mailto:/i) { #push(@mtrefs, [$src, $fil, $lnnos] ); return $H_MAILTO; # remote HREF } elsif ( $src =~ /^javascript:/i ) { return $H_JAVA; # a JAVASCRIPT HREF } elsif ($src =~ /^file:/i) { return $H_FILE; # remote HREF } elsif ( substr($src,0,1) eq '#') { # local in page HREF return $H_LOCAL; } else { my $ind = index($src,'#'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } $ind = index($src,'?'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } $src =~ s/\/$//; if (length($src)) { return $H_OTHER; } } return 0; } sub get_href_type_name($) { my ($src) = shift; my $typ = get_href_type($src); if ($typ == $H_HTTP) { # ($src =~ /^http:/i) return "$H_HTTP: remote HREF (http)"; } elsif ($typ == $H_HTTPS) { # ($src =~ /^https:/i) return "$H_HTTPS: remote HREF (https)"; } elsif ($typ == $H_FTP) { # ($src =~ /^ftp:/i) return "$H_FTP: remote HREF (ftp)"; } elsif ($typ == $H_MAILTO) { # ($src =~ /^mailto:/i) { return "$H_MAILTO: remote HREF (mailto)"; } elsif ($typ == $H_JAVA) { # if ($src =~ /^javascript:/i ) { return "$H_JAVA: a JAVASCRIPT HREF"; } elsif ($src =~ /^file:/i) { return "$H_FILE: a FILE HREF"; } return "$H_JAVA: a ???? HREF CHECKME"; } elsif ($typ == $H_LOCAL) { # ( substr($src,0,1) eq '#') return "$H_LOCAL: infile link"; # (".substr($src,1).")"; } elsif ($typ == $H_OTHER) { return "$H_OTHER: other (local) link"; } return "0: UNCASED [$src] CHECKME!"; } sub mycmp_decend0 { my $off = 0; return 1 if (${$a}[$off] < ${$b}[$off]); return -1 if (${$a}[$off] > ${$b}[$off]); return 0; } sub mycmp_decend1 { return 1 if (${$a}[1] gt ${$b}[1]); return -1 if (${$a}[1] lt ${$b}[1]); return 0; } sub get_anchor_hash_ref_lc($$$) { my ($fank,$fil,$dbg) = @_; my %hash = (); my ($ank,$len,$i,$ch,$pc,$hr2,$txt); my ($lchr2); if ($fank =~ /<a\s+(.+)>$/) { $ank = trim_all($1); $len = length($ank); $ch = ''; $hr2 = ''; for ($i = 0; $i < $len; $i++) { $pc = $ch; $ch = substr($ank,$i,1); if ($ch =~ /\w/) { $hr2 .= $ch; # accumulate \w chars - alphanumeric, including _ } elsif (length($hr2)) { if (($ch ne '=') && ($ch =~ /\s/)) { $i++; for (; $i < $len; $i++) { $ch = substr($ank,$i,1); last if ($ch eq '='); last if !($ch =~ /\s/); } } if ($ch eq '=') { # found our equal sign $i++; # move on... for (; $i < $len; $i++) { $ch = substr($ank,$i,1); last if ($ch =~ /('|")/); last if !($ch =~ /\s/); } if (($ch eq '"')||($ch eq "'")) { $pc = $ch; $i++; # move on... $txt = ''; for (; $i < $len; $i++) { $ch = substr($ank,$i,1); last if ($ch eq $pc); $txt .= $ch; } if ($ch eq $pc) { $lchr2 = lc($hr2); $hash{$lchr2} = $txt; prt( "[dbg38] Got [$hr2] = [$txt] [$fil]\n" ) if ($dbg38); } else { prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO END INVERTED COMMA! ($pc) [$fil]\n"); pgm_exit(1,"") if ($dbg); } } else { if (($ch =~ /\w/) && (($hr2 =~ /name/i)||($hr2 =~ /id/i))) { # accept these WITHOUT inverted comma $txt = $ch; $i++; # MOVING ON for (; $i < $len; $i++) { $ch = substr($ank,$i,1); last if !($ch =~ /\w/); $txt .= $ch; } $lchr2 = lc($hr2); $hash{$lchr2} = $txt; prt( "Got [$hr2] = [$txt] - no inverted commas! [$fil]\n" ); } else { prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO START INVERTED COMMA! [$fil]\n"); pgm_exit(1,"") if ($dbg); } } } else { prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO EQUAL SIGN! [$fil]\n"); pgm_exit(1,"") if ($dbg); } $hr2 = ''; } } } return \%hash; } sub get_URL($$) { my ($href,$rurl) = @_; my $url = ''; if ($href =~ /^http:\/\//) { $url = substr($href,7); } elsif ($href =~ /^https:\/\//) { $url = substr($href,8); } if (length($url)) { my @arr = split(/(\/|\?|\&)/,$url); ${$rurl} = $arr[0]; return 1; } return 0; } sub get_URI($$) { my ($href,$rurl) = @_; my $url = ''; if ($href =~ /^http:\/\//) { $url = substr($href,7); } elsif ($href =~ /^https:\/\//) { $url = substr($href,8); } if (length($url)) { my $ind = index($url,'?'); if ($ind > 0) { $url = substr($url,0,$ind-1); } ${$rurl} = $url; return 1; } return 0; } sub clean_URL($) { my ($href) = @_; my $ind = index($href,'?'); $href = substr($href,0,$ind-1) if ($ind > 0); return $href; } sub get_page($$$) { my ($url,$rtxt,$show) = @_; prt( "Fetching: $url\n" ) if ($show); my $txt = get($url); if ($txt && length($txt)) { prt( "$txt\n" ) if ($show); ${$rtxt} = $txt; return 1; } prt( "FAILED to get URL $url ...\n" ) if ($show); return 0; } sub process_in_files($) { my ($ra) = @_; # (\@in_files); my ($hrtyp,$hrt,$i,@slist,$ff,$msg,$tmp,@arr); my ($j,$ank,@arr2,$file,$rhrf,$hrf,$hfile); my ($nm,$tst,$fnd,$fullanc,$url,$ip,$page,$curl); foreach $hfile (@{$ra}) { $g_in_file = "$g_base_dir$hfile"; ($g_filename, $g_dirname) = fileparse($g_in_file); next if !process_file( $g_in_file ); $g_hcnt = scalar @hrefs; $g_acnt = scalar @anchors; prt( "Got $g_hcnt HREF... $g_acnt anchors... title=\"$g_title\"\n" ); @arr2 = (); for ($i = 0; $i < $g_acnt; $i++) { $hrf = $anchors[$i]; $hrf =~ s/\n/ /gm; $hrf = trim_all($hrf); $file = anchor_href($hrf); # IF ANY $hrt = get_href_type($hfile); #$rhrf = get_anchor_hash_ref_lc($hrf,$file,1); $rhrf = get_anchor_hash_ref_lc($hrf,$file,0); # 0 1 2 3 push(@arr2, [$hrt, $hrf, $file, $rhrf]); } @arr = (); for ($i = 0; $i < $g_hcnt; $i++) { $fnd = -1; $hrf = $hrefs[$i]; $hrf =~ s/\n/ /gm; $hrf = trim_all($hrf); $hrt = get_href_type($hrf); $hrtyp = get_href_type_name($hrf); # try to find ANCHOR for ($j = 0; $j < $g_acnt; $j++) { $ank = $arr2[$j]; if (length(${$ank}[2])) { # a HREF, does it have a 'href' $rhrf = ${$ank}[3]; # extract HASH $nm = ${$rhrf}{'href'}; # extract value if ($hrf eq $nm) { $fnd = $j; # 0 1 2 3 # push(@arr2, [$hrt, $hrf, $file, $rhrf]); $fullanc = ${$ank}[1]; last; } } } if ($hrt == 6) { # infile link - find the anchor $tmp = "NF"; $tst = $hrf; $tst =~ s/^#//; for ($j = 0; $j < $g_acnt; $j++) { $ank = $arr2[$j]; if (length(${$ank}[2]) == 0) { # not a HREF, does it have a 'name' $rhrf = ${$ank}[3]; # extract HASH if (defined ${$rhrf}{'name'}) { $nm = ${$rhrf}{'name'}; # extract value if ($tst eq $nm) { $tmp = "ok"; last; } } } } $hrtyp .= " $tmp ($fnd)"; } elsif ($hrt == 7) { # local file link my $fn = $hrf; my $ind = index($fn,'#'); if ($ind > 0) { $fn = substr($fn,0,$ind); } $ff = "$g_dirname$fn"; if (-f $ff) { $hrtyp .= " ok ($fnd)" ; } else { $tmp = $ff.".txt"; if (-f $tmp) { $hrtyp .= " ok (with .txt) ($fnd)"; } else { $hrtyp .= " NF [$ff] ($fnd)"; if ($fnd != -1) { $hrtyp .= "[$fullanc]"; } } } } # 0 1 2 3 4 push(@arr, [$hrt, $hrf, $hrtyp, $fnd, $hrt]); } if ($do_alpha_sort) { prt("Moment, doing alpha sort...\n"); @arr = sort mycmp_decend1 @arr; } prt("Moment, doing type sort...\n"); @slist = sort mycmp_decend0 @arr; prt("List up to $g_hcnt HREF (sorted by type)"); if (!$show_local_href) { prt(" excluding local, in file references."); } prt("\n"); for ($i = 0; $i < $g_hcnt; $i++) { $hrf = $slist[$i][1]; $hrtyp = $slist[$i][2]; $hrt = $slist[$i][4]; if (!$show_local_href) { next if ($hrt == $H_LOCAL); } $hrf .= ' ' while (length($hrf) < $min_href); prt( "$hrf [$hrtyp] " ); if ($show_ips) { if (($hrt == $H_HTTP)||($hrt == $H_HTTPS)) { $hrf = $slist[$i][1]; $url = ''; $ip = ''; $page = ''; if (get_URL($hrf,\$url) && length($url)) { if (get_IPAddress($url,\$ip,0)) { prt("IP=$ip "); if (get_page($url,\$page,0)) { prt("page ok"); } else { $curl = clean_URL($hrf); if (get_page($curl,\$page,0)) { prt("page ok"); } else { prt("page failed"); } } } else { prt("IP failed"); } } } } prt("\n"); } #prt( "\nList $g_acnt anchors \n" ); #$had_menu = 0; #foreach $hrf (@anchors) { #$hrf =~ s/\n/ /gm; #$hrf = trim_all($hrf); #$hfile = anchor_href($hrf); #if (file_has_my_ext("$g_dirname$hfile", \@html_ext)) { #$g_title = get_file_title( "$g_dirname$hfile" ); #} else { #$g_title = "<not html>"; #} ####prt( "$hrf ($hfile) $g_title\n" ); #prt( "[$hrf] [$hfile], title=[$g_title]\n" ); # if ($had_menu); #if ($hfile eq './') { #$had_menu = 1; #} #} } } ### MAIN PROCESS ### parse_args(@ARGV); process_in_files(\@in_files); pgm_exit(0,"Normal exit"); ######################## sub give_help { prt("$pgmname Version $VERS\n"); prt("$pgmname [options] file [file ...]\n"); prt("The input file name(s) will be processed, and a list of HREF found output.\n"); prt("options:\n"); prt(" -h or -? = this help, and exit 0\n"); # prt(" -d <dir> = Delete HREFS which are in this directory. ^dir for starts with this directory.\n"); prt(" -a = Show ALL hrefs, including in-file links.\n"); prt(" -l = Load log at end.\n"); prt(" -o <out> = set the output file name. Implies -r.\n"); prt(" -r = remove HREFS, and write to file [$def_out_file]\n"); prt(" -s = Show IP and page fetch.\n"); } sub need_arg { my ($arg,@av) = @_; pgm_exit(1,"ERROR: Option [$arg] needs foloowing argument! Try -h for valid options.\n") if (!@av); } sub parse_args { my (@av) = @_; my $cnt = 0; my @arr = (); my ($sarg,$ch); while (@av) { $cnt++; my $arg = $av[0]; if ($arg =~ /^-/) { # what options? $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); $ch = substr($sarg,0,1); if (($ch eq 'h')||($ch eq '?')) { give_help(); exit(0); } elsif ($ch eq 'a') { $show_local_href = 1; prt("Set to show ALL hrefs, including local, in file links.\n"); } elsif ($ch eq 'l') { $load_log = 1; prt("Set to load log at end.\n"); } elsif ($ch eq 'r') { prt("Set to remove HREFS, and write file.\n"); $remove_hrefs = 1; } elsif ($ch eq 'o') { need_arg(@av); shift @av; $cnt++; $def_out_file = $av[0]; prt("Set to remove HREFS, and write file $def_out_file.\n"); $remove_hrefs = 1; } elsif ($ch eq 's') { $show_ips = 1; prt("Set to show IP, and page fetch.\n"); } else { pgm_exit(1,"ERROR: Unknown option [$arg]! Try -h for valid options.\n"); } } else { if (-f $arg) { push(@arr,$arg); } else { pgm_exit(1,"ERROR: Unable to locate [$arg] file! Argument # $cnt. Check name, location...\n"); } } shift @av; } if (! @arr && $debug_on ) { @in_files = @def_files; prt("Set INPUT to debug default!\n"); } if (@arr) { prt("Set input to "); foreach my $f (@arr) { prt("$f "); } prt("\n"); $g_base_dir = ""; @in_files = @arr; } elsif (!@in_files) { pgm_exit(1,"ERROR: No input files found in command!\n"); } } # eof