#!/perl -w # NAME: chklinks03.pl # AIM: Given a input FOLDER, check all the HTML found for a ... # AND check ALL image links , if it is a LOCAL file, # and other 'link' items, like .zip, .txt, etc. # 2019-05-06 - Review # 17/10/2013 - Default is to follow page links. Added -nofollow to only do the page # 22/05/2011 - More tidying, reducing output to 'essentials' unless -v2 or more # 19/05/2011 - Fix of 20100312 fix, to find substitutes '.txt' files # 02/04/2010 - Some more 'fixes' - continue 2010/04/04 - 2010/04/12... # 2010/04/01 - Fix for 'remove_script' - to move to htmltools.pl # 15/03/2010 - After a number of 'fixes', starting to feel SOLID ;=)) # 12/03/2010 - Special case - I have replaced large ZIPS, with a TEXT # file, so do NOT give this WARNING, if such a .txt file exists. # 11/03/2010 - Lots of tidying, and fixing... experimented with another collecthrefs2 # but found only difference is in the DUPLICATE that can be in htmltools::collecthrefs # 13/11/2008 - some tidying, especially when from ONE FILE # where there will be NO from|to references # 18/06/2007 - Add some command parameters, and help # 02/06/2007 - geoff mclane - geoffair.com/mperl/index.htm use strict; use warnings; use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] ) use File::Spec; # File::Spec->rel2abs($rel); # get ABSOLUTE form use Socket; use Cwd; # for cwd() my $os = $^O; unshift(@INC, 'C:/GTools/perl'); # for htmltools, if functions used my @imgs = (); my @hrefs = (); require 'logfile.pl' or die "Unable to load logfile.pl ...\n"; require 'htmltools.pl' or die "Unable to load htmltools.pl ...\n"; # log file stuff my ($LF); my $pgmname = $0; if ($pgmname =~ /\w{1}:\\.*/) { my @tmpsp = split(/\\/,$pgmname); $pgmname = $tmpsp[-1]; } my $perl_base = 'C:/GTools/perl'; my $outfile = "$perl_base/temp.$pgmname.txt"; open_log($outfile); my $currworkdir = cwd(); my $out_hrefs = $perl_base."\\temphrefs.txt"; my $out_hosts = $perl_base."\\temphosts.txt"; my $out_domains = $perl_base."\\tempdomains.txt"; # some FEATURES and USER variables # my @excludes = qw( cvineng2.htm ); my @excludes = qw( desktop.ini php.ini blank.html blank.htm index3.htm ); my @splexcludes = qw( macpc tmp wedding ); my @def_indexes = qw( default.html default.htm index.aspx index.htm index.html index.jsp index.php index.shtml home.htm home.html welcome.html welcome.htm ); my $recurse = 1; # recursive # follow links to other files, and check them also my $ignfpd = 1; # ignore FRONTPAGE folders my $chkip = 0; # check the IP address my $check_host_ip = 0; # check the host domain only IP my $load_log = 0; # load log file at end -ll to set. my $show_no_index = 0; # show when NOT html in folder, OR no 'index' type file my $showhreflinks = 0; # show a WARNING when an IMG, ICO, etc is a REMOTE link my $showlinks = 0; # show the links for each file my $shownolinks = 1; # show NO links for each file my $showscripts = 0; # show SCRIPT files my $writeips = 1; # write IP found to a file my $refreships = 0; # if $chkip, and $writeips, re-write NEW check file my $shownohrefs = 0; # show when NO HREF found in file my $show_all_not_found = 0; # avoid repeating an image file not found my $not_found_file = ''; # if file name given, write list to that file... my $out_remote_links = 0; # potential BIG LIST of remote LINKS my $in_folder = ""; my $ipfile = "iplinks.txt"; my @ipsfound = (); my $verbosity = 0; my @html_ext = qw( .htm .html .shtml .php ); my @graf_ext = qw( .jpg .jpeg .gif .png .bmp .ico .mpg ); my @css_ext = qw( .css ); my @script_ext = qw( .js .class .cgi ); my @fpfolders = qw( _vti_cnf _vti_pvt _private _derived ); my @g_xclude_dir = (); my @excused = ( '?dir=test', '?dir=.' ); # *** DEBUG *** my $debug_on = 0; # OFF for release - only to load default # SET A DEFAULT INPUT FOLDER / FILE ##my $def_file = "C:\\HOMEPAGE\\FG\\index.html"; my $def_file = ""; my $def_in_folder = 'F:\Projects\html-tidy.org.api\tidy\tidylib_api_next'; # program variables # NOTE: Each of these is a multidimensional array - see offset below my @htm_files = (); # store files found in folder my @img_files = (); my @css_files = (); my @zip_files = (); my @txt_files = (); my @script_files = (); my @other_files = (); my @g_empty_folders = (); my @g_folders_noind = (); my %g_images_notfound = (); my %g_dir_files = (); # store files %hash->dir->%type->@files my $g_active_file = ''; my $g_active_lnn = ''; my $g_user_file = ''; my $single_file = 0; # offsets in above arrays my $of_ff = 0; # full file name my $of_hr = 1; # array ref of href links my $of_im = 2; # array ref of image links my $of_lk = 3; # linked count my $of_rh = 4; # ref hash from 'collecthrefs2' my $of_to = 5; # links TO, but this is really already in $of_hr, after excluding in-file, and external links my $of_fm = 6; # links FROM my $of2hr = 7; # more or less duplicate of $of_hr my $of2im = 8; # more or less duplicate of $of_im my $of_s1 = 9; # spare 1 my $of_s2 = 10; # spare 2 # 0 1 2 3 4 5 6 7 8 9 10 # push(@{$arr_ref}, [$ff, '', '', 0, 0, '', '', 0, 0, 0, 0] ); my @donesrcs = (); my @doneimgs = (); my %ext_hash = (); ##my $cnt = 0; ##my $file = ''; my @warnings = (); # list of errors, warnings during running my @httprefs = (); # set of HREF src values push(@httprefs, [$src, $fil, $lnnos] ); my @httpsrefs = (); my @ftprefs = (); my @mtrefs = (); my %g_hrefs = (); # full list of ALL http items found my %g_hosts = (); # list of http://domain.names my %g_domains = (); # just domain names my @scripts = (); my $tot_imgcnt = 0; my $homefile = ''; my $total_hrefs = 0; my $total_imgs = 0; my @missed = (); my $excusecnt = 0; my $hrflnkcnt = 0; # $showhreflinks my $homeoffset = -1; my @offsdone = (); my @htmlinks = (); my @missing_links = (); # store files with NO links # debug only bits my $dbg1 = 0; # prt( "[dbg1] Processed $inf folder finding $fcnt HTML files ...\n" ) if ($dbg1); my $dbg1b = 0; # prt( "[dbg1b] Processing [$inf] folder...\n" ) if ($dbg1b); my $dbg2 = 0; # show ALL HREF entries ... my $dbg3 = 0; # show IP found ... my $dbg4 = 0; # show entered/exit script my $dbg5 = 0; # show 'ok' when found my $dbg6 = 0; # show processing lines my $dbg7 = 0; # show anchor count my $dbg8 = 0; # show unique anchor href my $dbg9 = 0; # show files with SCRIPTS my $dbg10 = 0; # show diag for get_img_srcs() ... my $dbg11 = 0; # in image processing show entered/exits script my $dbg12 = 0; # in image processing show processing count my $dbg13 = 0; # in image processing show ok - found file my $dbg14 = 0; # in image processing show image count found my $dbg15 = 0; # in image processing show image count when NONE found my $dbg16 = 0; # show WARNINGS during run ... my $dbg17 = 0; # show MISSING or BLANK HREF in PHP file my $dbg18 = 0; # check_linkages: show 'ok', in 2nd link check my $dbg19 = 0; # check_local_links: show progress ... my $dbg20 = 0; # check_local_links: show ALL link COUNTS - NONE IS ALWAYS SHOWN ... my $dbg21 = 0; # check_local_links: show LINK when found ... my $dbg22 = 0; # mark_image_link: show comparing, and comparision ... my $dbg23 = 0; # mark_image_link: show count of new images marked ... my $dbg24 = 0; # show each image file being marked my $dbg25 = 0; # prt( "[dbg25] NO LINK FOUND HREF [$src]($msrc) in $totcnt file - $ff ($fnd) - ($lev)!\n" ) if ($dbg25); my $dbg26 = 0; # show EACH HTML FILE BEING PROCESSED my $dbg27 = 0; # show EACH extesnions, and counts my $dbg28 = 0; # show image links information ... my $dbg29 = 0; # show ZIP, TXT, CSS, SCRIPT and OTHER file links information ... my $dbg30 = 0; # show HTML HREF links information ... my $dbg31 = 0; # like $dbg20 - check_local_links: show ALL links - NONE IS ALWAYS SHOWN ... my $dbg32 = 0; # show missing during processing my $dbg33 = 0; # show HAS NO LINKS during processing my $dbg34 = 0; # show prt( "REL PATH [$src] to UNIX PATH [$nusrc] my $dbg35 = 0; # prt("[dbg35] Got anchor [$hrf]($len2) ...\n") if ($dbg35); my $dbg36 = 0; # prt("[dbg36] Got file [$fil], with ext = [$ext] = $type\n") if ($dbg36); my $dbg37 = 0; # prt("[dbg37] File [$fil] EXCLUDED!\n") if ($dbg37); my $dbg38 = 0; # prt( "Got [$hr2] = [$txt]\n" ); my $dbg39 = 0; # prt( "[dbg39] Got [$hr2] = [$txt] - no inverted commas! [$fil]\n" ) if ($dbg39); my $dbg40 = 0; # prt("[dbg40] mark_link fmfil=[$fmfil] fnd=[$fnd] src=[$src] lenv=[$lev]\n"); my $dbg41 = 0; # prt( "[dbg41] $i2: compare \n[$mfil] with \n[$msrc]\n" ) if ($dbg41); my $dbg42 = 0; # prt("[dbg42] Processing file [$file]...\n") if ($dbg42); my $dbg43 = 0; # prt("[dbg43] Got ".scalar @hr." hrefs, and ".scalar @is." image links.\n") if ($dbg43); my $dbg44 = 0; # prt("[dbg44] Sending [$src] to 'mark_dir_link'...\n") if ($dbg44); my $dbg45 = 0; # prt("[dbg45] LINK not found in HREF files, maybe IMAGES, zip, etc TO=[$src], FROM=[$fmfil]\n") if ($dbg45); my $dbg46 = 0; # prt("[dbg46] LINK found, but not in HREF files! TO=[$src], FROM=[$fmfil]\n") if ($dbg46); sub set_debug_val($) { my ($v) = shift; $dbg1 = $v; $dbg1b = $v; $dbg2 = $v; $dbg3 = $v; $dbg4 = $v; $dbg5 = $v; $dbg6 = $v; $dbg7 = $v; $dbg8 = $v; $dbg9 = $v; $dbg10 = $v; $dbg11 = $v; $dbg12 = $v; $dbg13 = $v; $dbg14 = $v; $dbg15 = $v; $dbg16 = $v; $dbg17 = $v; $dbg18 = $v; $dbg19 = $v; $dbg20 = $v; $dbg21 = $v; $dbg22 = $v; $dbg23 = $v; $dbg24 = $v; $dbg25 = $v; $dbg26 = $v; $dbg27 = $v; $dbg28 = $v; $dbg29 = $v; $dbg30 = $v; $dbg31 = $v; $dbg32 = $v; $dbg33 = $v; $dbg34 = $v; $dbg35 = $v; $dbg36 = $v; $dbg37 = $v; $dbg38 = $v; $dbg39 = $v; $dbg40 = $v; $dbg41 = $v; $dbg42 = $v; $dbg43 = $v; $dbg44 = $v; $dbg45 = $v; $dbg46 = $v; } sub set_debug_on() { set_debug_val(1); } sub set_debug_off() { set_debug_val(0); } # forward refs sub mark_link($$$$); sub trace_from_htm($$); sub process_folder($); sub is_same_file($$); sub mark_dir_link($$$$); sub is_htm_file_ext($); sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if !($msg =~ /\n$/); prt($msg); } #show_warnings($val); close_log($outfile,$load_log); exit($val); } sub show_warnings() { my $wcnt = scalar @warnings; if ($wcnt) { prt( "\nWARNINGS FOLLOW ($wcnt):\n" ); foreach my $w (@warnings) { prt( "$w\n" ); } } else { ##### prt( "No warnings ...\n" ); } } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } sub u2d { my ($f) = shift; $f =~ s/\//\\/g; return $f; } sub VERB1() { return ($verbosity >= 1); } sub VERB2() { return ($verbosity >= 2); } sub VERB5() { return (($verbosity >=5) ? 1 : 0); } sub VERB9() { return (($verbosity >=9) ? 1 : 0); } sub sub_common_folder_dos { my ($f1, $f2) = @_; my $df1 = u2d($f1); my $df2 = u2d($f2); if ($os =~ /Win/) { $df1 = lc($df1); $df2 = lc($df2); } # paddle across, stopping at first difference my $off = 0; while ( substr($df1,$off,1) && substr($df2,$off,1) && ( substr($df1,$off,1) eq substr($df2,$off,1) ) ) { $off++; } return substr($f1,$off); } sub sub_in_folder($) { my ($path) = shift; $path = sub_common_folder_dos($path,$in_folder); $path =~ s/^(\\|\/)//; # kick off any leading '\' or '/' - 2010-04-02 return $path; } # 2010.03.16 - seek an 'index.htm' type file sub is_def_index_file($) { my ($fil) = @_; foreach my $f (@def_indexes) { return 1 if ($f eq $fil); } return 0; } # 20100311 - only report MISSING LINKS on HTML files sub is_link_type_file($) { my ($f) = @_; my ($n,$d,$e) = fileparse( $f, qr/\.[^.]*/ ); return 1 if (is_htm_ext($e)); return 0; } sub show_startup() { prt( "Checking $in_folder ..." ); prt( " Using HOME file $homefile ..." ) if length($homefile); prt("\n"); if (VERB5()) { prt( "\nOptions:\n" ); prt( sprintf(" -checkips - HREF check %s, \n", ($chkip ? "On" : "Off")) ); prt( sprintf(" -showhreflinks - Show HREF %s, \n", ($showhreflinks ? "On" : "Off")) ); prt( sprintf(" -showlinks - Show links %s, \n", ($showlinks ? "On" : "Off")) ); prt( sprintf(" -showscripts - Show script files %s, \n", ($showscripts ? "On" : "Off")) ); prt( sprintf(" -writeips - Write HREF $ipfile %s, \n", ($writeips ? "On" : "Off")) ); prt( sprintf(" -refreships - Refresh HREF %s\n", ($refreships ? "On" : "Off")) ); prt( sprintf(" -shownohrefs - Show NO HREF found %s\n", ($shownohrefs ? "On" : "Off")) ); prt( sprintf(" -nofollow - Follow page links found %s\n", ($recurse ? "On" : "Off")) ); if (@excludes) { prt( "Have ".scalar @excludes." excluded files - " ); foreach my $ex (@excludes) { prt( "$ex " ); } prt("\n"); } } ###pgm_exit(1,"TEMP EXIT(1)"); } # if (!is_in_array_ref0($tmp, \@hr)) sub is_in_array_ref0($$) { my ($test,$ra) = @_; my $lct1 = lc($test); my $ac = scalar @{$ra}; for (my $i = 0; $i < $ac; $i++) { my $rt = ${$ra}[$i]; # extract array ref my $t = ${$rt}[0]; # extract HREF my $lct2 = lc($t); return 1 if ($lct1 eq $lct2); } return 0; } sub is_in_array_ref($$) { my ($test,$ra) = @_; my $lct1 = lc($test); my $ac = scalar @{$ra}; for (my $i = 0; $i < $ac; $i++) { my $t = ${$ra}[$i]; # extract item my $lct2 = lc($t); return 1 if ($lct1 eq $lct2); } return 0; } sub get_anchor_hash_ref($$$) { my ($fank,$fil,$dbg) = @_; my %hash = (); my ($ank,$len,$i,$ch,$pc,$hr2,$txt); if ($fank =~ /$/) { $ank = trim_all($1); $len = length($ank); $ch = ''; $hr2 = ''; for ($i = 0; $i < $len; $i++) { $pc = $ch; $ch = substr($ank,$i,1); if ($ch =~ /\w/) { $hr2 .= $ch; # accumulate \w chars - alphanumeric, including _ } elsif (length($hr2)) { if (($ch ne '=') && ($ch =~ /\s/)) { $i++; for (; $i < $len; $i++) { $ch = substr($ank,$i,1); last if ($ch eq '='); last if !($ch =~ /\s/); } } if ($ch eq '=') { # found our equal sign $i++; # move on... for (; $i < $len; $i++) { $ch = substr($ank,$i,1); last if ($ch =~ /('|")/); last if !($ch =~ /\s/); } if (($ch eq '"')||($ch eq "'")) { $pc = $ch; $i++; # move on... $txt = ''; for (; $i < $len; $i++) { $ch = substr($ank,$i,1); last if ($ch eq $pc); $txt .= $ch; } if ($ch eq $pc) { $hash{$hr2} = $txt; prt( "[dbg38] Got [$hr2] = [$txt] [$fil]\n" ) if ($dbg38); } else { prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO END INVERTED COMMA! ($pc) [$fil]\n"); pgm_exit(1,"") if ($dbg); } } else { if (($ch =~ /\w/) && (($hr2 =~ /name/i)||($hr2 =~ /id/i))) { # accept these WITHOUT inverted comma $txt = $ch; $i++; # MOVING ON for (; $i < $len; $i++) { $ch = substr($ank,$i,1); last if !($ch =~ /\w/); $txt .= $ch; } $hash{$hr2} = $txt; prt( "[dbg39] Got [$hr2] = [$txt] - no inverted commas! [$fil]\n" ) if ($dbg39); } else { prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO START INVERTED COMMA! [$fil]\n"); pgm_exit(1,"") if ($dbg); } } } else { prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO EQUAL SIGN! [$fil]\n"); pgm_exit(1,"") if ($dbg); } $hr2 = ''; } } } return \%hash; } # Collect HREF anchors from a TEXT stream # 25/07/2007 - Skip over comments # return a reference array sub collecthrefs2($$) { my ($txt,$fil) = @_; my $ntxt = ''; my $len = length($txt); my ($ch,$pc); my $hrf = ''; my ($i,$j,$len2,$hr2,$hfile); my @hr = (); my @nr = (); for ($i = 0; $i < $len; $i++) { $ch = substr($txt,$i,1); if ($ch eq '<') { $hrf = $ch; # start a tag $i++; # go to the end of that tag for ( ; $i < $len; $i++) { $ch = substr($txt,$i,1); $hrf .= $ch; # 20100312 and 25/07/2007 watch OUT for COMMENTS - skip these if ($ch eq '-') { if ($hrf eq '' found if ($hrf =~ /-->$/) { last; } } } } } if ($ch eq '>') { last; } } if ($hrf =~ /^')); # stop only on NON SPACE, or '>' $hfile .= $ch; } #pgm_exit(1,"ERROR EXIT 2"); } } else { prt("PROBLEM: No equal sign [$hrf]! [$fil]\n"); pgm_exit(1,"ERROR EXIT 3"); } last; # end this scan of the anchor } $hr2 = ''; } } if (length($hfile)) { push(@hr, [$hfile, $hrf, 0, '', $ahr]) ; } else { push(@nr, [$hfile, $hrf, 0, '', $ahr]) ; } } } } prt( "[dbg35] Collected ". scalar @hr . " HREF... [$fil]\n" ) if ($dbg35); my %h = (); $h{'href'} = \@hr; $h{'name'} = \@nr; return \%h; } sub secs_2_hhmmss($) { my ($secs) = shift; my $rt = ''; my $mins = int($secs / 60); $secs = $secs - ($mins * 60); $secs = (int(($secs * 10) + 0.5)) / 10; if ($mins > 60) { my $hrs = int($mins / 60); $mins = $mins - ($hrs * 60); $mins = '0'.$mins if ($mins < 10); $secs = '0'.$secs if ($secs < 10); $rt = "$hrs:$mins:$secs"; } else { $mins = '0'.$mins if ($mins < 10); $secs = '0'.$secs if ($secs < 10); $rt = "$mins:$secs"; } return $rt; } sub show_time($$$$) { my ($totcnt, $lncnt, $bgntime, $msg) = @_; my ($currtime, $difftime, $persec, $remains, $remsecs, $tenths, $remtm, $elapsed); $currtime = time(); $difftime = $currtime - $bgntime; $persec = $lncnt / $difftime; $remains = $totcnt - $lncnt; $remsecs = $remains / $persec; $tenths = (int(($persec * 100) + 0.05)) / 100; $remtm = secs_2_hhmmss($remsecs); $elapsed = secs_2_hhmmss($difftime); prt( "$elapsed Done $lncnt, at $tenths/sec, left $remains in $remtm - $msg\n" ); } sub test_remove_script { my ($txt) = shift; my $ntxt = ''; my $len = length($txt); my ($i,$ch,$pc,$bal,$ppc); $pc = ''; $ch = ''; for ($i = 0; $i < $len; $i++) { $ppc = $pc; $pc = $ch; $ch = substr($txt,$i,1); if (($ch eq '<')&&(substr($txt,$i) =~ /^ my $ntxt = test_remove_script( $rawtxt ); # remove SCRIPTS ... ... ... my @is = ret_imgs_array($ntxt); $ntxt = trimblanklines($ntxt); @hrefs = (); # clear my @hr = ret_hrefs_array( $ntxt ); my $rhash = collecthrefs2( $ntxt, $file ); ### collecthrefs( $txt, 0 ); ### collectimgs( $txt, 0 ); # bump the counts of HREF and IMGS collected $itot = scalar @is; $htot = scalar @hr; # store the references ... that is a reference to an array $htm_files[$i][$of_hr] = \@hr; # like $of_to, but is ALL HREF links $htm_files[$i][$of_im] = \@is; $htm_files[$i][$of_rh] = $rhash; # store the reference hash to 2nd ref arrays 'href' & 'name' $refa = ${$rhash}{'href'}; $cnth = scalar @{$refa}; $refa = ${$rhash}{'name'}; $cntn = scalar @{$refa}; $pmsg .= " a=$htot, i=$itot, h=$cnth, n=$cntn"; # new code to do some similar things, but while in the array # BUT IS REALLY JUST A DUPLICATE OF THE ABOVE # one day should choose ONE or the OTHER ONLY ;=)) it just wastes time ###@lines = drop_php_from_array( @lines ) if (lc($ext) eq '.php'); #write2file((join("\n",@lines)."\n"),'tempbefore.htm'); @lines = dropcomments_from_array(@lines); # 20100313 *MUST* drop scripts from line array, before partying #write2file((join("\n",@lines)."\n"),'tempbefore2.htm'); @lines = dropscripts_from_array(@lines); #write2file((join("\n",@lines)."\n"),'tempafter.htm'); #pgm_exit(1,"Check tempafter.htm"); my @isrcs = get_img_srcs($file, @lines); $tot_imgcnt += check_images( $file, @isrcs ); my @hsrcs = get_href_srcs($file, @lines); check_hrefs( $file, \@hsrcs ); $htm_files[$i][$of2hr] = \@hsrcs; # really just duplicate... $htm_files[$i][$of2im] = \@isrcs; # should be removed!!! prt("[dbg43] Got $htot hrefs (h=$cnth,n=$cntn)(".scalar @hsrcs."), and $itot image links (".scalar @isrcs.").\n") if ($dbg43); $pmsg .= " h2=".scalar @hsrcs.", i2=".scalar @isrcs; prt("$pmsg\n") if (VERB5()); #if ($file =~ /$test_file/) { # pgm_exit(1,"CHECK THIS FILE [$ntxt]\n"); #} } $total_hrefs += $htot; $total_imgs += $itot; if ((($pcnt % 100) == 0)||($max < 10)) { ###local $| = 1; ###prt( "\rDone $pcnt HTML files ..." ); if ($max < 10) { prt( "Done $file HTML file ... href,other($htot,$itot)\n" ); } else { $msg = "href,other ($total_hrefs,$total_imgs)"; show_time( $max, $pcnt, $bgntime, $msg ); #prt( "Done $pcnt HTML files ... href,other ($total_hrefs,$total_imgs)\n" ); } } } prt( "Completed $pcnt HTML files ... Found $total_hrefs HREF, and $total_imgs IMG/OTHER tokens.\n" ); } sub get_href_type($) { my ($src) = shift; if ($src =~ /^http:/i) { #push(@httprefs, [$src, $fil, $lnnos] ); return 1; # remote HREF } elsif ($src =~ /^https:/i) { return 1; # remote HREF #push(@httpsrefs, [$src, $fil, $lnnos] ); } elsif ($src =~ /^ftp:/i) { #push(@ftprefs, [$src, $fil, $lnnos] ); return 3; # remote HREF } elsif ($src =~ /^mailto:/i) { #push(@mtrefs, [$src, $fil, $lnnos] ); return 4; # remote HREF } elsif ( $src =~ /^javascript:/i ) { return 5; # a JAVASCRIPT HREF } elsif ($src =~ /^file:/i) { return 5; # remote HREF } elsif ( substr($src,0,1) eq '#') { # local in page HREF return 6; } else { my $ind = index($src,'#'); $src = substr($src,0,$ind) if ( $ind != -1 ); $ind = index($src,'?'); $src = substr($src,0,$ind) if ( $ind != -1 ); $src =~ s/\/$//; return 7 if (length($src)); } return 0; } sub get_local_href($) { my ($src) = shift; my $ind = index($src,'#'); $src = substr($src,0,$ind) if ( $ind != -1 ); $ind = index($src,'?'); $src = substr($src,0,$ind) if ( $ind != -1 ); $src =~ s/\/$//; # remove any TRAILING '/' char # 25/07/2007 - also 'convert' '%20' to space $src =~ s/%20/ /g; return $src; } sub dos_2_unix($) { my ($du) = shift; $du =~ s/\\/\//g; return $du; } ### my @donesrcs = (); sub in_done_srcs($) { my ($f) = shift; foreach my $fd (@donesrcs) { if ($fd eq $f) { return 1; } } return 0; } sub in_done_imgs($) { my ($f) = shift; foreach my $fd (@doneimgs) { if ($fd eq $f) { return 1; } } return 0; } sub fix_rel_unix_path($) { my ($path) = shift; $path = dos_2_unix($path); # pgm_exit(1,"ERROR: Passed PATH that starts relative! [$path]\n") if (($path =~ /^\.\./)||($path =~ /^\.(\\|\/)\.\./)); my @a = split(/\//, $path); my $npath = ''; my $max = scalar @a; my @na = (); for (my $i = 0; $i < $max; $i++) { my $p = $a[$i]; if ($p eq '.') { # ignore this } elsif ($p eq '..') { if (@na) { pop @na; # discard previous } else { prt( "WARNING: Got relative .. without previous!!! path=[$path]\n" ); } } else { push(@na,$p); } } foreach my $pt (@na) { $npath .= "/" if length($npath); $npath .= $pt; } return $npath; } sub add_new_link($$) { my ($nlnk, $lnks) = @_; my @arr = split(',', $lnks); foreach my $lk (@arr) { if ($lk eq $nlnk) { return 0; } } return 1; } sub mark_image_link($$$$) { my ($fmfil, $fnd, $src, $lev) = @_; my $fcnt = scalar @img_files; my $msrc = lc(dos_2_unix($src)); my $lnks = ''; my ($val); prt( "[dbg22] Seeking [$msrc] in $fcnt images files ...\n" ) if ($dbg22); my ($i,$fil,$mfil); for ($i = 0; $i < $fcnt; $i++) { $fil = $img_files[$i][$of_ff]; if (is_same_file($src,$fil)) { $val = $img_files[$i][$of_lk]; $val++; # add image file linked to from what file $lnks = $img_files[$i][$of_fm]; if (length($lnks) == 0) { $lnks = $fmfil; } elsif (add_new_link($fmfil, $lnks)) { $lnks .= ','; $lnks .= $fmfil; } else { $val--; # reduce due to being a repeated link } $img_files[$i][$of_lk] = $val; $img_files[$i][$of_fm] = $lnks; prt( "[dbg28] IMG src in $fmfil, of $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg28); return 0; } } for ($i = 0; $i < $fcnt; $i++) { $fil = $img_files[$i][$of_ff]; $mfil = lc(dos_2_unix($fil)); prt( "[dbg22] Comparing to $mfil ...\n" ) if ($dbg22); if ($msrc eq $mfil) { $val = $img_files[$i][$of_lk]; $val++; $img_files[$i][$of_lk] = $val; # add image file linked to from what file $lnks = $img_files[$i][$of_fm]; if (length($lnks) == 0) { $lnks = $fmfil; } elsif (add_new_link($fmfil, $lnks)) { $lnks .= ','; $lnks .= $fmfil; } $img_files[$i][$of_fm] = $lnks; prt( "[dbg28] IMG src in $fmfil, of $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg28); return 0; } } prt( "[dbg21] $src - NOT FOUND![1]\n" ) if ($dbg21); return 1; } sub mark_other_links($$$$) { my ($fmfil, $fnd, $src, $lev) = @_; my $totcnt = 0; my $msrc = lc(dos_2_unix($src)); my $fcnt = scalar @img_files; my $i = 0; my $lnks = ''; my ($val); $totcnt += $fcnt; if (mark_image_link( $fmfil, $fnd, $src, $lev ) == 0) { return 0; } # maybe ZIP files $fcnt = scalar @zip_files; $totcnt += $fcnt; for ($i = 0; $i < $fcnt; $i++) { my $fil = $zip_files[$i][$of_ff]; my $mfil = lc(dos_2_unix($fil)); if ($msrc eq $mfil) { $val = $zip_files[$i][$of_lk]; $val++; # add zip file linked to from what file $lnks = $zip_files[$i][$of_fm]; if (length($lnks) == 0) { $lnks = $fmfil; } elsif (add_new_link($fmfil, $lnks)) { $lnks .= ','; $lnks .= $fmfil; } else { $val--; } $zip_files[$i][$of_lk] = $val; $zip_files[$i][$of_fm] = $lnks; prt( "ZIP link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29); return 0; } } # maybe TXT files $fcnt = scalar @txt_files; $totcnt += $fcnt; for ($i = 0; $i < $fcnt; $i++) { my $fil = $txt_files[$i][$of_ff]; my $mfil = lc(dos_2_unix($fil)); if ($msrc eq $mfil) { $val = $txt_files[$i][$of_lk]; $val++; # add txt file linked to from what file $lnks = $txt_files[$i][$of_fm]; if (length($lnks) == 0) { $lnks = $fmfil; } elsif (add_new_link($fmfil, $lnks)) { $lnks .= ','; $lnks .= $fmfil; } else { $val--; } $txt_files[$i][$of_lk] = $val; $txt_files[$i][$of_fm] = $lnks; prt( "TXT link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29); return 0; } } # maybe CSS files $fcnt = scalar @css_files; $totcnt += $fcnt; for ($i = 0; $i < $fcnt; $i++) { my $fil = $css_files[$i][$of_ff]; my $mfil = lc(dos_2_unix($fil)); if ($msrc eq $mfil) { $val = $css_files[$i][$of_lk]; $val++; # add txt file linked to from what file $lnks = $css_files[$i][$of_fm]; if (length($lnks) == 0) { $lnks = $fmfil; } elsif (add_new_link($fmfil, $lnks)) { $lnks .= ','; $lnks .= $fmfil; } else { $val--; } $css_files[$i][$of_lk] = $val; $css_files[$i][$of_fm] = $lnks; prt( "CSS link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29); return 0; } } # maybe SCRIPT files $fcnt = scalar @script_files; $totcnt += $fcnt; for ($i = 0; $i < $fcnt; $i++) { my $fil = $script_files[$i][$of_ff]; my $mfil = lc(dos_2_unix($fil)); if ($msrc eq $mfil) { $val = $script_files[$i][$of_lk]; $val++; # add script file linked to from what file $lnks = $script_files[$i][$of_fm]; if (length($lnks) == 0) { $lnks = $fmfil; } elsif (add_new_link($fmfil, $lnks)) { $lnks .= ','; $lnks .= $fmfil; } else { $val--; } $script_files[$i][$of_lk] = $val; $script_files[$i][$of_fm] = $lnks; prt( "SCRIPT link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29); return 0; } } # OK, OTHER $fcnt = scalar @other_files; $totcnt += $fcnt; for ($i = 0; $i < $fcnt; $i++) { my $fil = $other_files[$i][$of_ff]; my $mfil = lc(dos_2_unix($fil)); if ($msrc eq $mfil) { $val = $other_files[$i][$of_lk]; $val++; # add script file linked to from what file $lnks = $other_files[$i][$of_fm]; if (length($lnks) == 0) { $lnks = $fmfil; } elsif (add_new_link($fmfil, $lnks)) { $lnks .= ','; $lnks .= $fmfil; } else { $val--; } $other_files[$i][$of_lk] = $val; $other_files[$i][$of_fm] = $lnks; prt( "OTHER link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29); return 0; } } $totcnt += 1 if ($totcnt == 0); return $totcnt; } ####################################################################### # mark a link # parameters: # LINK IS IN FILE - $fmfil = FROM # Offset in $htm_files[$fnd] # Link item source - $src = TO # Level # 2010/04/08 - If given a DIRECTORY, try to find an 'index' to link to... sub mark_link($$$$) { my ($fmfil, $fnd, $src, $lev) = @_; my $fcnt = scalar @htm_files; my $msrc = lc(dos_2_unix($src)); my $ff = $htm_files[$fnd][$of_ff]; my $sfil = sub_in_folder($fmfil); my $ssrc = sub_in_folder($src); #prt("[dbg40]mark_link: for src=[$src] in fil=[$fmfil] off=[$fnd] lev=$lev ff=[$ff]\n") if ($dbg40); #prt("[dbg40]mark_link:$lev: for src=[$ssrc] from=[$sfil]($fnd) ff=[$ff]\n") if ($dbg40); prt("[dbg40]mark_link:$lev: TO=[$ssrc] from=[$sfil]($fnd)\n") if ($dbg40); my $i = 0; my $totcnt = $fcnt; my $fil = ''; my $mfil = ''; my $val = 0; my $lnks = ''; my ($hrf,$j,$hrt,$tmp,$i2); if (-d $src) { prt("[dbg44] Sending [$src] to 'mark_dir_link'...\n") if ($dbg44); return mark_dir_link($fmfil, $fnd, $src, $lev); } for ($i = 0; $i < $fcnt; $i++) { $i2 = $i + 1; if ($i != $fnd) { $fil = $htm_files[$i][$of_ff]; $mfil = lc(dos_2_unix($fil)); prt( "[dbg41] $i2: compare \n[$mfil] with \n[$msrc]\n" ) if ($dbg41); # VERY NOISY if (($msrc eq $mfil) || is_same_file($src,$fil)) { # FOUND the TO file in LIST - MARK IT $val = $htm_files[$i][$of_lk]; $val++; # add HTML file linked to from what file $lnks = $htm_files[$i][$of_fm]; if (length($lnks) == 0) { $lnks = $fmfil; } elsif (add_new_link($fmfil, $lnks)) { $lnks .= ','; $lnks .= $fmfil; } else { $val--; # already COUNTED } $htm_files[$i][$of_lk] = $val; $htm_files[$i][$of_fm] = $lnks; # add to LINKS prt( "[dbg30] SET link to $fil($i) $val ok [lnks=$lnks] in $fmfil\n" ) if ($dbg30); ###prt( "$ff ($fnd) linked to $fil ($i) $val\n" ) if ($dbg21); my $hr = $htm_files[$i][$of_hr]; # extract HREF ref.array - all LINKS to another! my $im = $htm_files[$i][$of_im]; # extract IMAGE ref.array # my $hr2 = $htm_files[$i][$of_rh]; # extract the NEW ANCHOR collection # $htm_files[$i][$of_rh] = $rhash; # store the reference hash to 2 ref arrays 'href' & 'name' my $rhash = $htm_files[$i][$of_rh]; my $hr2 = ${$rhash}{'href'}; # extract the NEW ANCHOR collection my $hrc = scalar @{$hr}; my $imc = scalar @{$im}; # get count of images my $hr2c = scalar @{$hr2}; if ($hrc != $hr2c) { # ====== this is mainly diagnostic ====== ##prt("WARNING: Counts old [$hrc], new [$hr2c] in [$fil]\n"); my %th1 = (); for ($j = 0; $j < $hrc; $j++) { $hrf = ${$hr}[$j]; $th1{$hrf} = 1; } $tmp = scalar keys(%th1); if ($tmp == $hr2c) { ##prt("WARNING: Solved when dupes removed! New counts old [$tmp] = new [$hr2c]\n"); } else { prt("PROBLEM: Counts old [$hrc], new [$hr2c] in [$fil]\n"); prt("List of $hrc OLD\n"); for ($j = 0; $j < $hrc; $j++) { $hrf = ${$hr}[$j]; $hrt = get_href_type($hrf); $val = is_in_array_ref0($hrf,$hr2); $tmp = ($val ? "Yes" : "No"); prt("[$hrf] ($hrt) $tmp\n"); } prt("List of $hr2c NEW\n"); for ($j = 0; $j < $hr2c; $j++) { $hrf = ${$hr2}[$j][0]; $hrt = get_href_type($hrf); $val = is_in_array_ref($hrf,$hr); $tmp = ($val ? "Yes" : "No"); prt("[$hrf] ($hrt) $tmp anchor=<".${$hr2}[$j][1].">\n"); } pgm_exit(1,"What is different/missing...\n"); } # ====== above is mainly diagnostic ====== } # found the file LINKED TO... my ($itmnam, $itmdir) = fileparse($fil); # get name and path $itmdir = $currworkdir.'/' if ($itmdir =~ /^\.(\\|\/)$/); prt( "$lev [$fil] has $hrc href, $imc img/other links ..\n" ) if ($dbg19 || $dbg26); # now process links to links... for ($j = 0; $j < $hrc; $j++) { $hrf = ${$hr}[$j]; $hrt = get_href_type($hrf); if ($hrt == 7) { my $nsrc = fix_rel_unix_path($itmdir.get_local_href($hrf)); if ( !in_done_srcs($nsrc) ) { push(@donesrcs, $nsrc); # put it in DONE list mark_link( $fil, $i, $nsrc, $lev + 1 ); # and MARK its links now } } } $val = 0; prt( "$fil - Checking $imc images files ...\n") if ($dbg24); for ($j = 0; $j < $imc; $j++) { # do each, in this linked file my $img = ${$im}[$j]; # get the image string my $isrc = $itmdir.$img; # join it with the path my $nisrc = fix_rel_unix_path($isrc); # fix rel, and force unix path prt( "Marking [$nisrc] - ".($j+1)." of $imc img/other links ..\n" ) if ($dbg19 || $dbg26); if ( !in_done_imgs($nisrc) ) { push(@doneimgs, $nisrc); # put it in DONE list mark_other_links( $fil, $j, $nisrc, 0 ); # and MARK the link in @img_files $val++; } else { prt( "Already IN doneimgs ...\n" ) if ($dbg19 || $dbg26); } } prt( "$fil - Marked $val of $imc images files ...\n") if ($val && $dbg24); return 0; } } # do NOT find self } # hmmmm, LINK not found in HREF files, maybe IMAGES, zip, etc ... prt("[dbg45] LINK not found in HREF files, maybe IMAGES, zip, etc TO=[$src], FROM=[$fmfil]\n") if ($dbg45); if (is_htm_file_ext($src)) { # 19/05/2011 - BUt if the file does EXIST, pobably just means it is outside the start point if (-f $src) { prt("[dbg46] LINK found, but not in HREF files! TO=[$src], FROM=[$fmfil]\n") if ($dbg46); } else { prtw("WARNING: LINK not found in HREF files! TO=[$src], FROM=[$fmfil]\n"); } } $val = mark_other_links( $fmfil, $fnd, $src, $lev ); if ($val) { $totcnt += $val; prt( "[dbg25] NO LINK FOUND to [$src]($msrc) in $totcnt file - $ff ($fnd) - ($lev)!\n" ) if ($dbg25); return 1; } return 0; # SUCCESS - FOUND THE LINK } # ============================================================ # sub seek_dir_index_file($$$$) { # Seek the first 'index' type file for a DIRECTORY # IN - ($directory, $resultref); # OUT - 0 = SUCCESS, and result reference contains file # 1 = FAILED no 'index' type found # 2 = FAILED directory NOT in %g_dir_file # ============================================================ sub seek_dir_index_file($$) { my ($nusrc, $rind) = @_; my $hr = \%g_dir_files; my ($dhr,$v2,$i,$ii,$cnt,$v3,$icnt,$ff,$k2); if (defined ${$hr}{$nusrc}) { $dhr = ${$hr}{$nusrc}; # get HASH reference for this DIRECTORY (in unix form) foreach $k2 (sort keys %{$dhr}) { # for each type HTML, CSS, etc... $v2 = ${$dhr}{$k2}; # extract array reference for this TYPE $cnt = scalar @{$v2}; # count to process # prt(" Type $k2 = $cnt\n"); for ($i = 0; $i < $cnt; $i++) { $v3 = ${$v2}[$i][0]; # get the file name $ii = ${$v2}[$i][1]; # and if it is 'index' type # prt(" File: [$v3] "); if ($ii) { # FOUND an 'index; type # the $v3 file name is an INDEX type file $ff = $nusrc.'/'.$v3; # build FULL FILE name $icnt++; #prt(" ('index' type)"); ${$rind} = $ff; return 0; # SUCCESS } # prt("\n"); } } return 1; } else { pgm_exit(1,"ERROR: Why is [$nusrc] NOT in directory/file reference?\n"); } return 2; # FAILED!!! } # 2010/04/08 - If given a DIRECTORY, try to find an 'index' to link to... # $fil = $htm_files[$fnd][$of_ff]; # extract FULL PATH name of file CONTAINING the HREF... # ($itmnam, $itmdir) = fileparse($fil); # get name and path # $itmdir = $currworkdir.'/' if ($itmdir =~ /^\.(\\|\/)$/); # my $lref = get_local_href($hrf); # $src = $itmdir.$lref; # my $nusrc = fix_rel_unix_path($src); # $nusrc already tested as a DIRECTORY - did I keep DIRECTORY LISTS/ #sub show_dir_files($) { # my ($hr) = @_; # = \%g_dir_files # my ($key,$val,$k2,$v2,$v3,$dir,$cnt,$i,$ii,$icnt); # foreach $key (sort keys %{$hr}) { # $dir = sub_in_folder($key); # $dir = '' if (length($dir)==0); # $val = ${$hr}{$key}; # extract hash ref BY TYPE # prt("Directory: [$dir]\n"); # $icnt = 0; # foreach $k2 (sort keys %{$val}) { # # for each type # $v2 = ${$val}{$k2}; # $cnt = scalar @{$v2}; # prt(" Type $k2 = $cnt\n"); # for ($i = 0; $i < $cnt; $i++) { # $v3 = ${$v2}[$i][0]; # $ii = ${$v2}[$i][1]; # prt(" File: [$v3] "); # if ($ii) { # $icnt++; # prt(" ('index' type)"); # } # prt("\n"); # } # } # if (!$icnt) { # prt("Directory: [$dir] has NO 'index' type file.\n"); # } elsif ($icnt > 1) { # prt("Directory: [$dir] has $icnt 'index' type files.\n"); # } # } # pgm_exit(1,"Debug run show of directories and files...\n"); #} sub mark_dir_link($$$$) { my ($fil, $fnd, $nusrc, $lev) = @_; my $fcnt = scalar @htm_files; my $hr = \%g_dir_files; my ($dhr,$v2,$i,$ii,$cnt,$v3,$icnt,$ff,$k2); if (defined ${$hr}{$nusrc}) { $dhr = ${$hr}{$nusrc}; # get HASH reference for this DIRECTORY (in unix form) foreach $k2 (sort keys %{$dhr}) { # for each type HTML, CSS, etc... $v2 = ${$dhr}{$k2}; # extract array reference for this TYPE $cnt = scalar @{$v2}; # count to process # prt(" Type $k2 = $cnt\n"); for ($i = 0; $i < $cnt; $i++) { $v3 = ${$v2}[$i][0]; # get the file name $ii = ${$v2}[$i][1]; # and if it is 'index' type # prt(" File: [$v3] "); if ($ii) { # FOUND an 'index; type # the $v3 file name is an INDEX type file $ff = $nusrc.'/'.$v3; # build FULL FILE name $icnt++; #prt(" ('index' type)"); return mark_link($fil,$fnd,$ff,0); # and mark this file } # prt("\n"); } } } else { pgm_exit(1,"ERROR: Why is [$nusrc] NOT in directory/file reference?\n"); } return 1; # FAILED!!! } sub get_links_stg($) { my ($lnks) = @_; my @arr = split(',',$lnks); my $nlnks = ''; foreach my $l (@arr) { my $sl = sub_in_folder($l); $nlnks .= ',' if (length($nlnks)); $nlnks .= $sl; } return $nlnks; } ########################################################################### # show link count, and links, in passed multidimensional file array # # If showlinks (or $dbg20 or $dbg31) is ON, shows internal LINKS # NOTE: Presently DOES NOT get all LINKS??? BAH!!! ########################################################################### sub show_link_counts($$) { my ($m, $hf) = @_; my $fcnt = scalar @{$hf}; my $mcnt = 0; my $mss = "Checking LINKS for $fcnt $m files ...\n"; my ($i,$fil,$hrt,$sfil,$lnks,$min,$len,$msg,$i2); if ($fcnt) { $min = 0; for ($i = 0; $i < $fcnt; $i++) { $fil = ${$hf}[$i][$of_ff]; $sfil = sub_in_folder($fil); $len = length($sfil); $min = $len if ($len > $min); } for ($i = 0; $i < $fcnt; $i++) { $i2 = $i + 1; $fil = ${$hf}[$i][$of_ff]; $hrt = ${$hf}[$i][$of_lk]; $lnks = ${$hf}[$i][$of_fm]; $sfil = sub_in_folder($fil); if ($hrt) { if ($dbg20 || $dbg31 || $showlinks) { prt( $mss ) if (length($mss)); $mss = ''; $sfil .= ' ' while (length($sfil) < $min); $msg = sprintf("%3d:",$i2); if ($dbg31 || $showlinks) { prt( "$msg $sfil has $hrt link(s) [".get_links_stg($lnks)."]\n" ); } else { prt( "$msg $sfil has $hrt link(s)\n" ); } } } else { # expect NONE for most types - only check HTML if ( is_link_type_file($fil) ) { prt( $mss ) if (length($mss)); $mss = ''; prtw("WARNING: $sfil($i) HAS NO LINKS!") if ($dbg33 || $showlinks); push(@missing_links,$fil); } $mcnt++; # count a MISSED item if ($shownolinks || $showlinks) { prt( $mss ) if (length($mss)); $mss = ''; prt( "$i: [$sfil] has NO links\n" ); } } } if ($mcnt) { prt( $mss ) if (length($mss)); $mss = ''; prt( "Done LINKS for $fcnt $m files ... MISSED $mcnt!!!\n" ); } } else { prt( "There are NO $m files ...\n" ) if ($dbg20 || $showlinks); } } sub in_excused($) { my ($tx) = shift; foreach my $t (@excused) { if ($t eq $tx) { return 1; } } return 0; } sub is_same_file($$) { my ($fil1,$fil2) = @_; #prt("Comparing -\n[$fil1] to\n[$fil2]\n"); if ($fil1 =~ /^\.(\\|\/)/) { $fil1 = $currworkdir.substr($fil1,1); } if ($fil2 =~ /^\.(\\|\/)/) { $fil2 = $currworkdir.substr($fil2,1); } #prt("Comparing -\n[$fil1] to\n[$fil2]\n"); my $len = length($fil1); return 0 if (length($fil2) != $len); # NOT same length return 1 if ($fil1 eq $fil2); my $ufil1 = dos_2_unix($fil1); my $ufil2 = dos_2_unix($fil2); #prt("Comparing UNIX -\n[$ufil1] to\n[$ufil2]\n"); return 1 if ($ufil1 eq $ufil2); my ($i,$ch1,$ch2); for ($i = 0; $i < $len; $i++) { $ch1 = substr($fil1,$i,1); $ch2 = substr($fil2,$i,1); if ($ch1 ne $ch2) { #prt("Not same on char [$ch1] vs [$ch2] ($i)\n"); return 0; } } return 1 if ($i == $len); return 0; } # ========================================================== # using the given (home) PAGE, try to TRACE ALL LINKS # not really meaingful unless it is a main 'index' type file # =========================================================== sub check_local_links($) { my ($hf) = shift; my $fcnt = scalar @htm_files; my ($hfnm,$hfdir) = fileparse($hf); my $lchf = lc($hfnm); my $fnd = get_home_offset($hf); my ($fil,$nm,$dir,$ext); if ($hfdir eq ".\\") { $hfdir = $in_folder."\\"; } my $itmdir = ''; my $itmnam = ''; my $i = 0; my $i2 = 0; prt( "Checking local links, for $fcnt files, from $hf ...\n") if (VERB9()); if ($fnd == -1) { prt( "WARNING: check_local_links: Unable to find [$hf] ...\n" ); return 1; } # process item 1 ... my $pcnt = 1; my $hr = $htm_files[$fnd][$of_hr]; my $im = $htm_files[$fnd][$of_im]; my $hrc = scalar @{$hr}; my $imc = scalar @{$im}; my $hrf = ''; my $img = ''; my $hrt = 0; my $src = ''; my $nsrc = ''; my $ff = ''; my $shwerr = 0; my $emsg = ''; my ($msg,$sfil); $fil = $htm_files[$fnd][$of_ff]; # extract FULL PATH name of file CONTAINING the HREF... $htm_files[$fnd][$of_lk] = 1; # mark as DONE ($itmnam, $itmdir) = fileparse($fil); # get name and path $itmdir = $currworkdir.'/' if ($itmdir =~ /^\.(\\|\/)$/); $sfil = sub_in_folder($fil); prt( "HOME [$fil] has $hrc href, $imc img/other links ..\n" ) if ($dbg19 || $dbg26); for ($i = 0; $i < $hrc; $i++) { $hrf = ${$hr}[$i]; $hrt = get_href_type($hrf); $i2 = $i + 1; $shwerr = 0; $emsg = "IN[$sfil] $i2 HREF [$hrf]$hrt"; if ($hrt == 0) { if (in_excused($hrf)) { $excusecnt++; } else { $emsg .= " CHECK UNKNOWN ****** CHECK ME ******[1]"; push(@warnings, "WARNING: $emsg!" ); $shwerr = 1; } } elsif ($hrt < 5) { $emsg .= " REMOTE"; } elsif ($hrt == 5) { $emsg .= " JAVASCRIPT"; } elsif ($hrt == 6) { $emsg .= " IN PAGE"; } else { my $lref = get_local_href($hrf); $src = $itmdir.$lref; my $nusrc = fix_rel_unix_path($src); prt( "[dbg34] REL PATH [$src] to UNIX PATH [$nusrc] hrt=$hrt file=[$fil]\n" ) if ($dbg34); push(@donesrcs, $nusrc); # put it in DONE list # mark link - FROM $fil if (is_same_file($fil,$nusrc)) { $emsg .= " LINK to SELF ok"; } elsif (-d $nusrc) { if ( mark_dir_link( $fil, $fnd, $nusrc, 0 ) ) { pgm_exit(1,"CODE ERROR: To be done mark links for DIRECTORY! [$nusrc]\n"); } } else { if ( mark_link( $fil, $fnd, $nusrc, 0 ) ) { $emsg .= " SITE REF [$nusrc] ***NO IN-SITE LINK***???"; $msg = "$i2 [$fil] HREF [$hrf]$hrt SITE REF [$nusrc] ***NO IN-SITE LINK***???"; if (-f $src) { $msg .= "\n*** BUT FILE EXISTS [$src] ***"; $emsg .= "\n*** BUT FILE EXISTS [$src] ***"; # 19/05/2011 - do NOT post this warning # push(@warnings, "WARNING: Local HREF [$lref] in [$fil] OUTSIDE WEB! but EXISTS!" ); } else { push(@missed, $msg ); $shwerr = 1; } } else { $emsg .= " SITE REF [$src] ok" if ($dbg19); } } } prt( "$emsg\n" ) if ($dbg19 || $shwerr || VERB5()); } prt( "HOME - Marking $imc images files ...\n") if ($dbg24); for ($i = 0; $i < $imc; $i++) { $img = ${$im}[$i]; $src = $itmdir.$img; $nsrc = fix_rel_unix_path($src); prt( "HOME $fil - Mark $src ($nsrc) image ...\n" ) if ($dbg24); push(@doneimgs, $nsrc); # put it in DONE list mark_other_links( $fil, $i, $nsrc, 0 ); } return 0; } sub offset_done($$) { my ($off, @done) = @_; foreach my $num (@done) { if ($off == $num) { return 1; } } return 0; } sub trace_from_htm($$) { my ($hf, $lev) = @_; my $fnd = get_offset_of_htm($hf); my $msg = ''; if (($fnd != -1) && !offset_done($fnd,@offsdone)) { push(@offsdone,$fnd); my $hr = $htm_files[$fnd][$of_hr]; my $hrc = scalar @{$hr}; my @offsets = (); my($itmnam, $itmdir) = fileparse($hf); # get name and path $itmdir = $currworkdir.'/' if ($itmdir =~ /^\.(\\|\/)$/); for (my $i = 0; $i < $hrc; $i++) { my $hrf = ${$hr}[$i]; my $hrt = get_href_type($hrf); if ($hrt == 7) { #my $src = fix_rel_unix_path($itmdir.get_local_href($hrf)); my $src = $itmdir.get_local_href($hrf); push(@offsets,$src); trace_from_htm($src, ($lev + 1)); } } $hrc = scalar @offsets; my $cnt = $lev; $msg = sprintf("%4d ", $lev); prt( $msg ); while($cnt) { prt( ' ' ); $msg .= ' '; $cnt--; } prt( "$hf links to $hrc files ...\n" ); $msg .= "$hf links to $hrc files ..."; push(@htmlinks, [$lev, $msg]); foreach my $fil (@offsets) { $cnt = $lev; $msg = sprintf("%4d ", $lev); prt( $msg ); while($cnt) { prt( ' ' ); $msg .= ' '; $cnt--; } prt( "$fil\n" ); $msg .= $fil; push(@htmlinks, [$lev, $msg]); } } } sub get_offset_of_htm($) { my ($hf) = shift; my $fcnt = scalar @htm_files; my ($hfnm,$hfdir) = fileparse($hf); if ($hfdir eq ".\\") { $hfdir = $in_folder."\\"; } my $lchf = lc($hfnm); my $fnd = -1; for (my $i = 0; $i < $fcnt; $i++) { my $fil = $htm_files[$i][$of_ff]; my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ ); if (lc($nm.$ext) eq $lchf) { # have at least the NAME, but maybe not the FOLDER if (lc($hfdir) eq lc($dir)) { $fnd = $i; last; } } } return $fnd; } sub get_home_offset($) { if ($homeoffset != -1) { return $homeoffset; } my ($hf) = shift; prt( "Getting offset of HOME file [$hf]... "); my $fnd = get_offset_of_htm($hf); if ($fnd == -1) { prtw("WARNING: Unable to find [$hf] ...\n" ); } else { prt( "index $fnd\n" ); } $homeoffset = $fnd; # set HOME file offset in list... return $homeoffset; } ############################################################################# # check linkages ############################################################################# sub check_linkages($) { my ($hf) = shift; my $fcnt = scalar @htm_files; my ($hfnm,$hfdir) = fileparse($hf); my $lchf = lc($hfnm); my $fnd = get_home_offset($hf); my ($fil,$nm,$dir,$ext); if ($hfdir eq ".\\") { $hfdir = $in_folder."\\"; } my $itmdir = ''; my $itmnam = ''; my $i = 0; my $i2 = 0; my ($tmp); prt( "Re-checking HREF and IMG/OTHER links, for $fcnt files ...\n") if (VERB9()); if ($fnd == -1) { prt( "WARNING: Unable to find [$hf] ...\n" ); push(@warnings, "WARNING: Unable to find [$hf] ..."); return 1; } # process item 1 ... my $pcnt = 1; my $hr = $htm_files[$fnd][$of_hr]; my $im = $htm_files[$fnd][$of_im]; my $hrc = scalar @{$hr}; my $imc = scalar @{$im}; my $hrf = ''; my $img = ''; my $hrt = 0; my $src = ''; my $ff = ''; my $shwerr = 0; my $emsg = ''; $fil = $htm_files[$fnd][$of_ff]; # extract FULL PATH name of file ... ($itmnam, $itmdir) = fileparse($fil); # get name and path $itmdir = $currworkdir.'/' if ($itmdir =~ /^\.(\\|\/)$/); prt( "\n" ) if ($dbg18); prt( "$pcnt [$fil] has $hrc href, and $imc image links ..\n" ) if ($dbg18); for ($i = 0; $i < $hrc; $i++) { $hrf = ${$hr}[$i]; $hrt = get_href_type($hrf); $i2 = $i + 1; $shwerr = 0; $emsg = "HH[$fil] "; $emsg .= "$i2 HREF [$hrf]$hrt"; if ($hrt == 0) { if (in_excused($hrf)) { $excusecnt++; } else { $emsg .= " CHECK UNKNOWN ****** CHECK ME ******[2]"; push(@warnings, "WARNING: $emsg" ); $shwerr = 1; } } elsif ($hrt < 5) { $emsg .= " REMOTE"; } elsif ($hrt == 5) { $emsg .= " JAVASCRIPT"; } elsif ($hrt == 6) { $emsg .= " LOCAL"; } else { $src = $itmdir.get_local_href($hrf); if (-f $src) { $emsg .= " SITE REF [$src] ok"; } elsif (-d $src) { $emsg .= " SITE REF [$src] okd (as .txt)"; } else { # special sometimes replacement with a TEXT file $tmp = $src.".txt"; if (-f $tmp) { $emsg .= " SITE REF [$src] ok (as .txt)"; } else { $emsg .= " SITE REF [$src] ***MISSING***?[1]"; push(@missed, $emsg ); $shwerr = 1; } } } prt( "$emsg\n" ) if ($dbg18 || $shwerr || VERB9() ); } # From this beginning for (my $j = 0; $j < $fcnt; $j++) { $fil = $htm_files[$j][$of_ff]; ($itmnam, $itmdir) = fileparse($fil); # get name and path $itmdir = $currworkdir.'/' if ($itmdir =~ /^\.(\\|\/)$/); if ($j != $fnd) { $pcnt++; $hr = $htm_files[$j][$of_hr]; $im = $htm_files[$j][$of_im]; $hrc = scalar @{$hr}; $imc = scalar @{$im}; prt( "\n" ) if ($dbg18); prt( "$pcnt [$fil] has $hrc href, and $imc image links ..\n" ) if ($dbg18 || VERB9()); for ($i = 0; $i < $hrc; $i++) { $hrf = ${$hr}[$i]; $hrt = get_href_type($hrf); $i2 = $i + 1; $shwerr = 0; $emsg = "HF[".sub_in_folder($fil)."] "; $emsg .= "$i2 HREF [$hrf]$hrt"; if ($hrt == 0) { if (in_excused($hrf)) { $excusecnt++; } else { $emsg .= " CHECK UNKNOWN ****** CHECK ME ******[3]"; push(@warnings, "WARNING: $emsg"); $shwerr = 1; } } elsif ($hrt < 5) { $emsg .= " REMOTE"; } elsif ($hrt == 5) { $emsg .= " JAVASCRIPT"; } elsif ($hrt == 6) { $emsg .= " LOCAL"; } else { $src = get_local_href($hrf); if ($src eq '.') { if (length($homefile)) { # && ($fdir eq $in_folder) $src = $homefile; # translate a DOT to HOME FILE } } $ff = $itmdir.$src; if (-f $ff) { $emsg .= " SITE REF [$ff] ok"; } elsif (-d $ff) { $emsg .= " SITE REF [$ff] okd"; # maybe check if there is an 'index' type file my $res = fix_rel_unix_path($ff); if (seek_dir_index_file($res,\$res)) { $emsg .= " BUT NO 'index' found in [$ff]"; $shwerr = 1; push(@missed, $emsg ); } else { $emsg .= " found 'index' [$res]"; } } else { $tmp = $ff.".txt"; if (-f $tmp) { $emsg .= " SITE REF [$ff] ok (as .txt)"; } else { $emsg .= " SITE REF [$src][$ff] ***MISSING***?[3]"; push(@missed, $emsg ); $shwerr = 1; } } } prt( "$emsg\n" ) if ($dbg18 || ($shwerr && $dbg32) || VERB9()); } for (my $i = 0; $i < $imc; $i++) { $img = ${$im}[$i]; my $sfil = sub_in_folder($fil); my $simg = sub_in_folder($img); $emsg = "IF[$sfil] [$simg] "; if ($img =~ /^http:\/\/.*/i) { if ($showhreflinks) { prtw("WARNING: IMG link is HREF $emsg [1]"); } else { $hrflnkcnt++; } } else { # 25/07/2007 - deal with '%20' in text $img =~ s/%20/ /g; $src = $itmdir.$img; $shwerr = 0; if (-f $src) { $emsg .= " IMG ok"; } else { $tmp = $src.".txt"; if (-f $tmp) { $emsg .= " IMG ok (as .txt)"; } else { $emsg .= " IMG ***MISSING***?[5]"; push(@missed, $emsg ); $shwerr = 1; } } } prt( "$emsg\n" ) if ($dbg18 || ($shwerr && $dbg32) || VERB9()); } } } return 0; } sub check_images($$) { my ($ifile, @srcs) = @_; my ($nm, $dir) = fileparse($ifile); my $scnt = scalar @srcs; my ($tmp); if ($scnt) { prt( "Found $scnt imgs in $nm ...\n" ) if ($dbg14); for (my $i = 0; $i < $scnt; $i++) { my $src = $srcs[$i][0]; my $lnn = $srcs[$i][1]; if ($src =~ /^http:\/\//i) { # remote HREF } else { # 25/07/2007 - deal with '%20' to space $src =~ s/%20/ /g; my $ff = $dir.$src; if ( -f $ff ) { prt( "$src - ok\n" ) if ($dbg13); } else { $tmp = $ff.".txt"; if (-f $tmp) { prt( "$src - ok (as .txt)\n" ) if ($dbg13); } else { if ($show_all_not_found) { prtw("WARNING: [$src] $ifile:$lnn NOT FOUND![2]a"); } elsif (!defined $g_images_notfound{$src}) { prtw("WARNING: [$src] $ifile:$lnn NOT FOUND![2]b"); $g_images_notfound{$src} = "$ifile,$lnn"; } } } } } # for image count } else { prt( "Found NO imgs in [$ifile] ...\n" ) if ($dbg15); } return $scnt; } sub get_img_srcs($$) { my ($fil, @lns) = @_; my $lc = scalar @lns; my $scnt = 0; my ($nm,$dir) = fileparse( $fil ); prt( "Processing $lc lines from [$nm] dir=[$dir]...\n" ) if ($dbg12); my @isrc = (); my $ln = ''; my $bal = ''; my $inscript = 0; my $msg = ''; my $bgnln = 0; my $lnnos = ''; for (my $i = 0; $i < $lc; $i++) { $ln = $bal; $bal = ''; $ln .= $lns[$i]; chomp $ln; prt( "$i [$ln] ...\n" ) if ($dbg10); if ($inscript) { if ($ln =~ /<\/script>/i) { $inscript =0; prt( "EXIT a SCRIPT ...\n" ) if ($dbg11); } next; } if ( $ln =~ //i ) { $msg = "WARNING: Also found SCRIPT in IMG line ...[$ln]"; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } prt( "[dbg10] Found [$iln] ...\n" ) if ($dbg10); $bgnln = $i; while ( !($iln =~ />/) && ($i < $lc)) { $i++; my $nxln = $lns[$i]; chomp $nxln; prt( "[dbg10] Adding [$nxln] ...\n" ) if ($dbg10); $iln .= ' ' if !($iln =~ /=$/); $iln .= $nxln; } $lnnos = "$bgnln:$i"; my $ind = index($iln, '>'); if ($ind != -1) { $bal = substr($iln, $ind+1); $iln = substr($iln, 0, $ind+1); } $iln = trim_all($iln); #if ($iln =~ /src=\"(.+)\"/i) { if ($iln =~ /src=\s*\"(\S+)\"/i) { prt( "[dbg10] SRC = $1 In line [$iln]$lnnos...\n" ) if ($dbg10); push(@isrc, [$1, $lnnos, $fil]); $scnt++; } elsif ($iln =~ /src=\s*(\S+)/i) { # without QUOTES prt( "[dbg10] SRC = $1 In line [$iln]$lnnos...\n" ) if ($dbg10); push(@isrc, [$1, $lnnos, $fil]); $scnt++; } elsif ($iln =~ /src=\s*\'(\S+)\'/i) { # single QUOTES prt( "SRC = $1 In line [$iln]$lnnos...\n" ) if ($dbg10); push(@isrc, [$1, $lnnos, $fil]); $scnt++; } else { $msg = "WARNING: SRC NOT FOUND in [$iln]$fil:$lnnos..."; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } } elsif ( $ln =~ //i ) { $inscript = 1; prt( "Entered a SCRIPT ...\n" ) if ($dbg11); if ($ln =~ /<\/script>/i) { $inscript =0; prt( "EXIT a SCRIPT ...\n" ) if ($dbg11); } } } prt( "Returning $scnt img sources ...\n") if ($dbg10); #$dbg12 = $sdbg12; #$dbg11 = $sdbg11; #$dbg16 = $sdbg16; #$dbg10 = $sdbg10; return @isrc; } sub get_text_sub($) { my $ff = shift; # full file name my @arr = split(/\./,$ff); # split on DOT $arr[-1] = 'txt'; # make last 'txt' my $nf = ''; # start a new file name my ($part); foreach $part (@arr) { $nf .= '.' if (length($nf)); $nf .= $part; } return $nf; } # ================================================================ # sub check_hrefs($$) # called from primary file processing - sub process_file_array() # split passed 'anchor' list into 'type' of link # and store in global @httprefs, @httpsrefs, @ftprefs, @mtrefs # skipping javascript:, and checking file/dir for file links # The global @httprefs is later used for an IP check, if requested. # ================================================================ sub check_hrefs($$) { my ($fil, $rsrcs) = @_; my ($fnm,$fdir,$fext) = fileparse( $fil, qr/\.[^.]*/ ); $fdir = $currworkdir.'/' if ($fdir =~ /^\.(\\|\/)/); # 20100401 - if local use current directory my $scnt = scalar @{$rsrcs}; my $isphp = (lc($fext) eq '.php'); my $sfil = sub_in_folder($fil); if ($scnt) { prt( "Found $scnt anchor href= in $fnm$fext ...\n" ) if ($dbg7); for (my $i = 0; $i < $scnt; $i++) { my $orgsrc = ${$rsrcs}[$i][0]; my $lnnos = ${$rsrcs}[$i][1]; my $src = $orgsrc; if ($src =~ /^http:/i) { # remote HREF push(@httprefs, [$src, $fil, $lnnos] ); } elsif ($src =~ /^https:/i) { # remote HREF push(@httpsrefs, [$src, $fil, $lnnos] ); } elsif ($src =~ /^ftp:/i) { # remote HREF push(@ftprefs, [$src, $fil, $lnnos] ); } elsif ($src =~ /^mailto:/i) { # remote HREF push(@mtrefs, [$src, $fil, $lnnos] ); } elsif ( $src =~ /^#/ ) { # local in page HREF } elsif ( $src =~ /^javascript:/i ) { # a JAVASCRIPT HREF } else { my $ind = index($src,'#'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } $ind = index($src,'?'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } $src =~ s/\/$//; if (length($src)) { if ($src eq '.') { # HREF is just a DOT if (length($homefile)) { # && ($fdir eq $in_folder) $src = $homefile; # translate a DOT to HOME FILE } } # 25/07/2007 - deal with '%20' back to space $src =~ s/%20/ /g; my $ff = $fdir.$src; if ( -f $ff ) { prt( "[dbg5] $src - ok\n" ) if ($dbg5); } elsif (-d $ff) { prt( "[dbg5] $src - okd\n" ) if ($dbg5); } else { # 20100312 - Special case - I have replaced large ZIPS, and others, with a TEXT # file, so do NOT give this WARNING, if such a .txt file exists. my $tmpf = $ff.".txt"; if ( -f $tmpf ) { prt( "[dbg5] $src - ok AS $tmpf REPLACEMENT FILE!\n" ) if ($dbg5); } else { $tmpf = get_text_sub($ff); if ( -f $tmpf ) { prt( "[dbg5] $src - ok AS $tmpf REPLACEMENT FILE!2\n" ) if ($dbg5); } else { if ($orgsrc eq $src) { prtw("WARNING: href [$orgsrc] file [$ff] NOT FOUND![3]s in [$sfil]$lnnos\n"); } else { prtw("WARNING: href [$orgsrc] [$src] file [$ff] NOT FOUND![3] in [$sfil]$lnnos\n"); } } } } } else { if ($isphp) { prt( "Found BLANK HREF [$orgsrc] in PHP [$sfil]...\n" ) if ($dbg17); } else { prtw("WARNING: Found BLANK HREF [$orgsrc] in [$sfil] ..."); } } } } } else { if ($isphp) { prt( "Found NO HREFs in PHP [$sfil]...\n" ) if ($dbg17); } else { prt( "NO HREF FOUND in [$sfil]...\n" ) if ($shownohrefs); } } } ############################################################ # Only used is $chkip = 1; # Show IP Address # uses sockets, gethostbyname # Return 0, if can NOT be resolved. # else the number of IP addresses resolved. ############################################################ sub checkIPAddress($) { my ($nm) = shift; my @addr = gethostbyname($nm); my $cnt = 0; if( !@addr ) { my $err = $!; $err =~ s/\n/ /g; $err =~ s/^\s+//; $err =~ s/\s+$//; if ($nm =~ /:\d+$/) { my $nm2 = $nm; $nm2 =~ s/:\d+$//; @addr = gethostbyname($nm2); if (@addr) { $nm = $nm2; } else { my $err2 = $1; $err2 =~ s/\n/ /g; $err2 =~ s/^\s+//; $err2 =~ s/\s+$//; prt( "Can't resolve $nm: $err!\nNor $nm2: $err2" ); return 0; } } else { prt( "Can't resolve $nm: $err!\n" ); return 0; } } @addr = map { inet_ntoa($_) } @addr[4 .. $#addr]; foreach my $k (@addr) { $cnt++; prt( "$cnt: $nm resolves to IP [$k]\n" ) if ($dbg3); } return $cnt; } sub getIPAddress($) { my ($uri) = shift; my $packed_ip = gethostbyname($uri); my $ip_address = 'UNKNOWN'; if (defined $packed_ip) { $ip_address = inet_ntoa($packed_ip); } return $ip_address; } sub show_uri_ip($$) { my ($u,$l) = @_; my $msg = "[$u]"; $msg .= ' ' while (length($msg) < $l); if (checkIPAddress($u)) { my $ip = getIPAddress($u); prt("$msg ok, on IP [$ip]\n"); } else { prt("$msg FAILED\n"); } } ################################################ # Add to @scripts multidimensional array, # if NOT already in there, when on the line # numbers are added. ############################################### sub add_2_scripts($$) { my ($fil, $lns) = @_; my $sc = scalar @scripts; for (my $i = 0; $i < $sc; $i++) { my $cf = $scripts[$i][0]; if ($cf eq $fil) { my $lc = $scripts[$i][1]; $lc .= ":$lns"; $scripts[$i][1] = $lc; return 0; } } push(@scripts, [$fil, $lns]); return 1; } #################################################### # Get HREF sources # Given an ARRAY of file lines, check for # anchor href="something" ... # Return the "something" in an array #################################################### sub get_href_srcs($$) { my ($fil, @lns) = @_; my $lc = scalar @lns; my $scnt = 0; my $slns = 0; # count the SCRIPT lines my ($nm,$dir) = fileparse( $fil ); prt( "Processing $lc lines from [$nm] dir=[$dir]...\n" ) if ($dbg6); my @isrc = (); my $ln = ''; my $bal = ''; my $inscript = 0; $slns = 0; my $bgnln = 0; my $endln = 0; for (my $i = 0; $i < $lc; $i++) { $ln = $bal; $bal = ''; $ln .= $lns[$i]; chomp $ln; prt( "$i [$ln] ...\n" ) if ($dbg10); if ($inscript) { if ($ln =~ /<\/script>/i) { $inscript =0; prt( "EXIT a SCRIPT ...\n" ) if ($dbg4); add_2_scripts( $fil, $slns ); $slns = 0; next; } $slns++; next; } if ( $ln =~ //) && ($i < $lc)) { $i++; my $nxln = $lns[$i]; chomp $nxln; prt( "Adding [$nxln] ...\n" ) if ($dbg10); $iln .= ' ' if !($iln =~ /=$/); $iln .= $nxln; } $endln = $i; my $ind = index($iln, '>'); if ($ind != -1) { $bal = substr($iln, $ind+1); $iln = substr($iln, 0, $ind+1); } #if ($iln =~ /src=\"(.+)\"/i) { if ($iln =~ /href\s*=\s*\"(\S+)\"/i) { prt( "HREF = $1\nIn line [$iln]...\n" ) if ($dbg10); push(@isrc, [$1, "$bgnln:$endln"] ); $scnt++; } else { # hmmm... NO 'href' in his line if (( $iln =~ /name=\s*\"(\S+)\"/i )||( $iln =~ /name=(\S+)/i )) { # ignore BOOKMARKS } elsif (( $iln =~ /id=\s*\"(\S+)\"/i )||( $iln =~ /id=(\S+)/i )) { # ignore BOOKMARKS } else { prtw("WARNING:$fil:$bgnln: HREF NOT FOUND in [$iln] [$ln] [$fil]"); } } } elsif ( $ln =~ //i ) { $inscript = 1; prt( "Entered a SCRIPT ...\n" ) if ($dbg4); $slns = 0; $ln = substr($ln, 7); if ($ln =~ /<\/script>/i) { $inscript =0; prt( "EXIT a SCRIPT ...\n" ) if ($dbg4); add_2_scripts( $fil, 1 ); $slns = 0; } } } if ($inscript) { my $msg = "WARNING: EXIT WHILE IN SCRIPT in [$fil]..."; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } prt( "Returning $scnt HREF sources ...\n") if ($dbg10); return @isrc; } ######################################################### # Passed an array REF of extensions, # check if this is one of them? ######################################################### sub is_this_extent($$) { my ($ext, $rex) = @_; my $lcx = lc($ext); foreach my $x (@{$rex}) { return 1 if ($lcx eq lc($x)); } return 0; } ############################################ # only looking for HTM, HTML, PHP, # could be extended to others maybe ... ############################################ # test an EXTENSION, or form '.htm'... sub is_htm_ext($) { my ($ext) = shift; return( is_this_extent($ext,\@html_ext) ); } sub is_graf_ext($) { my ($ext) = shift; return( is_this_extent($ext,\@graf_ext) ); } sub is_zip_ext($) { my ($ext) = shift; my @arr = qw( .zip .gz ); return( is_this_extent($ext,\@arr) ); } sub is_css_ext($) { my ($ext) = shift; return( is_this_extent($ext, \@css_ext) ); } sub is_txt_ext($) { my ($ext) = shift; my @arr = qw( .txt ); return( is_this_extent($ext, \@arr) ); } sub is_script_ext($) { my ($fil) = shift; return( is_this_extent($fil, \@script_ext) ); } # test a FILE/PATH extension sub is_htm_file_ext($) { my ($fil) = shift; my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/); return( is_htm_ext($e) ); } sub is_graphic_file_ext($) { my ($fil) = shift; my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/); return( is_graf_ext($e) ); } sub is_zip_file_ext($) { my ($fil) = shift; my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/); return( is_zip_ext($e) ); } sub is_css_file_ext($) { my ($fil) = shift; my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/); return( is_css_ext($e) ); } sub is_txt_file_ext($) { my ($fil) = shift; my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/); return( is_txt_ext($e) ); } sub is_script_file_ext($) { my ($fil) = shift; my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/); return( is_script_ext($e) ); } ################################################ # my $ignfpd = 1; # ignore FRONTPAGE folders ################################################ sub is_fp_folder($) { my ($inf) = shift; foreach my $fil (@fpfolders) { if (lc($inf) eq lc($fil)) { return 1; } } return 0; } sub is_xclude_folder($) { my ($inf) = shift; foreach my $fil (@g_xclude_dir) { if (lc($inf) eq lc($fil)) { return 1; } } return 0; } #################################### # Check if FILE is in EXCLUDE list #################################### sub in_excludes($) { my ($fil) = shift; my $lcf = lc($fil); foreach my $f (@excludes) { if (lc($f) eq $lcf) { return 1; } } return 0; } sub in_spl_excludes($) { my ($fldr) = shift; my $lfldr = lc($fldr); foreach my $f (@splexcludes) { if (lc($f) eq $lfldr) { return 1; } } return 0; } #################################################################### # process_folder(folder) # Main DIRECTORY processing function # # Open the FOLDER given, and collect ALL files found, # iterate into sub-directories, if $recurse is non-zero, # and it is NOT a special FRONTPAGE (hidden) FOLDER. # # Files are collected into multidemensional arrays, and # %g_dir_files - store per directory, per type array of files #################################################################### sub process_folder($) { my ($inf) = shift; my $fcnt = 0; # start with NO 'html' files my ($fil,$ff,$fnd,$dhr,$far,$isind); my $rg_dir_files = \%g_dir_files; prt( "[dbg1b] Processing [$inf] folder...\n" ) if ($dbg1b); my @dirs = (); my $uinf = dos_2_unix($inf); # keep HASH in UNIX form $fnd = 0; # count of 'index.htm' type files found in folder if ( opendir( DIR, $inf ) ) { my @files = readdir(DIR); closedir DIR; $fcnt = scalar @files; #prt("Processing $fcnt files from [$inf]...\n"); $fcnt = 0; ${$rg_dir_files}{$uinf} = {} if (!defined ${$rg_dir_files}{$uinf}); # store files %hash->dir->%type->@files foreach $fil (@files) { next if (($fil eq ".")||($fil eq "..")); $isind = 0; $ff = $inf."\\".$fil; if ( -d $ff ) { if ($recurse) { next if ($ignfpd && is_fp_folder($fil)); # ignore FRONTPAGE folders next if (@splexcludes && in_spl_excludes($fil)); next if (is_xclude_folder($fil)); #process_folder( $ff ); push(@dirs,$ff); } } else { # NOTE: multidimensional arrays pushed - offsets into arrays # my $of_ff = 0; # 1 - full file name # my $of_hr = 1; # 2 - array ref of href links # my $of_im = 2; # 3 - array ref of image links # my $of_lk = 3; # 4 - linked count # my $of_rh = 4; # 5 - contains 'collecthrefs2' collections # my $of_to = 5; # links TO # my $of_fm = 6; # links FROM # my $of2hr = 7; # more or less duplicate of $of_hr # my $of2im = 8; # more or less duplicate of $of_im if ( in_excludes($fil) ) { # NOT in @excludes prt("[dbg37] File [$fil] EXCLUDED!\n") if ($dbg37); } else { $dhr = ${$rg_dir_files}{$uinf}; # extract hash ref for this (unix) path my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ ); if (defined $ext_hash{$ext}) { $ext_hash{$ext}++; } else { $ext_hash{$ext} = 1; } my $arr_ref = \@other_files; my $type = "Other"; if (is_htm_file_ext($fil)) { $arr_ref = \@htm_files; $fcnt++; $isind = is_def_index_file($fil); $fnd++ if ($isind); $type = "HTML"; } elsif (is_graphic_file_ext($fil)) { $arr_ref = \@img_files; $type = "IMG"; } elsif (is_zip_file_ext($fil)) { $arr_ref = \@zip_files; $type = "ZIP"; } elsif (is_css_file_ext($fil)) { $arr_ref = \@css_files; $type = "CSS"; } elsif (is_txt_file_ext($fil)) { $arr_ref = \@txt_files; $type = "TEXT"; } elsif (is_script_file_ext($fil)) { $arr_ref = \@script_files; $type = "SCRIPT"; } # ========================================================== # 0 1 2 3 4 5 6 7 8 9 10 push(@{$arr_ref}, [$ff, '', '', 0, 0, '', '', 0, 0, 0, 0] ); # ========================================================== ${$dhr}{$type} = [] if (!defined ${$dhr}{$type}); $far = ${$dhr}{$type}; push(@{$far},[$fil,$isind,0,[],[],0]); prt("[dbg36] Got file [$fil], with ext = [$ext] = $type\n") if ($dbg36); } } } prt( "[dbg1] Processed $inf folder finding $fcnt HTML files ...\n" ) if ($dbg1); if ($fcnt) { # got some, but any 'index.htm' type push(@g_folders_noind, $inf) if (!$fnd); } else { # NO 'html' files found push(@g_empty_folders, $inf); # store the EMPTY folder } } else { pgm_exit(1,"ERROR: Failed to open folder [$inf]!\n" ); } if (@dirs) { # process the subdirectories, if any to process foreach $ff (@dirs) { process_folder( $ff ); } } } ############################################## # Just to show the COUNTS in the ARRAYS ############################################## sub show_found_counts() { my ($cnt); if (VERB1()) { $cnt = scalar @htm_files; prt( "show_found_counts() $cnt HTML, "); $cnt = scalar @img_files; prt( "$cnt images, " ); $cnt = scalar @css_files; prt( "$cnt css, " ); $cnt = scalar @zip_files; prt( "$cnt zip, " ); $cnt = scalar @txt_files; prt( "$cnt txt, " ); $cnt = scalar @script_files; prt( "$cnt script, " ); $cnt = scalar @other_files; prt( "and $cnt others ...\n" ); } $cnt = scalar keys %ext_hash; if ($dbg27 || VERB5()) { prt( "$cnt extensions, and each count ...\n" ); foreach my $key (keys %ext_hash) { my $val = $ext_hash{$key}; prt( "$val $key "); } prt("\n"); } my $cnt1 = scalar @g_empty_folders; my $cnt2 = scalar @g_folders_noind; prtw("WARNING: Got $cnt1 folders with NO HTML type files, and $cnt2 with no 'index.htm'\n") if (($cnt1 || $cnt2) && $show_no_index); } # @ipsfound = ; sub in_ips_found($) { my ($ip) = shift; my $lcip = lc($ip); foreach my $i (@ipsfound) { chomp $i; if (lc($i) eq $lcip) { return 1; } } return 0; } sub trim_href($) { my ($fh) = shift; my $h = $fh; my $ind = index($h,'#'); $h = substr($h,0,$ind) if ($ind > 0); $ind = index($h,'?'); $h = substr($h,0,$ind) if ($ind > 0); $h =~ s/\/$//; # remove any trailing '/' if ($h =~ /\.$/) { $h =~ s/\.$//; # remove any trailing '.' - but wonder what this is prt("PROBLEM: File [$g_active_file]$g_active_lnn\n"); pgm_exit(1,"Check out this trailing '.', from [$fh]!\n"); } return $h; } sub get_host_link($) { my ($h) = shift; if ($h =~ /^(\w+:\/\/[\w\.-]+\.[A-Z]{2,4})/i) { return "$1"; } return $h; } sub get_host_link13($) { my ($uri) = shift; # print "$1,$2, $3,$4, $5, $6,$7, $8, $9\n" if ($uri =~ m{^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?}) { return $1.$3; } return $uri; } sub get_host_link4($) { my ($uri) = shift; # print "$1,$2, $3,$4, $5, $6,$7, $8, $9\n" if ($uri =~ m{^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?}) { return $4; } return $uri; } sub get_domain_name($$) { my ($uri,$rw) = shift; $uri = get_host_link4($uri); $uri =~ s/^www(\d*)\.// if ($rw); return $uri; } sub show_remote_links() { my $rh = \%g_hrefs; my $rh2 = \%g_hosts; my $rd = \%g_domains; my ($len,$key,$val,$href,$cnt,$chk,$dom); my $min = 0; my @hosts = (); my @domains = (); foreach $key (keys %{$rh}) { $len = length($key); $min = $len if ($len > $min); } $min += 2; $min = 65 if ($min > 65); my $icnt = 0; foreach $key (sort keys %{$rh}) { $val = ${$rh}{$key}; # still to decide which is best??? $href = get_host_link($key); #my $href13 = get_host_link13($key); #if ($href ne $href13) { # pgm_exit(1, "CHECK: from [$key], got \n [$href] and \n [$href13] FIX CODE!\n"); #} $chk = ''; if (defined ${$rh2}{$href}) { ${$rh2}{$href}++; } else { ${$rh2}{$href} = 1; push(@hosts,$href); $dom = lc(get_domain_name($key,0)); if (!defined ${$rd}{$dom}) { ${$rd}{$dom} = 1; push(@domains,$dom) } if ($check_host_ip) { if (checkIPAddress($href)) { $chk = 'ok'; } else { $chk = 'NA'; } } } $key = '['.$key.']'; $key .= ' ' while (length($key) < $min); $icnt++; $cnt = ${$rh2}{$href}; prt( "$icnt: $key ($cnt)$chk in $val\n" ) if ($out_remote_links); } if (@hosts) { if ($os =~ /Win/) { $out_hosts = path_u2d($out_hosts); } write2file(join("\n",sort(@hosts))."\n",$out_hosts); prt("Full list of HOSTS written to [$out_hosts]...\n") if (VERB2()); } if (@domains) { if ($os =~ /Win/) { $out_domains = path_u2d($out_domains); } write2file(join("\n",sort(@domains))."\n",$out_domains); prt("Full list of Domains written [$out_domains]...\n") if (VERB2()); } $icnt = 0; foreach $key (sort keys %{$rh2}) { $val = ${$rh2}{$key}; $icnt++; prt("$icnt: $key $val\n") if ($out_remote_links); } } ####################################################### # Process the HTTP HREF sources # if $chkip = 1; then attempt to resolve the IP # addresses from the host name. # push(@httprefs, [$src, $fil, $lnnos] ); ####################################################### sub process_host_array() { my $hcnt = scalar @httprefs; my $newcnt = 0; my ($val,$msg,$file,$i,$href,$lnn,$nm,$dir,$key,$icnt,$rawhr); my @hrefsarr = (); $icnt = 0; if ($hcnt) { prt( "Found total $hcnt remote 'http' HREF entries ...\n" ) if (VERB2()); for ($i = 0; $i < $hcnt; $i++) { $lnn = $httprefs[$i][2]; # get file LINE NUMBER $file = $httprefs[$i][1]; # get FULL FILE NAME $g_active_file = $file; $g_active_lnn = $lnn; # $href = $httprefs[$i][0]; # should remove any hash like '#pos', and any query like '?a=b&c=d' $rawhr = $httprefs[$i][0]; push(@hrefsarr,$rawhr); $href = trim_href($rawhr); ($nm,$dir) = fileparse($file); if (defined( $g_hrefs{$href} )) { $val = $g_hrefs{$href}; $val .= ' '.$file; } else { $val = $file; $newcnt++; } $val .= ":$lnn"; $g_hrefs{$href} = $val; prt( "$href in [$file]$lnn\n" ) if ($dbg2); } if (@hrefsarr) { my %dupes = (); foreach $href (@hrefsarr) { $dupes{$href} = 1; } @hrefsarr = keys(%dupes); } if (@hrefsarr) { if ($os =~ /Win/) { $out_hrefs = path_u2d($out_hrefs); } write2file(join("\n",sort(@hrefsarr))."\n",$out_hrefs); prt("Full list of HREFS written to [$out_hrefs] file.\n") if (VERB2()); } $hcnt = scalar keys(%g_hrefs); show_remote_links(); prt( "Found $hcnt different remote entries ...($newcnt) " ); if ($chkip) { prt("and now checking for valid IP\n"); my $inips = 0; prt( "Checking $hcnt IP addresses ... " ); if ( !$refreships && ( -f $ipfile)) { if (open INF, "<$ipfile") { @ipsfound = ; close INF; prt( "Have ".scalar @ipsfound." in $ipfile" ); } else { prt( "Warning: Failed to open $ipfile" ); } } prt("\n"); $icnt = 0; foreach $key (keys %g_hrefs) { $val = $g_hrefs{$key}; $icnt++; prt( "$key in $val\n" ) if ($dbg8); if ($key =~ /^http:\/\//i) { my $hkey = substr($key, 7); # REMOVE protocol string my @arr = split( /\//, $hkey ); # and get just the domain name $hkey = $arr[0]; # but this may not work with '#', or queries '?' # ***TBF*** TO BE FIXED if ( !in_ips_found($hkey) ) { if (checkIPAddress( $hkey ) == 0) { $msg = "FAILED: NO IP FOR HOST [$hkey][$val]"; push(@warnings, $msg); prt( "$msg\n" ) if ($dbg16); } elsif ($writeips) { push(@ipsfound,"$hkey\n"); } } else { $inips++; } } if (($icnt % 100) == 0) { prt( "Done $icnt IP Addresses ...\n" ); } } prt( "Completed $icnt IP Addresses ... " ); if ($writeips) { $val = join("\n", sort @ipsfound); $val = trimblanklines($val); write2file($val, $ipfile); prt( "$inips in previous. Written ".scalar @ipsfound." to $ipfile" ); } prt("\n"); } else { prt(", but IP checking is OFF! (add -checkips)\n"); } } } sub write_missing($$) { my ($fil,$rh) = @_; # ($not_found_file,\%g_images_notfound) if (length($not_found_file)); my ($key,$list); $list = ''; foreach $key (keys %{$rh}) { $list .= "\n" if (length($list)); $list .= $key; } if (length($list)) { $list .= "\n"; write2file($list,$fil); prt("Written missing list to [$fil]\n"); } } sub show_scripts() { my ($val,$scnt,$file); $scnt = scalar @scripts; if ($scnt && ($dbg9 || $showscripts)) { prt( "Got $scnt files containing SCRIPTS ...\n" ); # push(@scripts, [$fil, $lns]); for (my $i = 0; $i < $scnt; $i++) { $file = $scripts[$i][0]; $val = $scripts[$i][1]; prt( "$file $val\n" ); } } } sub show_dir_files($) { my ($hr) = @_; # = \%g_dir_files my ($key,$val,$k2,$v2,$v3,$dir,$cnt,$i,$ii,$icnt); foreach $key (sort keys %{$hr}) { $dir = sub_in_folder($key); $dir = '' if (length($dir)==0); $val = ${$hr}{$key}; # extract hash ref BY TYPE prt("Directory: [$dir]\n"); $icnt = 0; foreach $k2 (sort keys %{$val}) { # for each type $v2 = ${$val}{$k2}; $cnt = scalar @{$v2}; prt(" Type $k2 = $cnt\n"); for ($i = 0; $i < $cnt; $i++) { $v3 = ${$v2}[$i][0]; $ii = ${$v2}[$i][1]; prt(" File: [$v3] "); if ($ii) { $icnt++; prt(" ('index' type)"); } prt("\n"); } } if (!$icnt) { prt("Directory: [$dir] has NO 'index' type file.\n"); } elsif ($icnt > 1) { prt("Directory: [$dir] has $icnt 'index' type files.\n"); } } pgm_exit(1,"Debug run show of directories and files...\n"); } # #################################################### ### MAIN ### parse_args(@ARGV); if (length($in_folder) == 0) { mydie( "No input folder (or file) given/found in command ...\n" ); } if (-f $in_folder) { #$g_user_file = File::Spec->rel2abs($in_folder); $g_user_file = $in_folder; ($homefile, $in_folder) = fileparse($in_folder); $in_folder =~ s/[\\\/]$//; $in_folder = $currworkdir if ($in_folder eq '.'); # 2010-04-02 - use FULL work directory if just '.' $single_file = 1; } # set_debug_on(); # set_htools_dbg_on(); show_startup(); if ( !$recurse && length($g_user_file) && (-f $g_user_file)) { # 0 1 2 3 4 5 6 7 8 9 10 push(@htm_files, [$g_user_file, '', '', 0, 0, '', '', 0, 0, 0, 0] ); } else { process_folder($in_folder); } ###show_dir_files(\%g_dir_files) if ($debug_on); show_found_counts(); process_file_array(); # main file processing - each HTML file found is processed process_host_array(); if (length($homefile)) { ###trace_from_htm( $homefile, 0 ); check_linkages( $homefile ); check_local_links( $homefile ); show_link_counts("HTML ", \@htm_files); show_link_counts("IMG ", \@img_files); show_link_counts("CSS ", \@css_files); show_link_counts("ZIP ", \@zip_files); show_link_counts("TXT ", \@txt_files); show_link_counts("Script ", \@script_files); show_link_counts("Other ", \@other_files); } show_scripts(); my $mlcnt = scalar @missing_links; if ($mlcnt && !$dbg33) { prtw("WARNING: Found $mlcnt files with NO LINKS!\n"); } ############################################################## prt( "\n###### SHOW RESULTS ########\n" ); prt( "WARNING: $hrflnkcnt images by HREF not shown! (use -showhreflinks)\n" ) if ($hrflnkcnt); show_warnings(); if (@missed) { prt( "\nMISSING FOLLOW: ".scalar @missed."\n" ); foreach my $mfile (@missed) { prt( "$mfile\n"); } } prt( "###### END RESULTS ########\n" ); write_missing($not_found_file,\%g_images_notfound) if (length($not_found_file)); ############################################################## pgm_exit(0,"Normal exit(0)\n"); ################################## sub show_help { prt( "$pgmname [Options] input-folder or home-file-name.\n" ); prt( "Purpose: To take a folder, or home file in a folder, and\n" ); prt( "check the assume local Web Site for internal consistency.\n" ); prt( "Options:\n" ); prt( " -checkips = Will check the IP resolution of REMOTE HREF items.\n" ); prt( " -showhreflinks = Show a WARNING when an IMG, ICO, etc is a REMOTE link\n" ); prt( " -showlinks = Show the links for each file ...\n" ); prt( " -showremote = Show remote links, and file containing link.\n" ); prt( " -showscripts = Show SCRIPT files ...\n" ); prt( " -shownoindex = Show folders with NO 'index' type file, and/or NO html types files.\n"); prt( " -writeips = Write HREF of IP found to a file ...\n" ); prt( " -refreships = If -checkips, and -writeips, re-write NEW check file...\n" ); prt( " -ipfile out-file = Set HREF output file. Default is [$ipfile].\n" ); prt( " -ignore in-file = Ignore this file. Repeat for more. use '.none.' to reset list.\n" ); prt( " -shownohrefs = Show when NO HREF found in a file.\n" ); prt( " -loadlog (-ll) = Load LOG file at end.\n"); prt( " -showallnotfound = Show ALL images not found. Default to show only 'different'.\n"); prt( " -nffile out-file = Write not found files to file name given.\n"); prt( " -v[NN] = Bump verbal, or set to 'NN'.\n" ); prt( " -x = Exclude directory. '.reset.' clears the list.\n" ); prt( " -load-log (-l) = load log with output at end.\n"); prt( " -nofollow = Do not follow page links.\n"); prt( "If an input-folder given, then no trace of internal links will be done.\n" ); prt( "If a home file name is given, the folder used will be of that file.\n" ); prt( "All arguments can also be given with a '--' prefix, if that 'feels' better ;=))\n"); prt( "Following are the current default settings ...\n" ); $verbosity += 5; show_startup(); mydie(" Happy link checking ;=))\n"); } # Ensure argument exists, or die. sub require_arg { my ($arg, @arglist) = @_; mydie( "ERROR: no argument given for option '$arg' ...\n" ) if ! @arglist; } ########################################################## # Parse USER input # Largerly still to be done ########################################################## sub parse_args { my (@av) = @_; my ($arg,$narg,$sarg); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); if (($sarg eq 'h')||($sarg eq 'help')||($sarg eq '?')) { prt("Showing help...\n"); show_help(); } elsif ($sarg =~ /^v/) { if ($sarg =~ /^v.*(\d+)$/) { $verbosity = $1; } else { $sarg = substr($sarg,1); while ($sarg =~ /^v/) { $sarg = substr($sarg,1); $verbosity++; } } prt( "Set verbal to $verbosity\n" ) if (VERB1()); } elsif ($sarg eq 'checkips') { $chkip = 1; prt( "Will check IP of REMOTE HREF items.\n" ); } elsif ($sarg eq 'showhreflinks') { $showhreflinks = 1; prt( "Show a WARNING when an IMG, ICO, etc is a REMOTE link\n" ); } elsif ($sarg eq 'showlinks') { $showlinks = 1; prt( "Show the links for each file ...\n" ); } elsif ($sarg eq 'showremote') { $out_remote_links = 1; prt( "Show remote links, and each file...\n" ); } elsif ($sarg eq 'showscripts') { $showscripts = 1; prt( "Show SCRIPT files ...\n" ); } elsif ($sarg eq 'writeips') { $writeips = 1; prt( "Write HREF of IP found to a file ...\n" ); } elsif ($sarg eq 'refreships') { $refreships = 1; prt( "If -checkips, and -writeips, re-write NEW check file...\n" ); } elsif ($sarg eq 'ipfile') { require_arg(@av); shift @av; $ipfile = $av[0]; prt( "HREF output set to $ipfile ...\n" ); } elsif ($sarg eq 'ignore') { require_arg(@av); shift @av; $arg = $av[0]; if ($arg eq '.none.') { @excludes = (); prt( "Reset EXCLUDES array ...\n" ); } else { push(@excludes, $arg); prt( "Added file [$arg] to EXCLUDES ...\n" ); } } elsif ($sarg eq 'x') { require_arg(@av); shift @av; $arg = $av[0]; if ($arg eq '.reset.') { @g_xclude_dir = (); prt( "Reset EXCLUDE directory array ...\n" ); } else { push(@g_xclude_dir, $arg); prt( "Added file [$arg] to EXCLUDE directories ...\n" ); } } elsif ($sarg eq 'shownoindex') { $show_no_index = 1; prt("Set to show folders with NO 'index' type file, and/or NO html types files.\n"); } elsif (($sarg eq 'll')||($sarg eq 'loadlog')) { $load_log = 1; prt("Set to load log at end.\n"); } elsif ($sarg eq 'showallnotfound') { prt( "Set to show ALL images not found.\n"); $show_all_not_found = 1; } elsif ($sarg eq 'nffile') { require_arg(@av); shift @av; $narg = $av[0]; $not_found_file = $narg; # if file name given, write list to that file... prt("Will write missing to file [$narg]\n"); } elsif ($sarg eq 'debug') { prt("Setting BIG debug ON...\n"); set_debug_on(); set_htools_dbg_on(); $load_log = 1; } elsif ($sarg =~ /^l/) { $load_log = 1; prt("Set to load log at end.\n"); } elsif ($sarg eq 'nofollow') { $recurse = 0; prt("Set to NOT follow links\n"); } else { mydie( "ERROR: Unknown argument [$arg] ...\n" ); } } else { # no leading '-' $in_folder = $av[0]; prt( "Input folder set to [$in_folder]...\n" ) if (VERB2()); } shift @av; } # check the INPUT folder if ( $debug_on ) { prtw("WARNING: DEBUG is ON!\n"); if (length($in_folder) == 0) { if (length($def_in_folder) && (-d $def_in_folder)) { $in_folder = $def_in_folder; prt("[debug_on] Input folder set to DEFAULT [$in_folder]...\n"); } elsif (length($def_file) && (-f $def_file)) { $in_folder = $def_file; prt("[debug_on] Input folder set to DEFAULT [$in_folder]...\n"); } } $load_log = 1; } if (length($in_folder) == 0) { mydie( "ERROR: No VALID FOLDER OR FILE NAME found in command\n" ); } if ( !( (-d $in_folder) || (-f $in_folder ) ) ) { mydie( "ERROR: [$in_folder] is NOT VALID FOLDER OR FILE NAME\n" ); } # pgm_exit(1,"Current work directory: [$currworkdir]...\n"); } # eof - chklinks03.pl