#!/perl -w
# NAME: gethrefs02.pl
# AIM: Parse a HTML file, and extract HREF links
# 2016-08-03 - Reciew
# 05/11/2015 - Lots of quick improvements
# 18/07/2010 - revisit and test...
use strict;
use warnings;
use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
use constant {
HRT_UNKNOWN => 0,
HRT_LOCAL => 1,
HRT_LINK => 2,
HRT_SCRIPT => 4,
HRT_FILE => 8,
HRT_BASE => 16,
HRT_PARAMS => 32
};
use constant {
FT_UNKNOWN => 0,
FT_HTML => 1,
FT_GRAF => 2,
FT_CSS => 3,
FT_SCRIPT => 4,
FT_TEXT => 5,
FT_ZIP => 6,
FT_BIN => 7,
FT_CODE => 8,
FT_DIR => 9,
FT_HIDDEN => 10,
FT_PARAM => 11
};
# offsets in file array
use constant {
OF_FF => 0, # full file name
OF_HR => 1, # array ref of href links
OF_IM => 2, # array ref of image links
OF_LK => 3, # linked count
OF_SP => 4, # spare
OF_TO => 5, # links TO
OF_FM => 6, # links FROM
OF_FT => 7 # file type
};
my $perl_root = 'C:\Gtools\perl';
unshift(@INC,$perl_root);
#require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
require 'lib_utils.pl' or die "Unable to load lib_utils.pl ...\n";
require 'htmltools.pl' or die "Unable to load htmltools.pl ...\n";
# for htmltools, if functions used
my @imgs = ();
my @hrefs = ();
# log file stuff
my ($LF);
my $pgmname = $0;
if ($pgmname =~ /\w{1}:\\.*/) {
my @tmpsp = split(/\\/,$pgmname);
$pgmname = $tmpsp[-1];
}
my $outfile = $perl_root."\\temp.$pgmname.txt";
open_log($outfile);
###prt( "$0 ... Hello, World ...\n" );
my $os = $^O;
my $in_file = '';
## my $in_file = 'C:\GTools\java\examples\JavaTech\Code_List.htm';
## my $in_file = 'C:\HOMEPAGE\GA\travel\maroc\index.htm';
## my $in_file = 'temphtml.htm';
my $usr_url = '';
my @all_hrefs = ();
my $outtemp = $perl_root."\\templist.txt";
my $show_full_list = 0;
my $show_missed_files = 0;
my $verbosity = 0;
my $load_log = 0;
my $out_file = '';
my $VERS = "0.0.5 2015-11-05";
##my $VERS = "0.0.4 2010-07-18";
sub VERB1() { return $verbosity >= 1; }
sub VERB2() { return $verbosity >= 2; }
sub VERB5() { return $verbosity >= 5; }
sub VERB9() { return $verbosity >= 9; }
# CONSTANTS
###########
# File Type Extensions
my @html_extension = qw( .htm .html .shtml .php );
my @graf_extension = qw( .jpg .jpeg .gif .png .bmp .ico .mpg );
my @css_extension = qw( .css );
my @script_extension = qw( .js .class .cgi .java .remote );
my @zip_extension = qw( .zip .tar .gz .jar .tgz );
my @txt_extension = qw( .txt .doc .bat .cmd .old .bak .policy .pdf .cfg );
my @code_extension = qw( .c .cxx .cpp .h .hxx .hpp .idl .mak );
my @bin_extension = qw( .dat .exe .au );
# private FRONTPAGE folders
my @fpfolders = qw( _vti_cnf _vti_pvt _private _derived );
# features
my $ignfpd = 1; # ignore FRONTPAGE folders
my @excludes = qw( desktop.ini php.ini blank.html blank.htm );
my $recurse = 0; # recursive
my @splexcludes = qw( macpc );
my %ext_hash = ();
my @all_files = ();
my $refcnt = 0;
my @done_files = ();
my %not_found = ();
my ($base_file,$base_dir);
my $base_href = ''; # set if found
# DEBUG
my $dbg1 = 0; # show discarded material
my $dbg2 = 0; # show "$hcnt:HREF: hr[$hr] $ctyp tail[$tail] pre[$disc] tag[$tag] bgn[$bgn]...
my $dbg3 = 0; # show Processing $lncnt lines from $fil ...
my $dbg4 = 0; # show File [$name], in [$rdir] ...
my $dbg5 = 0; # show HREF immediately
my $dbg6 = 0; # show FOLDERS searched...
# ### DEBUG ###
my $debug_on = 0;
my $def_file = 'C:\Users\user\Downloads\temp\index.html';
my @warnings = ();
sub show_warnings($) {
my ($val) = @_;
if (@warnings) {
prt( "\nGot ".scalar @warnings." WARNINGS...\n" );
foreach my $itm (@warnings) {
prt("$itm\n");
}
prt("\n");
} else {
prt( "\nNo warnings issued.\n\n" ) if (VERB9());
}
}
sub pgm_exit($$) {
my ($val,$msg) = @_;
if (length($msg)) {
$msg .= "\n" if (!($msg =~ /\n$/));
prt($msg);
}
show_warnings($val);
close_log($outfile,$load_log);
exit($val);
}
sub prtw($) {
my ($tx) = shift;
$tx =~ s/\n$//;
prt("$tx\n");
push(@warnings,$tx);
}
##################################################################
# OF_FF => 0, # full file name
# OF_HR => 1, # array ref of href links
# OF_IM => 2, # array ref of image links
# OF_LK => 3, # linked count
# OF_SP => 4, # spare
# OF_TO => 5, # links TO
# OF_FM => 6, # links FROM
# OF_FT => 7 # file type
sub show_results {
my $fcnt = scalar @all_files;
my ($i, $ff, $ft, $cnt, $mcnt);
$mcnt = 0;
for ($i = 0; $i < $fcnt; $i++) {
$ft = $all_files[$i][OF_FT];
if ($ft == FT_HTML) {
$cnt = $all_files[$i][OF_SP];
if ($cnt == 0) {
$ff = $all_files[$i][OF_FF];
$mcnt++; # prt( "Missed [$ff]\n" );
}
}
}
if ($mcnt) {
prt("Got $mcnt 'missed' files... not marked...\n");
for ($i = 0; $i < $fcnt; $i++) {
$ft = $all_files[$i][OF_FT];
if ($ft == FT_HTML) {
$cnt = $all_files[$i][OF_SP];
if ($cnt == 0) {
$ff = $all_files[$i][OF_FF];
prt( "Missed [$ff]\n" );
}
}
}
}
}
# =========================================================================
# url_parse - needs some more to remove any other post, like index.htm?a=b...
# ---------
sub url_parse($) {
my ($url) = @_;
my $post = '';
my $name = '';
my $dir = '';
my $ind = index($url,'#');
if ($ind > 0) {
$post = substr($url,$ind);
$url = substr($url,0,$ind);
}
if ($url =~ /\/$/) {
$dir = $url;
} else {
($name,$dir) = fileparse($url);
if ( !($name =~ /\./) ) {
# without an EXTENT, assume directory
$dir .= $name.'/';
$name = '';
}
}
return $dir,$name,$post; # url_parse - return (dir,name,post)
}
sub uri_parse2($) {
my ($uri) = shift;
$uri =~ /^(([^:\/\?#]+):)?(\/\/([^\/\?#]*))?([^\?#]*)(\?([^#]*))?(#(.*))?/;
# Then:
my $scheme = (defined $2) ? $2 : '';
my $authority = (defined $4) ? $4 : '';
my $path = (defined $5) ? $5 : '';
my $query = (defined $7) ? $7 : '';
my $fragment = (defined $9) ? $9 : '';
return $scheme,$authority,$path,$query,$fragment;
}
##################################################################
#########################################################
# Passed an array of extensions,
# check if this is one of them?
#########################################################
sub is_my_extension {
my ($fil, $rexts) = @_;
my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
my $lcext = lc($ext);
my ($ex);
foreach $ex (@{$rexts}) {
return 1 if (lc($ex) eq $lcext);
}
return 0;
}
############################################
# only looking for HTM, HTML, PHP,
# could be extended to others maybe ...
############################################
sub is_htm_extension {
my ($fil) = shift;
return( is_my_extension($fil, \@html_extension) );
}
sub is_graphic_extension {
my ($fil) = shift;
return( is_my_extension($fil, \@graf_extension) );
}
sub is_zip_extension {
my ($fil) = shift;
return( is_my_extension($fil, \@zip_extension) );
}
sub is_css_extension {
my ($fil) = shift;
return( is_my_extension($fil, \@css_extension) );
}
sub is_txt_extension {
my ($fil) = shift;
return( is_my_extension($fil, \@txt_extension) );
}
sub is_code_extension {
my ($fil) = shift;
return( is_my_extension($fil, \@code_extension) );
}
sub is_script_extension {
my ($fil) = shift;
return( is_my_extension($fil, \@script_extension) );
}
sub is_bin_extension {
my ($fil) = shift;
return( is_my_extension($fil, \@bin_extension) );
}
#use constant {
# FT_UNKNOWN => 0,
## FT_HTML => 1,
## FT_GRAF => 2,
## FT_CSS => 3,
## FT_SCRIPT => 4,
## FT_TEXT => 5,
## FT_ZIP => 6,
## FT_DIR => 7
#};
sub get_file_type_const($);
sub get_file_type_const($) {
my ($fil) = shift;
if (is_htm_extension($fil)) {
return FT_HTML;
} elsif (is_graphic_extension($fil)) {
return FT_GRAF;
} elsif (is_zip_extension($fil)) {
return FT_ZIP;
} elsif (is_css_extension($fil)) {
return FT_CSS;
} elsif (is_txt_extension($fil)) {
return FT_TEXT;
} elsif (is_script_extension($fil)) {
return FT_SCRIPT;
} elsif (is_bin_extension($fil)) {
return FT_BIN;
} elsif (is_code_extension($fil)) {
return FT_CODE;
} elsif ($fil =~ /\/$/) {
return FT_DIR;
} elsif ($fil =~ /#/) {
my $ih = index($fil,'#');
if ($ih > 0) {
my $f2 = substr($fil,0,$ih);
return get_file_type_const($f2);
}
} elsif ($fil =~ /\//) {
return FT_DIR; # gross assumption
} elsif ($fil =~ /^\w+$/) {
return FT_DIR; # another gross assumption
}
return FT_HIDDEN if ($fil =~ /^\./);
return FT_PARAM if ($fil =~ /^\?.+/);
### pgm_exit(1,"Why UNKNOWN for [$fil]?\n");
return FT_UNKNOWN;
}
sub file_type_const_to_string {
my ($ft) = shift;
if ($ft == FT_HTML) {
return "html";
} elsif ($ft == FT_GRAF) {
return "graphic";
} elsif ($ft == FT_ZIP) {
return "zip";
} elsif ($ft == FT_CSS) {
return "css";
} elsif ($ft == FT_TEXT) {
return "text";
} elsif ($ft == FT_SCRIPT) {
return "script";
} elsif ($ft == FT_BIN) {
return "binary";
} elsif ($ft == FT_CODE) {
return "code";
} elsif ($ft == FT_DIR) {
return "directory";
} elsif ($ft == FT_UNKNOWN) {
return "unknown";
} elsif ($ft == FT_HIDDEN) {
return "hidden";
} elsif ($ft == FT_PARAM) {
return "parameter";
}
pgm_exit(1,"***FIX ME*** uncased type [$ft]!");
return "";
}
sub fix_rel_url($) {
my ($path) = @_;
my @a = split(/\//, $path);
my $npath = '';
my $max = scalar @a;
my @na = ();
for (my $i = 0; $i < $max; $i++) {
my $p = $a[$i];
if ($p eq '.') {
# ignore this
} elsif ($p eq '..') {
if (@na) {
pop @na; # discard previous
} else {
pgm_exit(1,"ERROR: Got relative .. without previous!!! path=$path\n" );
}
} else {
push(@na,$p);
}
}
foreach my $pt (@na) {
$npath .= "/" if length($npath);
$npath .= $pt;
}
return $npath;
}
sub is_http_link($) {
my ($hr) = shift;
return 1 if ($hr =~ /^http(s*):\/\//);
return 0;
}
sub get_full_base_href($$) {
my ($rrhr, $bhr) = @_;
my $rhr = ${$rrhr};
my ($nm,$dir,$fhr);
if ( length($bhr) && !is_http_link($rhr) && !($rhr =~ /^#/) ) {
if ($bhr =~ /\/$/) {
$dir = $bhr; # assume a DIRECTORY
} else {
($nm,$dir) = fileparse($bhr); # assume a FILE, so get the dir only...
}
$fhr = $dir.$rhr;
$fhr = fix_rel_url($fhr);
${$rrhr} = $fhr;
return 1;
}
return 0;
}
sub get_href_type_const($);
sub get_href_type_const($) {
my ($hrf) = shift;
my ($ih,$id,$is);
if (is_http_link($hrf)) {
return HRT_LINK;
} elsif ($hrf =~ /^ftp:\/\//i) {
return HRT_LINK;
} elsif ($hrf =~ /^javascript:/i) {
return HRT_SCRIPT;
} elsif (substr($hrf,0,1) eq '#') {
return HRT_LOCAL;
}
#if ( get_full_base_href(\$hrf,$base_href) ) {
# if ($hrf =~ /^http(s*):\/\//i) {
# return HRT_LINK;
# }
#}
$ih = index($hrf,'#');
$id = rindex($hrf,'.');
$is = rindex($hrf,'/');
if ($ih > 0) {
my $hr2 = substr($hrf,0,$ih);
my $srt = get_href_type_const($hr2);
$id = rindex($hr2,'.');
$is = rindex($hr2,'/');
if ($id > 0) {
return ($srt | HRT_FILE);
}
}
if ($id > 0) {
# contains a DOT - assume a file
return HRT_FILE;
}
if ($hrf =~ /\/$/) {
# ends in '/', assume file - acutally directory
return HRT_FILE;
}
if ($hrf =~ /\//) {
# contains any '/', assume a file
return HRT_FILE;
}
if ($hrf =~ /^\w+$/) {
# contains any alphanumeric only, assume a file
return HRT_FILE;
}
if ($hrf =~ /^\?.+/) {
# if a href param, like '?C=N;O=D', ...
return HRT_PARAMS;
}
prtw("WARNING: Why UNKNOWN on href [$hrf] ih=$ih id=$id is=$is\n");
return HRT_UNKNOWN;
}
my %done_warning = ();
sub href_type_to_string {
my ($hrt) = shift;
my $ret = '';
if ($hrt & HRT_LINK) {
$ret .= "extern link ";
}
if ($hrt & HRT_SCRIPT) {
$ret .= "script ";
}
if ($hrt & HRT_LOCAL) {
$ret .= "local ";
}
if ($hrt & HRT_FILE) {
$ret .= "file ";
}
if ($hrt & HRT_BASE) {
$ret .= "BASE ";
}
if ($hrt == HRT_UNKNOWN) {
$ret = "unknown";
}
if ($hrt == HRT_PARAMS) {
$ret = "parameter";
}
$ret =~ s/\s+$//;
if (length($ret) == 0) {
my $err = "***FIX ME*** uncased type [$hrt]!";
if (!defined $done_warning{$err}) {
$done_warning{$err} = 1;
prtw("WARNING: $err\n");
}
}
return $ret;
}
##############################################
sub get_hrefs_from_string($) {
my ($ln) = shift;
my ($i, $j, $line, $ch, $ch2, $len, $tag, $disc, $hcnt);
my ($bgn, $fhr, $hr, $tail, $max, $hrt, $ft, $ctyp);
my ($sp,$tag2,$gottag);
my @hrf = ();
$ln =~ s/\n/ /g;
$ln = trim_all($ln);
# sub write2file { my ($txt,$fil) = @_;
# write2file($fulln,'tempfl.txt');
$len = length($ln);
$disc = '';
$hcnt = 0;
$base_href = ''; # assume NO
# process single long string, char by char
for ($i = 0; $i < $len; $i++) {
$ch = substr($ln,$i,1);
if ($ch eq '<') {
$tag = $ch; # start a tag
$i++;
$ch = substr($ln,$i,1);
# could check for things like ') {
last;
}
if (!$gottag) {
if ($ch =~ /\w/) {
$tag2 .= $ch;
} else {
$gottag = 1;
}
}
}
if ($tag =~ /(.*\s+)href(\s*)=/i) {
$bgn = $1;
$sp = length($2);
$hcnt++;
$fhr = substr($tag,length($bgn)+5+$sp);
$fhr = substr($fhr,1) while ($fhr =~ /^\s/); # remove all LEADING space
$ch = substr($fhr,0,1);
prt("$tag [$tag2] [$fhr]\n") if ($dbg5);
if (($ch eq '"')||($ch eq "'")) {
$max = length($fhr);
$hr = '';
$tail = '';
# collect actual HREF=
for ($j = 1; $j < $max; $j++) {
$ch2 = substr($fhr,$j,1);
if ($ch eq $ch2) {
$tail = substr($fhr,$j);
last;
}
$hr .= $ch2;
}
if ($tag2 =~ /^BASE$/i) {
$hrt = HRT_BASE;
#prt("Got [$tag2] [$fhr] [$hr]\n");
my ($d,$n,$p) = url_parse($hr);
prt("Got BASE [$hr] = [$d]+[$n]+[$p]\n");
$base_href = $hr;
} else {
get_full_base_href(\$hr,$base_href);
$hrt = get_href_type_const($hr);
}
$ctyp = '';
$ft = FT_UNKNOWN;
if ($hrt & HRT_FILE) {
$ft = get_file_type_const($hr);
$ctyp = "ext[".file_type_const_to_string($ft)."] ";
}
$ctyp = 'type['.href_type_to_string($hrt)."] $ctyp";
#prt("tag [$tag2] [$hr] $ctyp\n");
prt( "$hcnt:HREF: hr[$hr] $ctyp tail[$tail] pre[$disc] tag[$tag] bgn[$bgn]\n" ) if ($dbg2);
# href HRT-type FT-file
# 0 1 2
push(@hrf, [$hr, $hrt, $ft]);
} else {
prt( "$hcnt:HREF: fhr[$fhr] pre[$disc] tag[$tag] bgn[$bgn] CHECK ME\n" );
}
} else {
prt( "DISCARDED: pre[$disc] tag[$tag] ...\n" ) if ($dbg1);
}
$disc = '';
} else {
$disc .= $ch;
}
}
return @hrf;
}
sub trim_href($) {
my $fil = shift;
my $nfil = '';
my $len = length($fil);
my ($i,$ch);
for ($i = 0; $i < $len; $i++) {
$ch = substr($fil,$i,1);
if (($i == 0)&&($ch eq '/')) {
next;
}
if (($ch eq '#')||($ch eq '?')) {
last;
}
$nfil .= $ch;
}
return $nfil;
}
sub parse_file($$) {
my ($bdir,$bfil) = @_;
#if ($bdir = /^\.(\\|\/)$/) {
# $bdir = '';
#}
my $fil = $bdir.$bfil;
prt( "Processing file '$fil' ...\n" );
my ($lncnt, $full, $hrcnt, $i, $hfcnt, $typ, $filcnt,$linkcnt,$ra,$hr,$ci2);
my @hrf = ();
if ( ! open INF, "<$fil") {
prt( "WARNING: Can NOT open file [$fil]...\n" );
return @hrf;
}
my @lines = ;
close INF;
$lncnt = scalar @lines;
prt( "Processing $lncnt lines from [$fil] ...\n" );
$full = join('',@lines);
# sub write2file { my ($txt,$fil) = @_;
#my $scrp = return_tag($full,'script');
##my $scrp = get_all_tag_text($full,'script');
##write2file($scrp,'tempscript.txt');
##prt( "Got script text [$scrp]\n" );
@hrf = get_hrefs_from_string($full);
$hrcnt = scalar @hrf;
prt( "Got $hrcnt HREF entries, from $fil...\n" );
$filcnt = 0;
# href HRT-type FT-file
# 0 1 2
# push(@hrf, [$hr, $hrt, $ft]);
my %hr_dupes = ();
my $url = $usr_url; # get any user url
my @list = ();
if (length($url)) {
$url .= '/' if (!($url =~ /\/$/)); # ensure ends with '/'
}
for ($i = 0; $i < $hrcnt; $i++) {
$ra = $hrf[$i];
$hr = ${$ra}[0];
$typ = ${$ra}[1];
$fil = $bdir.$hr;
$fil = trim_href($fil);
if (defined $hr_dupes{$hr}) {
$hr_dupes{$hr}++;
next;
} else {
$ci2 = sprintf("%3d", ($i + 1));
push(@list,"$url$hr");
prt("$ci2: $url$hr\n") if (VERB1());
}
next if (-d $fil);
if ( ($typ & HRT_FILE) && !($typ & HRT_LINK) ) {
$filcnt++;
if (! -f $fil) {
if (defined $not_found{$fil}) {
$not_found{$fil}++;
} else {
prt( "WARNING: File [$fil] NOT found ...\n" ) if (VERB2());
$not_found{$fil} = 1;
}
}
}
}
my $cnt = scalar @list;
prt( "Got $cnt diff HREF entries, from $bfil... $filcnt appear file refs...\n" );
if ($cnt && length($out_file)) {
$fil = join("\n",@list)."\n";
write2file($fil,$out_file);
prt("List written to '$out_file'...\n");
}
$linkcnt = 0;
my %counted = ();
my %by_extent = ();
my %by_fn = ();
my @dupes = ();
my $msg = '';
my ($nm,$dir,$ext,$ind,$ff);
for ($i = 0; $i < $hrcnt; $i++) {
$fil = $hrf[$i][0];
$typ = $hrf[$i][1];
if ($typ & HRT_LINK) {
$ind = index($fil,'#');
$fil = substr($fil,0,$ind) if ($ind > 0);
if (defined $counted{$fil}) {
$counted{$fil}++;
} else {
$counted{$fil} = 1;
$linkcnt++;
($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
$by_extent{$ext} = [] if (!defined $by_extent{$ext});
push( @{$by_extent{$ext}}, $fil );
$ff = $nm.$ext;
if ($ext =~ /^\.java/) {
if (defined $by_fn{$ff}) {
push(@dupes,$ff);
$by_fn{$ff}++;
} else {
$by_fn{$ff} = 1;
}
}
}
}
}
if ($linkcnt) {
%counted = ();
prt("Listing $linkcnt links...\n");
if ($show_full_list) {
for ($i = 0; $i < $hrcnt; $i++) {
$fil = $hrf[$i][0];
$typ = $hrf[$i][1];
if ($typ & HRT_LINK) {
$ind = index($fil,'#');
$fil = substr($fil,0,$ind) if ($ind > 0);
if (defined $counted{$fil}) {
$counted{$fil}++;
} else {
$counted{$fil} = 1;
prt("$fil\n");
$msg .= "$fil\n";
}
}
}
}
foreach $ext (keys %by_extent) {
my $list = $by_extent{$ext};
foreach $fil (@{$list}) {
#prt("$fil\n");
$msg .= "$fil\n";
}
}
write2file($msg,$outtemp);
prt("Written list to $outtemp...\n");
if (@dupes) {
prt("Note: ".scalar @dupes." duplicated file names...\n");
prt( join(" ",@dupes)."\n");
} else {
prt("Appears NO duplicated names...\n");
}
#} else {
# prt("No link count in $fil...\n");
}
return @hrf;
}
####################################
####################################################################
# process_folder(folder)
# Main DIRECTORY processing function
#
# Open the FOLDER given, and collect ALL files found,
# iterate into sub-directories, if $recurse is non-zero,
# and it is NOT a special FRONTPAGE (hidden) FOLDER.
#
# Files are collected into multidemensional arrays
####################################################################
sub process_folder {
my ($inf) = shift;
my ($ft,$ff,$nm,$dir,$ext,$val,$fil,$idir);
my $fcnt = 0;
prt( "Processing $inf folder ...\n" ) if ($dbg1 || VERB5());
if ( opendir( DIR, $inf ) ) {
my @files = readdir(DIR);
closedir DIR;
$idir = $inf;
$idir .= "\\" if (!($idir =~ /(\\|\/)$/));
foreach $fil (@files) {
next if (($fil eq ".")||($fil eq ".."));
$ff = $idir.$fil;
if ( -d $ff ) {
if ($recurse) {
if ($ignfpd && is_fp_folder($fil)) { # ignore FRONTPAGE folders
next;
}
if (@splexcludes && in_spl_excludes($fil)) {
next;
}
process_folder( $ff );
}
} else {
$ft = get_file_type_const($fil);
# NOTE: multidimensional arrays pushed - offsets into arrays
if ( !in_excludes($fil) ) { # NOT in @excludes
($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
$val = 0;
$val = $ext_hash{$ext} if ( defined $ext_hash{$ext} );
$val++;
$ext_hash{$ext} = $val;
push(@all_files, [$ff, '', '', 0, 0, '', '', $ft] );
$fcnt++;
}
}
}
prt( "Processed $inf folder finding $fcnt files ...\n" ) if ($dbg6 || VERB2());
} else {
prt( "ERROR: Failed to open folder $inf ...\n" );
}
}
################################################
# my $ignfpd = 1; # ignore FRONTPAGE folders
################################################
sub is_fp_folder {
my ($inf) = shift;
foreach my $fil (@fpfolders) {
if (lc($inf) eq lc($fil)) {
return 1;
}
}
return 0;
}
####################################
# Check if FILE is in EXCLUDE list
####################################
sub in_excludes {
my ($fil) = shift;
my $lcf = lc($fil);
foreach my $f (@excludes) {
if (lc($f) eq $lcf) {
return 1;
}
}
return 0;
}
sub in_spl_excludes {
my ($fldr) = shift;
my $lfldr = lc($fldr);
foreach my $f (@splexcludes) {
if (lc($f) eq $lfldr) {
return 1;
}
}
return 0;
}
sub set_status_case {
my ( $ch, $pch, $inccm, $inlnc, $inqot, $qot ) = @_;
my $ldbg2 = 0;
if ($$inccm) {
if (($ch eq '/')&&($pch eq '*')) {
$$inccm = 0;
prt( "status: End C comment /* */ ...\n" ) if ($ldbg2);
}
} elsif ($$inlnc ) {
if ($ch eq "\n") {
$$inlnc = 0;
prt( "status: End line comment // ...\n" ) if ($ldbg2);
}
} elsif ($$inqot ) {
if ($ch eq $$qot) {
prt( "status: End quote $$qot ...\n" ) if ($ldbg2);
$$inqot = 0;
$$qot = '';
}
} else {
if ($ch eq '/') {
if ($pch eq '/') {
$$inlnc = 1;
prt( "status: Entered line comment // ...\n" ) if ($ldbg2);
}
} elsif ($ch eq '*') {
if ($pch eq '/') {
$$inccm = 1;
prt( "status: Entered C comment /* */ ...\n" ) if ($ldbg2);
}
} elsif (($ch eq '"')||($ch eq "'")) {
$$qot = $ch;
$$inqot = 1;
prt( "status: Entered quote $$qot ...\n" ) if ($ldbg2);
}
}
}
sub get_all_tag_text {
my ($txt, $tag) = @_;
my $len = length($txt);
my $ldbg1 = 0;
my $ntxt = '';
my $ch = '';
my $pch = '';
my $ftag = '';
my $nline = '';
my $i = 0;
my $intag = 0;
my $incomment = 0;
my $inqot = 0; # in quotes ' or "
my $qot = '';
my $inlnc = 0; # in line comment
my $inccm = 0; # in C comment
my ($part, $shlen);
###prt("Processing $len chars for $tag ...\n");
for ($i = 0; $i < $len; $i++) {
$pch = $ch;
$ch = substr($txt, $i, 1);
set_status_case( $ch, $pch, \$inccm, \$inlnc, \$inqot, \$qot );
if ($incomment) {
$ntxt .= $ch;
if ($ch eq '>') {
$shlen = -15;
if (length($ntxt) < 15) {
$shlen = 0 - length($ntxt);
}
prt( "Potential close [".substr($ntxt,$shlen)."] ...($i)" ) if ($ldbg1);
if (substr($ntxt,-3) eq '-->') {
if (!$inqot && !$inlnc && !$inccm) {
prt( " Yes\n" ) if ($ldbg1);
$incomment = 0; # no longer IN comment
prt("End comment ...\n") if ($ldbg1);
} else {
if ($inqot) {
prt( " NO DUE TO IN QUOTE\n" ) if ($ldbg1);
} elsif ($inlnc) {
prt( " NO DUE TO IN LINE COMMENT\n" ) if ($ldbg1);
} elsif ($inccm) {
prt( " NO DUE TO IN C COMMENT\n" ) if ($ldbg1);
} else {
prt( " NO DUE TO SOME REASON!!! **** CHECK ME!!! ****\n" ) if ($ldbg1);
}
}
} else {
prt( " NO!\n" ) if ($ldbg1);
}
}
} elsif ($intag) {
if ($ch eq "<") {
###prt("Got begin < ...\n");
$part = substr($txt,$i,4);
if ($part eq '