#!/usr/bin/perl -w # NAME: chkxml2.pl # AIM: Given a directory, check ALL xml files # (a) for a BOM, and (b) for xml character encoding use strict; use warnings; use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] ) use File::stat; # get file info if ($sb = stat($fil)){$dt = $sb->mtime; $sz = $sb->size;} use Cwd; my $os = $^O; my $perl_dir = '/home/geoff/bin'; my $PATH_SEP = '/'; my $temp_dir = '/tmp'; if ($os =~ /win/i) { $perl_dir = 'C:\GTools\perl'; $temp_dir = $perl_dir; $PATH_SEP = "\\"; } unshift(@INC, $perl_dir); require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n"; # log file stuff our ($LF); my $pgmname = $0; if ($pgmname =~ /(\\|\/)/) { my @tmpsp = split(/(\\|\/)/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = $temp_dir.$PATH_SEP."temp.$pgmname.txt"; open_log($outfile); # user variables my $VERS = "0.0.1 2012-05-05"; my $load_log = 0; my $in_file = ''; my $verbosity = 0; my $out_xml = ''; ### debug ### my $debug_on = 1; #my $def_file = 'c:\FG\15\fgdata'; my $def_file = 'c:\FGCVS\flightgear\data'; ### program variables my @warnings = (); my $cwd = cwd(); my @xml_array = (); my $total_lines = 0; my $total_files = 0; my $blank_files = 0; my $bom_files = 0; my $files_no_decl = 0; my @no_xml_decl = (); my $files_with_enc = 0; my $files_no_enc = 0; my @blanks_founda = (); my %encs_foundh = (); my @encs_founda = (); my %boms_foundh = (); my @boms_founda = (); my $curr_file_bom = ''; my %encs_found = (); my $total_bytes = 0; my $largest_file_size = 0; my $largest_file_name = ''; my $largest_file_line = 0; sub VERB1() { return $verbosity >= 1; } sub VERB2() { return $verbosity >= 2; } sub VERB5() { return $verbosity >= 5; } sub VERB9() { return $verbosity >= 9; } # BOM list - name, count, values my @BOM_list = ( [ "UTF-8", 3, [0xEF,0xBB,0xBF ] ], # 239 187 191 [ "UTF-16 (BE)", 2, [0xFE,0xFF ] ], # 254 255 [ "UTF-16 (LE)", 2, [0xFF,0xFE ] ], # 255 254 [ "UTF-32 (BE)", 4, [0x00,0x00,0xFE,0xFF] ], # 0 0 254 255 [ "UTF-32 (LE)", 4, [0xFF,0xFE,0x00,0x00] ], # 255 254 0 0 [ "UTF-7a" , 4, [0x2B,0x2F,0x76,0x38] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F [ "UTF-7b" , 4, [0x2B,0x2F,0x76,0x39] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F [ "UTF-7c" , 4, [0x2B,0x2F,0x76,0x2B] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F [ "UTF-7d" , 4, [0x2B,0x2F,0x76,0x2F] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F [ "UTF-1" , 3, [0xF7,0x64,0x4C ] ], # 247 100 76 [ "UTF-EBCDIC" , 4, [0xDD,0x73,0x66,0x73] ], # 221 115 102 115 [ "SCSU" , 3, [0x0E,0xFE,0xFF ] ], # 14 254 255 [ "BOCU-1" , 3, [0xFB,0xEE,0x28 ] ], # 251 238 40 [ "GB-18030" , 4, [0x84,0x31,0x95,0x33] ] # 132 49 149 51 ); sub show_warnings($) { my ($val) = @_; if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } else { prt( "\nNo warnings issued.\n\n" ) if (VERB9()); } } sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg); } show_warnings($val); close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } # LOAD without a BOM my $last_bom_name = ''; sub line_has_bom($$) { my ($line,$rname) = @_; my $max = scalar @BOM_list; my $len = length($line); my ($i,$j,$name,$cnt,$ra,$ch,$val); for ($i = 0; $i < $max; $i++) { $name = $BOM_list[$i][0]; # name $cnt = $BOM_list[$i][1]; # length $ra = $BOM_list[$i][2]; # ref array of values if ($len > $cnt) { # make sure line length GT BOM for ($j = 0; $j < $cnt; $j++) { $ch = substr($line,$j,1); # extract CHAR $val = ord($ch); # get VALUE last if ($val != ${$ra}[$j]); # compare } if ($j == $cnt) { # if ALL values found ${$rname} = $name; # give back 'name' $last_bom_name = $name; return $cnt; # and return count } } } return 0; # no BOM found } sub remove_utf_bom($$) { my ($ff,$ra) = @_; my $line = ${$ra}[0]; # get first line my $name = ''; my $len = line_has_bom($line,\$name); my $iret = 0; if ($len) { $curr_file_bom = substr($line,0,$len); $line = substr($line,$len); # truncate line ${$ra}[0] = $line; # and return minus BOM my ($nm,$dr) = fileparse($ff); # just show name prt("NOTE: File [$ff] is $name encoding. BOM($len) removed.\n"); $iret = 1; } return $iret; } sub process_in_dir($); sub process_in_dir($) { my $dir = shift; my @dirs = (); if (!opendir(DIR,$dir)) { prtw("WARNING: Unable to open directory [$dir}!\n"); return; } my @files = readdir(DIR); closedir(DIR); my $cnt = scalar @files; prt("Have $cnt files, from [$dir] to process...\n") if (VERB9()); my ($file,$ff); ut_fix_directory(\$dir); @dirs = (); foreach $file (@files) { next if (($file eq ".") || ($file eq "..")); $ff = $dir.$file; if ( -l $ff) { # ignore LINKS } elsif (-d $ff) { push(@dirs,$ff); } elsif (-f $ff) { if ($file =~ /\.xml$/i) { push(@xml_array,$ff); } } } foreach $dir (@dirs) { process_in_dir($dir); } } sub local_xml_to_lines($$$) { my ($rlm, $rlns,$file) = @_; my $intag = 0; my $text = ''; my @nlines = (); my ($fln, $ln, $ch, $pch, $nch, $len, $i, $i2, $tag, $xml, $dnx); my ($lnnm, $lnb, $nlnm); $pch = ''; $nch = ''; $tag = ''; $xml = ''; $dnx = 0; $lnnm = 0; $nlnm = 0; $lnb = 0; my $nxtxt = ''; foreach $fln (@{$rlns}) { chomp $fln; $ln = trim_all($fln); $len = length($ln); $lnnm++; # count another xml line for ($i = 0; $i < $len; $i++) { $i2 = $i + 1; $ch = substr($ln,$i,1); $nch = (($i2 < $len) ? substr($ln,$i2,1) : ' '); if ($intag) { # on first SPACE $tag .= $ch; if ($ch eq '>') { # end of XML tag $nlnm++; push(@nlines,$tag); ### prt( "push(\@xlnmap, [ $nlnm, $lnb, $lnnm ]); # each NEW line has BEGIN and END\n" ); $$rlm{$nlnm} = "$lnb-$lnnm"; # each NEW line has BEGIN and END $nxtxt .= "$lnb-$lnnm: [$tag]\n"; $tag = ''; $intag = 0; $xml = ''; } } else { if ($ch eq '<') { $tag = $ch; # start a tag line $intag = 1; # signal in a tag $xml = ''; $dnx = 0; $lnb = $lnnm; # set the BEGIN xml line } } $pch = $ch; } # done a line - this is like a SPACE if ($intag && length($tag)) { $tag .= ' ' if !($tag =~ /\s$/); } } if (length($tag)) { prtw("WARNING:local_xml_to_lines: xml re-lining error! Left pending tag [$tag]\nin file [$file]...\n"); } #$dnx = scalar @nlines; #if ($dnx < 5) { # prt("ERROR: Input file [$g_sc_act_vcproj] only has $dnx lines!\n"); # pgm_exit(1, "INPUT TOO SMALL!!!\n"); #} #if ($write_temp_xml) { # my $tmpxml = "C:\\GTools\\perl\\tempvcx.xml"; # if (! -f $tmpxml) { # write2file($nxtxt,$tmpxml); # prt( "Written relined XML to '$tmpxml'\n" ); # } #} return \@nlines; } sub local_array_2_hash_on_equals($) { my $rlns = shift; # (@inarr) = @_; my %hash = (); my ($itm, @arr, $key, $val, $al, $a, $cnt, $titm); $cnt = 0; foreach $itm (@{$rlns}) { $cnt++; $titm = trim_all($itm); if (length($titm) == 0) { prt( "NOTE: fgutils:array_2_hash_on_equals: Item $cnt has NO length in passed array!\n" ); next; } elsif ($titm eq '=') { # 20090912 - lets overlook this = no noise ### prt( "NOTE: fgutils:array_2_hash_on_equals: Item $cnt is JUST an equal sign! [$itm]!\n" ); next; } @arr = split('=',$itm); $al = scalar @arr; $key = $arr[0]; $val = ''; for ($a = 1; $a < $al; $a++) { $val .= '=' if length($val); $val .= $arr[$a]; } if (defined $key && length($key)) { if (defined $hash{$key}) { prtw( "WARNING: array_2_hash_on_equals: Duplicate KEY: [$key] ... ADDING val [$val]\n" ); $hash{$key} .= "\@".$val; } else { $hash{$key} = $val; } } else { if (defined $key) { prt( "NOTE: fgutils:array_2_hash_on_equals: Item $cnt:$itm: key=[$key] has NO length in passed array!\n" ); } else { prt( "NOTE: fgutils:array_2_hash_on_equals: Item $cnt:$itm: key is NOT set in passed array!\n" ); } } } return \%hash; } sub do_xml_file($) { my $xml = shift; my ($sb,$tm,$sz); if (! open INF, "<$xml") { prtw("WARNING: can NPT open file [$xml]\n"); } my @lines = ; close INF; my $cnt = scalar @lines; $total_lines += $cnt; $total_files++; my $withBOM = 0; my %lnmap = (); if ($cnt) { my ($rlines,$max); $withBOM = remove_utf_bom($xml,\@lines); $rlines = local_xml_to_lines(\%lnmap,\@lines,$xml); $max = scalar @{$rlines}; my ($line,$i,$rh,@arr,$enc,$haddec,$msg,$rea,$tmp); $msg = "Processing $cnt lines, $max xml, from [$xml] "; $enc = ''; $haddec = 0; for ($i = 0; $i < $max; $i++) { $line = ${$rlines}[$i]; if ($line =~ /<\?xml/) { @arr = space_split($line); $rh = local_array_2_hash_on_equals(\@arr); if (defined ${$rh}{'encoding'}) { $tmp = strip_both_quotes(${$rh}{'encoding'}); $tmp =~ s/>$//; $tmp =~ s/\?$//; $enc = strip_both_quotes($tmp); if ($enc =~ /^utf/i) { $enc = uc($enc); } elsif ($enc =~ /^iso/i) { $enc = uc($enc); } } $haddec = 1; last; } } if ($haddec) { $msg .= "decl "; if (length($enc)) { $msg .= "encoding=\"$enc\" "; $files_with_enc++; if (defined $encs_foundh{$enc}) { $encs_foundh{$enc}++; } else { $encs_foundh{$enc} = 1; push(@encs_founda,$xml); } $encs_found{$enc} = [] if (!defined $encs_found{$enc}); $rea = $encs_found{$enc}; push(@{$rea},$xml); } else { $msg .= "NO encoding "; $files_no_enc++; } } else { $msg .= "NO DECL "; $files_no_decl++; push(@no_xml_decl,$xml); } if ($withBOM) { $msg .= "BOM $last_bom_name"; $bom_files++; push(@boms_founda,$xml); if (defined $boms_foundh{$last_bom_name}) { $boms_foundh{$last_bom_name}++; } else { $boms_foundh{$last_bom_name} = 1; } } prt("$msg\n"); # if (VERB5() || $withBOM); } else { # no line count??? $blank_files++; push(@blanks_founda,$xml); } if ($sb = stat($xml)) { $tm = $sb->mtime; $sz = $sb->size; if ($sz > $largest_file_size) { $largest_file_size = $sz; $largest_file_name = $xml; $largest_file_line = $cnt; } $total_bytes += $sz; } } sub process_xml_files() { my $cnt = scalar @xml_array; prt("Have $cnt XML files to process...\n"); my ($file); foreach $file (@xml_array) { do_xml_file($file); } } sub show_stats() { prt("\nProcessed $total_lines lines, $total_bytes bytes, from $total_files files.\n"); my (@arr,$key,$rea,$cnt,$msg); if ($blank_files) { prt("Found $blank_files files BLANK!\n"); prt(join("\n",@blanks_founda)."\n"); } if ($bom_files) { @arr = keys(%boms_foundh); prt("Found $bom_files files with a BOM. ".join(" ",@arr)."\n"); prt(join("\n",@boms_founda)."\n"); } if ($files_no_decl) { prt("Found $files_no_decl files with NO 'xml' declaration.\n"); prt(join("\n",@no_xml_decl)."\n"); } if ($files_no_enc) { prt("Found $files_no_enc files with NO 'encoding' declaration.\n"); } if ($files_with_enc) { @arr = keys(%encs_foundh); prt("Found $files_with_enc files with encodings like ".join(" ",@arr)."\n"); $msg = ''; foreach $key (keys %encs_found) { $rea = $encs_found{$key}; $cnt = scalar @{$rea}; prt(" Encoding $key - $cnt files\n"); prt(join("\n",@{$rea})."\n"); $msg .= "$key = $cnt "; } ###prt(join("\n",@encs_founda)."\n"); } prt("$msg\n"); prt("Counts: $files_no_decl no decl, $files_with_enc with enc, $files_no_enc no enc\n"); prt("Counts: ".get_nn($total_lines)." lines, ".get_nn($total_bytes)." bytes (".util_bytes2ks($total_bytes)."), from ".get_nn($total_files)." files.\n"); prt("Largest file ".get_nn($largest_file_size)." bytes, [$largest_file_name], ".get_nn($largest_file_line)." lines.\n"); } ######################################### ### MAIN ### parse_args(@ARGV); prt("Collecting xml files from [$in_file] directory... recursive... moment...\n"); process_in_dir($in_file); process_xml_files(); show_stats(); pgm_exit(0,""); ######################################## sub give_help { prt("$pgmname: version $VERS\n"); prt("Usage: $pgmname [options] in-file\n"); prt("Options:\n"); prt(" --help (-h or -?) = This help, and exit 0.\n"); prt(" --verb[n] (-v) = Bump [or set] verbosity. def=$verbosity\n"); prt(" --load (-l) = Load LOG at end. ($outfile)\n"); prt(" --out (-o) = Write output to this file.\n"); } sub need_arg { my ($arg,@av) = @_; pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av); } sub parse_args { my (@av) = @_; my ($arg,$sarg); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); if (($sarg =~ /^h/i)||($sarg eq '?')) { give_help(); pgm_exit(0,"Help exit(0)"); } elsif ($sarg =~ /^v/) { if ($sarg =~ /^v.*(\d+)$/) { $verbosity = $1; } else { while ($sarg =~ /^v/) { $verbosity++; $sarg = substr($sarg,1); } } prt("Verbosity = $verbosity\n") if (VERB1()); } elsif ($sarg =~ /^l/) { $load_log = 1; prt("Set to load log at end.\n") if (VERB1()); } elsif ($sarg =~ /^o/) { need_arg(@av); shift @av; $sarg = $av[0]; $out_xml = $sarg; prt("Set out file to [$out_xml].\n") if (VERB1()); } else { pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n"); } } else { $in_file = $arg; prt("Set input to [$in_file]\n") if (VERB1()); } shift @av; } if ((length($in_file) == 0) && $debug_on) { $in_file = $def_file; $load_log = 2; prt("Set DEFAULT input to [$in_file]\n"); } if (length($in_file) == 0) { pgm_exit(1,"ERROR: No input files found in command!\n"); } if (! -d $in_file) { pgm_exit(1,"ERROR: Unable to find directory [$in_file]! Check name, location...\n"); } } # eof - chkxml2.pl