#!/usr/bin/perl -W # 23/01/2013 - Strip a BOM # 27/01/2010 - Initial cut # BOM Type Hex = Decimal # UTF-8 EF BB BF = 239 187 191 # UTF-16 (BE) FE FF = 254 255 # UTF-16 (LE) FF FE = 255 254 # UTF-32 (BE) 00 00 FE FF = 0 0 254 255 # UTF-32 (LE) FF FE 00 00 = 255 254 0 0 # UTF-7 2B 2F 76, and one of: [38|39|2B|2F] 43 47 118, and one of: [56|57|43|47] +/v, and one of 8 9 + / # UTF-1 F7 64 4C = 247 100 76 # UTF-EBCDIC DD 73 66 73 = 221 115 102 115 # SCSU 0E FE FF = 14 254 255 # BOCU-1 FB EE 28 +optional FF 251 238 40 +optional 255 # GB-18030 84 31 95 33 = 132 49 149 51 use strict; use warnings; use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] ) ###use Cwd; my $os = $^O; my $perl_dir = '/home/geoff/bin'; my $PATH_SEP = '/'; my $temp_dir = '/tmp'; if ($os =~ /win/i) { $perl_dir = 'C:\GTools\perl'; $temp_dir = $perl_dir; $PATH_SEP = "\\"; } unshift(@INC, $perl_dir); ##require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n"; # log file stuff our ($LF); my $pgmname = $0; if ($pgmname =~ /(\\|\/)/) { my @tmpsp = split(/(\\|\/)/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = $temp_dir.$PATH_SEP."temp.$pgmname.txt"; ###open_log($outfile); my $VERS = "0.0.3 2017-12-23"; ###my $VERS = "0.0.2 2013-01-23"; my $load_log = 0; my $in_file = ''; my $verbosity = 0; my $out_file = ''; my $strip_bom = 1; my $removed_bom = 0; my $modify = 0; # write out to same as in file # ### DEBUG ### my $debug_on = 0; #my $def_file = 'def_file'; #my $def_file = 'C:\DTEMP\libxml2-2.6.30\result\slashdot16.xml'; my $def_file = 'C:\HOMEPAGE\GA\projects\ini.htm'; my $do_quick_test = 0; my $BOM_offset = -1; sub VERB1() { return $verbosity >= 1; } sub VERB2() { return $verbosity >= 2; } sub VERB5() { return $verbosity >= 5; } sub VERB9() { return $verbosity >= 9; } my @BOM_list = ( [ "UTF-8", 3, [0xEF,0xBB,0xBF ] ], # 239 187 191 [ "UTF-16 (BE)", 2, [0xFE,0xFF ] ], # 254 255 [ "UTF-16 (LE)", 2, [0xFF,0xFE ] ], # 255 254 [ "UTF-32 (BE)", 4, [0x00,0x00,0xFE,0xFF] ], # 0 0 254 255 [ "UTF-32 (LE)", 4, [0xFF,0xFE,0x00,0x00] ], # 255 254 0 0 [ "UTF-7a" , 4, [0x2B,0x2F,0x76,0x38] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F [ "UTF-7b" , 4, [0x2B,0x2F,0x76,0x39] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F [ "UTF-7c" , 4, [0x2B,0x2F,0x76,0x2B] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F [ "UTF-7d" , 4, [0x2B,0x2F,0x76,0x2F] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F [ "UTF-1" , 3, [0xF7,0x64,0x4C ] ], # 247 100 76 [ "UTF-EBCDIC" , 4, [0xDD,0x73,0x66,0x73] ], # 221 115 102 115 [ "SCSU" , 3, [0x0E,0xFE,0xFF ] ], # 14 254 255 [ "BOCU-1" , 3, [0xFB,0xEE,0x28 ] ], # 251 238 40 [ "GB-18030" , 4, [0x84,0x31,0x95,0x33] ] # 132 49 149 51 ); sub prt($) { print shift; } sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg); } ###show_warnings($val); ###close_log($outfile,$load_log); exit($val); } sub has_utf_16_BOM($) { my ($fil) = shift; if (open INF, "<$fil") { binmode INF; my $buf = ""; if ((read INF, $buf, 2) == 2) { close INF; my $od1 = ord(substr($buf,0,1)); my $od2 = ord(substr($buf,1,1)); if (($od1 == 0xFF)&&($od2 == 0xFE)) { return 2; # LittleEndians (intel) } elsif (($od1 == 0xFE)&&($od2 == 0xFF)) { return 3; # BigEndians (amd) } return 1; } close INF; } return 0; } if ($do_quick_test) { if (@ARGV) { $in_file = $ARGV[0]; } my $bom = has_utf_16_BOM($in_file); print "File [$in_file]\n"; if ($bom == 2) { print " *** IS *** UTF-16 (LE) LittleEndians 0xFFFE"; } elsif ($bom == 3) { print " *** IS *** UTF-16 (BE) BigEndians 0xFEFF"; } elsif ($bom == 1) { print "is *** NOT *** UTF-16..."; } else { print "Can not open file!"; } print "\n"; } sub line_has_bom($$) { my ($line,$rname) = @_; my $max = scalar @BOM_list; my $len = length($line); my ($i,$j,$name,$cnt,$ra,$ch,$val); for ($i = 0; $i < $max; $i++) { $name = $BOM_list[$i][0]; # name $cnt = $BOM_list[$i][1]; # length $ra = $BOM_list[$i][2]; # ref array of values if ($len > $cnt) { # make sure line length GT BOM for ($j = 0; $j < $cnt; $j++) { $ch = substr($line,$j,1); # extract CHAR $val = ord($ch); # get VALUE last if ($val != ${$ra}[$j]); # compare } if ($j == $cnt) { # if ALL values found $BOM_offset = $i; ${$rname} = $name; # give back 'name' return $cnt; # and return count } } } return 0; # no BOM found } sub remove_utf_bom($$) { my ($ff,$ra) = @_; my $line = ${$ra}[0]; # get first line my $name = ''; my $len = line_has_bom($line,\$name); my ($nm,$dr) = fileparse($ff); # just show name if ($len) { $line = substr($line,$len); # truncate line ${$ra}[0] = $line; # and return minus BOM prt("NOTE: File [$nm] is $name encoding. BOM($len) removed.\n"); $removed_bom = 1; } else { prt("NOTE: File [$nm] has no BOM.\n"); } } sub load_file_lines($$) { my ($ff,$ra) = @_; my $lncnt = 0; if (open INF, "<$ff") { @{$ra} = ; close INF; $lncnt = scalar @{$ra}; remove_utf_bom($ff,$ra) if ($strip_bom); } else { prtw("WARNING: Unable to open [$ff]!\n"); } return $lncnt; } ######################################### # RENAME A FILE TO .OLD, or .BAK # 0 - do nothing if file does not exist. # 1 - rename to .OLD if .OLD does NOT exist # 2 - rename to .BAK, if .OLD already exists, # 3 - deleting any previous .BAK ... sub my_rename_2_old_bak { my ($fil) = shift; my $ret = 0; # assume NO SUCH FILE if ( -f $fil ) { # is there? my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ ); my $nmbo = $dir . $nm . '.old'; $ret = 1; # assume renaming to OLD if ( -f $nmbo) { # does OLD exist $ret = 2; # yes - rename to BAK $nmbo = $dir . $nm . '.bak'; if ( -f $nmbo ) { $ret = 3; unlink $nmbo; } } rename $fil, $nmbo; } return $ret; } sub my_write2file { my ($txt,$fil) = @_; open WOF, ">$fil" or mydie("ERROR: Unable to open $fil! $!\n"); print WOF $txt; close WOF; } sub process_in_file($) { my ($inf) = @_; my @lines = (); load_file_lines($inf,\@lines); my $lncnt = scalar @lines; if ($removed_bom) { prt("Got $lncnt lines, from [$inf]...\n"); if (VERB9() && ($BOM_offset >= 0)) { my $i = $BOM_offset; my $name = $BOM_list[$i][0]; # name my $cnt = $BOM_list[$i][1]; # length my $ra = $BOM_list[$i][2]; # ref array of values prt("Removed BOM: $name, $cnt ["); for ($i = 0; $i < $cnt; $i++) { prt(" ") if ($i); prt(sprintf("%#02x", ${$ra}[$i])); } prt("]\n"); } my $txt = join("",@lines)."\n"; if ($modify) { if (!length($out_file)) { $out_file = $inf; } } if (length($out_file)) { my_rename_2_old_bak($out_file); my_write2file($txt,$out_file); prt("Data written to outfile [$out_file]\n"); } else { my_write2file($txt,$outfile); prt("Data written to DEFAULT outfile [$outfile]\n"); } } else { prt("In file '$inf', $lncnt lines, does not have a known BOM...\n"); } } ######################################### ### MAIN ### parse_args(@ARGV); process_in_file($in_file); pgm_exit(0,""); ######################################## sub need_arg { my ($arg,@av) = @_; pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av); } sub parse_args { my (@av) = @_; my ($arg,$sarg); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); if (($sarg =~ /^h/i)||($sarg eq '?')) { give_help(); pgm_exit(0,"Help exit(0)"); } elsif ($sarg =~ /^v/) { if ($sarg =~ /^v.*(\d+)$/) { $verbosity = $1; } else { while ($sarg =~ /^v/) { $verbosity++; $sarg = substr($sarg,1); } } prt("Verbosity = $verbosity\n") if (VERB1()); #} elsif ($sarg =~ /^l/) { # if ($sarg =~ /^ll/) { # $load_log = 2; # } else { # $load_log = 1; # } # prt("Set to load log at end. ($load_log)\n") if (VERB1()); } elsif ($sarg =~ /^o/) { need_arg(@av); shift @av; $sarg = $av[0]; $out_file = $sarg; prt("Set out file to [$out_file].\n") if (VERB1()); } elsif ($sarg =~ /^m/) { $modify = 1; prt("Set modify in file, if BOM.\n") if (VERB1()); } else { pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n"); } } else { $in_file = $arg; prt("Set input to [$in_file]\n") if (VERB1()); } shift @av; } if ((length($in_file) == 0) && $debug_on) { $in_file = $def_file; prt("Set DEFAULT input to [$in_file]\n"); } if (length($in_file) == 0) { pgm_exit(1,"ERROR: No input files found in command!\n"); } if (! -f $in_file) { pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n"); } } sub give_help { prt("$pgmname: version $VERS\n"); prt("Usage: $pgmname [options] in-file\n"); prt("Options:\n"); prt(" --help (-h or -?) = This help, and exit 0.\n"); prt(" --verb[n] (-v) = Bump [or set] verbosity. def=$verbosity\n"); ### prt(" --load (-l) = Load LOG at end. ($outfile)\n"); prt(" --out (-o) = Write output to this file.\n"); prt(" --modify (-m) = Modify input file, if BOM exists.\n"); } # eof - utf_16_BOM.pl