# "
# | " "
# | "
# | "
# | QUOTATION MARK
# ...
sub process_in_file3($) {
my ($inf) = @_;
if (! open INF, "<$inf") {
pgm_exit(1,"ERROR: Unable to open file [$inf]\n");
}
my @lines = ;
close INF;
my $lncnt = scalar @lines;
prt("Processing $lncnt lines, from [$inf]...\n");
my ($line,$inc,$lnn,$len,@arr,$ent,$ver,$code);
my ($i,$ch,$cnt,@arr2,$class,$cnt2,$attr);
my ($type,$value,$hexchar,$tmp,$decimal,$ra,$added);
$lnn = 0;
my @ents = ();
my $intable = 0;
my $tag = '';
my $tail = '';
my $intag = 0;
my $trcnt = 0;
my $entcnt = 0;
my $modents = 0;
$class = '';
foreach $line (@lines) {
$lnn++;
chomp $line;
$line = trim_all($line);
$len = length($line);
next if ($len == 0);
if ($intable) {
if ($line =~ /^<\/table>/) {
prt("$lnn: End table...\n") if (VERB9());
$intable = 0;
} else {
for ($i = 0; $i < $len; $i++) {
$ch = substr($line,$i,1);
if ($ch eq '<') {
$tag = $ch;
$i++;
$intag = 1;
for (; $i < $len; $i++) {
$ch = substr($line,$i,1);
if ($ch eq '>') {
if ($intag) {
$tag .= $ch;
} else {
# $tail .= $ch;
}
#prt("$tag $tail\n");
if (length($tag) || length($tail)) {
if (length($tail)) {
if (length($tag)) {
@arr = space_split($tail);
$cnt = scalar @arr;
if ($tag =~ /^
foreach $attr (@arr) {
@arr2 = split("=",$attr);
$cnt2 = scalar @arr2;
if ($cnt2 == 2) {
$type = $arr2[0];
$value = strip_quotes($arr2[1]);
if ($type eq 'title') {
# title="U+00009 CHARACTER TABULATION"
} elsif ($type eq 'data-block') {
# data-block="C0 Controls and Basic Latin"
} elsif ($type eq 'data-category') {
# data-category="Cc"
} elsif ($type eq 'data-set') {
# data-set="mmlextra">
} else {
pgm_exit(1,"Failed with '$attr'. type '$type', value '$value' - FIX ME\n");
}
} else {
pgm_exit(1,"Failed with '$attr'\n");
}
}
prt("Tag: TR with $cnt attrs\n") if (VERB9())
;
$trcnt++;
} elsif ($cnt == 1) {
@arr2 = split("=",$arr[0]);
$cnt2 = scalar @arr2;
if (($cnt2 == 2) && ($arr2[0] =~ /class/i)) {
$class = strip_quotes($arr2[1]);
# prt("Tag: $tag with class '$class'\n");
if ($class eq 'character') {
prt("Tag: $tag with class '$class'\n") if (VERB5());
} elsif ($class eq 'named') {
prt("Tag: $tag with class '$class'\n") if (VERB5());
} elsif ($class eq 'hex') {
prt("Tag: $tag with class '$class'\n") if (VERB5());
} elsif ($class eq 'dec') {
prt("Tag: $tag with class '$class'\n") if (VERB5());
} elsif ($class eq 'desc') {
prt("Tag: $tag with class '$class'\n") if (VERB5());
} else {
pgm_exit(1,"Error: Unknown class '$class' - FIX ME\n");
}
} else {
prtw("Warning: Tag: $tag with $cnt attrs, $tail - FIX ME\n");
}
} else {
prt("Tag: $tag with $cnt attrs, $tail\n");
}
} else {
prt("Tail: $tail\n");
}
} else {
if ($tag =~ //) {
# skip this
} else {
prt("$tag ONLY\n");
}
}
}
$tag = '';
$tail = '';
$intag = 0;
} elsif ($ch eq '<') {
# start of another tag closes this...
$tail = trim_all($tail);
if (length($tag) || length($tail)) {
if (length($tail)) {
if (length($tag)) {
@arr = space_split($tail);
$cnt = scalar @arr;
prt("tag: $tag with $cnt attrs CHECK ME\n");
} else {
#prt("tail: $tail, class $class\n");
if ($class eq 'character') {
#prt("tail: $tail, class $class\n");
if ($tail =~ /^\([0-9A-F]+);$/) {
$hexchar = $1;
prt("C: $class = 0x$hexchar\n") if (VERB2());
} else {
pgm_exit(1,"Failed with tail '$tail', class '$class'\n");
}
} elsif ($class eq 'named') {
#prt("tail: $tail, class $class\n");
@arr2 = split(/\s+/,$tail);
@ents = ();
foreach $tmp (@arr2) {
if ($tmp =~ /^\&(\w+);$/) {
$ent = $1;
# $entities{$ent} = [$ver, $code, 0];
if (defined $entities{$ent}) {
$ra = $entities{$ent};
$ver = ${$ra}[0];
$code = ${$ra}[1]; # maybe check the 'code' value
$added = ${$ra}[2]; # check if a duplicate
${$ra}[2]++;
prtw("Warning: $ent, $ver, $code already added! CHECK ME!\n") if ($added);
} else {
$ver = 'VERS_CHECK';
}
prt("E: $class = '$ent', vers '$ver'\n") if (VERB2());
###################################
push(@ents,[$ent,$ver]);
###################################
# $entcnt++;
} else {
pgm_exit(1,"Failed with tail '$tail', class '$class'\n");
}
}
} elsif ($class eq 'hex') {
prt("tail: $tail, class $class\n") if (VERB9());
} elsif ($class eq 'dec') {
# tail: 𝕪, class dec
# prt("tail: $tail, class $class\n");
if ($tail =~ /^\&\#(\d+);$/) {
$decimal = $1;
prt("D: $class = '$decimal'\n") if (VERB2());
} else {
pgm_exit(1,"Failed with tail '$tail', class '$class'\n");
}
} elsif ($class eq 'desc') {
prt("tail: $tail, class $class\n") if (VERB9());
} else {
pgm_exit(1,"Error: Unknown class '$class' - FIX ME\n");
}
}
} else {
if ($tag =~ //) {
# skip this
} else {
prt("$tag ONLY\n");
}
}
}
# start of NEXT tag
$tag = $ch;
$tail = '';
$intag = 1;
} elsif ($intag) {
if ($ch =~ /\s/) {
$intag = 0;
$tail = '';
} else {
$tag .= $ch;
}
} else {
$tail .= $ch;
}
}
}
} # for the line
# at end of line
if (length($tag) || length($tail)) {
if (length($tail)) {
if (length($tag)) {
@arr = space_split($tail);
$cnt = scalar @arr;
prt("tag: $tag with $cnt attrs\n");
} else {
prt("tail: $tail, class $class\n") if (VERB5());
}
} else {
if ($tag =~ //) {
# skip this
} else {
prtw("Warning: $tag ONLY - CHECK ME\n");
}
}
}
$tag = '';
$tail = '';
if (defined $ent && defined $ver && defined $decimal && @ents) {
#my %all_ents = (); # entities found in current charref.html
#push(@ents,[$ent,$ver]);
my ($ra2);
foreach $ra (@ents) {
$ent = ${$ra}[0];
$ver = ${$ra}[1];
#######################################
if (defined $preferred_vals{$ent}) {
$code = $preferred_vals{$ent};
if ($code != $decimal) {
prt("Modifying $ent code $decimal to $code\n");
$modified_ents{$ent} = [$code, $decimal];
$decimal = $code;
$modents++;
}
}
#######################################
if (defined $exclude_pairs{$ent}) {
$ra = $exclude_pairs{$ent};
$ver = ${$ra}[0];
$code = ${$ra}[1];
prt("Excluding $ent, a 'surrogate' pair $ver:$code, dec $decimal!\n");
} else {
#######################################
if (defined $all_ents{$ent}) {
$ra2 = $all_ents{$ent};
my $pver = ${$ra2}[0];
my $pcod = ${$ra2}[1];
if (($pver ne $ver) || ($pcod != $decimal)) {
pgm_exit(1,"Already have $ent, $pcod, $pver, NOW $decimal, $ver!\n");
}
} else {
$entcnt++;
$all_ents{$ent} = [$ver, $decimal, 0];
}
#######################################
### Prepare list by VALUE
### NOTE: can be repeated values
### NOTE: this MISSSES the numeric equivalent ABOVE,
### but not ALL are exact equivalents!
$all_by_val{$decimal} = [] if (!defined $all_by_val{$decimal});
$ra2 = $all_by_val{$decimal};
push(@{$ra2}, [$ent,$ver]);
#######################################
$len = length($ent);
$max_ent = $len if ($len > $max_ent);
$len = length($ver);
$max_ver = $len if ($len > $max_ver);
$len = length($decimal);
$max_dec = $len if ($len > $max_dec);
}
}
$ent = undef;
$ver = undef;
$decimal = undef;
@ents = ();
}
}
} else {
if ($line =~ /^/) {
prt("$lnn: Start table...\n") if (VERB9());
$intable = 1;
}
}
}
#######################################################
# $all_ents{$ent} = [$ver, $decimal, 0];
#######################################################
@arr = keys %all_ents;
$cnt = scalar @arr;
prt("Found $trcnt lines... $entcnt entities, $cnt keys...\n");
@arr = keys %entities;
foreach $ent (@arr) {
$ra = $entities{$ent};
$ver = ${$ra}[0];
$decimal = ${$ra}[1];
$added = ${$ra}[2];
if ($added == 0) {
prtw("Warning: Seem to have missed $ent, $ver, $decimal!\n");
}
}
######################################################
### write html table sorted by VALUE
# push(@{$ra2}, [$ent,$ver]);
@arr = sort mycmp_dec keys(%all_by_val);
$cnt = scalar @arr; # just count of VALUES stored, each could be more than 1 entity
prt("Got $cnt decimal value... Write to a html file...\n");
my $htm = '';
my $cols = 4;
my $wrap = 0;
my $entries = 0;
my ($ra2);
$htm .= "";
for ($wrap = 0; $wrap < $cols; $wrap++) {
$htm .= "Name | ";
$htm .= "Code | ";
$htm .= "G | ";
$htm .= "N | ";
}
$htm .= " \n";
$wrap = 0;
foreach $code (@arr) {
$ra = $all_by_val{$code};
foreach $ra2 (@{$ra}) {
$entries++;
$ent = ${$ra2}[0];
$ver = ${$ra2}[1];
$htm .= "" if ($wrap == 0);
$htm .= "&$ent; | ";
$htm .= "$code | ";
$htm .= "&$ent; | ";
$htm .= "$code; | ";
$wrap++;
if ($wrap == $cols) {
$wrap = 0;
$htm .= " \n";
}
}
}
if ($wrap) {
while ($wrap < $cols) {
$wrap++;
$htm .= " | ";
$htm .= " | ";
$htm .= " | ";
$htm .= " | ";
}
$htm .= "\n";
}
my ($name,$dir) = fileparse($inf);
# my $out_file7 = $temp_dir.$PATH_SEP."tempallents1.html";
write_table("$entries entries, $entcnt entities, $cnt values, from $name",$htm,$out_file7);
prt("$entries entries, $cnt values, $entcnt entities, written to $out_file7...\n");
######################################################
### TODO: Write out modified ents
# my $out_file8 = $temp_dir.$PATH_SEP."tempmodents1.html";
# $modified_ents{$ent} = [$code, $decimal];
@arr = sort keys( %modified_ents );
$cnt = scalar @arr;
if ($cnt != $modents) {
pgm_exit(1,"Error: Need to store mods in an array of arrays!\n");
}
prt("Write out $cnt ($modents) modified entities...\n");
$htm = "";
$wrap = 0;
$htm .= "";
$htm .= "Name | ";
$htm .= "G | ";
$htm .= "Code1 | ";
$htm .= "N | ";
$htm .= "Code2 | ";
$htm .= "N | ";
$htm .= " \n";
foreach $ent (@arr) {
$ra2 = $modified_ents{$ent};
$code = ${$ra2}[0];
$ver = ${$ra2}[1];
$htm .= "";
$htm .= "&$ent; | ";
$htm .= "&$ent; | ";
$htm .= "$code | ";
$htm .= "$code; | ";
$htm .= "$ver | ";
$htm .= "$ver; | ";
$htm .= " \n";
}
write_table("$cnt modified entities, 2 values",$htm,$out_file8);
prt("$cnt entities, written to $out_file8...\n");
######################################################
# $load_log = 1;
# pgm_exit(1,"TEMP EXIT 2\n");
}
#############################################################################
# list of some 253 entities, up to HTML 4 it seems
#
#
#Name |
#Character |
#Unicode code point (decimal) |
#Standard |
#DTD[a] |
#Old ISO subset[b] |
#Description[c] |
#
#
#quot |
#" |
#U+0022 (34) |
#HTML 2.0 |
#HTMLspecial |
#ISOnum |
#quotation mark (APL quote) |
#
#
#amp |
#& |
#U+0026 (38) |
#Original html specification(html 1.0) and HTML 2.0 |
#HTMLspecial and http://info.cern.ch/MarkUp/html-spec/html.dtd (originally) |
#ISOnum |
#ampersand |
#
#
#apos |
#' |
#U+0027 (39) |
#XHTML 1.0 |
#HTMLspecial |
#ISOnum |
#apostrophe (apostrophe-quote); see below |
#
sub process_in_file4($) {
my ($inf) = @_;
if (! open INF, "<$inf") {
pgm_exit(1,"ERROR: Unable to open file [$inf]\n");
}
my @lines = ;
close INF;
my $lncnt = scalar @lines;
prt("Processing $lncnt lines, from [$inf]...\n");
my ($line,$ent,$dec,$ver,$ra);
my $lnn = 0;
my $intable = 0;
my $trcnt = 0;
my $trcnt2 = 0;
my $intr = 0;
my $tdcnt = 0;
my $tvers = 'N/A';
my $code = "Unk";
my $verx = '';
foreach $line (@lines) {
chomp $line;
$lnn++;
if (!$intable) {
if ($line =~ //) {
$intable = 1;
prt("$lnn: Start table $line\n");
}
next;
}
if ($line =~ /<\/table>/) {
$intable = 0;
prt("$lnn: End table $line\n");
}
if ($line =~ /^/) {
$trcnt++;
$intr = 1;
$tdcnt = 0;
next;
}
if ($line =~ /^<\/tr>/) {
$intr = 0;
if ($trcnt2) {
if (defined $ent && defined $dec && defined $ver) {
prt("$lnn: '$ent', dec $dec, ver$verx $ver $tvers $code\n");
$ent = undef;
$dec = undef;
$ver = undef;
} else {
pgm_exit(1,"$lnn: Failed to get threesome...\n");
}
}
$trcnt2++;
next;
}
# 422 ' |
if ($line =~ /^|\s+)/) {
$tdcnt++;
if ($tdcnt == 1) {
if ($line =~ / | (\w+)<\/td>/) {
$ent = $1;
$tvers = 'N/A';
$code = 'Unk';
# $entities{$ent} = [$ver, $code, 0];
if (defined $entities{$ent}) {
$ra = $entities{$ent};
$tvers = ${$ra}[0];
$code = ${$ra}[1]; # maybe check the 'code' value
}
prt("$lnn: $ent\n") if (VERB9());
} else {
pgm_exit(1,"$lnn: First TD failed '$line'\n");
}
} elsif ($tdcnt == 3) {
if ($line =~ / | U.+\((\d+)\)<\/td>/) {
$dec = $1;
} else {
pgm_exit(1,"$lnn: Third TD failed '$line'\n");
}
} elsif ($tdcnt == 4) {
$verx = '';
if ($line =~ /^ | HTML\s+(.+)<\/td>/) {
$ver = $1;
} elsif ($line =~ /Original\s+html\h+specification\(html 1.0\)/) {
$ver = '1.0';
} elsif ($line =~ /^ | XHTML\s+(.+)<\/td>/) {
$ver = $1;
$verx = 'X';
} else {
pgm_exit(1,"$lnn: Forth TD failed '$line'\n");
}
}
}
}
prt("$lnn: EOF - tr count $trcnt\n");
$load_log = 1;
pgm_exit(1,"TEMP EXIT 2\n");
}
# ========================================================================
# HTML 4
# https://www.w3.org/TR/REC-html40/sgml/entities.html
#
#
#
#
#
# to
#
# In comparison with Tidy
# Got 3 WARNINGS...
# Warning: 391: ent lang, code 9001 vs 10216, vers VERS_FROM40
# Warning: 395: ent rang, code 9002 vs 10217, vers VERS_FROM40
# Warning: Seem to have missed apos, VERS_FROM40|VERS_XML, 39!
# https://www.w3.org/TR/html5/syntax.html#named-character-references
# lang = U+027E8 = 10216
# rang = U+027E9 = 10217
# and
# apos = U+00027 = 39
# ========================================================================
sub process_in_file4f($) {
my ($inf) = @_;
if (! open INF, "<$inf") {
pgm_exit(1,"ERROR: Unable to open file [$inf]\n");
}
my @lines = ;
close INF;
my $lncnt = scalar @lines;
prt("Processing $lncnt lines, from [$inf]...\n");
my ($line,$ent,$val,$code,$tvers,$tcode,$ra);
my $lnn = 0;
my $entcnt = 0;
foreach $line (@lines) {
chomp $line;
$lnn++;
if ($line =~ /NameCode | G | N | ";
$htm .= "";
for ($wrap = 0; $wrap < $cols; $wrap++) {
$htm .= $head;
}
$htm .= " \n";
$wrap = 0;
foreach $key (@arr) {
$ra = ${$rh}{$key};
$ver = ${$ra}[0];
$dec = ${$ra}[1];
if (defined $decimals{$dec}) {
$decimals{$dec}++;
# is case sensitive, so is NORMAL to have repeated values
# prtw("Warning: REPEATED decimal $ent $ver $dec\n");
} else {
$decimals{$dec} = 1;
}
# got ent=$key,$dec,$ver
$td = "&$key; | $dec | &$key; | $dec; | ";
$htm .= "" if ($wrap == 0);
$htm .= $td;
$wrap++;
if ($wrap == $cols) {
$htm .= " \n";
$wrap = 0;
}
$ent = "\"$key\",";
$ver .= ',';
if ($pad_struct) {
$ent .= ' ' while (length($ent) < $max_ent);
$ver .= ' ' while (length($ver) < $max_ver);
$dec = ' '.$dec while (length($dec) < $max_dec);
$line = " { $ent$ver$dec },";
} else {
$line = " { $ent $ver $dec },";
}
push(@lines,$line);
$entcnt++;
}
# { NULL, VERS_UNKNOWN, 0 }
$ent = 'NULL,';
$ver = 'VERS_UNKNOWN';
$ver .= ',';
$dec = 0;
if ($pad_struct) {
$ent .= ' ' while (length($ent) < $max_ent);
$ver .= ' ' while (length($ver) < $max_ver);
$dec = ' '.$dec while (length($dec) < $max_dec);
$line = " { $ent$ver$dec }";
} else {
$line = " { $ent $ver $dec },";
}
push(@lines,$line);
$line = "};";
push(@lines,$line);
$line = join("\n",@lines)."\n";
write2file($line,$out);
prt("$entcnt entities written to $out...\n");
# my ($n,$d,$e) = fileparse($out, qr/\.[^.]*/);
if ($wrap) {
#$td = "&$key; | $dec | &$key; | $dec; | ";
$td = " | &ndsp; | | | ";
while ($wrap < $cols) {
$htm .= $td;
$wrap++;
}
$htm .= " | \n";
}
my ($name,$dir) = fileparse($inf);
$out .= ".html";
write_table("$entcnt entities from $name",$htm,$out);
prt("$entcnt entities written to $out...\n");
}
# '∨' => {
# 'codepoints' => [
# 8744
# ],
# 'characters' => "\x{2228}"
# },
# A problem with 'zwnj'?
# "": { "codepoints": [8204], "characters": "\u200C" }
#
sub process_in_json2($) {
my ($inf) = @_;
if (! open INF, "<$inf") {
pgm_exit(1,"ERROR: Unable to open file [$inf]\n");
}
my @lines = ;
close INF;
my $lncnt = scalar @lines;
prt("Processing $lncnt lines, from [$inf]...\n");
my ($line,$num,$lnn,$val,$rh2,$ent,$key);
my ($ra,$cnt,$tvers,$tcode,$ra2,$i,$msg,$ver);
$line = join("",@lines);
my $json = JSON->new->allow_nonref;
my $rh = $json->decode( $line );
#prt(Dumper($rh));
#$load_log = 1;
my @arr = sort keys %{$rh};
$num = scalar @arr;
prt("Got $num keys, from json input...\n");
clear_hash_counts(\%entities);
my %plus1 = ();
my $keycnt = 0;
my $rpts = 0;
foreach $key (@arr) {
$rh2 = ${$rh}{$key};
$ent = $key;
$ent =~ s/^&//;
$ent =~ s/;$//;
#if (defined ${$rh2}{codepoints} && defined ${$rh2}{characters}) {
if (defined ${$rh2}{codepoints}) {
$ra2 = ${$rh2}{codepoints};
$cnt = scalar @{$ra2};
# first codepoint
$val = ${$ra2}[0];
if (defined $entities{$ent}) {
$ra = $entities{$ent};
$tvers = ${$ra}[0];
$tcode = ${$ra}[1];
${$ra}[2]++;
} else {
$tvers = 'VERS_CHECK';
$tcode = $val;
}
if ($cnt == 2) {
# MULTIPLE codepoints
# 0 1 2 3 4
$plus1{$ent} = [$tvers,$tcode,0,$cnt,$ra2];
} elsif ($cnt == 1) {
if (defined $json_ents{$ent}) {
$ra = $json_ents{$ent};
$ver = ${$ra}[0];
$val = ${$ra}[1];
if (($ver ne $tvers) || ($val != $tcode)) {
pgm_exit(1,"json_ents $ent ALREADY defined as [$ver, $val], now [$tvers, $tcode]\n");
}
$rpts++;
} else {
$keycnt++;
$json_ents{$ent} = [$tvers, $tcode, 0];
}
} else {
prt(Dumper($ra2));
pgm_exit(1,"Reference array $cnt - not 1 or 2!\n");
}
} else {
prt(Dumper($rh2));
pgm_exit(1,"Reference hash does not contain 'codepoints'!\n");
}
}
##########################################################
### deal with those with two...
my $htm = '';
my $tmp = '';
@arr = sort keys %plus1;
$cnt = scalar @arr;
$tmp = $keycnt + $cnt + $rpts;
prt("Added $keycnt entities, with code, to json_ents hash... hv $rpts rpts, $cnt with 2, tot $tmp\n");
my $head = "# | Name | G | Code | N | Code | N | NN | ";
$tmp = '';
if ($cnt) {
$verbosity = 1;
prt("Note: $cnt entities, have more than one codepoint!\n");
###prt(join(", ", @arr)."\n");
$lnn = 0;
$htm .= "";
$htm .= $head;
$htm .= " \n";
foreach $ent (@arr) {
$lnn++;
$ra = $plus1{$ent};
$tvers = ${$ra}[0];
$tcode = ${$ra}[1];
$cnt = $plus1{$ent}[3];
$ra2 = $plus1{$ent}[4];
$msg = "$lnn: $ent, $tvers, $tcode - ";
$htm .= "$lnn | &$ent; | &$ent; | ";
$tmp = '';
for ($i = 0; $i < $cnt; $i++) {
$val = ${$ra2}[$i];
$msg .= "$val ";
$htm .= "$val | $val; | ";
$tmp .= "$val;";
}
$htm .= "$tmp | ";
prt("$msg\n") if (VERB5());
$htm .= " \n";
}
$cnt = scalar @arr;
prt("Display of $cnt entities, with more than one codepoint!\n");
write_table("$cnt Multi-codepoints entities",$htm,$out_file3);
prt("HTML table written to $out_file3...\n");
}
}
sub compare_ent_hashes($$) {
my ($rh1,$rh2) = @_;
my @arr1 = sort keys(%{$rh1});
my @arr2 = sort keys(%{$rh2});
my $cnt1 = scalar @arr1;
my $cnt2 = scalar @arr2;
if (!$cnt1) {
prt("Warning: First hash has NO keys...\n");
return;
}
if (!$cnt2) {
prt("Warning: Second hash has NO keys...\n");
return;
}
prt("Comparing 2 hashes: $cnt1 and $cnt2 keys respectively..\n");
my ($key,$ra1,$vers1,$code1,$ra2,$vers2,$code2,$ent);
my $samecnt = 0;
my $diffcnt = 0;
my $misscnt1 = 0;
my $misscnt2 = 0;
my $htm = "";
$htm .= "# | Name | G | ";
$htm .= "Code | N | ";
$htm .= "Code | N | ";
$htm .= " \n";
foreach $key (@arr1) {
$ent = $key;
$ra1 = ${$rh1}{$key};
$vers1 = ${$ra1}[0];
$code1 = ${$ra1}[1];
if (defined ${$rh2}{$key}) {
$ra2 = ${$rh2}{$key};
$vers2 = ${$ra2}[0];
$code2 = ${$ra2}[1];
if (($vers1 eq $vers2) && ($code1 == $code2)) {
# the SAME - no interest...
$samecnt++;
} else {
prt("Diff: ent $key - rh1 $vers1, $code1 vs rh2 $vers2, $code2\n") if (VERB2());
$diffcnt++;
$htm .= "";
$htm .= "D: $diffcnt | ";
$htm .= "&$ent; | &$ent; | ";
$htm .= "$code1 | $code1; | ";
$htm .= "$code2 | $code2; | ";
$htm .= " \n"
}
# mark BOTH as compared
${$ra1}[2] = 1;
${$ra2}[2] = 1;
} else {
prt("Missed: ent $key $vers1, $code1 ONLY in rh1\n") if (VERB2());
$misscnt1++;
$htm .= "";
$htm .= "M1 $misscnt1 | ";
$htm .= "&$ent; | &$ent; | ";
$htm .= "$code1 | $code1; | ";
$htm .= " | | ";
$htm .= " \n"
}
}
foreach $key (@arr2) {
$ent = $key;
$ra2 = ${$rh2}{$key};
$vers2 = ${$ra2}[0];
$code2 = ${$ra2}[1];
next if (${$ra2}[2]);
prt("Missed: ent $key $vers2, $code2 ONLY in rh2\n") if (VERB2());
$misscnt2++;
$htm .= "";
$htm .= "M2 $misscnt2 | ";
$htm .= "&$ent; | &$ent; | ";
$htm .= " | | ";
$htm .= "$code2 | $code2; | ";
$htm .= " \n"
}
prt("Done 2 hashes: $cnt1 and $cnt2 keys - same $samecnt, diff $diffcnt, miss1 $misscnt1, miss2 $misscnt2...\n");
write_table("Hash Differences - same $samecnt, diff $diffcnt, miss1 $misscnt1, miss2 $misscnt2",$htm,$out_file5);
prt("Has differences written to $out_file5\n");
}
sub show_array_ref($) {
my $ra = shift;
my ($tmp,$cnt);
$cnt = scalar @{$ra};
prt("Show of ra of $cnt values...\n");
$cnt = 0;
foreach $tmp (@{$ra}) {
$cnt++;
prt("$cnt: '$tmp'\n");
}
}
# Character Entity Name Entity Number Description
# 1 2 3 4
# | |   | Space |
# ! | | ! | Exclamation mark |
# & | & | & | Ampersand |
# tmp '♦'
# tmp '♦'
# tmp '♦'
# tmp 'Diamond'
sub get_cols($$$$$) {
my ($line,$rnum,$rent,$rdec,$rdesc) = @_;
$line =~ s/^//;
$line =~ s/<\/tr>$//;
my $len = length($line);
my @arr = split(//,$line);
my $cnt = scalar @arr;
#prt("cnt = $cnt, line '$line'\n");
my ($tmp);
my ($i,$num,$ent,$dec,$desc);
my @arr2 = ();
$i = 0;
foreach $tmp (@arr) {
$i++;
# prt("$i: tmp '$tmp'\n");
next if ($tmp =~ /^\s*$/);
next if (length($tmp) == 0);
push(@arr2,$tmp);
#prt("$i: tmp '$tmp'\n");
}
$cnt = scalar @arr2;
if ($cnt == 4) {
# extract each column value
$i = 0;
# for ($i = 0; $i < $cnt; $i++) {
foreach $tmp (@arr2) {
# $tmp = $arr2[$i];
if ($tmp =~ /<\/td>$/) {
$tmp =~ s/<\/td>$//;
}
if ($i == 0) {
if ($tmp =~ /^&\#(\d+);$/) {
$num = $1;
} elsif ($tmp eq ' ') {
$num = 160;
} else {
prt("$i: tmp '$tmp'\n");
show_array_ref(\@arr2);
return 0;
}
} elsif ($i == 1) {
if ($tmp =~ /^&(\w+);$/) {
$ent = $1;
} elsif ($tmp =~ /^\s*$/) {
$ent = '';
} else {
prt("$i: tmp '$tmp'\n");
return 0;
}
} elsif ($i == 2) {
if ($tmp =~ /^&\#(\d+);$/) {
$dec = $1;
} else {
prt("$i: tmp '$tmp'\n");
return 0;
}
} elsif ($i == 3) {
$desc = $tmp;
}
$i++;
}
${$rnum} = $num;
${$rent} = $ent;
${$rdec} = $dec;
${$rdesc} = $desc;
return 1;
}
return 0;
}
# Just 238 entities
# my $in_file6 = 'C:\Users\user\Documents\Tidy\html-entities.html';
# 'C:\Users\user\Documents\Tidy\html-entities.html';
# from : https://www.freeformatter.com/html-entities.html
#
# HTML entities.
sub process_in_file6($) {
my ($inf) = @_;
if (! open INF, "<$inf") {
pgm_exit(1,"ERROR: Unable to open file [$inf]\n");
}
my @lines = ;
close INF;
my $lncnt = scalar @lines;
prt("Processing $lncnt lines, from [$inf]...\n");
my ($line,$len,$num,$ent,$dec,$desc,$ra,$tvers,$tcode);
my $lnn = 0;
my $intable = 0;
my $intbody = 0;
my $tblcnt = 0;
my $entcnt = 0;
my $htm = '';
clear_hash_counts(\%entities);
foreach $line (@lines) {
chomp $line;
$line = trim_all($line);
$lnn++;
$len = length($line);
next if ($len == 0);
if ($line =~ /^/) {
pgm_exit(1,"$lnn: Already IN tbody! '$line'\n") if ($intbody);
prt("$lnn: Start tbody '$line'\n") if (VERB9());
$intbody = 1;
next;
} elsif ($line =~ /<\/tbody>/) {
pgm_exit(1,"$lnn: NOT IN tbody! '$line'\n") if (!$intbody);
prt("$lnn: End tbody '$line'\n") if (VERB9());
$intbody = 0;
next;
}
if ($intbody) {
if ($line =~ /^/) {
$ent = '';
if (get_cols($line,\$num,\$ent,\$dec,\$desc)) {
$len = length($ent);
next if ($len == 0);
$entcnt++;
if (defined $entities{$ent}) {
$ra = $entities{$ent};
$tvers = ${$ra}[0];
$tcode = ${$ra}[1];
${$ra}[2]++;
prtw("Warning: free_ents $ent $tvers, tidy $tcode vs $dec\n") if ($tcode != $dec);
} else {
$tvers = 'VERS_CHECK';
$tcode = $dec;
}
$free_ents{$ent} = [$tvers, $tcode, 0];
$htm .= " $entcnt | &$ent; | $dec | &$ent; | ";
$htm .= "$dec; | ";
$htm .= "$tvers | ";
$htm .= "$desc | ";
$htm .= " \n";
} else {
pgm_exit(1,"$lnn: TR NOT HANDLED! '$line'\n");
}
} else {
pgm_exit(1,"$lnn: NOT HANDLED! '$line'\n");
}
}
}
}
my @arr = keys %entities;
my ($added);
my $missed = 0;
foreach $ent (@arr) {
$ra = $entities{$ent};
$tvers = ${$ra}[0];
$tcode = ${$ra}[1];
$added = ${$ra}[2];
$dec = $tcode;
if ($added == 0) {
prtw("Warning: Seem to have missed $ent, $tvers, $tcode!\n") if (VERB9());
$htm .= "*** NOT FOUND IN FreeFormatter.com TABLES *** | \n" if ($missed == 0);
$missed++;
$htm .= "$missed | &$ent; | $dec | &$ent; | ";
$htm .= "$dec; | ";
$htm .= "$tvers | ";
$htm .= "NOT FOUND in tables | ";
$htm .= " \n";
}
}
write_table("$entcnt FreeFormatter.com, missing $missed",$htm,$out_file6);
prt("$tblcnt tables, found $entcnt entities... written to $out_file6\n");
clear_hash_counts(\%entities);
# pgm_exit(1,"TEMP EXIT 6\n");
}
#########################################
### MAIN ###
###parse_args(@ARGV);
### check_hash();
process_in_file2($in_file2); # 253 - 'C:\Users\user\Documents\Tidy\entities.c';
#process_in_file6($in_file6); # 238 'C:\Users\user\Documents\Tidy\html-entities.html';
#process_in_file4f($in_file4f); # 'C:\Users\user\Documents\Tidy\html4-ents.html';
#process_in_file4($in_file4); # 253 - 'C:\Users\user\Documents\Tidy\wiki-list-entities.html';
process_in_file3($in_file3); # 'C:\Users\user\Documents\Tidy\charref.html'; => %all_ents
do_output(\%all_ents,$out_file,$in_file3);
# process_in_json($in_file1); # 'C:\Users\user\Documents\Tidy\htmlmathml.json';
process_in_json2($in_file5j); # = 'C:\Users\user\Documents\Tidy\html5-ents.json';
do_output(\%json_ents,$out_file2,$in_file5j);
compare_ent_hashes(\%all_ents,\%json_ents);
pgm_exit(0,"");
########################################
sub need_arg {
my ($arg,@av) = @_;
pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av);
}
sub parse_args {
my (@av) = @_;
my ($arg,$sarg);
my $verb = VERB2();
while (@av) {
$arg = $av[0];
if ($arg =~ /^-/) {
$sarg = substr($arg,1);
$sarg = substr($sarg,1) while ($sarg =~ /^-/);
if (($sarg =~ /^h/i)||($sarg eq '?')) {
give_help();
pgm_exit(0,"Help exit(0)");
} elsif ($sarg =~ /^v/) {
if ($sarg =~ /^v.*(\d+)$/) {
$verbosity = $1;
} else {
while ($sarg =~ /^v/) {
$verbosity++;
$sarg = substr($sarg,1);
}
}
$verb = VERB2();
prt("Verbosity = $verbosity\n") if ($verb);
} elsif ($sarg =~ /^l/) {
if ($sarg =~ /^ll/) {
$load_log = 2;
} else {
$load_log = 1;
}
prt("Set to load log at end. ($load_log)\n") if ($verb);
} elsif ($sarg =~ /^o/) {
need_arg(@av);
shift @av;
$sarg = $av[0];
$out_file = $sarg;
prt("Set out file to [$out_file].\n") if ($verb);
} else {
pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n");
}
} else {
$in_file = $arg;
prt("Set input to [$in_file]\n") if ($verb);
}
shift @av;
}
if ($debug_on) {
prtw("WARNING: DEBUG is ON!\n");
if (length($in_file) == 0) {
$in_file = $def_file;
prt("Set DEFAULT input to [$in_file]\n");
}
}
if (length($in_file) == 0) {
pgm_exit(1,"ERROR: No input files found in command!\n");
}
if (! -f $in_file) {
pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n");
}
}
sub give_help {
prt("$pgmname: version $VERS\n");
prt("Usage: $pgmname [options] in-file\n");
prt("Options:\n");
prt(" --help (-h or -?) = This help, and exit 0.\n");
prt(" --verb[n] (-v) = Bump [or set] verbosity. def=$verbosity\n");
prt(" --load (-l) = Load LOG at end. ($outfile)\n");
prt(" --out (-o) = Write output to this file.\n");
}
# Hypertext Markup Language - 2.0 September 22, 1995
# https://www.w3.org/MarkUp/html-spec/html-spec_9.html#SEC9.7
#
sub get_html2_ents() {
my $txt = <
EOF
return $txt;
}
# from : https://www.w3.org/TR/REC-html32#latin1
sub get_html3_ents() {
my $txt = <
EOF
return $txt;
}
# eof - tidyentities.pl
| |