#!/usr/bin/perl -w
# NAME: airhtml.pl
# AIM: Given a Wiki aircraft html file, try to extract specification
use strict;
use warnings;
use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
use HTML::Strip;
use Cwd;
my $os = $^O;
my $perl_dir = '/home/geoff/bin';
my $PATH_SEP = '/';
my $temp_dir = '/tmp';
if ($os =~ /win/i) {
$perl_dir = 'C:\GTools\perl';
$temp_dir = $perl_dir;
$PATH_SEP = "\\";
}
unshift(@INC, $perl_dir);
require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n";
# log file stuff
our ($LF);
my $pgmname = $0;
if ($pgmname =~ /(\\|\/)/) {
my @tmpsp = split(/(\\|\/)/,$pgmname);
$pgmname = $tmpsp[-1];
}
my $outfile = $temp_dir.$PATH_SEP."temp.$pgmname.txt";
open_log($outfile);
# user variables
my $VERS = "0.0.3 2014-11-19";
my $load_log = 0;
my $in_file = '';
my $verbosity = 0;
my $out_file = '';
# ### DEBUG ###
my $debug_on = 0;
my $def_file = 'F:\Projects\aircraft\707\Boeing_707.html';
### program variables
my @warnings = ();
my $cwd = cwd();
sub VERB1() { return $verbosity >= 1; }
sub VERB2() { return $verbosity >= 2; }
sub VERB5() { return $verbosity >= 5; }
sub VERB9() { return $verbosity >= 9; }
sub show_warnings($) {
my ($val) = @_;
if (@warnings) {
prt( "\nGot ".scalar @warnings." WARNINGS...\n" );
foreach my $itm (@warnings) {
prt("$itm\n");
}
prt("\n");
} else {
prt( "\nNo warnings issued.\n\n" ) if (VERB9());
}
}
sub pgm_exit($$) {
my ($val,$msg) = @_;
if (length($msg)) {
$msg .= "\n" if (!($msg =~ /\n$/));
prt($msg);
}
show_warnings($val);
close_log($outfile,$load_log);
exit($val);
}
sub prtw($) {
my ($tx) = shift;
$tx =~ s/\n$//;
prt("$tx\n");
push(@warnings,$tx);
}
sub process_in_file2($) {
my ($inf) = @_;
if (! open INF, "<$inf") {
pgm_exit(1,"ERROR: Unable to open file [$inf]\n");
}
my @lines = ;
close INF;
my $lncnt = scalar @lines;
prt("Processing $lncnt lines, from [$inf]...\n");
my ($line,$inc,$len);
$line = join("",@lines);
my $hs = HTML::Strip->new();
my $clean_text = $hs->parse( $line );
$hs->eof;
my $cnt = length($clean_text);
prt("Got $cnt plain text... ");
###prt($clean_text);
@lines = split("\n",$clean_text);
$lncnt = scalar @lines;
prt("$lncnt lines ");
my @nlines = ();
foreach $line (@lines) {
chomp $line;
$line = trim_all($line);
$len = length($line);
next if ($len == 0);
push(@nlines,$line);
}
$lncnt = scalar @nlines;
prt("$lncnt clean lines\n");
prt(join("\n",@nlines)."\n");
$load_log = 1;
}
sub prth($) {
my $txt = shift;
$txt =~ s/\&\#160;/ /g;
prt($txt);
}
sub process_in_file($) {
my ($inf) = @_;
if (! open INF, "<$inf") {
pgm_exit(1,"ERROR: Unable to open file [$inf]\n");
}
my @lines = ;
close INF;
my $lncnt = scalar @lines;
prt("Processing $lncnt lines, from [$inf]...\n");
my ($line,$i,$ch,$pc,$len,$tag,$text);
my $inhtm = 0;
my $inquot = 0;
my $inhead = 0;
my $inscript = 0;
my $txtline = '';
my $max_line = 100;
my $col_num = 0;
my $clear = 0;
my $intable = 0;
$tag = '';
$text = '';
foreach $line (@lines) {
chomp $line;
$line = trim_all($line);
$len = length($line);
next if ($len == 0);
for ($i = 0; $i < $len; $i++) {
$pc = $ch;
$ch = substr($line,$i,1);
if ($inhtm) {
if ($ch eq '>') {
$inhtm = 0;
if ($inhead) {
if ($tag =~ /^\/head/i) {
$inhead = 0;
}
} elsif ($tag =~ /^head/i) {
$inhead = 1;
} else {
if ($inscript) {
if ($tag =~ /^\/script/i) {
$inscript = 0;
}
} elsif ($tag =~ /^script/i) {
$inscript = 1;
} else {
if ($tag =~ /^\//) {
# close tag
#$tag =~ s/^\///;
$tag = substr($tag,1);
$clear = 0;
if ($tag =~ /^p$/i) {
$clear = 1;
} elsif ($tag =~ /^td$/i) {
$clear = 1;
} elsif ($tag =~ /^th$/i) {
$clear = 1;
} elsif ($tag =~ /^h\d+$/i) {
$clear = 1;
} elsif ($tag =~ /^table/i) {
$intable = 0;
$clear = 1;
$txtline .= ' '; #"\n"; # if (length($txtline));
}
if ($clear) {
prth("$txtline\n") if (length($txtline));
$txtline = '';
}
} else {
# open tag
if ($tag =~ /^span/i) {
# span
} elsif ($tag =~ /^a\s+/i) {
# href
} elsif ($tag =~ /^div/i) {
# div
} elsif ($tag =~ /^sup/i) {
# sup
} elsif ($tag =~ /^small/i) {
# small
} elsif ($tag =~ /^table/i) {
# table
$intable = 1;
} elsif ($tag =~ /^b$/i) {
# bold
} elsif ($tag =~ /^i$/i) {
# italics
} elsif ($tag =~ /^br/i) {
# br
prth("$txtline\n") if (length($txtline));
$txtline = '';
} elsif ($tag =~ /^h\d+/i) {
# h#
prth("$txtline\n") if (length($txtline));
$txtline = '';
} elsif ($tag =~ /^li\s*/i) {
# li
prth("$txtline\n") if (length($txtline));
$txtline = '';
} elsif ($tag =~ /^p\s*/i) {
# p
prth("$txtline\n") if (length($txtline));
$txtline = '';
} elsif ($tag =~ /^td/i) {
$col_num++;
} elsif ($tag =~ /^th/i) {
$col_num++;
} elsif ($tag =~ /^!/) {
# comment
} elsif ($tag =~ /^img\s+/i) {
# img
} elsif ($tag =~ /^tr/i) {
prth("$txtline\n") if (length($txtline));
$txtline = '';
$col_num = 0;
} else {
prt("<$tag>\n");
}
}
}
}
} else {
$tag .= $ch;
}
} else {
if ($ch eq '<') {
if (!$inhead) {
if (length($text)) {
if (length($txtline)) {
if (length($txtline.$text) > $max_line) {
prth("$txtline\n");
$txtline = '';
}
}
$txtline .= $text;
}
}
$text = '';
$tag = '';
$inhtm = 1;
} else {
$text .= $ch;
}
}
}
}
$load_log = 1;
}
#########################################
### MAIN ###
parse_args(@ARGV);
process_in_file($in_file);
pgm_exit(0,"");
########################################
sub need_arg {
my ($arg,@av) = @_;
pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av);
}
sub parse_args {
my (@av) = @_;
my ($arg,$sarg);
my $verb = VERB2();
while (@av) {
$arg = $av[0];
if ($arg =~ /^-/) {
$sarg = substr($arg,1);
$sarg = substr($sarg,1) while ($sarg =~ /^-/);
if (($sarg =~ /^h/i)||($sarg eq '?')) {
give_help();
pgm_exit(0,"Help exit(0)");
} elsif ($sarg =~ /^v/) {
if ($sarg =~ /^v.*(\d+)$/) {
$verbosity = $1;
} else {
while ($sarg =~ /^v/) {
$verbosity++;
$sarg = substr($sarg,1);
}
}
$verb = VERB2();
prt("Verbosity = $verbosity\n") if ($verb);
} elsif ($sarg =~ /^l/) {
if ($sarg =~ /^ll/) {
$load_log = 2;
} else {
$load_log = 1;
}
prt("Set to load log at end. ($load_log)\n") if ($verb);
} elsif ($sarg =~ /^o/) {
need_arg(@av);
shift @av;
$sarg = $av[0];
$out_file = $sarg;
prt("Set out file to [$out_file].\n") if ($verb);
} else {
pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n");
}
} else {
$in_file = $arg;
prt("Set input to [$in_file]\n") if ($verb);
}
shift @av;
}
if ($debug_on) {
prtw("WARNING: DEBUG is ON!\n");
if (length($in_file) == 0) {
$in_file = $def_file;
prt("Set DEFAULT input to [$in_file]\n");
}
}
if (length($in_file) == 0) {
pgm_exit(1,"ERROR: No input files found in command!\n");
}
if (! -f $in_file) {
pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n");
}
}
sub give_help {
prt("$pgmname: version $VERS\n");
prt("Usage: $pgmname [options] in-file\n");
prt("Options:\n");
prt(" --help (-h or -?) = This help, and exit 0.\n");
prt(" --verb[n] (-v) = Bump [or set] verbosity. def=$verbosity\n");
prt(" --load (-l) = Load LOG at end. ($outfile)\n");
prt(" --out (-o) = Write output to this file.\n");
}
# eof - template.pl