#!/usr/bin/perl -w # NAME: airhtml.pl # AIM: Given a Wiki aircraft html file, try to extract specification use strict; use warnings; use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] ) use HTML::Strip; use Cwd; my $os = $^O; my $perl_dir = '/home/geoff/bin'; my $PATH_SEP = '/'; my $temp_dir = '/tmp'; if ($os =~ /win/i) { $perl_dir = 'C:\GTools\perl'; $temp_dir = $perl_dir; $PATH_SEP = "\\"; } unshift(@INC, $perl_dir); require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n"; # log file stuff our ($LF); my $pgmname = $0; if ($pgmname =~ /(\\|\/)/) { my @tmpsp = split(/(\\|\/)/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = $temp_dir.$PATH_SEP."temp.$pgmname.txt"; open_log($outfile); # user variables my $VERS = "0.0.3 2014-11-19"; my $load_log = 0; my $in_file = ''; my $verbosity = 0; my $out_file = ''; # ### DEBUG ### my $debug_on = 0; my $def_file = 'F:\Projects\aircraft\707\Boeing_707.html'; ### program variables my @warnings = (); my $cwd = cwd(); sub VERB1() { return $verbosity >= 1; } sub VERB2() { return $verbosity >= 2; } sub VERB5() { return $verbosity >= 5; } sub VERB9() { return $verbosity >= 9; } sub show_warnings($) { my ($val) = @_; if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } else { prt( "\nNo warnings issued.\n\n" ) if (VERB9()); } } sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg); } show_warnings($val); close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } sub process_in_file2($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$inc,$len); $line = join("",@lines); my $hs = HTML::Strip->new(); my $clean_text = $hs->parse( $line ); $hs->eof; my $cnt = length($clean_text); prt("Got $cnt plain text... "); ###prt($clean_text); @lines = split("\n",$clean_text); $lncnt = scalar @lines; prt("$lncnt lines "); my @nlines = (); foreach $line (@lines) { chomp $line; $line = trim_all($line); $len = length($line); next if ($len == 0); push(@nlines,$line); } $lncnt = scalar @nlines; prt("$lncnt clean lines\n"); prt(join("\n",@nlines)."\n"); $load_log = 1; } sub prth($) { my $txt = shift; $txt =~ s/\&\#160;/ /g; prt($txt); } sub process_in_file($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$i,$ch,$pc,$len,$tag,$text); my $inhtm = 0; my $inquot = 0; my $inhead = 0; my $inscript = 0; my $txtline = ''; my $max_line = 100; my $col_num = 0; my $clear = 0; my $intable = 0; $tag = ''; $text = ''; foreach $line (@lines) { chomp $line; $line = trim_all($line); $len = length($line); next if ($len == 0); for ($i = 0; $i < $len; $i++) { $pc = $ch; $ch = substr($line,$i,1); if ($inhtm) { if ($ch eq '>') { $inhtm = 0; if ($inhead) { if ($tag =~ /^\/head/i) { $inhead = 0; } } elsif ($tag =~ /^head/i) { $inhead = 1; } else { if ($inscript) { if ($tag =~ /^\/script/i) { $inscript = 0; } } elsif ($tag =~ /^script/i) { $inscript = 1; } else { if ($tag =~ /^\//) { # close tag #$tag =~ s/^\///; $tag = substr($tag,1); $clear = 0; if ($tag =~ /^p$/i) { $clear = 1; } elsif ($tag =~ /^td$/i) { $clear = 1; } elsif ($tag =~ /^th$/i) { $clear = 1; } elsif ($tag =~ /^h\d+$/i) { $clear = 1; } elsif ($tag =~ /^table/i) { $intable = 0; $clear = 1; $txtline .= ' '; #"\n"; # if (length($txtline)); } if ($clear) { prth("$txtline\n") if (length($txtline)); $txtline = ''; } } else { # open tag if ($tag =~ /^span/i) { # span } elsif ($tag =~ /^a\s+/i) { # href } elsif ($tag =~ /^div/i) { # div } elsif ($tag =~ /^sup/i) { # sup } elsif ($tag =~ /^small/i) { # small } elsif ($tag =~ /^table/i) { # table $intable = 1; } elsif ($tag =~ /^b$/i) { # bold } elsif ($tag =~ /^i$/i) { # italics } elsif ($tag =~ /^br/i) { # br prth("$txtline\n") if (length($txtline)); $txtline = ''; } elsif ($tag =~ /^h\d+/i) { # h# prth("$txtline\n") if (length($txtline)); $txtline = ''; } elsif ($tag =~ /^li\s*/i) { # li prth("$txtline\n") if (length($txtline)); $txtline = ''; } elsif ($tag =~ /^p\s*/i) { # p prth("$txtline\n") if (length($txtline)); $txtline = ''; } elsif ($tag =~ /^td/i) { $col_num++; } elsif ($tag =~ /^th/i) { $col_num++; } elsif ($tag =~ /^!/) { # comment } elsif ($tag =~ /^img\s+/i) { # img } elsif ($tag =~ /^tr/i) { prth("$txtline\n") if (length($txtline)); $txtline = ''; $col_num = 0; } else { prt("<$tag>\n"); } } } } } else { $tag .= $ch; } } else { if ($ch eq '<') { if (!$inhead) { if (length($text)) { if (length($txtline)) { if (length($txtline.$text) > $max_line) { prth("$txtline\n"); $txtline = ''; } } $txtline .= $text; } } $text = ''; $tag = ''; $inhtm = 1; } else { $text .= $ch; } } } } $load_log = 1; } ######################################### ### MAIN ### parse_args(@ARGV); process_in_file($in_file); pgm_exit(0,""); ######################################## sub need_arg { my ($arg,@av) = @_; pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av); } sub parse_args { my (@av) = @_; my ($arg,$sarg); my $verb = VERB2(); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); if (($sarg =~ /^h/i)||($sarg eq '?')) { give_help(); pgm_exit(0,"Help exit(0)"); } elsif ($sarg =~ /^v/) { if ($sarg =~ /^v.*(\d+)$/) { $verbosity = $1; } else { while ($sarg =~ /^v/) { $verbosity++; $sarg = substr($sarg,1); } } $verb = VERB2(); prt("Verbosity = $verbosity\n") if ($verb); } elsif ($sarg =~ /^l/) { if ($sarg =~ /^ll/) { $load_log = 2; } else { $load_log = 1; } prt("Set to load log at end. ($load_log)\n") if ($verb); } elsif ($sarg =~ /^o/) { need_arg(@av); shift @av; $sarg = $av[0]; $out_file = $sarg; prt("Set out file to [$out_file].\n") if ($verb); } else { pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n"); } } else { $in_file = $arg; prt("Set input to [$in_file]\n") if ($verb); } shift @av; } if ($debug_on) { prtw("WARNING: DEBUG is ON!\n"); if (length($in_file) == 0) { $in_file = $def_file; prt("Set DEFAULT input to [$in_file]\n"); } } if (length($in_file) == 0) { pgm_exit(1,"ERROR: No input files found in command!\n"); } if (! -f $in_file) { pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n"); } } sub give_help { prt("$pgmname: version $VERS\n"); prt("Usage: $pgmname [options] in-file\n"); prt("Options:\n"); prt(" --help (-h or -?) = This help, and exit 0.\n"); prt(" --verb[n] (-v) = Bump [or set] verbosity. def=$verbosity\n"); prt(" --load (-l) = Load LOG at end. ($outfile)\n"); prt(" --out (-o) = Write output to this file.\n"); } # eof - template.pl