airhtml.pl to HTML.

index -|- end

Generated: Mon Aug 29 19:34:10 2016 from airhtml.pl 2014/11/20 11.7 KB. text copy

#!/usr/bin/perl -w
# NAME: airhtml.pl
# AIM: Given a Wiki aircraft html file, try to extract specification
use strict;
use warnings;
use File::Basename;  # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
use HTML::Strip;
use Cwd;
my $os = $^O;
my $perl_dir = '/home/geoff/bin';
my $PATH_SEP = '/';
my $temp_dir = '/tmp';
if ($os =~ /win/i) {
    $perl_dir = 'C:\GTools\perl';
    $temp_dir = $perl_dir;
    $PATH_SEP = "\\";
}
unshift(@INC, $perl_dir);
require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n";
# log file stuff
our ($LF);
my $pgmname = $0;
if ($pgmname =~ /(\\|\/)/) {
    my @tmpsp = split(/(\\|\/)/,$pgmname);
    $pgmname = $tmpsp[-1];
}
my $outfile = $temp_dir.$PATH_SEP."temp.$pgmname.txt";
open_log($outfile);

# user variables
my $VERS = "0.0.3 2014-11-19";
my $load_log = 0;
my $in_file = '';
my $verbosity = 0;
my $out_file = '';

# ### DEBUG ###
my $debug_on = 0;
my $def_file = 'F:\Projects\aircraft\707\Boeing_707.html';

### program variables
my @warnings = ();
my $cwd = cwd();

sub VERB1() { return $verbosity >= 1; }
sub VERB2() { return $verbosity >= 2; }
sub VERB5() { return $verbosity >= 5; }
sub VERB9() { return $verbosity >= 9; }

sub show_warnings($) {
    my ($val) = @_;
    if (@warnings) {
        prt( "\nGot ".scalar @warnings." WARNINGS...\n" );
        foreach my $itm (@warnings) {
           prt("$itm\n");
        }
        prt("\n");
    } else {
        prt( "\nNo warnings issued.\n\n" ) if (VERB9());
    }
}

sub pgm_exit($$) {
    my ($val,$msg) = @_;
    if (length($msg)) {
        $msg .= "\n" if (!($msg =~ /\n$/));
        prt($msg);
    }
    show_warnings($val);
    close_log($outfile,$load_log);
    exit($val);
}


sub prtw($) {
   my ($tx) = shift;
   $tx =~ s/\n$//;
   prt("$tx\n");
   push(@warnings,$tx);
}

sub process_in_file2($) {
    my ($inf) = @_;
    if (! open INF, "<$inf") {
        pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); 
    }
    my @lines = <INF>;
    close INF;
    my $lncnt = scalar @lines;
    prt("Processing $lncnt lines, from [$inf]...\n");
    my ($line,$inc,$len);
    $line = join("",@lines);
    my $hs = HTML::Strip->new();
    my $clean_text = $hs->parse( $line );
    $hs->eof;
    my $cnt = length($clean_text);
    prt("Got $cnt plain text... ");
    ###prt($clean_text);
    @lines = split("\n",$clean_text);
    $lncnt = scalar @lines;
    prt("$lncnt lines ");
    my @nlines = ();
    foreach $line (@lines) {
        chomp $line;
        $line = trim_all($line);
        $len = length($line);
        next if ($len == 0);
        push(@nlines,$line);
    }
    $lncnt = scalar @nlines;
    prt("$lncnt clean lines\n");
    prt(join("\n",@nlines)."\n");
    $load_log = 1;
}

sub prth($) {
    my $txt = shift;
    $txt =~ s/\&\#160;/ /g;
    prt($txt);
}

sub process_in_file($) {
    my ($inf) = @_;
    if (! open INF, "<$inf") {
        pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); 
    }
    my @lines = <INF>;
    close INF;
    my $lncnt = scalar @lines;
    prt("Processing $lncnt lines, from [$inf]...\n");
    my ($line,$i,$ch,$pc,$len,$tag,$text);
    my $inhtm = 0;
    my $inquot = 0;
    my $inhead = 0;
    my $inscript = 0;
    my $txtline = '';
    my $max_line = 100;
    my $col_num = 0;
    my $clear = 0;
    my $intable = 0;
    $tag = '';
    $text = '';
    foreach $line (@lines) {
        chomp $line;
        $line = trim_all($line);
        $len = length($line);
        next if ($len == 0);
        for ($i = 0; $i < $len; $i++) {
            $pc = $ch;
            $ch = substr($line,$i,1);
            if ($inhtm) {
                if ($ch eq '>') {
                    $inhtm = 0;
                    if ($inhead) {
                        if ($tag =~ /^\/head/i) {
                            $inhead = 0;
                        }
                    } elsif ($tag =~ /^head/i) {
                        $inhead = 1;
                    } else {
                        if ($inscript) {
                            if ($tag =~ /^\/script/i) {
                                $inscript = 0;
                            }
                        } elsif ($tag =~ /^script/i) {
                            $inscript = 1;
                        } else {
                            if ($tag =~ /^\//) {
                                # close tag
                                #$tag =~ s/^\///;
                                $tag = substr($tag,1);
                                $clear = 0;
                                if ($tag =~ /^p$/i) {
                                    $clear = 1;
                                } elsif ($tag =~ /^td$/i) {
                                    $clear = 1;
                                } elsif ($tag =~ /^th$/i) {
                                    $clear = 1;
                                } elsif ($tag =~ /^h\d+$/i) {
                                    $clear = 1;
                                } elsif ($tag =~ /^table/i) {
                                    $intable = 0;
                                    $clear = 1;
                                    $txtline .= ' ';    #"\n"; # if (length($txtline));
                                }
                                if ($clear) {
                                    prth("$txtline\n") if (length($txtline));
                                    $txtline = '';
                                }
                            } else {
                                # open tag
                                if ($tag =~ /^span/i) {
                                    # span
                                } elsif ($tag =~ /^a\s+/i) {
                                    # href
                                } elsif ($tag =~ /^div/i) {
                                    # div
                                } elsif ($tag =~ /^sup/i) {
                                    # sup
                                } elsif ($tag =~ /^small/i) {
                                    # small
                                } elsif ($tag =~ /^table/i) {
                                    # table
                                    $intable = 1;
                                } elsif ($tag =~ /^b$/i) {
                                    # bold
                                } elsif ($tag =~ /^i$/i) {
                                    # italics
                                } elsif ($tag =~ /^br/i) {
                                    # br
                                    prth("$txtline\n") if (length($txtline));
                                    $txtline = '';
                                } elsif ($tag =~ /^h\d+/i) {
                                    # h#
                                    prth("$txtline\n") if (length($txtline));
                                    $txtline = '';
                                } elsif ($tag =~ /^li\s*/i) {
                                    # li
                                    prth("$txtline\n") if (length($txtline));
                                    $txtline = '';
                                } elsif ($tag =~ /^p\s*/i) {
                                    # p
                                    prth("$txtline\n") if (length($txtline));
                                    $txtline = '';
                                } elsif ($tag =~ /^td/i) {
                                    $col_num++;
                                } elsif ($tag =~ /^th/i) {
                                    $col_num++;
                                } elsif ($tag =~ /^!/) {
                                    # comment
                                } elsif ($tag =~ /^img\s+/i) {
                                    # img
                                } elsif ($tag =~ /^tr/i) {
                                    prth("$txtline\n") if (length($txtline));
                                    $txtline = '';
                                    $col_num = 0;
                                } else {
                                    prt("<$tag>\n");
                                }
                            }
                        }
                    }
                } else {
                    $tag .= $ch;
                }
            } else {
                if ($ch eq '<') {
                    if (!$inhead) {
                        if (length($text)) {
                            if (length($txtline)) {
                                if (length($txtline.$text) > $max_line) {
                                    prth("$txtline\n");
                                    $txtline = '';
                                }
                            }
                            $txtline .= $text;
                        }
                    }
                    $text = '';
                    $tag = '';
                    $inhtm = 1;
                } else {
                    $text .= $ch;
                }
            }
        }
    }
    $load_log = 1;
}


#########################################
### MAIN ###
parse_args(@ARGV);
process_in_file($in_file);
pgm_exit(0,"");
########################################

sub need_arg {
    my ($arg,@av) = @_;
    pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av);
}

sub parse_args {
    my (@av) = @_;
    my ($arg,$sarg);
    my $verb = VERB2();
    while (@av) {
        $arg = $av[0];
        if ($arg =~ /^-/) {
            $sarg = substr($arg,1);
            $sarg = substr($sarg,1) while ($sarg =~ /^-/);
            if (($sarg =~ /^h/i)||($sarg eq '?')) {
                give_help();
                pgm_exit(0,"Help exit(0)");
            } elsif ($sarg =~ /^v/) {
                if ($sarg =~ /^v.*(\d+)$/) {
                    $verbosity = $1;
                } else {
                    while ($sarg =~ /^v/) {
                        $verbosity++;
                        $sarg = substr($sarg,1);
                    }
                }
                $verb = VERB2();
                prt("Verbosity = $verbosity\n") if ($verb);
            } elsif ($sarg =~ /^l/) {
                if ($sarg =~ /^ll/) {
                    $load_log = 2;
                } else {
                    $load_log = 1;
                }
                prt("Set to load log at end. ($load_log)\n") if ($verb);
            } elsif ($sarg =~ /^o/) {
                need_arg(@av);
                shift @av;
                $sarg = $av[0];
                $out_file = $sarg;
                prt("Set out file to [$out_file].\n") if ($verb);
            } else {
                pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n");
            }
        } else {
            $in_file = $arg;
            prt("Set input to [$in_file]\n") if ($verb);
        }
        shift @av;
    }

    if ($debug_on) {
        prtw("WARNING: DEBUG is ON!\n");
        if (length($in_file) ==  0) {
            $in_file = $def_file;
            prt("Set DEFAULT input to [$in_file]\n");
        }
    }
    if (length($in_file) ==  0) {
        pgm_exit(1,"ERROR: No input files found in command!\n");
    }
    if (! -f $in_file) {
        pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n");
    }
}

sub give_help {
    prt("$pgmname: version $VERS\n");
    prt("Usage: $pgmname [options] in-file\n");
    prt("Options:\n");
    prt(" --help  (-h or -?) = This help, and exit 0.\n");
    prt(" --verb[n]     (-v) = Bump [or set] verbosity. def=$verbosity\n");
    prt(" --load        (-l) = Load LOG at end. ($outfile)\n");
    prt(" --out <file>  (-o) = Write output to this file.\n");
}

# eof - template.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional