getimg4htmurl.pl to HTML.

index -|- end

Generated: Sun Apr 15 11:46:20 2012 from getimg4htmurl.pl 2012/02/26 23.1 KB.

#!/usr/bin/perl -w
# NAME: getimg4htmurl.pl
# AIM: Read a HTML file, and get the <img, and fetch the image via HTTP, and save...
# 23/02/2012 geoff mclane http://geoffair.net/mperl
use strict;
use warnings;
use File::Basename;  # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
use File::stat;
use Cwd;
use LWP::Simple;
my $os = $^O;
my $perl_dir = '/home/geoff/bin';
my $PATH_SEP = '/';
my $temp_dir = '/tmp';
if ($os =~ /win/i) {
    $perl_dir = 'C:\GTools\perl';
    $temp_dir = $perl_dir;
    $PATH_SEP = "\\";
}
unshift(@INC, $perl_dir);
require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n";
require 'lib_css.pl' or die "Unable to load 'lib_css.pl' Check paths in \@INC...\n";
# log file stuff
our ($LF);
my $pgmname = $0;
if ($pgmname =~ /(\\|\/)/) {
    my @tmpsp = split(/(\\|\/)/,$pgmname);
    $pgmname = $tmpsp[-1];
}
my $outfile = $temp_dir.$PATH_SEP."temp.$pgmname.txt";
open_log($outfile);

# user variables
my $VERS = "0.0.1 2012-02-23";
my $load_log = 0;
my $in_file = '';
my $verbosity = 0;
my $out_dir = '';

### program variables
my @warnings = ();
my $cwd = cwd();

sub VERB1() { return $verbosity >= 1; }
sub VERB2() { return $verbosity >= 2; }
sub VERB5() { return $verbosity >= 5; }
sub VERB9() { return $verbosity >= 9; }

# my @noclose = qw(img link meta);
my @noclose = qw( meta link area base basefont br frame hr isindex param bgsound embed keygen img );

# tags which do NOT need a closing, like </p>, tag
my @optclose = ( "body", "colgroup", "dd", "dt", "head", "html", "li", "optgroup", "option",
"p", "tbody", "td", "tfoot", "th", "thead", "tr", "marquee" );
# my @optclose = qw(li);

my @html_ext = qw( .htm .html .shtml .php );
my @graf_ext = qw( .jpg .jpeg .gif .png .bmp .ico .mpg .tif );

# debug
my $debug_on = 1;
my $def_file = 'C:\HOMEPAGE\GA\fg\release260.htm';
my $def_out = 'C:\GTools\perl\temp';
my $do_fetch = 0;   # switch off file fetch
my $dbg_css = 0;

# 1,031 bridge.jpg
my $test_url = "http://geoffair.org/images/bridge.jpg";
#my $test_url = "http://geoffair.org/images/spacer.gif";
my $test_out = $temp_dir.$PATH_SEP."tempimg.jpg";

sub show_warnings($) {
    my ($val) = @_;
    if (@warnings) {
        prt( "\nGot ".scalar @warnings." WARNINGS...\n" );
        foreach my $itm (@warnings) {
           prt("$itm\n");
        }
        prt("\n");
    } else {
        prt( "\nNo warnings issued.\n\n" ) if (VERB9());
    }
}

sub pgm_exit($$) {
    my ($val,$msg) = @_;
    if (length($msg)) {
        $msg .= "\n" if (!($msg =~ /\n$/));
        prt($msg);
    }
    show_warnings($val);
    close_log($outfile,$load_log);
    exit($val);
}


sub prtw($) {
   my ($tx) = shift;
   $tx =~ s/\n$//;
   prt("$tx\n");
   push(@warnings,$tx);
}

sub is_noclose_tag($) {
    my ($tag) = @_;
    my $lctag = lc($tag);
    my ($tst);
    foreach $tst (@noclose) {
        return 1 if ($tst eq $tag);
    }
    return 0;
}

sub is_optclose_tag($) {
    my ($tag) = @_;
    my $lctag = lc($tag);
    my ($tst);
    foreach $tst (@optclose) {
        return 1 if ($tst eq $tag);
    }
    return 0;
}

sub is_ext_html($) {
    my $ext = shift;
    $ext = lc($ext);
    my ($tst);
    foreach $tst (@html_ext) {
        return 1 if ($tst eq $ext);
    }
    return 0;
}

sub is_ext_graf($) {
    my $ext = shift;
    $ext = lc($ext);
    my ($tst);
    foreach $tst (@graf_ext) {
        return 1 if ($tst eq $ext);
    }
    return 0;
}


sub file_has_htm_ext($) {
    my $fn = shift;
    my ($nm,$dir,$ext) = fileparse($fn, qr/\.[^.]*/);
    if ($ext && length($ext)) {
        return 4 if ($ext eq '.css');
        return 3 if (is_ext_graf($ext));
        return 2 if (is_ext_html($ext));
        return 1;
    }
    return 0;
}

sub fetch_url {
   my ($url,$fil) = @_;
   prt( "Fetching: $url...\n" );
   my $img = get($url);
    my ($sb,$sz,$isz);
   if ($img) {
        $isz = length($img);
        prt("Writing $isz to $fil...\n");
        if (open OF, ">$fil") {
            binmode OF;
            print OF $img;
            close OF;
            if ($sb = stat($fil)) {
                $sz = $sb->size;
                prt("Written $sz bytes to $fil\n");
            }
        } else {
            pgm_exit(1,"ERROR: Unable to create file [$fil]\n");
        }
    } else {
        prtw("WARNING: get of URL '$url' FAILED!\n");
    }
}

sub get_attr_hash($) {
    my $ra = shift;
    my %hash = ();
    my ($itm,$key,$val,$len);
    foreach $itm (@{$ra}) {
        if ($itm =~ /^(\w+)=/) {
            $key = lc($1);
            $len = length($key) + 1;
            $val = substr($itm,$len);
            $val = strip_quotes($val);
            $hash{$key} = $val;
        }
    }
    return \%hash;
}

sub sanetise_filename($) {
    my $rn = shift;
    ${$rn} =~ s/\?/_/g;
    ${$rn} =~ s/\*/_/g;
    ${$rn} =~ s/&amp;/&/g;
    ${$rn} =~ s/%3D/=/g;
    ${$rn} =~ s/:/_/g;
}

#    $hash{'REF_HASH'} = $rhash;
#    $hash{'H_CLASSES'} = \%h_classes;
#    $hash{'H_NAMES'} = \%h_names;
#    $hash{'H_ELEMENTS'} = \%h_elements;
#    $hash{'H_ELECLASS'} = \%h_eleclass;
#    $hash{'H_ELEHASH'} = \%h_elehash;
#    $hash{'H_ELECOLON'} = \%h_elecolon;
#    $hash{'H_ELEPLUS'} = \%h_eleplus;
#    $hash{'H_OTHERS'} = \%h_others;
sub check_class_list($$) {
    my ($rcc,$rclasses) = @_;
    if (!defined ${$rcc}{'REF_HASH'}) {
        prtw("WARNING: Has passed does NOT contain main ref hash!\n");
        return;
    }
    my $rhash = ${$rcc}{'REF_HASH'};
    my $rh_classes = ${$rcc}{'H_CLASSES'};       # = \%h_classes; begin with '.'
    my $rh_names  = ${$rcc}{'H_NAMES'};         # = \%h_names;   begin with '#'
    my $rh_elements = ${$rcc}{'H_ELEMENTS'};    # = \%h_elements; raw elements
    my $rh_eleclass = ${$rcc}{'H_ELECLASS'};    # = \%h_eleclass; element.class
    my $rh_elehash = ${$rcc}{'H_ELEHASH'};      # = \%h_elehash;  element#class
    my $rh_elecolon = ${$rcc}{'H_ELECOLON'};    # = \%h_elecolon; element:class
    my $rh_eleplus = ${$rcc}{'H_ELEPLUS'};      # = \%h_eleplus;  element+element
    my $rh_others = ${$rcc}{'H_OTHERS'};        # = \%h_others;   NOT any of the above???
    my ($class,$val,$cont,$rh2,$key2,$done,$val2);
    foreach $class (sort keys %{$rclasses}) {
        $val = ${$rclasses}{$class}; # this is only the count of useage
        if (defined ${$rh_classes}{$class}) {
            $class = '.'.$class;
            if (defined ${$rhash}{$class}) {
                $rh2 = ${$rhash}{$class};
                $cont = '';
                foreach $key2 (sort keys %{$rh2}) {
                    $val = ${$rh2}{$key2};
                    $cont .= "$key2:$val;"
                }
                prt("$class { $cont }\n");
            } else {
                prt("$class NOT FOUND! dot.class format\n");
            }
        } elsif (defined ${$rh_names}{$class}) {
            $class = '#'.$class;
            if (defined ${$rhash}{$class}) {
                $rh2 = ${$rhash}{$class};
                $cont = '';
                foreach $key2 (sort keys %{$rh2}) {
                    $val = ${$rh2}{$key2};
                    $cont .= "$key2:$val;"
                }
                prt("$class { $cont }\n");
            } else {
                prt("$class NOT FOUND! hash.class format\n");
            }
        } else {
            $done = 0;
            foreach $key2 (keys %{$rh_eleclass}) {
                $val = ${$rh_eleclass}{$key2};
                foreach $val2 (@{$val}) {
                    ###prt("Checking $key2.$val2\n");
                    if ($val2 eq $class) {
                        $class = $key2.".".$class;
                        if (defined ${$rhash}{$class}) {
                            $rh2 = ${$rhash}{$class};
                            $cont = '';
                            foreach $key2 (sort keys %{$rh2}) {
                                $val = ${$rh2}{$key2};
                                $cont .= "$key2:$val;"
                            }
                            prt("$class { $cont }\n");

                        } else {
                            prt("$class NOT FOUND! element.class format\n");
                        }
                        $done = 1;
                        last;
                    }
                }
            }

            if ($done == 0) {
                prt("$class NOT FOUND!\n");
            }
        }
    }
    ### pgm_exit(1,"TEMP EXIT 2");
}

sub process_html_file($) {
    my ($inf) = @_;
    if (! open INF, "<$inf") {
        pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); 
    }
    my @lines = <INF>;
    close INF;
    my $lncnt = scalar @lines;
    prt("Processing $lncnt lines, from [$inf]...\n");
    my ($file_name,$file_dir) = fileparse($inf);
    my ($line,$inc,$lnn);
    my ($inquot,$intag,$i,$len,$ch,$qc,$tag,@arr,$ptag,$cnt,$rh,$val,$fn,$of,$res);
    my ($tmp,$msg,$tscnt,$ff,$nf,$rcss,$ra,$rceh);
    $lnn = 0;
    $intag = 0;
    $tag = '';
    my @tagstack = ();
    my %classes = ();
    my %class_element = ();
    my %elements = ();
    my %element_list = ();
    my %local_jumps = ();
    my %local_names = ();
    my %local_id = ();
    my %file_srcs = ();
    my %file_hrefs = ();
    my %remote_srcs = ();
    my %remote_hrefs =();
    my %css_hash = ();
    foreach $line (@lines) {
        chomp $line;
        $lnn++;
        $line = trim_all($line);
        $len = length($line);
        next if ($len == 0);
        $inquot = 0;
        for ($i = 0; $i < $len; $i++) {
            $ch = substr($line,$i,1);
            if ($intag) {
                $tag .= $ch;
                if ($inquot) {
                    $inquot = 0 if ($ch eq $qc);
                    prt("$lnn: End   quote...($ch)\n") if (($inquot == 0) && VERB9());
                } elsif (($ch eq '"')||($ch eq "'")) {
                    $inquot = 1;
                    $qc = $ch;
                    prt("$lnn: Begin quote...($ch)\n") if (VERB9());
                } else {
                    if ($ch eq '>')  {
                        $intag = 0;
                        $tag =~ s/^<(.*)>$/$1/;
                        #@arr = split(/\s+/,$tag);
                        @arr = space_split($tag);
                        $cnt = scalar @arr;
                        $rh = get_attr_hash(\@arr);
                        $tag = $arr[0];
                        if ($tag =~ /^\//) {
                            # CLOSE TAG ie </element>
                            $tag = substr($tag,1);
                            $tscnt = scalar @tagstack;
                            if ($tscnt) {
                                $ptag = $tagstack[-1];  # get last
                                if (lc($tag) eq lc($ptag)) {
                                    $ptag = pop @tagstack;
                                    $tscnt = scalar @tagstack;
                                    prt("$lnn: Close tag [$tag] [$ptag]$tscnt...\n") if (VERB2());
                                } else {
                                    if (is_optclose_tag($ptag) && ($tscnt > 1)) {

                                    }
                                    prtw("WARNING:$lnn: Close tag [$tag] ne [$ptag], but NOT last on STACK!\n");
                                }
                            } else {
                                prtw("WARNING:$lnn: Close tag [$tag], but NONE on STACK!\n");
                            }
                        } elsif ($tag =~ /^!/) {
                            # SPECIAL TAG - <!DOCTYPE...>
                            prt("$lnn: Special tag [$tag]...\n") if (VERB9());
                        } elsif (is_noclose_tag($tag)) {
                            # ELEMENTS WHICH DO NOT HAVE A CLOSE
                            prt("$lnn: Noclose tag [$tag]...\n") if (VERB5());
                        } else {
                            # normal ELEMENT
                            push(@tagstack,$tag);
                            $element_list{$tag} = 1;
                            $elements{$tag} = [ ] if (!defined $elements{$tag});
                            $ra = $elements{$tag};
                            push(@{$ra},$rh);   # store the attribute hash for the element
                            $msg = '';
                            foreach $tmp (@tagstack) {
                                $msg .= ":" if (length($msg));
                                $msg .= $tmp;
                            }
                            $tscnt = scalar @tagstack;
                            prt("$lnn: Push tag [$tag]$tscnt... $msg\n") if (VERB2());
                            if (VERB9()) {
                                @arr = sort keys(%{$rh});
                                $msg = '';
                                foreach $tmp (@arr) {
                                    $val = ${$rh}{$tmp};
                                    $msg .= ' ' if (length($msg));
                                    $msg .= "$tmp=\"$val\"";
                                }
                                prt("$lnn: Attributes: $msg\n") if (length($msg));
                            }
                        }
                        if (defined ${$rh}{'src'}) {
                            $val = ${$rh}{'src'};
                            if ($val =~ /^http:\/\//i) {
                                @arr = split(/\//,$val);
                                $fn = $arr[-1];
                                prt("$lnn: src = [$val] file [$fn]\n") if (VERB1());
                                $of = $out_dir.$PATH_SEP.$fn;
                                fetch_url($val,$of) if ($do_fetch);
                                $remote_srcs{$val} = $of;
                            } else {
                                $ff = $file_dir.$val;
                                $nf = 1;
                                if (-f $ff) {
                                    $msg = 'ok';
                                    $nf = 0;
                                } else {
                                    $msg = "NOT FOUND!";
                                }
                                prt("$lnn: src = [$val] $msg\n") if ($nf || VERB1());
                                $file_srcs{$val} = $ff;
                            }
                        }
                        if (defined ${$rh}{'href'}) {
                            $val = ${$rh}{'href'};
                            if ($val =~ /^http:\/\//i) {
                                @arr = split(/\//,$val);
                                $fn = $arr[-1];
                                sanetise_filename(\$fn);
                                $res = file_has_htm_ext($fn);
                                if ($res == 0) {
                                    $fn .= ".htm";
                                } elsif ($res == 1) {
                                    $fn .= ".html";
                                }
                                prt("$lnn: src = [$val] file [$fn]\n") if (VERB1());
                                $of = $out_dir.$PATH_SEP.$fn;
                                fetch_url($val,$of) if ($do_fetch);
                                $remote_hrefs{$val} = $of;
                            } elsif ($val =~ /^\#/) {
                                # local jump target
                                $val = substr($val,1);
                                $local_jumps{$val} = 1;
                            } else {
                                $ff = $file_dir.$val;
                                $nf = 1;
                                if (-f $ff) {
                                    $msg = 'ok';
                                    $nf = 0;
                                    if (lc($tag) eq 'link') {
                                        $rcss = read_css_file(\%css_hash,$ff,$dbg_css);
                                        ### pgm_exit(1,"TEMP EXIT");
                                    }
                                } else {
                                    $msg = "NOT FOUND!";
                                }
                                prt("$lnn: href = [$val] $msg\n") if ($nf || VERB1());
                                $file_hrefs{$val} = $ff;
                            }
                        }
                        if (defined ${$rh}{'class'}) {
                            $val = ${$rh}{'class'};
                            $val = strip_quotes($val);
                            @arr = split(/\s+/,$val);
                            foreach $tmp (@arr) {
                                if (defined $classes{$tmp}) {
                                    $classes{$tmp}++;
                                } else {
                                    $classes{$tmp} = 1;
                                }
                                $class_element{$tmp} = { } if (!defined $class_element{$tmp});
                                $rceh = $class_element{$tmp};
                                ${$rceh}{$tag} = 1;
                            }
                        }
                        if (defined ${$rh}{'name'}) {
                            $val = ${$rh}{'name'};
                            $val = strip_quotes($val);
                            if (defined $local_names{$val}) {
                                prtw("WARNING:$lnn: Attribute name [$val] REPEATED!\n");
                            } else {
                                $local_names{$val} = 1;
                            }
                        }
                        if (defined ${$rh}{'id'}) {
                            $val = ${$rh}{'id'};
                            $val = strip_quotes($val);
                            if (defined $local_id{$val}) {
                                prtw("WARNING:$lnn: Attribute name [$val] REPEATED!\n");
                            } else {
                                $local_id{$val} = 1;
                            }
                        }
                        $tag = ''# kill this tag
                    }
                }
            } else {
                if ($ch eq '<') {
                    $intag = 1;
                    prt("$lnn: Begin tag...\n") if (VERB9());
                    $tag = $ch;
                }
            }
        } # for this line
        $tag .= ' ' if (length($tag));
        if ($inquot) {
            prtw("WARNING:$lnn: Ended line in QUOTES!\n");
            $inquot = 0;
        }
    }
    @arr = sort keys(%local_jumps);
    $tscnt = scalar @arr;
    if ($tscnt) {
        $msg = '';
        foreach $tmp (@arr) {
            $msg .= ' ' if (length($msg));
            $msg .= "$tmp";
            if (! defined $local_names{$tmp}) {
                $msg .= " NOT FOUND!";
            }
        }
        prt("Found $tscnt jumps: $msg\n");
    }
    my $rcc = get_class_counts(\%css_hash);
    @arr = sort keys(%classes);
    $tscnt = scalar @arr;
    if ($tscnt) {
        $msg = '';
        foreach $tmp (@arr) {
            $val = $classes{$tmp};
            $msg .= ' ' if (length($msg));
            $msg .= "$tmp($val)"
        }
        prt("Found $tscnt classes: $msg\n");
        check_class_list($rcc,\%classes) if (VERB9());
    }
    $tscnt = scalar @tagstack;
    if ($tscnt) {
        $msg = '';
        foreach $tmp (@tagstack) {
           $msg .= ":" if (length($msg));
           $msg .= $tmp;
        }
        prtw("WARNING: $lnn: End of file [$inf], with tag stack [$msg] NOT cleared.\n");
    } else {
        prt("$lnn: End of file [$inf], with tag stack cleared.\n");
    }
    my %hash = ();
    $hash{'INPUT_FILE'} = $inf;
    $hash{'CLASSES'} = \%classes;
    $hash{'ELEMENTS'} = \%elements;
    $hash{'CLASS_ELEMENT'} = \%class_element;
    $hash{'ELEMENT_LIST'} = \%element_list;
    $hash{'LOCAL_JUMPS'} = \%local_jumps;
    $hash{'LOCAL_NAMES'} = \%local_names;
    $hash{'LOCAL_ID'} = \%local_id;
    $hash{'FILE_SRCS'} = \%file_srcs;
    $hash{'FILE_HREFS'} = \%file_hrefs;
    $hash{'REMOTE_SRCS'} = \%remote_srcs;
    $hash{'REMOTE_HREFS'} = \%remote_hrefs;
    $hash{'CSS_HAHS'} = \%css_hash;
    return \%hash;

}

sub process_in_file($) {
    my $file = shift;
    my $ref_hash = process_html_file($file);
}

#########################################
### MAIN ###
#fetch_url($test_url,$test_out);
#pgm_exit(1,"EOP");
parse_args(@ARGV);
process_in_file($in_file);
pgm_exit(0,"");
########################################
sub give_help {
    prt("$pgmname: version $VERS\n");
    prt("Usage: $pgmname -o dir [options] in-file\n");
    prt("Options:\n");
    prt(" --help  (-h or -?) = This help, and exit 0.\n");
    prt(" --verb[n]     (-v) = Bump [or set] verbosity. def=$verbosity\n");
    prt(" --load        (-l) = Load LOG at end. ($outfile)\n");
    prt(" --out <dir>   (-o) = Write output to this directory.\n");
    prt(" Read the input HTML page, and fetch any remote href or src items,\n");
    prt(" and write them to the output directory.\n");
}

sub need_arg {
    my ($arg,@av) = @_;
    pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av);
}

sub parse_args {
    my (@av) = @_;
    my ($arg,$sarg);
    while (@av) {
        $arg = $av[0];
        if ($arg =~ /^-/) {
            $sarg = substr($arg,1);
            $sarg = substr($sarg,1) while ($sarg =~ /^-/);
            if (($sarg =~ /^h/i)||($sarg eq '?')) {
                give_help();
                pgm_exit(0,"Help exit(0)");
            } elsif ($sarg =~ /^v/) {
                if ($sarg =~ /^v.*(\d+)$/) {
                    $verbosity = $1;
                } else {
                    while ($sarg =~ /^v/) {
                        $verbosity++;
                        $sarg = substr($sarg,1);
                    }
                }
                prt("Verbosity = $verbosity\n") if (VERB1());
            } elsif ($sarg =~ /^l/) {
                $load_log = 1;
                prt("Set to load log at end.\n") if (VERB1());
            } elsif ($sarg =~ /^o/) {
                need_arg(@av);
                shift @av;
                $sarg = $av[0];
                $out_dir = $sarg;
                prt("Set out directory to [$out_dir].\n") if (VERB1());
            } else {
                pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n");
            }
        } else {
            $in_file = $arg;
            prt("Set input to [$in_file]\n") if (VERB1());
        }
        shift @av;
    }

    if ((length($in_file) ==  0) && $debug_on) {
        $in_file = $def_file;
        prt("Set DEFAULT input to [$in_file]\n");
        $out_dir = $def_out;
        $load_log = 1;
        #$verbosity = 1;
    }
    if (length($in_file) ==  0) {
        pgm_exit(1,"ERROR: No input file found in command!\n");
    }
    if (! -f $in_file) {
        pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n");
    }
    if (length($out_dir) == 0) {
        pgm_exit(1,"ERROR: No output directory found in command!\n");
    }
    if (! -d $out_dir) {
        pgm_exit(1,"ERROR: Unable to find output directory [$out_dir]!\n");
    }
}

# eof - template.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional