autoword03.pl to HTML.

index -|- end

Generated: Tue Feb 2 17:54:22 2010 from autoword03.pl 2006/06/16 4.8 KB.

#!/Perl
# Series: Microsoft Word Automation
# original from : http://www.stouk.com/documents/perl/gui/guiref/page04.htm
# purpose: To load a Microsoft Word document, get the contents of the document,
# and put all the words into a HASH ... sort and show the HASH ...
# NOTE: If a NEW application is loaded, or the document is LARGE, the
# script can appear stalled. Increase $verb to 9 to see more action
# happening ... sometimes 30-50 seconds to process even 1 page ... I assume
# something to do with getting the OLE server running ...
# Setting word visible seems to INCREASE the time taken?
# It seems to run FASTEST when a copy of Word is already open ...
# author: geoff mclane - email: geoffair _at_ hotmail _dot_ com - 2006-06-12
#
# NOTE WELL: ON LARGE FILES, THIS CAN TAKE A VERY LONG TIME, UNLESS
# YOU ALREADY HAVE A COPY OF WORD RUNNING. WHEN THERE IS A COPY OF
# WORD RUNNING, THE RESULTS CAN BE QUITE QUICK!!!!!
# IN FACT IT MAY BE COMPLETELY STALLING IF AN INSTANCE OF WORD IS NOT
# ALREADY RUNNING - THIS INSTANCE ALSO GETS CLOSED AT THE END ...
#
use strict; 
use Win32::OLE qw(in with); 
use Win32::OLE::Const 'Microsoft Word'; 
# *** ALTER THIS TO POINT TO YOUR OWN TEST DOCUMENT ***
#######################################################
my $infile = 'c:\tmp\test.doc';
# #####################################################
my $inhalt = '';
my %distinct = (); # TO HOLD THE FINAL LIST
# just for LOG FILE ouput ...
my ($LOG);
my $write_log = 0;
my $verb = 1; # increase to 9 to see more output
my $outfile = "$0.txt"; # note name of perl file used as base
if ( open( $LOG, ">$outfile" ) ) {
    $write_log = 1; # we have a LOG file
} else {
    $write_log = 0;
    prt( "WARNING: Unable to open $outfile LOG ...\n" );
}
# a stepped approach to openning, or loading Microsoft Word
prt( "Attaching to Word application ...\n" );
my $Word = Win32::OLE->GetActiveObject('Word.Application');
if ($Word) {
   prt( "Using existing running Word application ...\n" );
} else {
   prt( "Starting NEW Word application ...\n" );
   $Word = Win32::OLE->new('Word.Application', 'Quit'); 
   if ($Word) {
      prt("New application running ...\n");
   } else {
      mydie( "ERROR: Failed to load Word application ...\n" );
   }
}
###prt( "Setting word as visible ...\n" );
###$Word->{'Visible'} = 1; # if you want to see something, but it seems to SLOW UP things ;=))
# Load the application with the document
prt( "Openning document $infile ...\n" );
$Word->Documents->Open($infile) 
|| mydie("Unable to open [$infile] document!\nError: ". Win32::OLE->LastError() . "\n"); 
prt( "Getting contents of the ActiveDocument ... wait ...\n" );
my $myRange = $Word->ActiveDocument->Content; 
prt( "Processing ActiveDocument contents ... wait ...\n" );
# Collections - Characters Words Sentences Paragraphs Sections HeadersFooters 
foreach my $word (in $myRange->Words){ 
   $inhalt = lc($word->{Text});
   # try to trim it up a bit
   chomp $inhalt; # remove trailing \n char, if any ...
   $inhalt =~ s/^\'//; # remove any beginning single quotes
   $inhalt =~ s/^\"//g; # remove any beginning double quotes
   $inhalt =~ s/\'$//; # remove any ending single quotes
   $inhalt =~ s/\"$//g; # remove any ending double quotes
   prtv9( "Got lc text [$inhalt] ...\n" );
   next if not $inhalt =~ m/^[a-z]{2,}/; # forget it if not start with 2 alpha 
   ###$inhalt =~ s/[\s] $//i; 
   ##$inhalt =~ s/[\s] $//g; # remove trailing spaces
   while ($inhalt =~ / $/) {
      $inhalt =~ s/ $//g; # remove any trailing spaces 
   }
   prtv9( "Modified to [$inhalt], " );
   ### $distinct{$inhalt}; # was this
   ### $distinct{$inhalt} = $inhalt; # tried this
   # but better to keep count
   if (defined $distinct{$inhalt} ) {
      $distinct{$inhalt} = $distinct{$inhalt} + 1;
      prtv9( "and bumped count to $distinct{$inhalt}...\n" );
   } else {
      prtv9( "and stored first time...\n" );
      $distinct{$inhalt} = 1;
   }
} 
my $wcnt = keys( %distinct );
prt( "Showing sorted output per $wcnt HASH keys ... and the count for each ...\n" );
my $cnt = 0;
foreach my $wort (sort keys %distinct){
   $cnt++;
   if ($cnt < 10) {
      prt("  $cnt ");
   } elsif ($cnt < 100) {
      prt(" $cnt ");
   } else {
      prt("$cnt ");
   }
   prt( "[$wort] $distinct{$wort}\n" ); 
} 
prt( "Done $cnt output of sorted keys, with count ...\n" );
prt( "Closing and quitting word ...\n" );
$Word->Documents->Close; 
$Word->Documents->Quit; 
$Word->Quit; # quit the application
prt("Done ...\n");
################################
### output and log file
sub wlog {
   my $ml = shift;
   print $LOG $ml;
}
sub prt {
   my $m = shift;
   if ($write_log) {
      wlog($m);
   }
   print $m;
}
sub prtv9 {
   my $ms = shift;
   if ($verb > 8) {
      prt($ms);
   }
}
sub mydie {
   my $msg = shift;
   if ($write_log) {
      wlog($msg);
   }
   die $msg;
}
# eof - autoword03.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional