Generated: Tue Feb 2 17:54:22 2010 from autoword03.pl 2006/06/16 4.8 KB.
#!/Perl # Series: Microsoft Word Automation # original from : http://www.stouk.com/documents/perl/gui/guiref/page04.htm # purpose: To load a Microsoft Word document, get the contents of the document, # and put all the words into a HASH ... sort and show the HASH ... # NOTE: If a NEW application is loaded, or the document is LARGE, the # script can appear stalled. Increase $verb to 9 to see more action # happening ... sometimes 30-50 seconds to process even 1 page ... I assume # something to do with getting the OLE server running ... # Setting word visible seems to INCREASE the time taken? # It seems to run FASTEST when a copy of Word is already open ... # author: geoff mclane - email: geoffair _at_ hotmail _dot_ com - 2006-06-12 # # NOTE WELL: ON LARGE FILES, THIS CAN TAKE A VERY LONG TIME, UNLESS # YOU ALREADY HAVE A COPY OF WORD RUNNING. WHEN THERE IS A COPY OF # WORD RUNNING, THE RESULTS CAN BE QUITE QUICK!!!!! # IN FACT IT MAY BE COMPLETELY STALLING IF AN INSTANCE OF WORD IS NOT # ALREADY RUNNING - THIS INSTANCE ALSO GETS CLOSED AT THE END ... # use strict; use Win32::OLE qw(in with); use Win32::OLE::Const 'Microsoft Word'; # *** ALTER THIS TO POINT TO YOUR OWN TEST DOCUMENT *** ####################################################### my $infile = 'c:\tmp\test.doc'; # ##################################################### my $inhalt = ''; my %distinct = (); # TO HOLD THE FINAL LIST # just for LOG FILE ouput ... my ($LOG); my $write_log = 0; my $verb = 1; # increase to 9 to see more output my $outfile = "$0.txt"; # note name of perl file used as base if ( open( $LOG, ">$outfile" ) ) { $write_log = 1; # we have a LOG file } else { $write_log = 0; prt( "WARNING: Unable to open $outfile LOG ...\n" ); } # a stepped approach to openning, or loading Microsoft Word prt( "Attaching to Word application ...\n" ); my $Word = Win32::OLE->GetActiveObject('Word.Application'); if ($Word) { prt( "Using existing running Word application ...\n" ); } else { prt( "Starting NEW Word application ...\n" ); $Word = Win32::OLE->new('Word.Application', 'Quit'); if ($Word) { prt("New application running ...\n"); } else { mydie( "ERROR: Failed to load Word application ...\n" ); } } ###prt( "Setting word as visible ...\n" ); ###$Word->{'Visible'} = 1; # if you want to see something, but it seems to SLOW UP things ;=)) # Load the application with the document prt( "Openning document $infile ...\n" ); $Word->Documents->Open($infile) || mydie("Unable to open [$infile] document!\nError: ". Win32::OLE->LastError() . "\n"); prt( "Getting contents of the ActiveDocument ... wait ...\n" ); my $myRange = $Word->ActiveDocument->Content; prt( "Processing ActiveDocument contents ... wait ...\n" ); # Collections - Characters Words Sentences Paragraphs Sections HeadersFooters foreach my $word (in $myRange->Words){ $inhalt = lc($word->{Text}); # try to trim it up a bit chomp $inhalt; # remove trailing \n char, if any ... $inhalt =~ s/^\'//; # remove any beginning single quotes $inhalt =~ s/^\"//g; # remove any beginning double quotes $inhalt =~ s/\'$//; # remove any ending single quotes $inhalt =~ s/\"$//g; # remove any ending double quotes prtv9( "Got lc text [$inhalt] ...\n" ); next if not $inhalt =~ m/^[a-z]{2,}/; # forget it if not start with 2 alpha ###$inhalt =~ s/[\s] $//i; ##$inhalt =~ s/[\s] $//g; # remove trailing spaces while ($inhalt =~ / $/) { $inhalt =~ s/ $//g; # remove any trailing spaces } prtv9( "Modified to [$inhalt], " ); ### $distinct{$inhalt}; # was this ### $distinct{$inhalt} = $inhalt; # tried this # but better to keep count if (defined $distinct{$inhalt} ) { $distinct{$inhalt} = $distinct{$inhalt} + 1; prtv9( "and bumped count to $distinct{$inhalt}...\n" ); } else { prtv9( "and stored first time...\n" ); $distinct{$inhalt} = 1; } } my $wcnt = keys( %distinct ); prt( "Showing sorted output per $wcnt HASH keys ... and the count for each ...\n" ); my $cnt = 0; foreach my $wort (sort keys %distinct){ $cnt++; if ($cnt < 10) { prt(" $cnt "); } elsif ($cnt < 100) { prt(" $cnt "); } else { prt("$cnt "); } prt( "[$wort] $distinct{$wort}\n" ); } prt( "Done $cnt output of sorted keys, with count ...\n" ); prt( "Closing and quitting word ...\n" ); $Word->Documents->Close; $Word->Documents->Quit; $Word->Quit; # quit the application prt("Done ...\n"); ################################ ### output and log file sub wlog { my $ml = shift; print $LOG $ml; } sub prt { my $m = shift; if ($write_log) { wlog($m); } print $m; } sub prtv9 { my $ms = shift; if ($verb > 8) { prt($ms); } } sub mydie { my $msg = shift; if ($write_log) { wlog($msg); } die $msg; } # eof - autoword03.pl