Generated: Tue Feb 2 17:55:00 2010 from wordindex01.pl 2007/08/22 5 KB.
#!/perl -w # NAME: wordindex01.pl # AIM: Given a FILE, load it in WORD, extract the text, and build an alphabetic # index of words ... # Uses Word OLE engine # see http://www.ngbdigital.com/perl_ole_word.html # 21/08/2007 geoff mclane - http://geoffair.net/mperl # use strict; use warnings; use Win32::OLE; use Win32::OLE qw(in with); use Win32::OLE::Variant; use Win32::OLE::Const 'Microsoft Word'; require 'logfile.pl' or die "Unable to load logfile.pl ...\n"; # log file stuff my ($LF); my $outfile = 'temp.'.$0.'.txt'; if ($0 =~ /\w{1}:\\.*/) { my @tmpsp = split(/\\/,$0); $outfile = 'temp.'.($tmpsp[-1]).'.txt'; } open_log($outfile); prt( "$0 ... Hello, World ...\n" ); my $in_file = 'C:\Documents and Settings\Geoff McLane\My Documents\Tidy\Php-01.doc'; my @common = qw( am as be br but by can do eof etc for from get got has have hi if in it its may my no not now of or re see so some an on such sure at to too us is was with you ); sub in_common { my ($tx) = shift; foreach my $t (@common) { if ($t eq $tx) { return 1; } } return 0; } # debug my $dbg1 = 0; # show stored value my $dbg2 = 0; # show REPEATED words my $dbg3 = 0; # show progress each 100 words my $dbg9 = 1; # show actions sent to prtv9 ... my %distinct = (); # TO HOLD THE FINAL LIST enumerate_doc( $in_file ); my $wcnt = keys( %distinct ); prt( "Showing sorted output per $wcnt HASH keys ... and the count for each ...\n" ); my $cnt = 0; foreach my $wort (sort keys %distinct){ $cnt++; if ($cnt < 10) { prt(" $cnt "); } elsif ($cnt < 100) { prt(" $cnt "); } else { prt("$cnt "); } prt( "[$wort] $distinct{$wort}\n" ); } prt( "Done $cnt output of sorted keys, with count ...\n" ); close_log($outfile,1); exit(0); ##################################### ###### subs sub Quit { my( $Obj ) = @_; $Obj->Quit(); } sub enumerate_doc { my ($infile) = shift; my $wdcnt = 0; my $lcword = ''; my $newcnt = 0; # a stepped approach to openning, or loading Microsoft Word prt( "Attaching to Word application ...\n" ); my $Word = Win32::OLE->GetActiveObject('Word.Application'); if ($Word) { prt( "Using existing running Word application ...\n" ); } else { prt( "Starting NEW Word application ...\n" ); $Word = Win32::OLE->new('Word.Application', 'Quit'); if ($Word) { prt("New application running ...\n"); } else { mydie( "ERROR: Failed to load Word application ...\n" ); } } $Word->{'Visible'} = 0; $Word->{DisplayAlerts} = 0; # Load the application with the document prt( "Openning document $infile ...\n" ); my $Doc = $Word->Documents->Open($infile) || mydie("Unable to open [$infile] document!\nError: ". Win32::OLE->LastError() . "\n"); prt( "Getting contents of the ActiveDocument ... wait ...\n" ); my $myRange = $Word->ActiveDocument->Content; # Collections - Characters Words Sentences Paragraphs Sections HeadersFooters foreach my $word (in $myRange->Words){ $wdcnt++; if (($wdcnt % 100) == 0) { prt( "Processed $wdcnt words ...\n" ) if ($dbg3); } $lcword = lc($word->{Text}); # extract the 'word', in lowercase # try to trim it up a bit chomp $lcword; # remove trailing \n char, if any ... $lcword = replace_hibits($lcword); # have seen 0xA0 in string - replace with SPACE $lcword = remove_quotes($lcword); # remove any QUOTES, " or ' at begin/end $lcword = trim_all($lcword); # trim it up $lcword = remove_quotes($lcword); # remove any onner quotes $lcword = trim_all($lcword); # and trim AGAIN ###next if not $lcword =~ m/^[a-z]{2,}/; # forget it if not start with 2 alpha next if ( !($lcword =~ /^\w{2}/) ); # forget it if not start with 2 alphanumeric if ($lcword =~ /^\d+$/) { next if (length($lcword) < 4); # dump numbers less than length 4 } next if (in_common($lcword)); # exclude a bumch of 'common' words if (length($lcword) > 3) { # tried to exclude plurals, but mainly FAILED if (substr($lcword,length($lcword)-1) eq 's') { my $tmp = substr($lcword,0,length($lcword)-1); next if (defined $distinct{$tmp} ); } } # keep count of words collected if (defined $distinct{$lcword} ) { $distinct{$lcword}++; prtv9( "[$lcword] bumped count to $distinct{$lcword}...\n" ) if ($dbg2); } else { prtv9( "[$lcword] stored ...\n" ) if ($dbg1); $distinct{$lcword} = 1; $newcnt++; } } prt( "Processed $wdcnt words ... collected $newcnt ...\n" ); } sub prtv9 { my ($txt) = shift; prt( "$txt" ) if ($dbg9); } sub remove_quotes { my ($tx) = shift; $tx =~ s/^'//; # remove any beginning single quotes $tx =~ s/^"//g; # remove any beginning double quotes $tx =~ s/'$//; # remove any ending single quotes $tx =~ s/"$//g; # remove any ending double quotes return $tx; } sub replace_hibits { my ($tx) = shift; my $mx = length($tx); my $ntx = ''; my ($ch, $val); for (my $i = 0; $i < $mx; $i++) { $ch = substr($tx,$i,1); $val = ord($ch); if ($val > 127) { $ch = ' '; } $ntx .= $ch; } return $ntx; } # eof - wordindex01.htm