Generated: Tue Feb 2 17:54:22 2010 from autoword03b.pl 2006/06/16 10.5 KB.
#!/Perl # Series: Microsoft Word Automation # original from : http://www.stouk.com/documents/perl/gui/guiref/page04.htm # purpose: To load a Microsoft Word document, get the contents of the document, # and put all the words into a HASH ... sort and show the HASH ... # NOTE: If a NEW application is loaded, or the document is LARGE, the # script can appear stalled. Increase $verb to 9 to see more action # happening ... sometimes 30-50 seconds to process even 1 page ... I assume # something to do with getting the OLE server running ... # Setting word visible seems to INCREASE the time taken? # It seems to run FASTEST when a copy of Word is already open ... # author: geoff mclane - email: geoffair _at_ hotmail _dot_ com - 2006-06-12 # # NOTE WELL: ON LARGE FILES, THIS CAN TAKE A VERY LONG TIME, UNLESS # YOU ALREADY HAVE A COPY OF WORD RUNNING. WHEN THERE IS A COPY OF # WORD RUNNING, THE RESULTS CAN BE QUITE QUICK!!!!! # IN FACT IT MAY BE COMPLETELY STALLING IF AN INSTANCE OF WORD IS NOT # ALREADY RUNNING - THIS INSTANCE ALSO GETS CLOSED AT THE END ... # IT CAN TAKE A MINUTE OR TWO - A LONG TIME TO WAIT ;=)) # like Done ... in 80.1525819301605 seconds ... # WITH FURTHER TESTS, IT DOES APPEAR TO STALL IF NO OTHER # WORD INSTANCE RUNNING, AND THE APPLICATION IS LEFT HIDDEN? # HARD TO WORK OUT ... # # MORE TRIES - it appears - # prt( "Getting object for $infile ...\n" ); # $WordObj = Win32::OLE->GetObject($infile); # works MUCH FASTER ... # Done ... in 7.13679909706116 seconds ... # WITH OR WITHOUT ANOTHER INSTANCE RUNNING!!! # use strict; use Win32::OLE qw(in with); use Win32::OLE::Const 'Microsoft Word'; use Time::HiRes qw(gettimeofday); # provide more accurate timings # *** ALTER THIS TO POINT TO YOUR OWN TEST DOCUMENT *** ####################################################### ###my $infile = 'c:\tmp\test.doc'; my $infile = 'c:\tmp\test2.doc'; # ##################################################### my @exclwordlist = qw( aborting accept action active addition all already alter an and any anything appear appears append application apply approach archive are as associated assume at attaching author background base be before beginning better bit breaks build bumped but can caption char characters checkspelling close closing cmd cnt code collate collections com command commented considerable const content contents continue control copies copy could count created dead default defined deleting die different distinct do doc docs documents does doing done dos double each else elsif email end ending enumerations eof error even examples existing exit extensible extracts failed faq fastest file fileformat filename files filesearch filtered final first following for forget format formats found from full function geoff geoffair get getting going got guiref had happening have headersfooters help hence hold hotmail htm html http hv05213080 if in increase index infile information inhalt initial input into is it item items its june just kcnt keep key keys language large lasterror lc left library library' like line list load loaded loading locate log lot lots mailto maybe mclane me microsoft ml mocrosfot modified monday more move ms msdn msg my mydie myrange name need needed nerror new next nmay no not note notepad notes numbers objects of on one online only open openning options or original other ouput out outfile output outputfilename own page page04 pages pagesetup pagetype paragraphs parent part passing path per perl pl point presently preserved previous print printer printout prints printzoomcolumn printzoompaperheight printzoompaperwidth printzoomrow processing properly properties property prt prtv9 purpose put quick quit quite quitting quotes qw readonly remind remove represents resize result review reviewed rich rtf run running saveas script seconds sections see seems selection sendfax sentences series server setting shift should show showing simply single slow so som some something sometimes sort sorted spaces specified stalled standard start starting stepped stored stouk strict stuff sub sure system taken template test tested text that the then there these things this time tmp to trailing tried tries trim trouble try trying txt unable up url us use used username using usually various vbawd11 verb version versions wait want warning was wcnt wd wdalertsnone wdformatdostext wdformatencodedtext wdformatfilteredhtml wdformathtml wdformatrtf wdformattemplate wdformattext wdformattextlinebreaks wdformatunicodetext wdformatwebarchive wdformatxml wdgreen wdopenformatauto wdprintalldocument wdprintdocumentcontent we web what what's whatever when while whole will win32 window windows winfaq12 with wlog womthsaveas1 word words words' work worked wort wrconwordobjectmodeloverview wrcore wrgrfapplicationobject write www xml yeek you your yours ); my $wid = 0; my $inhalt = ''; my %distinct = (); # TO HOLD THE FINAL LIST # just for LOG FILE ouput ... my ($LOG); my $write_log = 0; my $verb = 1; # increase to 9 to see more output my $outfile = "temp.$0.txt"; # note name of perl file used as base if ( open( $LOG, ">$outfile" ) ) { $write_log = 1; # we have a LOG file } else { $write_log = 0; prt( "WARNING: Unable to open $outfile LOG ...\n" ); } my $bt = gettimeofday; my $WordApp = undef; my $WordObj = undef; ### Various ways to do this ... # Collections - Characters Words Sentences Paragraphs Sections HeadersFooters use_get_object(); ### OR ##use_word_app(0); my $wcnt = keys( %distinct ); prt( "Showing sorted output per $wcnt HASH keys ... and the count for each ...\n" ); my $cnt = 0; foreach my $wort (sort keys %distinct){ $cnt++; if ($cnt < 10) { prt(" $cnt "); } elsif ($cnt < 100) { prt(" $cnt "); } else { prt("$cnt "); } prt( "[$wort] $distinct{$wort} " ); if (is_in_common($wort)) { prt( "EXCLUDED" ); } prt("\n"); } prt( "Done $cnt output of sorted keys, with count ...\n" ); prt( "Now output a perl qw list ...\n" ); $wid = 0; my $wmax = 80; prt( "my \@newwordlist = qw(\n" ); foreach my $word (sort keys %distinct){ if ( ! is_in_common($word) ) { my $len = length($word); if (($len + $wid) > $wmax) { prt("\n"); $wid = 0; } $wid += $len + 1; prt($word.' '); } } prt(");\n"); if ($WordApp) { prt( "Closing and quitting word ...\n" ); $WordApp->Documents->Close; $WordApp->Documents->Quit; $WordApp->Quit; # quit the application } prt("Done ... in ". (gettimeofday - $bt) . " seconds ...\n"); exit(0); ####################################################################### sub use_get_object { my $wd; # $document = Win32::OLE->GetObject($infile) prt( "Getting object for $infile ...\n" ); $WordObj = Win32::OLE->GetObject($infile); if ($WordObj) { prt( "Getting the collection ... (object $WordObj)\n" ); my $cwords = $WordObj->Words(); # get collection if ($cwords) { prt( "Getting an enumerator ...(collection $cwords)\n" ); my $enumerate = new Win32::OLE::Enum($cwords); if ($enumerate) { my $cnt = 0; prt( "Processing enumeration ...(enumerator $enumerate)\n" ); while( defined( $wd = $enumerate->Next() ) ) { $cnt++; prtv9( "$cnt $wd " ); # my $txt = lc($wd->{Range}->{Text}); # this is NOT right! my $txt = lc($wd->{Text}); # this works fine ... prtv9( "[$txt] \n" ); add_word($txt); } prt( "Done $cnt enumerations ...\n" ); } else { prt( "Failed to get enumerator ...\n" ); } } else { prt( "Failed to get collection ...\n" ); } } else { prt("Failed to GetObject ...\n"); } } # or paragraphs # $paragraphs = $document->Paragraphs(); # $enumerate = new Win32::OLE::Enum($paragraphs); # while(defined($paragraph = $enumerate->Next())) { # $style = $paragraph->{Style}->{NameLocal}; # $text = $paragraph->{Range}->{Text}; } sub use_word_app { my ($vis) = shift; # a stepped approach to openning, or loading Microsoft Word prt( "Attaching to Word application ...\n" ); $WordApp = Win32::OLE->GetActiveObject('Word.Application'); if ($WordApp) { prt( "Using existing running Word application ...\n" ); } else { prt( "Starting NEW Word application ...\n" ); $WordApp = Win32::OLE->new('Word.Application', 'Quit'); if ($WordApp) { prt("New application running ...\n"); } else { mydie( "ERROR: Failed to load Word application ...\n" ); } } if ($vis) { prt( "Setting word as visible ...\n" ); $WordApp->{'Visible'} = 1; # if you want to see something, but it seems to SLOW UP things ;=)) } # Load the application with the document prt( "Openning document $infile ...\n" ); $WordApp->Documents->Open($infile) || mydie("Unable to open [$infile] document!\nError: ". Win32::OLE->LastError() . "\n"); prt( "Getting contents of the ActiveDocument ... wait ...\n" ); my $myRange = $WordApp->ActiveDocument->Content; prt( "Processing ActiveDocument contents ... wait a while ...\n" ); foreach my $word (in $myRange->Words){ $inhalt = lc($word->{Text}); add_word($inhalt); ###print '.'; } } sub add_word { my ($inwd) = shift; # try to trim it up a bit chomp $inwd; # remove trailing \n char, if any ... $inwd =~ s/^\'//g; # remove any beginning single quotes $inwd =~ s/^\"//g; # remove any beginning double quotes $inwd =~ s/\'$//g; # remove any ending single quotes $inwd =~ s/\"$//g; # remove any ending double quotes prtv9( "Got lc text [$inwd] ...\n" ); return if not $inwd =~ m/^[a-z]{2,}/; # forget it if not start with 2 alpha ###$inwd =~ s/[\s] $//i; ##$inwd =~ s/[\s] $//g; # remove trailing spaces while ($inwd =~ / $/) { $inwd =~ s/ $//g; # remove any trailing spaces } prtv9( "Modified to [$inwd], " ); ### $distinct{$inwd}; # was this ### $distinct{$inwd} = $inhalt; # tried this # but better to keep count if (defined $distinct{$inwd} ) { $distinct{$inwd} = $distinct{$inwd} + 1; prtv9( "and bumped count to $distinct{$inwd}...\n" ); } else { prtv9( "and stored first time...\n" ); $distinct{$inwd} = 1; } } sub is_in_common { my ($wd) = shift; foreach my $w (@exclwordlist) { if ($w eq $wd) { return 1; } } return 0; } ################################ ### output and log file sub wlog { my $ml = shift; print $LOG $ml; } sub prt { my $m = shift; if ($write_log) { wlog($m); } print $m; } sub prtv9 { my $ms = shift; if ($verb > 8) { prt($ms); } } sub mydie { my $msg = shift; if ($write_log) { wlog($msg); } die $msg; } # eof - autoword03.pl