Generated: Tue Feb 2 17:54:22 2010 from autoword05.pl 2007/08/29 3.6 KB.
#!/Perl # NAME: autoword05.pl # AIM: To test out the following - # from : http://www.wellho.net/solutions/perl-using-perl-to-read-microsoft-word-documents.html # Extracting PARAGRAPHS from a word document. # 29/08/2007 - geoff mclane - geoffair.net/mperl use strict; use warnings; use Win32::OLE; use Win32::OLE::Enum; ###################################### # establish in and out file names my $in_file = 'c:\tmp\test.doc'; ###my $in_file = 'C:\Documents and Settings\Geoff McLane\My Documents\Tidy\Php-01.doc'; my $out_file = "temp.$0.txt"; ### OR USE COMMAND LINE ###$document = Win32::OLE -> GetObject($ARGV[1]); ###open (FH,">$ARGV[0]"); # $out_file = $ARGV[0]; # $in_file = $ARGV[1]; if ( ! -f $in_file) { die "Can not locate file [$in_file] ...\n"; } my ($i, $msg, $document, $paragraph, $paragraphs, $enumerate, $style, $text, $len, $hex, $ntxt, $ch, $val, $hv); $document = Win32::OLE->GetObject($in_file) or die "Unable to OPEN $in_file ...\n"; open (FH,">$out_file") or die "Unable to create $out_file ...\n"; $msg = "Extracting Text (in paragraphs) from $in_file ...\n"; print FH $msg; print $msg; show_crlf(); # just a DEBUG display ^M and ^J ... # Collections - Characters Words Sentences Paragraphs Sections HeadersFooters $paragraphs = $document->Paragraphs(); $enumerate = new Win32::OLE::Enum($paragraphs); while(defined($paragraph = $enumerate->Next())) { $style = $paragraph->{Style}->{NameLocal}; ###print FH "+[$style]\n"; $text = $paragraph->{Range}->{Text}; $len = length($text); $hex = ''; $ntxt = ''; for ($i = 0; $i < $len; $i++) { $ch = substr($text,$i,1); $val = ord($ch); $hv = dec2hex($val); if (($val >= 32) && ($val < 128)) { $hex .= $ch; $ntxt .= $ch; } else { $hex .= ' ' if length($hex); $hex .= $hv; if ($val >= 128) { $ntxt .= "\@$hv"; } else { $ntxt .= '^'; $ntxt .= chr($val + 64); } } } print FH "=[$text]\n"; print FH "h[$hex]\n"; $text =~ s/[\n\r]//g; $text =~ s/\x0b/\n/g; print "$ntxt ($hex)\n"; } close FH; system($out_file); exit(0); ############################ ##### subs sub dec2hex { my $decnum = shift; # or $_[0]; or @_; # parameter passed to the subfunction my $hexnum = ''; # the final hex number my $tempval = 0; while ($decnum != 0) { # get the remainder (modulus function) # by dividing by 16 $tempval = $decnum % 16; # convert to the appropriate letter # if the value is greater than 9 if ($tempval > 9) { $tempval = chr($tempval + 55); } # 'concatenate' the number to # what we have so far in what will # be the final variable $hexnum = $tempval . $hexnum ; # new actually divide by 16, and # keep the integer value of the # answer $decnum = int($decnum / 16); # if we cant divide by 16, this is the # last step if ($decnum < 16) { # convert to letters again.. if ($decnum > 9) { $decnum = chr($decnum + 55); } # add this onto the final answer.. # reset decnum variable to zero so loop # will exit $hexnum = $decnum . $hexnum; $decnum = 0 } } return $hexnum; } # end sub ################# sub show_crlf { $ch = "\r"; $val = ord($ch); $hv = dec2hex($val); $ntxt = '^'; $ntxt .= chr($val + 64); $msg = "Value $val = 0x$hv = Carriage return = $ntxt ...\n"; print FH $msg; print $msg; $ch = "\n"; $val = ord($ch); $hv = dec2hex($val); $ntxt = '^'; $ntxt .= chr($val + 64); $msg = "Value $val = 0x$hv = Line feed = $ntxt ...\n"; print FH $msg; print $msg; } # eof - autoword05.pl