autoword05.pl to HTML.

index -|- end

Generated: Tue Feb 2 17:54:22 2010 from autoword05.pl 2007/08/29 3.6 KB.

#!/Perl
# NAME: autoword05.pl
# AIM: To test out the following -
# from : http://www.wellho.net/solutions/perl-using-perl-to-read-microsoft-word-documents.html
# Extracting PARAGRAPHS from a word document.
# 29/08/2007 - geoff mclane - geoffair.net/mperl
use strict;
use warnings;
use Win32::OLE;
use Win32::OLE::Enum;
######################################
# establish in and out file names
my $in_file = 'c:\tmp\test.doc';
###my $in_file = 'C:\Documents and Settings\Geoff McLane\My Documents\Tidy\Php-01.doc';
my $out_file = "temp.$0.txt";
### OR USE COMMAND LINE
###$document = Win32::OLE -> GetObject($ARGV[1]);
###open (FH,">$ARGV[0]");
# $out_file = $ARGV[0];
# $in_file = $ARGV[1];
if ( ! -f $in_file) {
   die "Can not locate file [$in_file] ...\n";
}
my ($i, $msg, $document, $paragraph, $paragraphs, $enumerate, $style, $text, $len, $hex, $ntxt, $ch, $val, $hv);
$document = Win32::OLE->GetObject($in_file) or die "Unable to OPEN $in_file ...\n";
open (FH,">$out_file") or die "Unable to create $out_file ...\n";
$msg = "Extracting Text (in paragraphs) from $in_file ...\n";
print FH $msg;
print $msg;
show_crlf();   # just a DEBUG display ^M and ^J ...
# Collections - Characters Words Sentences Paragraphs Sections HeadersFooters 
$paragraphs = $document->Paragraphs();
$enumerate = new Win32::OLE::Enum($paragraphs);
while(defined($paragraph = $enumerate->Next()))
{
    $style = $paragraph->{Style}->{NameLocal};
    ###print FH "+[$style]\n";
    $text = $paragraph->{Range}->{Text};
   $len = length($text);
   $hex = '';
   $ntxt = '';
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($text,$i,1);
      $val = ord($ch);
      $hv = dec2hex($val);
      if (($val >= 32) && ($val < 128)) {
         $hex .= $ch;
         $ntxt .= $ch;
      } else {
         $hex .= ' ' if length($hex);
         $hex .= $hv;
         if ($val >= 128) {
            $ntxt .= "\@$hv";
         } else {
            $ntxt .= '^';
            $ntxt .= chr($val + 64);
         }
      }
   }
    print FH "=[$text]\n";
   print FH "h[$hex]\n";
    $text =~ s/[\n\r]//g;
    $text =~ s/\x0b/\n/g;
   print "$ntxt ($hex)\n";
}
close FH;
system($out_file);
exit(0);
############################
##### subs
sub dec2hex {
    my $decnum = shift;   # or $_[0]; or @_;    # parameter passed to the subfunction
    my $hexnum = '';     # the final hex number
    my $tempval = 0;
    while ($decnum != 0) {
      # get the remainder (modulus function)
      # by dividing by 16
      $tempval = $decnum % 16;
      # convert to the appropriate letter
      # if the value is greater than 9
      if ($tempval > 9) {
         $tempval = chr($tempval + 55);
      }
      # 'concatenate' the number to 
      # what we have so far in what will
      # be the final variable
      $hexnum = $tempval . $hexnum ;
      # new actually divide by 16, and 
      # keep the integer value of the 
      # answer
      $decnum = int($decnum / 16); 
      # if we cant divide by 16, this is the
      # last step
      if ($decnum < 16) {
         # convert to letters again..
         if ($decnum > 9) {
            $decnum = chr($decnum + 55);
         }
         # add this onto the final answer.. 
         # reset decnum variable to zero so loop
         # will exit
         $hexnum = $decnum . $hexnum; 
         $decnum = 0 
      }
    }
    return $hexnum;
} # end sub
#################
sub show_crlf {
   $ch = "\r";
   $val = ord($ch);
   $hv = dec2hex($val);
   $ntxt = '^';
   $ntxt .= chr($val + 64);
   $msg = "Value $val = 0x$hv = Carriage return = $ntxt ...\n";
   print FH $msg;
   print $msg;
   $ch = "\n";
   $val = ord($ch);
   $hv = dec2hex($val);
   $ntxt = '^';
   $ntxt .= chr($val + 64);
   $msg = "Value $val = 0x$hv = Line feed = $ntxt ...\n";
   print FH $msg;
   print $msg;
}
# eof - autoword05.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional