cleanhtm02.pl to HTML.

index -|- end

Generated: Tue Feb 2 17:54:25 2010 from cleanhtm02.pl 2006/10/03 16 KB.

#!/Perl
# cleanhtm02.pl - 2006.10.01 - geoff mclane (geoffmclane.com)
# AIM: To clean certain items from a HTML document ...
# specifically target microsoft word 'filtered' output which uses a
# paragraph style which places every line in a <p> ... </p>,
# and to denote REAL paragraphs, inserts a <p>&nbsp;</p> line ...
# Thus to remove some of the <p>, using a <br> to get to a new line ...
# =====================================================================
use strict;
require 'logfile.pl' or die "ERROR: Can NOT load logfile.pl ...\n";
require 'htmltools.pl' or die "ERROR: Can NOT load htmltools.pl ...\n";
# log file stuff
my ($LF);
my $outfile = 'temp'.$0.'.txt';
open_log($outfile);
prt( "$0 ... Hello, World ...\n" );
# user variable
###my $def_input = '..\javascript\messageontop.htm';
my $def_input = 'temptidy48in.htm';
###my $def_input = 'C:\Documents and Settings\Geoff McLane.PRO-1\My Documents\tidy\tidy-48.htm';
my $def_output = 'temptidy48.htm';
###my $def_input = 'C:\Documents and Settings\Geoff McLane.PRO-1\My Documents\FG\devel\fgd-008.htm';
###my $def_output = 'tempfgd02.htm';
# debug
my $dbg1 = 0;
my $dbg2 = 0;
my $dbg3 = 0;
my $dbg4 = 0;
my $dbg5 = 0;
my $dbg6 = 0;
my $dbg7 = 0;
my $dbg8 = 0;
# program variable
my $in_file = $def_input;
my $out_file = $def_output;
$in_file = pop @ARGV if (@ARGV);
$out_file = pop @ARGV if (@ARGV);
prt( "Got input from [$in_file], output to [$out_file] ...\n" );
if ( ! -f $in_file) {
   mydie("OOPS: Can NOT locate [$in_file] ...\n");
}
open IF, "<$in_file" or mydie("OOPS: Can NOT open [$in_file] ...\n");
my @lines = <IF>;   # slurp it all in
close IF;
my $cnt = scalar @lines;
prt("Processing $cnt lines from [$in_file] ...\n");
my $txt = join("\n", @lines);
my $ccnt = length($txt);
prt("Or $ccnt characters from [$in_file] ...\n");
my $newtxt = general_adjustments( $txt );
$ccnt = length($newtxt);
my @lines = split("\n", $newtxt);
my $line = '';
my $word = 0;
my $cont = '';
my $lncnt = scalar @lines;
my $style = '';
my @styles = ();
my %mystyles = ();
my %myattrs = ();
my @attribs = ();
my $stbgn = 0;
my $stend = 0;
my $bdybgn = 0;
my $bdyend = 0;
my @paralist = ();
my $blkparas = 0;
my $stybgn = 0;
my $styend = 0;
my %xmlblocks = ();
################################################################
my $newstyle = <<EOF;
<style type="text/css">
body {
 background-image:url('clds5.jpg');
 margin: 0cm 1cm 0cm 1cm;
}
.code {
 margin: 0px 10px 0px 10px;
 background: #f0f8ff;
 border-width: 1px;
 border-style: solid solid solid solid;
 border-color: #000090;
 width: 90%;
 padding: 0px 10px 0px 10px;
}
.diff {
 margin: 0px 10px 0px 10px;
 background: #f0ffef;
 border-width: 1px;
 border-style: solid solid solid solid;
 border-color: #900090;
 width: 90%;
 padding: 0px 10px 0px 10px;
}
</style>
EOF
my $my_pre = '<pre class="code">';
################################################################
$word = check_for_word();
get_the_style();
@attribs = keys %myattrs;
my $atcnt = scalar @attribs;
prt( "Got $atcnt attibutes to look for ... style bgn $stbgn($stybgn), end $stend($styend) body bgn $bdybgn, end $bdyend ...\n" );
##sub removetagattrib { my ($txt, $tag) = @_;
my $clntxt = removetagattrib( $newtxt, 'p' );
$clntxt = striptag( $clntxt, 'style' );
$clntxt = add_my_style( $clntxt );
$clntxt = inline_paras( $clntxt );
get_para_list( $clntxt );
prt( "In " . scalar @paralist . " paragraphs, found $blkparas blanks ...\n" );
show_xml_list( $clntxt );
$clntxt = prt_para_list( $clntxt );
write_out_file($clntxt, $out_file);
system($out_file);
close_log($outfile,1);
exit(0);
# ###############################################
# all subs below
# ##############
sub prt_para_list {
   my ($tx) = shift;
   my $cnt = scalar @paralist;
   my $mt = length($tx);
   my ($ch, $i, $ip, $stp, $edp, $ntx, $lstp, $ledp, $pstp, $pedp, $ps, $pe, $inp);
   my ($lpstp, $lpedp, $blk, $inx, $gotpre, $msg);
   prt( "Showing $cnt paragraphs ...\n" );
   $ip = 0;
   $ntx = '';
   $lstp = 0;
   $ledp = 0;
   $inx = 0;
   $stp = 0;
   $gotpre = 0;
   $inx = in_xml_range($stp); 
   if ($cnt) {
      $stp = $paralist[0][0];
      $edp = $paralist[0][1];
      $pstp = $paralist[0][2];
      $pedp = $paralist[0][3];
      $blk = $paralist[0][4];
      $ch = substr($tx, 0, $stp);
      $ntx = $ch;
      $lstp = $stp;
      $ledp = $edp;
      $lpstp = $pstp;
      $lpedp = $pedp;
   } # this includes a <p> - the first p ...
   $inp = 1;
   for ($i = 0; $i < $cnt; $i++) {
      $ip++;
      $stp = $paralist[$i][0];
      $edp = $paralist[$i][1];
      $pstp = $paralist[$i][2];
      $pedp = $paralist[$i][3];
      $blk = $paralist[$i][4];
      $inx = in_xml_range($stp); 
      if ($lpedp < $pstp) {
         $msg = '';
         if ($inp) {
            # exclude <p>
            $msg .= "In p exclude";
            $ch = substr($tx, $lpedp, $pstp - $lpedp);
            if ($inx) {
               ####$ch .= '</p>'; # close paragraph
               if (!$gotpre) {
                  $ch .= '</p>' . $my_pre;
                  $gotpre = 1;
                  $msg .= " added my pre";
               }
               $inp = 0;
            }
         } else {
            # include <p>
            if ($inx) {
               # exclude <p>
               $msg .= "Not p excl";
               $ch = substr($tx, $lpedp, $pstp - $lpedp);
               if (!$gotpre) {
                  $ch .= $my_pre; # '<pre>';
                  $gotpre = 1;
                  $msg .= " add pre";
               }
            } else {
               # include <p>
               $msg .= "Not p incl";
               $ch = substr($tx, $lpedp, $stp - $lpedp);
            }
         }
         ##prt( "$ip Inbetween [$ch] \n" ) if ($dbg5);
         prt( "$ip [$pstp-$stp-$edp-$pedp] Inbetween inp=$inp inx=$inx [$ch] [$msg]\n" ) if ($dbg8);
         $ntx .= $ch;
      }
      $ch = substr($tx, $stp, $edp - $stp);
      $ps = substr($tx, $pstp, $stp - $pstp);
      $pe = substr($tx, $edp, $pedp - $edp);
      ###if ($ch eq '&nbsp;') {
      if ($blk) {
         $ch = substr($tx, $pstp, $pedp - $pstp);
         prt( "$ip BLANK P[$ch]\n" ) if ($dbg5);
         if ($gotpre) {
            ###if ($inx) {
               $ntx .= '</pre>';
               $gotpre = 0;
            ###}
         } else {
            $ntx .= '</p>';
         }
         $inp = 0;
      } else {
         ###prt( "$ip P[$ch]\n" );
         prt( "$ip P[$ps][$pe]\n" ) if ($dbg5);
         $ntx .= $ch;
         if (($i + 1) < $cnt) {
            $blk = $paralist[$i+1][4];
            if ($blk == 0) {
               if (!$gotpre) {
                  $ntx .= '<br>';
               }
            }
         }
         $inp = 1;
      }
      $lstp = $stp;
      $ledp = $edp;
      $lpstp = $pstp;
      $lpedp = $pedp;
   }
   if ($inp) {
      $ntx .= '</p>';
   }
   if ($pedp < $mt) {
      $ntx .= substr($tx, $pedp); # add any remainder
   }
   return $ntx;
}
# check for things like
# <p>&lt;makeconf.mak&gt;, and if found seek to end
# <p>&lt;/makeconf.mak&gt;
sub check_my_xml {
   my ($tx, $i2) = @_;
   my $mt = length($tx);
   my $ch = '';
   my $i = 0;
   my $ln = '';
   for ($i = 0; $i < $mt; $i++) {
      $ch = substr($tx,$i,1);
      if ($ch eq '&') {
         $ln = $ch;
         $i++; # bump to next, and get the line
         for ( ; $i < $mt; $i++) {
            $ch = substr($tx,$i,1);
            if (($ch eq "\r")||($ch eq "\n")) {
               last;
            }
            $ln .= $ch;
         }
         if ($ln =~ /&lt;(\S+)&gt;/) {
            my $xml = $1;
            my $neg = 0;
            if (substr($xml,0,1) eq '/') {
               $xml = substr($xml,1);
               $neg = 1;
            } else {
               $i2 += length($ln);
            }
            if (defined $xmlblocks{$xml}) {
               prt("Old <$xml> ... ["  . $xmlblocks{$xml} . "]+[$neg $i2]\n") if ($dbg6);
               $xmlblocks{$xml} .= " $neg $i2";
            } else {
               prt("New <$xml> ... [$neg $i2]\n") if ($dbg6);
               $xmlblocks{$xml} = "$neg $i2";
            }
         }
         last;
      }
   }
}
sub in_xml_range {
   my ($j) = shift;
   foreach my $k (keys %xmlblocks) {
      my $v = $xmlblocks{$k}; # extract value
      my @a = split(' ',$v);
      if ((scalar @a == 2) && $a[0] && $a[1] && ($a[0] < $a[1]) ) {
         if (($j >= $a[0]) && ($j <= $a[1])){
            return 1;
         }
      }
   }
   return 0;
}
sub vv_in_xml_range {
   my ($j) = shift;
   prt( "Checking [$j] ...\n" );
   foreach my $k (keys %xmlblocks) {
      my $v = $xmlblocks{$k}; # extract value
      my @a = split(' ',$v);
      if ((scalar @a == 2) && $a[0] && $a[1] && ($a[0] < $a[1]) ) {
         if (($j >= $a[0]) && ($j >= $a[1])){
            return 1;
         }
      }
   }
   return 0;
}
sub show_xml_list {
   my ($tx) = shift;
   foreach my $k (keys %xmlblocks) {
      my $v = $xmlblocks{$k}; # extract value
      my @arr = split(" ",$v);
      if ((scalar @arr == 4) && ($arr[0] == 0) && ($arr[2] == 1)) {
         # appear to have 0 item start 1 item end
         $xmlblocks{$k} = "$arr[1] $arr[3]";
         prt( "Got $k bgn $arr[1], end $arr[3] ...\n" ) if ($dbg7);
      } else {
         prt( "Discarding [$k][$v] ...\n" ) if ($dbg7);
         delete $xmlblocks{$k}; # remove it
      }
   }
   prt( "Final list ...\n" );
   foreach my $k (keys %xmlblocks) {
      my $v = $xmlblocks{$k}; # extract value
      prt( "$k [$v]\n" );
      my @a = split(' ',$v);
      if ((scalar @a == 2) && $a[0] && $a[1] && ($a[0] < $a[1]) ) {
         my $blk = substr($tx, $a[0], $a[1] - $a[0]);
         prt( "BLOCK=[$blk]\n" ) if ($dbg7);
      } else {
         prt( "CHECH MISSED!!!\n" );
      }
   }
}
sub get_para_list {
   my ($tx) = shift;
   my $mt = length($tx);
   my ($ch, $i, $tg, $ip, $stp, $edp, $msg, $pstp, $pedp, $blk);
   $tg = '';
   $ip = 0;
   $stp = 0;
   $edp = 0;
   $pstp = 0;
   $pedp = 0;
   for ($i = 0; $i < $mt; $i++) {
      $ch = substr($tx,$i,1);
      if ($ch eq '<') {
         $edp = $i;
         $i++;
         $tg = $ch;
         for ( ; $i < $mt; $i++) {
            $ch = substr($tx,$i,1);
            $tg .= $ch;
            if ($ch eq '>') {
               $msg = "Got tag [$tg] ...";
               if ($tg =~ /<p.*>/i) {
                  $msg .= " in para";
                  $stp = $i + 1;
                  $pstp = $edp; # start of '<p...
                  $ip = 1;
               } elsif ($tg =~ /<\/p>/i) {
                  $msg .= " out para";
                  $ip = 0;
                  ###$edp = $i;
                  $pedp = $i + 1;
                  if ($stp && ($edp > $stp)) {
                     $ch = substr($tx, $stp, $edp - $stp);
                     # keep the intern stt   end outer sts end
                     $blk = 0;
                     if ($ch eq '&nbsp;') {
                        $blkparas++;
                        $blk = 1;
                     }
                     push(@paralist, [ $stp, $edp, $pstp, $pedp, $blk ]);
                  }
                  $stp = 0;
               }
               prt("$msg\n") if ($dbg3);
               last;
            }
         }
      } elsif ($ch eq '&') {
         check_my_xml( substr($tx,$i), $i );
      }
   }
}
sub process_style {
   my ($tx) = shift;
   my $sl = length($tx);
   my ($i, $ch, $sp, $ob, $nm, $stl);
   $sp = 0;
   $ob = 0;
   $nm = ''; # name of the style, can be comma separated
   $stl = ''; # material between braces
   for ($i = 0; $i < $sl; $i++) {
      $ch = substr($tx,$i,1);
      if ($ch eq '<') {
         # eat to end of this
         $i++;
         if ($i < $sl) {
            $ch = substr($tx,$i,1);
            if ($ch eq '!') {
               $i += 2;
            } else {
               for ( ; $i < $sl; $i++) {
                  $ch = substr($tx,$i,1);
                  if ($ch eq '>') {
                     last;
                  }
               }
            }
         }
      } elsif ($ch eq '/') {
         if ((($i + 1) < $sl) && (substr($tx,$i+1,1) eq '*')) {
            # entered a comment - eat it
            $i++;
            for ( ; $i < $sl; $i++) {
               $ch = substr($tx,$i,1);
               if ( ($ch eq '*') && (substr($tx,$i+1,1) eq '/')) {
                  last;
               }
            }
         }
      } elsif ($ch =~ /\s/) {
         $sp++;
      } elsif ($ch eq '{') {
         $ob = 1;
      } elsif ($ch eq '}') {
         $ob = 0;
         if (length($nm) && length($stl)) {
            my @arr = split(",", $nm);
            foreach my $bit (@arr) {
               $bit = trimall($bit);
               if (length($bit)) {
                  if (defined $mystyles{$bit}) {
                     prt( "Duplicate of [$bit], with val = [" . $mystyles{$bit} . "\n" );
                     $mystyles{$bit} .= ' ' . $stl;
                  } else {
                     $mystyles{$bit} = $stl;
                  }
               }
            }
         }
         prt( "Name = [$nm] Styles = [$stl] ...\n" ) if ($dbg4);
         $nm = '';
         $stl = '';
      } else {
         # seeking something { more ... }
         if ($ob) {
            $stl .= $ch;
         } else {
            $nm .= $ch;
         }
      }
   }
}
sub get_the_style {
   my $i = 0;
   my $off = 0;
   my $len = 0;
   # run one - extract the STYLE stuff
   for ($i = 0; $i < $lncnt; $i++) {
      $line = $lines[$i];
      $len = length($line);
      chomp $line;
      $line =~ s/\r$//;
      prt( "Line: ". ($i + 1) . " $line\n" ) if ($dbg2); 
      if ($line =~ /<style(.*)>/i) {
         prt( "Found [$line] ...\n" );
         $style = $line;
         $stbgn = $i; # keep BEGIN of STYLE
         $stybgn = $off;
         ###push(@styles, $line);
         while ( !($style =~ /<\/style>/i) && ($i < $lncnt) ) {
            $i++;
            $line = $lines[$i];
            $len = length($line);
            chomp $line;
            $line =~ s/\r$//;
            $style .= ' ' . $line;
            push(@styles, $line);
            $off += $len;
         }
         $stend = $i; # and END of STYLE
         $styend = $off;
         $len = 0;
      } elsif ($line =~ /<body(.*)>/i) {
         $bdybgn = $i;
      } elsif ($line =~ /<\/body>/i) {
         $bdyend = $i;
      }
      $off += $len;
   }
   process_style( $style );
   prt( "Style = [$style] ...\n" ) if ($dbg4);
   foreach my $k (keys %mystyles) {
      my $v = $mystyles{$k};
      prt( "$k { $v }\n" ) if ($dbg4);
      my @ar = split(/\./,$k);
      my $tg = trimall($ar[0]);
      my $at = $tg;
      if (scalar @ar == 2) {
         $at = trimall($ar[1]);
      }
      if (defined $myattrs{$at} ) {
         prt( "Adding [$tg] to [$at] ...\n" ) if ($dbg4);
         $myattrs{$at} .= ' ' . $tg;
      } else {
         prt( "Setting [$tg] to [$at] ...\n" ) if ($dbg4);
         $myattrs{$at} = $tg;
      }
   }
}
sub write_out_file {
   my ($tx, $fil) = @_;
   open OF, ">$fil" or mydie("YEEK! Can NOT create [$fil] ...\n");
   print OF $tx;
   close OF;
   prt("Written " . length($tx) . " characters to [$fil]...\n");
}
sub get_next_tag {
   my ($tx) = shift;
   my $tl = length($tx);
   my $ch = '';
   my $i = 0;
   my $nt = '';
   for ($i = 0; $i < $tl; $i++) {
      $ch = substr($tx,$i,1);
      if ($ch eq '<') {
         $nt = $ch;
         $i++;
         for ( ; $i < $tl; $i++) {
            $ch = substr($tx,$i,1);
            $nt .= $ch;
            if ($ch eq '>') {
               return $nt;
            }
         }
      }
   }
   return $nt;
}
sub inline_paras {
   my ($tx) = shift;
   my $tl = length($tx);
   my $ch = '';
   my $i = 0;
   my $nt = '';
   my $tg = '';
   my $ip = 0;
   for ($i = 0; $i < $tl; $i++) {
      $ch = substr($tx,$i,1);
      if ($ch eq '<') {
         $tg = get_next_tag( substr($tx,$i) );
         if ($tg =~ /^<p>/i) {
            $ip = 1;
         } elsif ($tg =~ /^<p\s+.+>/i) {
            $ip = 1;
         } elsif ($tg =~ /^<\/p>/) {
            $ip = 0;
         }
      }
      if ($ip) {
         if ($ch eq "\r") {
            # skip this
         } elsif ($ch eq "\n") {
            $nt .= ' ';
         } else {
            $nt .= $ch;
         }
      } else {
         $nt .= $ch;
      }
   }
   return $nt;
}
sub check_for_word {
   my $lc = scalar @lines;
   prt( "Processing $lc lines ... seeking MS word meta ...\n" );
   my $isword = 0;
   foreach $line (@lines) {
      chomp $line;
      $line =~ s/\r$//;
      ## <meta name="Generator" content="Microsoft Word 10 (filtered)">
      if ($line =~ /<meta\s+name="?Generator"?\s+?content="?(.*)"?>/i) {
         $cont = $1;
         if ($cont =~ /Microsoft/i) {
            $isword = 1;
            prt( "Found [$line] ...\n" );
            last;
         }
      }
   }
   return $isword;
}
sub general_adjustments {
   my ($tx) = shift;
   my $tl = length($tx);
   prt( "Begin len=$tl - Do some expansions, if required ...\n") if $dbg1;
   my $nt = htmlexpand($tx);
   $tl = length($nt);
   prt( "len=$tl - Add font tag to new line ...\n") if $dbg1;
   $nt = tag2newline($nt,'font');
   $tl = length($nt);
   prt( "len=$tl - Add input tag to new line ...\n") if $dbg1;
   $nt = tag2newline($nt,'input');
   $tl = length($nt);
   prt( "len=$tl - Add form tag to new line ...\n") if $dbg1;
   $nt = tag2newline($nt,'form');
   $tl = length($nt);
   prt( "len=$tl - Add comments to new line ...\n") if $dbg1;
   $nt = comments2newline($nt);
   $tl = length($nt);
   prt( "len=$tl - left before trimblanks ...\n") if $dbg1;
   ###$nt = trimblanklines($nt);
   $nt = trimblanks($nt);
   $tl = length($nt);
   prt("Now returning $tl characters ...\n") if $dbg1;
   return $nt;
}
sub trimall {
   my ($ln) = shift;
   chomp $ln;
   $ln =~ s/\r$//;
   $ln =~ s/\t/ /g;
   while ($ln =~ /\s\s/) {
      $ln =~ s/\s\s/ /g;
   }
   while ($ln =~ /^\s/) {
      $ln = substr($ln,1);
   }
   while ($ln =~ /\s$/) {
      $ln = substr($ln,0, length($ln) - 1);
   }
   return $ln;
}
sub add_my_style {
   my ($tx) = shift;
   my $nt = '';
   my $tl = length($tx);
   my $ch = '';
   my $i = 0;
   my $tg = '';
   my $di = 0;
   for ($i = 0; $i < $tl; $i++) {
      $ch = substr($tx,$i,1);
      if (!$di) {
         if ($ch eq '<') {
            $tg = get_next_tag( substr($tx,$i) );
            if ($tg =~ /<\/head>/i) {
               $nt .= $newstyle;
               $di = 1;
               prt( "Added new style ...\n" );
            }
         }
      }
      $nt .= $ch;
   }
   return $nt;
}
# eof - cleanhtm02.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional