Generated: Tue Feb 2 17:54:25 2010 from cleanhtm02.pl 2006/10/03 16 KB.
#!/Perl # cleanhtm02.pl - 2006.10.01 - geoff mclane (geoffmclane.com) # AIM: To clean certain items from a HTML document ... # specifically target microsoft word 'filtered' output which uses a # paragraph style which places every line in a <p> ... </p>, # and to denote REAL paragraphs, inserts a <p> </p> line ... # Thus to remove some of the <p>, using a <br> to get to a new line ... # ===================================================================== use strict; require 'logfile.pl' or die "ERROR: Can NOT load logfile.pl ...\n"; require 'htmltools.pl' or die "ERROR: Can NOT load htmltools.pl ...\n"; # log file stuff my ($LF); my $outfile = 'temp'.$0.'.txt'; open_log($outfile); prt( "$0 ... Hello, World ...\n" ); # user variable ###my $def_input = '..\javascript\messageontop.htm'; my $def_input = 'temptidy48in.htm'; ###my $def_input = 'C:\Documents and Settings\Geoff McLane.PRO-1\My Documents\tidy\tidy-48.htm'; my $def_output = 'temptidy48.htm'; ###my $def_input = 'C:\Documents and Settings\Geoff McLane.PRO-1\My Documents\FG\devel\fgd-008.htm'; ###my $def_output = 'tempfgd02.htm'; # debug my $dbg1 = 0; my $dbg2 = 0; my $dbg3 = 0; my $dbg4 = 0; my $dbg5 = 0; my $dbg6 = 0; my $dbg7 = 0; my $dbg8 = 0; # program variable my $in_file = $def_input; my $out_file = $def_output; $in_file = pop @ARGV if (@ARGV); $out_file = pop @ARGV if (@ARGV); prt( "Got input from [$in_file], output to [$out_file] ...\n" ); if ( ! -f $in_file) { mydie("OOPS: Can NOT locate [$in_file] ...\n"); } open IF, "<$in_file" or mydie("OOPS: Can NOT open [$in_file] ...\n"); my @lines = <IF>; # slurp it all in close IF; my $cnt = scalar @lines; prt("Processing $cnt lines from [$in_file] ...\n"); my $txt = join("\n", @lines); my $ccnt = length($txt); prt("Or $ccnt characters from [$in_file] ...\n"); my $newtxt = general_adjustments( $txt ); $ccnt = length($newtxt); my @lines = split("\n", $newtxt); my $line = ''; my $word = 0; my $cont = ''; my $lncnt = scalar @lines; my $style = ''; my @styles = (); my %mystyles = (); my %myattrs = (); my @attribs = (); my $stbgn = 0; my $stend = 0; my $bdybgn = 0; my $bdyend = 0; my @paralist = (); my $blkparas = 0; my $stybgn = 0; my $styend = 0; my %xmlblocks = (); ################################################################ my $newstyle = <<EOF; <style type="text/css"> body { background-image:url('clds5.jpg'); margin: 0cm 1cm 0cm 1cm; } .code { margin: 0px 10px 0px 10px; background: #f0f8ff; border-width: 1px; border-style: solid solid solid solid; border-color: #000090; width: 90%; padding: 0px 10px 0px 10px; } .diff { margin: 0px 10px 0px 10px; background: #f0ffef; border-width: 1px; border-style: solid solid solid solid; border-color: #900090; width: 90%; padding: 0px 10px 0px 10px; } </style> EOF my $my_pre = '<pre class="code">'; ################################################################ $word = check_for_word(); get_the_style(); @attribs = keys %myattrs; my $atcnt = scalar @attribs; prt( "Got $atcnt attibutes to look for ... style bgn $stbgn($stybgn), end $stend($styend) body bgn $bdybgn, end $bdyend ...\n" ); ##sub removetagattrib { my ($txt, $tag) = @_; my $clntxt = removetagattrib( $newtxt, 'p' ); $clntxt = striptag( $clntxt, 'style' ); $clntxt = add_my_style( $clntxt ); $clntxt = inline_paras( $clntxt ); get_para_list( $clntxt ); prt( "In " . scalar @paralist . " paragraphs, found $blkparas blanks ...\n" ); show_xml_list( $clntxt ); $clntxt = prt_para_list( $clntxt ); write_out_file($clntxt, $out_file); system($out_file); close_log($outfile,1); exit(0); # ############################################### # all subs below # ############## sub prt_para_list { my ($tx) = shift; my $cnt = scalar @paralist; my $mt = length($tx); my ($ch, $i, $ip, $stp, $edp, $ntx, $lstp, $ledp, $pstp, $pedp, $ps, $pe, $inp); my ($lpstp, $lpedp, $blk, $inx, $gotpre, $msg); prt( "Showing $cnt paragraphs ...\n" ); $ip = 0; $ntx = ''; $lstp = 0; $ledp = 0; $inx = 0; $stp = 0; $gotpre = 0; $inx = in_xml_range($stp); if ($cnt) { $stp = $paralist[0][0]; $edp = $paralist[0][1]; $pstp = $paralist[0][2]; $pedp = $paralist[0][3]; $blk = $paralist[0][4]; $ch = substr($tx, 0, $stp); $ntx = $ch; $lstp = $stp; $ledp = $edp; $lpstp = $pstp; $lpedp = $pedp; } # this includes a <p> - the first p ... $inp = 1; for ($i = 0; $i < $cnt; $i++) { $ip++; $stp = $paralist[$i][0]; $edp = $paralist[$i][1]; $pstp = $paralist[$i][2]; $pedp = $paralist[$i][3]; $blk = $paralist[$i][4]; $inx = in_xml_range($stp); if ($lpedp < $pstp) { $msg = ''; if ($inp) { # exclude <p> $msg .= "In p exclude"; $ch = substr($tx, $lpedp, $pstp - $lpedp); if ($inx) { ####$ch .= '</p>'; # close paragraph if (!$gotpre) { $ch .= '</p>' . $my_pre; $gotpre = 1; $msg .= " added my pre"; } $inp = 0; } } else { # include <p> if ($inx) { # exclude <p> $msg .= "Not p excl"; $ch = substr($tx, $lpedp, $pstp - $lpedp); if (!$gotpre) { $ch .= $my_pre; # '<pre>'; $gotpre = 1; $msg .= " add pre"; } } else { # include <p> $msg .= "Not p incl"; $ch = substr($tx, $lpedp, $stp - $lpedp); } } ##prt( "$ip Inbetween [$ch] \n" ) if ($dbg5); prt( "$ip [$pstp-$stp-$edp-$pedp] Inbetween inp=$inp inx=$inx [$ch] [$msg]\n" ) if ($dbg8); $ntx .= $ch; } $ch = substr($tx, $stp, $edp - $stp); $ps = substr($tx, $pstp, $stp - $pstp); $pe = substr($tx, $edp, $pedp - $edp); ###if ($ch eq ' ') { if ($blk) { $ch = substr($tx, $pstp, $pedp - $pstp); prt( "$ip BLANK P[$ch]\n" ) if ($dbg5); if ($gotpre) { ###if ($inx) { $ntx .= '</pre>'; $gotpre = 0; ###} } else { $ntx .= '</p>'; } $inp = 0; } else { ###prt( "$ip P[$ch]\n" ); prt( "$ip P[$ps][$pe]\n" ) if ($dbg5); $ntx .= $ch; if (($i + 1) < $cnt) { $blk = $paralist[$i+1][4]; if ($blk == 0) { if (!$gotpre) { $ntx .= '<br>'; } } } $inp = 1; } $lstp = $stp; $ledp = $edp; $lpstp = $pstp; $lpedp = $pedp; } if ($inp) { $ntx .= '</p>'; } if ($pedp < $mt) { $ntx .= substr($tx, $pedp); # add any remainder } return $ntx; } # check for things like # <p><makeconf.mak>, and if found seek to end # <p></makeconf.mak> sub check_my_xml { my ($tx, $i2) = @_; my $mt = length($tx); my $ch = ''; my $i = 0; my $ln = ''; for ($i = 0; $i < $mt; $i++) { $ch = substr($tx,$i,1); if ($ch eq '&') { $ln = $ch; $i++; # bump to next, and get the line for ( ; $i < $mt; $i++) { $ch = substr($tx,$i,1); if (($ch eq "\r")||($ch eq "\n")) { last; } $ln .= $ch; } if ($ln =~ /<(\S+)>/) { my $xml = $1; my $neg = 0; if (substr($xml,0,1) eq '/') { $xml = substr($xml,1); $neg = 1; } else { $i2 += length($ln); } if (defined $xmlblocks{$xml}) { prt("Old <$xml> ... [" . $xmlblocks{$xml} . "]+[$neg $i2]\n") if ($dbg6); $xmlblocks{$xml} .= " $neg $i2"; } else { prt("New <$xml> ... [$neg $i2]\n") if ($dbg6); $xmlblocks{$xml} = "$neg $i2"; } } last; } } } sub in_xml_range { my ($j) = shift; foreach my $k (keys %xmlblocks) { my $v = $xmlblocks{$k}; # extract value my @a = split(' ',$v); if ((scalar @a == 2) && $a[0] && $a[1] && ($a[0] < $a[1]) ) { if (($j >= $a[0]) && ($j <= $a[1])){ return 1; } } } return 0; } sub vv_in_xml_range { my ($j) = shift; prt( "Checking [$j] ...\n" ); foreach my $k (keys %xmlblocks) { my $v = $xmlblocks{$k}; # extract value my @a = split(' ',$v); if ((scalar @a == 2) && $a[0] && $a[1] && ($a[0] < $a[1]) ) { if (($j >= $a[0]) && ($j >= $a[1])){ return 1; } } } return 0; } sub show_xml_list { my ($tx) = shift; foreach my $k (keys %xmlblocks) { my $v = $xmlblocks{$k}; # extract value my @arr = split(" ",$v); if ((scalar @arr == 4) && ($arr[0] == 0) && ($arr[2] == 1)) { # appear to have 0 item start 1 item end $xmlblocks{$k} = "$arr[1] $arr[3]"; prt( "Got $k bgn $arr[1], end $arr[3] ...\n" ) if ($dbg7); } else { prt( "Discarding [$k][$v] ...\n" ) if ($dbg7); delete $xmlblocks{$k}; # remove it } } prt( "Final list ...\n" ); foreach my $k (keys %xmlblocks) { my $v = $xmlblocks{$k}; # extract value prt( "$k [$v]\n" ); my @a = split(' ',$v); if ((scalar @a == 2) && $a[0] && $a[1] && ($a[0] < $a[1]) ) { my $blk = substr($tx, $a[0], $a[1] - $a[0]); prt( "BLOCK=[$blk]\n" ) if ($dbg7); } else { prt( "CHECH MISSED!!!\n" ); } } } sub get_para_list { my ($tx) = shift; my $mt = length($tx); my ($ch, $i, $tg, $ip, $stp, $edp, $msg, $pstp, $pedp, $blk); $tg = ''; $ip = 0; $stp = 0; $edp = 0; $pstp = 0; $pedp = 0; for ($i = 0; $i < $mt; $i++) { $ch = substr($tx,$i,1); if ($ch eq '<') { $edp = $i; $i++; $tg = $ch; for ( ; $i < $mt; $i++) { $ch = substr($tx,$i,1); $tg .= $ch; if ($ch eq '>') { $msg = "Got tag [$tg] ..."; if ($tg =~ /<p.*>/i) { $msg .= " in para"; $stp = $i + 1; $pstp = $edp; # start of '<p... $ip = 1; } elsif ($tg =~ /<\/p>/i) { $msg .= " out para"; $ip = 0; ###$edp = $i; $pedp = $i + 1; if ($stp && ($edp > $stp)) { $ch = substr($tx, $stp, $edp - $stp); # keep the intern stt end outer sts end $blk = 0; if ($ch eq ' ') { $blkparas++; $blk = 1; } push(@paralist, [ $stp, $edp, $pstp, $pedp, $blk ]); } $stp = 0; } prt("$msg\n") if ($dbg3); last; } } } elsif ($ch eq '&') { check_my_xml( substr($tx,$i), $i ); } } } sub process_style { my ($tx) = shift; my $sl = length($tx); my ($i, $ch, $sp, $ob, $nm, $stl); $sp = 0; $ob = 0; $nm = ''; # name of the style, can be comma separated $stl = ''; # material between braces for ($i = 0; $i < $sl; $i++) { $ch = substr($tx,$i,1); if ($ch eq '<') { # eat to end of this $i++; if ($i < $sl) { $ch = substr($tx,$i,1); if ($ch eq '!') { $i += 2; } else { for ( ; $i < $sl; $i++) { $ch = substr($tx,$i,1); if ($ch eq '>') { last; } } } } } elsif ($ch eq '/') { if ((($i + 1) < $sl) && (substr($tx,$i+1,1) eq '*')) { # entered a comment - eat it $i++; for ( ; $i < $sl; $i++) { $ch = substr($tx,$i,1); if ( ($ch eq '*') && (substr($tx,$i+1,1) eq '/')) { last; } } } } elsif ($ch =~ /\s/) { $sp++; } elsif ($ch eq '{') { $ob = 1; } elsif ($ch eq '}') { $ob = 0; if (length($nm) && length($stl)) { my @arr = split(",", $nm); foreach my $bit (@arr) { $bit = trimall($bit); if (length($bit)) { if (defined $mystyles{$bit}) { prt( "Duplicate of [$bit], with val = [" . $mystyles{$bit} . "\n" ); $mystyles{$bit} .= ' ' . $stl; } else { $mystyles{$bit} = $stl; } } } } prt( "Name = [$nm] Styles = [$stl] ...\n" ) if ($dbg4); $nm = ''; $stl = ''; } else { # seeking something { more ... } if ($ob) { $stl .= $ch; } else { $nm .= $ch; } } } } sub get_the_style { my $i = 0; my $off = 0; my $len = 0; # run one - extract the STYLE stuff for ($i = 0; $i < $lncnt; $i++) { $line = $lines[$i]; $len = length($line); chomp $line; $line =~ s/\r$//; prt( "Line: ". ($i + 1) . " $line\n" ) if ($dbg2); if ($line =~ /<style(.*)>/i) { prt( "Found [$line] ...\n" ); $style = $line; $stbgn = $i; # keep BEGIN of STYLE $stybgn = $off; ###push(@styles, $line); while ( !($style =~ /<\/style>/i) && ($i < $lncnt) ) { $i++; $line = $lines[$i]; $len = length($line); chomp $line; $line =~ s/\r$//; $style .= ' ' . $line; push(@styles, $line); $off += $len; } $stend = $i; # and END of STYLE $styend = $off; $len = 0; } elsif ($line =~ /<body(.*)>/i) { $bdybgn = $i; } elsif ($line =~ /<\/body>/i) { $bdyend = $i; } $off += $len; } process_style( $style ); prt( "Style = [$style] ...\n" ) if ($dbg4); foreach my $k (keys %mystyles) { my $v = $mystyles{$k}; prt( "$k { $v }\n" ) if ($dbg4); my @ar = split(/\./,$k); my $tg = trimall($ar[0]); my $at = $tg; if (scalar @ar == 2) { $at = trimall($ar[1]); } if (defined $myattrs{$at} ) { prt( "Adding [$tg] to [$at] ...\n" ) if ($dbg4); $myattrs{$at} .= ' ' . $tg; } else { prt( "Setting [$tg] to [$at] ...\n" ) if ($dbg4); $myattrs{$at} = $tg; } } } sub write_out_file { my ($tx, $fil) = @_; open OF, ">$fil" or mydie("YEEK! Can NOT create [$fil] ...\n"); print OF $tx; close OF; prt("Written " . length($tx) . " characters to [$fil]...\n"); } sub get_next_tag { my ($tx) = shift; my $tl = length($tx); my $ch = ''; my $i = 0; my $nt = ''; for ($i = 0; $i < $tl; $i++) { $ch = substr($tx,$i,1); if ($ch eq '<') { $nt = $ch; $i++; for ( ; $i < $tl; $i++) { $ch = substr($tx,$i,1); $nt .= $ch; if ($ch eq '>') { return $nt; } } } } return $nt; } sub inline_paras { my ($tx) = shift; my $tl = length($tx); my $ch = ''; my $i = 0; my $nt = ''; my $tg = ''; my $ip = 0; for ($i = 0; $i < $tl; $i++) { $ch = substr($tx,$i,1); if ($ch eq '<') { $tg = get_next_tag( substr($tx,$i) ); if ($tg =~ /^<p>/i) { $ip = 1; } elsif ($tg =~ /^<p\s+.+>/i) { $ip = 1; } elsif ($tg =~ /^<\/p>/) { $ip = 0; } } if ($ip) { if ($ch eq "\r") { # skip this } elsif ($ch eq "\n") { $nt .= ' '; } else { $nt .= $ch; } } else { $nt .= $ch; } } return $nt; } sub check_for_word { my $lc = scalar @lines; prt( "Processing $lc lines ... seeking MS word meta ...\n" ); my $isword = 0; foreach $line (@lines) { chomp $line; $line =~ s/\r$//; ## <meta name="Generator" content="Microsoft Word 10 (filtered)"> if ($line =~ /<meta\s+name="?Generator"?\s+?content="?(.*)"?>/i) { $cont = $1; if ($cont =~ /Microsoft/i) { $isword = 1; prt( "Found [$line] ...\n" ); last; } } } return $isword; } sub general_adjustments { my ($tx) = shift; my $tl = length($tx); prt( "Begin len=$tl - Do some expansions, if required ...\n") if $dbg1; my $nt = htmlexpand($tx); $tl = length($nt); prt( "len=$tl - Add font tag to new line ...\n") if $dbg1; $nt = tag2newline($nt,'font'); $tl = length($nt); prt( "len=$tl - Add input tag to new line ...\n") if $dbg1; $nt = tag2newline($nt,'input'); $tl = length($nt); prt( "len=$tl - Add form tag to new line ...\n") if $dbg1; $nt = tag2newline($nt,'form'); $tl = length($nt); prt( "len=$tl - Add comments to new line ...\n") if $dbg1; $nt = comments2newline($nt); $tl = length($nt); prt( "len=$tl - left before trimblanks ...\n") if $dbg1; ###$nt = trimblanklines($nt); $nt = trimblanks($nt); $tl = length($nt); prt("Now returning $tl characters ...\n") if $dbg1; return $nt; } sub trimall { my ($ln) = shift; chomp $ln; $ln =~ s/\r$//; $ln =~ s/\t/ /g; while ($ln =~ /\s\s/) { $ln =~ s/\s\s/ /g; } while ($ln =~ /^\s/) { $ln = substr($ln,1); } while ($ln =~ /\s$/) { $ln = substr($ln,0, length($ln) - 1); } return $ln; } sub add_my_style { my ($tx) = shift; my $nt = ''; my $tl = length($tx); my $ch = ''; my $i = 0; my $tg = ''; my $di = 0; for ($i = 0; $i < $tl; $i++) { $ch = substr($tx,$i,1); if (!$di) { if ($ch eq '<') { $tg = get_next_tag( substr($tx,$i) ); if ($tg =~ /<\/head>/i) { $nt .= $newstyle; $di = 1; prt( "Added new style ...\n" ); } } } $nt .= $ch; } return $nt; } # eof - cleanhtm02.pl