Generated: Tue Feb 2 17:54:42 2010 from h2h01.pl 2009/08/01 15.8 KB.
#!/perl -w # NAME: h2h01.pl # AIM: Convert a HTML file, to an online HTML page, color coded... # 29/07/2009 geoff mclane http://geoffair.net/mperl3 use strict; use warnings; use File::Basename; # to split path into ($name, $dir, $ext) use File::stat; # to get the file date #require 'logfile.pl' or die "Unable to load logfile.pl ...\n"; require 'fgutils.pl' or die "Unable to load fgutils.pl ...\n"; # log file stuff my ($LF); my $pgmname = $0; if ($pgmname =~ /\w{1}:\\.*/) { my @tmpsp = split(/\\/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = "temp.$pgmname.txt"; open_log($outfile); my $in_file = 'C:\Projects\tidy\tests\2811690-04.html'; # options my $tab_space = ' '; my $out_file = 'temph2h.htm'; my $addspace = 0; # coloring my $a_class = 'a'; # RED my $b_class = 'b'; # comments (#006666) my $c_class = 'c'; # reserved words (blue) my $e_class = 'e'; # known WIN32 words my $t_class = 't'; # quoted - single and double (#006600) my $red_count = 0; my $blue_count = 0; my $comm_count = 0; my $quot_count = 0; my $eclass_count = 0; # debug my $dbg1 = 0; # show SET messages my $dbg2 = 0; # show type message my $dbg3 = 0; my $dbg4 = 0; my $convspace = 1; my $saveconv = 1; prt( "$0 ... Processing $in_file...\n" ); my @delimiters_NOT_USED = ( ' ', ',', '(', ')', '{', '}', '[', ']', '-', '+', '*', '%', '/', '=', '"', "'", '~', '!', '&', '|', '<', '>', '?', ':', ';', '.', '#', "\t" ); sub write_head($$$) { my ($fil, $title, $msg) = @_; my $head = <<EOF; <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <title> $titlee </title> <meta http-equiv="Content-Language" content="en-us"> <meta http-equiv="Content-Type" content="text/html; charset=us-ascii"> <link rel="stylesheet" href="cxx.css" type="text/css"> </head> <body> <a name="top" id="top"></a> <h1> $titlee </h1> <p class="top"><a href="index.htm">index</a></p> <p>$msgg </p> EOF write2file($head,$fil); # create and write to file } sub open_pre($) { my ($fil) = shift; $saveconv = $convspace; append2file("\n<pre class=\"cd\">",$fil); # append to file $convspace = 0; } sub close_pre($) { my ($fil) = shift; append2file("\n</pre>\n",$fil); # append to file $convspace = $saveconv; } sub append_tail($$) { my ($fil, $msg) = @_; my $tail = <<EOF; <hr class="mini"> <p class="top"> <a target="_self" href="#top">top</a> </p> <p> <a name="end" id="end"></a> <a target="_blank" href="http://tidy.sourceforge.net/"><img border="0" src="images/checked_by_tidy.gif" alt="checked by tidy" width="32" height="32"></a> <a href="http://validator.w3.org/check?uri=referer" target="_blank"><img src="images/valid-html401.gif" alt="Valid HTML 4.01 Transitional" width="88" height="31"></a> </p> <!-- $msg --> </body> </html> EOF append2file($tail,$fil); # append to file } ###################################################### # Converting SPACES to ' ' # Of course this could be done just using perl's # powerful search and replace, but this handles # any number of spaces, only converting the number # minus 1 to ... not sure how to have # this level of control with regex replacement ###################################################### sub conv_spaces { my $t = shift; my ($c, $i, $nt, $ln, $sc, $sp); $nt = ''; # accumulate new line here $ln = length($t); for ($i = 0; $i < $ln; $i++) { $c = substr($t,$i,1); if ($c eq ' ') { $i++; # bump to next $sc = 0; $sp = ''; for ( ; $i < $ln; $i++) { $c = substr($t,$i,1); if ($c ne ' ') { last; # exit } $sc++; $sp .= $c; } if ($sc) { $sp =~ s/ / /g; $nt .= $sp; } $i--; # back up one $c = ' '; # add back the 1 space } $nt .= $c; } prt( "conv_space: from [$t] to [$nt] ...\n" ) if $dbg4; return $nt; } ########################################################################### # VERY IMPORTANT SERVICE # This converts the 'text' into HTML text, but only does a partial job! # 1. Convert '&' to '&' to avoid interpreting as replacement # 2. Convert '<' to '<' and '>' to '>', to avoid interpreting as HTML # 3. Convert '"' to '"' # 4. Convert '\t' to SPACES # 5. Finally, if there are double or more SPACES, convert to ' ' ########################################################################### sub html_line { my $t = shift; my $ot = $t; $t =~ s/&/&/g; # all '&' become '&' $t =~ s/</</g; # make sure all '<' is/are swapped out $t =~ s/>/>/g; # make sure all '>' is/are swapped out $t =~ s/\"/"/g; # and all quotes become " $t =~ s/\t/$tab_space/g; # tabs to spaces if ($convspace && ($t =~ /\s\s/)) { # if any two consecutive white space $t = conv_spaces($t); } prt( "html_line: from [$ot] to [$t] ...\n" ) if $dbg3; return $t; } sub add_red($) { my ($t) = shift; $red_count++; return ('<span class="'.$a_class.'">'.$t.'</span>'); } sub add_comm($) { my ($t) = shift; $comm_count++; return ('<span class="'.$b_class.'">'.$t.'</span>'); } # reserved words sub add_blue($) { my ($t) = shift; $blue_count++; return ('<span class="'.$c_class.'">'.$t.'</span>'); } sub add_eclass($) { my ($t) = shift; $eclass_count++; return ('<span class="'.$e_class.'">'.$t.'</span>'); } sub add_quot($) { my ($t) = shift; $quot_count++; return ('<span class="'.$t_class.'">'.$t.'</span>'); } sub space_split_keep { my ($txt) = shift; my $len = length($txt); my ($k, $ch, $tag, $incomm, $k2, $nch); my @arr = (); $tag = ''; $incomm = 0; for ($k = 0; $k < $len; $k++) { $ch = substr($txt,$k,1); $k2 = $k + 1; $nch = ($k2 < $len) ? substr($txt,$k2,1) : ""; if ($incomm) { $tag .= $ch; $incomm = 0 if ($ch eq '"'); } elsif ($ch =~ /\s/) { # any spacey char $tag .= $ch; while( ($k2 < $len) && length($nch) && ($nch =~ /\s/) ) { $tag .= $nch; $k++; $k2 = $k + 1; $nch = ($k2 < $len) ? substr($txt,$k2,1) : ""; } push(@arr, $tag); $tag = ''; } elsif (($ch =~ /\//)&&($nch eq '>')) { # 04/10/2008, but only if before '>' 24/09/2008 add this as well push(@arr, $tag) if (length($tag)); $tag = $ch; # restart tag with this character } else { $tag .= $ch; $incomm = 1 if ($ch eq '"'); } } push(@arr, $tag) if (length($tag)); return @arr; } sub get_tag_split($$$) { my ($tx,$ln,$rk) = @_; my ($k, $c, $sp); my $tg = ''; for ($k = 0; $k < $ln; $k++) { $c = substr($tx,$k,1); if ($c eq '<') { # start of tag $k++; # bump to next, and get whole tag for (; $k < $ln; $k++) { $c = substr($tx,$k,1); if ($c eq '>') { # $k++; # include this char last; } $tg .= $c; } last; } } if ($k == $ln) { prt("$tx"); prt("\n") if (! $tx =~ /\n$/); prt("ERROR: NOT coded\n" ); exit 1; } $$rk = $k; #prt( "tag: [$tg]\n" ); #my @arr = space_split_keep($tg); #my $cnt = 0; #foreach my $tmp (@arr) { # $cnt++; # prt(" $cnt: [$tmp]\n"); #} #return @arr; return space_split_keep($tg); } sub is_entity($) { my ($t) = shift; my $l = length($t); if (($l > 2)&&($t =~ /^&/)) { for (my $p = 1; $p < $l; $p++) { my $c = substr($t,$p,1); if (($c =~ /\w/) || (($c eq '#')&&($p == 1))) { # alphanumeric or '#' char is first next; # while suitable chars } elsif (($p > 1)&&($c eq ';')) { return 1; # reach the first ';' IT IS AN ENTITY! } else { last; # no go - out of here } } } return 0; } sub process_file($) { my ($fil) = shift; #my @html = (); my $html = (); if (open INF, "<$fil") { my @lines = <INF>; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from $fil...\n"); my ($i, $j, $k, $t, $line, $ch, $len, @atag, $tag, $cnt, $txt, $htm, $tmp, $ltx, $ch2); my ($lnbal, $iscomm, $tst, $isdata, $isphp, $isasp, $isjava, $isproc); $txt = ''; $line = ''; for ($i = 0; $i < $lncnt; $i++) { $line .= $lines[$i]; # chomp $line; # $line = trim_all($line); } $len = length($line); for ($j = 0; $j < $len; $j++) { $ch = substr($line,$j,1); if ($ch eq '<') { $ltx = length($txt); if ($ltx > 0) { prt("$txt"); #push(@html, html_line($txt)); if (($txt =~ /&/)&&($txt =~ /;/)) { $tmp = ''; # appears text contains one or more entities # so give these another color for ($t = 0; $t < $ltx; $t++) { $ch2 = substr($txt,$t,1); if ( ($ch2 eq '&') && (is_entity(substr($txt,$t)) ) ) { $html .= html_line($tmp) if length($tmp); $tmp = $ch2; # start the entity $t++; # bump to next for (; $t < $ltx; $t++) { $ch2 = substr($txt,$t,1); $tmp .= $ch2; if ($ch2 eq ';') { last; # end of entity } } $html .= add_eclass(html_line($tmp)); $tmp = ''; } else { $tmp .= $ch2; } } $html .= html_line($tmp) if length($tmp); # $html .= add_comm(html_line($txt)); } else { $html .= html_line($txt); } $txt = ''; } # need to got to the END of this tag - especially for say <![DATA[ ... ]]> $lnbal = substr($line,$j); # @atag = get_tag_split( substr($line,$j), $len - $j, \$k ); @atag = get_tag_split( $lnbal, $len - $j, \$k ); $cnt = scalar @atag; $tag = $atag[0]; # get the HTML TAG $iscomm = 0; $isdata = 0; $isphp = 0; $isasp = 0; $isjava = 0; $isproc = 0; if ($cnt > 1) { if ($tag =~ /^!--/) { prt("SET comment [$tag]\n") if ($dbg1); $iscomm = 1; } elsif ($tag =~ /^!\[CDATA\[/) { prt("SET CDATA [$tag]\n") if ($dbg1); $isdata = 1; } elsif ($tag =~ /^?php/) { prt("SET PHP [$tag]\n") if ($dbg1); $isphp = 1; } elsif ($tag =~ /^%/) { prt("SET ASP [$tag]\n") if ($dbg1); $isasp = 1; } elsif ($tag =~ /^#/) { prt("SET JAVA [$tag]\n") if ($dbg1); $isjava = 1; } elsif ($tag =~ /^\?/) { prt("SET PROC [$tag]\n") if ($dbg1); $isproc = 1; } else { prt("SET NONE [$tag]\n") if ($dbg1); } $html .= add_blue(html_line("<$tag")); $htm = "<$tag"; $tmp = ''; for ($t = 1; $t < $cnt; $t++) { $tst = $atag[$t]; if (($t + 1) == $cnt) { last if ($isdata && ($tst eq ']]')); last if ($iscomm && ($tst eq '--')); last if ($isphp && ($tst eq '?')); last if ($isasp && ($tst eq '%')); last if ($isjava && ($tst eq '#')); } $tmp .= " " if (length($tmp) && $addspace); $tmp .= $tst; } $htm .= $tmp; $html .= " " if ($addspace); if ($iscomm) { prt("IS comment [$tmp]\n") if ($dbg2); $html .= add_comm(html_line($tmp)); $html .= add_blue(html_line("-->")); $htm .= "--"; } elsif ($isdata) { prt("IS CDATA [$tmp]\n") if ($dbg2); $html .= add_quot(html_line($tmp)); $html .= add_blue(html_line("]]>")); $htm .= "]]>"; } elsif ($isphp) { prt("IS PHP [$tmp]\n") if ($dbg2); $html .= add_quot(html_line($tmp)); $html .= add_blue(html_line("?>")); $htm .= "?>"; } elsif ($isasp) { prt("IS ASP [$tmp]\n") if ($dbg2); $html .= add_quot(html_line($tmp)); $html .= add_blue(html_line("%>")); $htm .= "%>"; } elsif ($isjava) { prt("IS JAVA [$tmp]\n") if ($dbg2); $html .= add_quot(html_line($tmp)); $html .= add_blue(html_line("#>")); $htm .= "#>"; } elsif ($isproc) { prt("IS PROC [$tmp]\n") if ($dbg2); $html .= add_quot(html_line($tmp)); $html .= add_blue(html_line(">")); $htm .= ">"; } else { prt("IS NOT [$tmp]\n") if ($dbg2); $html .= add_red(html_line($tmp)); $htm .= ">"; $html .= add_blue(html_line(">")) } #push(@html, add_red(html_line($htm))); #$html .= add_blue(html_line($htm)); } else { # simple tag - cover it in BLUE $htm = "<$tag>"; $html .= add_blue(html_line($htm)); } prt("$htm"); $j += $k; } else { $txt .= $ch; } } #$html .= "\n"; #} } else { prt("ERROR: Unable to open [$fil]...\n"); } #return @html; return $html; } #my @html_lines = process_file($in_file); #if (@html_lines) { my $html_lines = process_file($in_file); my ($in_title,$dir) = fileparse($in_file); if ( length($html_lines) > 0) { prt( "Writing lines to $out_file ...\n" ); my $sb = stat($in_file); my $tit = "$in_title to HTML"; my $cur_tm = localtime(time()); my $msg = "Generated: On $cur_tm,\n<br>From: $in_file, dated ".scalar localtime($sb->mtime).", with size ".$sb->size." bytes."; my $tmsg = "GA: Generated by $pgmname, on $cur_tm, from $in_title"; #write_head($out_file, "HTML of $in_file", "HTML conversion of [$in_file]"); write_head($out_file, $tit, $msg); open_pre( $out_file ); #append2file(join("\n",@html_lines), $out_file); append2file($html_lines, $out_file); close_pre( $out_file ); append_tail( $out_file, $tmsg); system($out_file); } close_log($outfile,0); exit(0); # eof - h2h01.pl