#!/perl -w # NAME: chkhtml.pl # AIM: Just parse HTML elements, and report any problem # 08/01/2016 - Some UI enhancements # 2010/04/06 - geoff mclane - http://geoffair.net/mperl/ use strict; use warnings; use File::Basename; use Cwd; unshift(@INC, 'C:\GTools\perl'); require 'logfile.pl' or die "Unable to load logfile.pl ...\n"; # log file stuff my ($LF); my $pgmname = $0; if ($pgmname =~ /\w{1}:\\.*/) { my @tmpsp = split(/\\/,$pgmname); $pgmname = $tmpsp[-1]; } my $perl_dir = 'C:\GTools\perl'; my $outfile = $perl_dir."\\temp.$pgmname.txt"; open_log($outfile); # user variables my $load_log = 1; my $def_infile = 'F:\Projects\tidy-html5\test\input5\in_342-1.html'; #my $def_infile = 'U:\var\www\fg\www\Docs\getstart\getstart24.html'; #my $def_infile = 'C:\HOMEPAGE\FG\Docs\getstart\getstartch9.html'; my $in_file = ''; my $show_attributes = 0; my $show_text = 0; my $trim_text = 1; my $max_txt_len = 80; my $show_element_stack = 1; my @closed_tags = qw( meta link area base basefont br frame hr isindex param bgsound embed keygen img ); # tags which do NOT need a closing, like

, tag my @opt_tags = ( "body", "colgroup", "dd", "dt", "head", "html", "li", "optgroup", "option", "p", "tbody", "td", "tfoot", "th", "thead", "tr", "marquee" ); ### program variables my $verbosity = 0; my @warnings = (); my $cwd = cwd(); sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg) } close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } sub show_warnings() { if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } else { prt( "\nNo warnings issued.\n\n" ); } } sub is_closed_tag($) { my ($tt) = shift; my $lctt = lc($tt); foreach my $tag (@closed_tags) { return 1 if ($tag eq $lctt); } return 0; } sub is_opt_tag($) { my ($tt) = shift; my $lctt = lc($tt); foreach my $tag (@opt_tags) { return 1 if ($tag eq $lctt); } return 0; } # $drop = can_find_this_tag($tag,\@elements); sub can_find_this_tag($$) { my ($tag,$re) = @_; my $len = scalar @{$re}; my $drop = 0; my $bu = -1; my $last = ''; while ($len) { $drop++; # can pop this one $last = ${$re}[$bu][0]; # get tag if ($last eq $tag) { # if the desired tag return $drop; # return drop value } elsif ( ! is_opt_tag($last) ) { return 0; # oop, have a non-optional tag } $bu--; # back up one more $len--; # and reduce available to check } return 0; } sub is_all_optional($) { my ($re) = @_; my $len = scalar @{$re}; my $bu = -1; my ($last); while ($len) { $last = ${$re}[$bu][0]; # get tag if ( ! is_opt_tag($last) ) { return 0; # oop, have a non-optional tag } $bu--; # back up one more $len--; # and reduce available to check } return 1; # ALL were optiona } sub show_stack_elements($$$) { my ($tag,$rele,$rlns) = @_; my $cnt = scalar @{$rele}; my $lcnt = scalar @{$rlns}; if ($cnt) { prt("The stack has $cnt elements... The current closing element is [$tag]\n"); for (my $i = 0; $i < $cnt; $i++) { my $e = ${$rele}[$i][0]; my $n = ${$rele}[$i][1]; prt("$n: elelement [$e]"); prt(" SAME as tag [$tag]!") if ($e eq $tag); if ($n <= $lcnt) { my $ln = trim_all(${$rlns}[$n-1]); prt(" line=[$ln]"); } prt("\n"); } } } sub get_element_chain($) { my ($rele) = @_; my $cnt = scalar @{$rele}; my $chn = ''; if ($cnt) { for (my $i = 0; $i < $cnt; $i++) { my $e = ${$rele}[$i][0]; $chn .= '|' if length($chn); $chn .= $e; } } return $chn; } sub process_file($) { my ($inf) = shift; if (!open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]!\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Got $lncnt lines, from $inf...\n"); my ($i,$line,$ch,$tag,$len,$intag,$txt,$j,$pc,$ppc,$incdata,$hadsp,$attrs); my ($lnn,$last,$lln,$bgnlnn,$endlnn,$clnn,$stkdep,$maxdep); my ($maxelement,$echn,$tag_type,$tmp); $tag = ''; $attrs = ''; $intag = 0; $incdata = 0; $hadsp = 0; $txt = ''; $ch = ''; $pc = ''; my @elements = (); $lnn = 0; $maxdep = 0; $maxelement = ''; for ($i = 0; $i < $lncnt; $i++) { $line = $lines[$i]; chomp $line; $len = length($line); $lnn++; $clnn = sprintf("%3d",$lnn); for ($j = 0; $j < $len; $j++) { $ppc = $pc; $pc = $ch; $ch = substr($line,$j,1); if ($incdata) { $tag .= $ch; if (($ch eq '>')&&($pc eq ']')&&($ppc eq ']')) { $incdata = 0; prt("$clnn: End CDATA\n"); } } elsif ($intag) { if ($hadsp) { $attrs .= $ch if !($ch eq '>'); } elsif ($ch =~ /\s/) { $hadsp = 1; } else { $tag .= $ch if !($ch eq '>'); } if ($ch eq '>') { $intag = 0; $endlnn = $lnn; } elsif (($ch eq '[')&&($pc eq 'A')&&($tag =~ /^<\!\[CDATA\[/)) { $incdata = 1; prt("\n$clnn: Begin CDATA\n"); } if (!$intag) { $stkdep = scalar @elements; $tag = trim_all($tag); $tag_type = 'Unknonw'; if ($tag =~ /^(\!|\?)/) { $tag_type = "Special"; } elsif (($attrs =~ /\/$/) || is_closed_tag($tag)) { $tag_type = "self-closed"; } elsif ($tag =~ /^\//) { $tag_type = "Close"; } else { $tag_type = "Open"; } $clnn = sprintf("%3d",$lnn); if ($verbosity) { prt("$clnn: "); if ($show_text) { prt("Text [".trim_all($txt)."] ") if (length($txt) && !($txt =~ /^\s+$/)); } prt("End tag [$tag] "); if ($show_attributes) { prt("Attrs [".trim_all($attrs)."] ") if (length($attrs)); } } if ($tag =~ /^(\!|\?)/) { prt("Special") if ($verbosity); } else { # if ($attrs =~ /\/$/) but it may NOT end with '/' if (($attrs =~ /\/$/) || is_closed_tag($tag)) { prt("self-closed") if ($verbosity); } elsif ($tag =~ /^\//) { $tag = substr($tag,1); prt("Close") if ($verbosity); $echn = get_element_chain(\@elements); if ($verbosity && $show_element_stack) { prt(" s=$echn $stkdep"); } if (@elements) { $last = $elements[-1][0]; $lln = $elements[-1][1]; if ($last eq $tag) { pop @elements; } else { # but may have 'opt' tags - tags that need no close on the stack, which # can be dropped to get to this tag my $drop = can_find_this_tag($tag,\@elements); if ($drop) { prt("\nWARNING: Last [$last]$lln NE [$tag]$lnn drop $drop "); $tmp = ''; while($drop--) { $tmp .= "|" if (length($tmp)); $tmp .= pop @elements; } prt("$tmp "); } else { prt("\nERROR: Last [$last]$lln NE [$tag]$lnn line=[".trim_all($line)."]\n"); show_stack_elements($tag,\@elements,\@lines); pgm_exit(1,"ERROR:[1]: It is useless to continue when the element stack is out of order!\n"); } } } else { prt("\nERROR: The stack has NO elements... The current closing element is [$tag]\n"); pgm_exit(1,"ERROR:[2]: It is useless to continue when the element stack is out of order! [2]\n"); } } else { prt("Open") if ($verbosity); push(@elements,[$tag,$bgnlnn,$endlnn]); $echn = get_element_chain(\@elements); $stkdep = scalar @elements; if ($stkdep > $maxdep) { $maxdep = $stkdep; $maxelement = "$clnn: $tag $bgnlnn $endlnn [$echn]"; } if ($verbosity && $show_element_stack) { prt(" s=$echn $stkdep"); } } } prt("\n") if ($verbosity); # reset $txt = ''; $tag = ''; $attrs = ''; $hadsp = 0; } } else { if ($ch eq '<') { $tag = ''; $intag = 1; $hadsp = 0; $bgnlnn = $lnn; } else { $txt .= $ch; } } } # reached end of line - get next $ch = ' '; $txt .= $ch if (length($txt) && !($txt =~ /\s$/)); if ($hadsp) { $attrs .= $ch if (length($attrs) && !($attrs =~ /\s$/)); } else { $tag .= $ch if (length($tag) && !($tag =~ /\s$/)); } $ppc = $pc; $pc = $ch; } if (@elements && !is_all_optional(\@elements)) { show_stack_elements("At-end-of-file",\@elements,\@lines); pgm_exit(1,"ERROR:[3] It is useless to continue when the element stack is out of order!\n"); } prt("Max. element stack $maxdep...$maxelement\n"); prt("Done $lncnt lines... $inf appears ok...\n"); } ######################################### ### MAIN ### parse_args(@ARGV); prt( "$pgmname: in [$cwd]: Process $in_file...\n" ); process_file($in_file); pgm_exit(0,"Normal exit(0)"); ######################################## sub give_help { prt("$pgmname: version 0.0.1 2010-04-06\n"); prt("Usage: $pgmname [options] in_file_name\n"); prt("Options:\n"); prt(" -h (or -?) = THis help, and exit 0\n"); prt(" -l = Load log file at end.\n"); prt(" -v[num] = Bump, or set verbosity to [num]\n"); prt("Parse input file, and report any problems...\n"); } sub parse_args { my (@av) = @_; my ($arg,$sarg,$ch); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); $ch = substr($sarg,0,1); if ($ch =~ /h/i) { give_help(); pgm_exit(0,"Help exit"); } elsif ($ch =~ /l/i) { $load_log = 1; prt("Set to load log at end\n"); } elsif ($ch =~ /v/i) { $sarg = substr($sarg,1); if (length($sarg)) { if ($sarg =~ /^\d+$/) { $verbosity = $sarg; prt("Set verbosity to [$verbosity]\n"); } else { pgm_exit(1,"Unknown argument [$arg] - verbosity is -v[num]. Try -h for help\n"); } } else { $verbosity++; prt("Bumped verbosity to [$verbosity]\n"); } } else { pgm_exit(1,"Unknown argument [$arg] Try -h for help\n"); } } else { $in_file = $arg; prt("Set input file to [$in_file]\n"); } shift @av; } if (!length($in_file)) { $in_file = $def_infile; $load_log = 1; $verbosity = 9; prt("Set DEFAULT input file to [$in_file], and set load_log=1, and verbosity=$verbosity\n"); } } # eof - template.pl