#!/usr/bin/perl -w # NAME: htmlclass.pl # AIM: Get class list from html file use strict; use warnings; use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] ) use Cwd; my $os = $^O; my $perl_dir = '/home/geoff/bin'; my $PATH_SEP = '/'; my $temp_dir = '/tmp'; if ($os =~ /win/i) { $perl_dir = 'C:\GTools\perl'; $temp_dir = $perl_dir; $PATH_SEP = "\\"; } unshift(@INC, $perl_dir); require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n"; require 'lib_url.pl' or die "Unable to load 'lib_url.pl' Check paths in \@INC...\n"; require 'lib_css.pl' or die "Unable to load 'lib_css.pl' Check paths in \@INC...\n"; # log file stuff our ($LF); my $pgmname = $0; if ($pgmname =~ /(\\|\/)/) { my @tmpsp = split(/(\\|\/)/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = $temp_dir.$PATH_SEP."temp.$pgmname.txt"; open_log($outfile); # user variables my $VERS = "0.0.1 2012-07-18"; my $load_log = 0; my $in_file = ''; my $verbosity = 0; my $out_file = $temp_dir.$PATH_SEP."temphtm.html"; my $find = ''; # ### DEBUG ### my $debug_on = 1; my $def_file = 'C:\FG\17\crossfeed\test\list.html'; my $def_find = 'header_flights'; #my $def_find = "wrapper"; ### program variables my @warnings = (); my $cwd = cwd(); my $strip_bom = 1; my $curr_file_bom = ''; my @BOM_list = ( [ "UTF-8", 3, [0xEF,0xBB,0xBF ] ], # 239 187 191 [ "UTF-16 (BE)", 2, [0xFE,0xFF ] ], # 254 255 [ "UTF-16 (LE)", 2, [0xFF,0xFE ] ], # 255 254 [ "UTF-32 (BE)", 4, [0x00,0x00,0xFE,0xFF] ], # 0 0 254 255 [ "UTF-32 (LE)", 4, [0xFF,0xFE,0x00,0x00] ], # 255 254 0 0 [ "UTF-7a" , 4, [0x2B,0x2F,0x76,0x38] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F [ "UTF-7b" , 4, [0x2B,0x2F,0x76,0x39] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F [ "UTF-7c" , 4, [0x2B,0x2F,0x76,0x2B] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F [ "UTF-7d" , 4, [0x2B,0x2F,0x76,0x2F] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F [ "UTF-1" , 3, [0xF7,0x64,0x4C ] ], # 247 100 76 [ "UTF-EBCDIC" , 4, [0xDD,0x73,0x66,0x73] ], # 221 115 102 115 [ "SCSU" , 3, [0x0E,0xFE,0xFF ] ], # 14 254 255 [ "BOCU-1" , 3, [0xFB,0xEE,0x28 ] ], # 251 238 40 [ "GB-18030" , 4, [0x84,0x31,0x95,0x33] ] # 132 49 149 51 ); sub VERB1() { return $verbosity >= 1; } sub VERB2() { return $verbosity >= 2; } sub VERB5() { return $verbosity >= 5; } sub VERB9() { return $verbosity >= 9; } sub show_warnings($) { my ($val) = @_; if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } else { prt( "\nNo warnings issued.\n\n" ) if (VERB9()); } } sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg); } show_warnings($val); close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } # LOAD without a BOM sub line_has_bom($$) { my ($line,$rname) = @_; my $max = scalar @BOM_list; my $len = length($line); my ($i,$j,$name,$cnt,$ra,$ch,$val); for ($i = 0; $i < $max; $i++) { $name = $BOM_list[$i][0]; # name $cnt = $BOM_list[$i][1]; # length $ra = $BOM_list[$i][2]; # ref array of values if ($len > $cnt) { # make sure line length GT BOM for ($j = 0; $j < $cnt; $j++) { $ch = substr($line,$j,1); # extract CHAR $val = ord($ch); # get VALUE last if ($val != ${$ra}[$j]); # compare } if ($j == $cnt) { # if ALL values found ${$rname} = $name; # give back 'name' return $cnt; # and return count } } } return 0; # no BOM found } sub remove_utf_bom($$) { my ($ff,$ra) = @_; my $line = ${$ra}[0]; # get first line my $name = ''; my $len = line_has_bom($line,\$name); if ($len) { $curr_file_bom = substr($line,0,$len); $line = substr($line,$len); # truncate line ${$ra}[0] = $line; # and return minus BOM my ($nm,$dr) = fileparse($ff); # just show name prt("NOTE: File [$nm] is $name encoding. BOM($len) removed.\n"); } } sub load_file_lines($$) { my ($ff,$ra) = @_; my $lncnt = 0; $curr_file_bom = ''; if (open INF, "<$ff") { @{$ra} = ; close INF; $lncnt = scalar @{$ra}; remove_utf_bom($ff,$ra) if ($strip_bom); } else { prtw("WARNING: Unable to open [$ff]!\n"); } return $lncnt; } sub show_hash($$) { my ($rh,$msg) = @_; my ($key,$val); my $cnt = 0; my $hdr = "$msg ["; foreach $key (keys %{$rh}) { $val = ${$rh}{$key}; prt("$hdr") if (length($hdr)); if (length($val)) { prt("$key=$val "); } else { prt("$key "); } $cnt++; $hdr = ''; } prt("]\n"); } sub split_attrs($) { my $txt = shift; my @arr = space_split($txt); my ($cnt,$key,$val,$i); my %h = (); foreach $txt (@arr) { my @a = split('=',$txt); $cnt = scalar @a; if ($cnt > 2) { for ($i = 2; $i < $cnt; $i++) { $a[1] .= "="; $a[1] .= $a[$i]; } $cnt = 2; } if ($cnt == 1) { $h{$txt} = ''; } elsif ($cnt == 2) { $key = $a[0]; $val = strip_both_quotes($a[1]); $h{$key} = $val; } else { pgm_exit(1,"UGH: cnt=$cnt NOT HANDLED [$txt]! FIX ME!\n"); } } show_hash(\%h,'ATTR') if (VERB9()); return \%h; } sub search_for_find($$) { my ($rh,$file) = @_; my ($key,$val,$k2,$v2); foreach $key (keys %{$rh}) { $val = ${$rh}{$key}; if ($key =~ /$find/i) { prt("$file: Found key $key, with find [$find]\n"); show_hash($val,'VAL'); } else { foreach $k2 (keys %{$val}) { $v2 = ${$val}{$k2}; if ($k2 =~ /$find/i) { prt("$file: With key $key, and find [$find], fnd $k2 = $v2\n"); } elsif ($v2 =~ /$find/i) { prt("$file: With key $key, and find [$find], k2 $k2 = $v2 fnd\n"); } } } } } ## for file, read file lines with sub read_css_file($$$) { my ($rh,$file,$dbg) = @_; ## for URL, fetch, with sub fetch_url($$) { my ($url,$ra) = @_; ## and get ref hash = ## return read_css_lines(\@lines,$rh,$file,$dbg); sub process_in_file($) { my ($inf) = @_; my @lines = (); my $lncnt = load_file_lines($inf,\@lines); prt("Processing $lncnt lines, from [$inf]...\n"); my ($name,$dir) = fileparse($inf); my ($line,$inc,$lnn,$i,$len,$ch,$pc,$tag,$txt,$attr,$msg,$tmp,$name,$cnt); my ($rh,$href,$ff,$tmp); ut_fix_directory(\$dir); my $inquot = 0; my $incomm = 0; my $intable = 0; my $intag = 0; my $hadsp = 0; my $inbody = 0; my $intr = 0; my $tdcnt = 0; my $trcnt = 0; my $inscript = 0; my $endsc = 0; my $http = ''; my $lsttxt = ''; my $ptxt = ''; my $endtable = 0; my $eol = " "; # or "\n" $lnn = 0; $ch = ''; $tag = ''; $txt = ''; $attr = ''; my @list = (); my @csslns = (); my ($rcss,$csslns); my $dbg = 1; foreach $line (@lines) { chomp $line; $lnn++; $line = trim_all($line); $len = length($line); next if ($len == 0); for ($i = 0; $i < $len; $i++) { $pc = $ch; $ch = substr($line,$i,1); if ($intag) { if ($ch eq '>') { $msg = "<$tag"; $msg .= " $attr" if (length($attr)); $msg .= $ch; $intag = 0; if ($incomm) { if ($attr =~ /--$/) { $incomm = 0; $endsc = 1; } } elsif ($inscript) { if ($tag =~ /^\/script$/i) { $inscript = 0; $endsc = 1; } } elsif ($inbody) { if ($tag =~ /^\/body$/i) { $inbody = 0; } } elsif ($tag =~ /^body$/i) { $inbody = 1; } if ($inscript || $incomm || $endsc) { # no show } else { prt("$txt"); prt("$msg\n"); $rh = split_attrs($attr) if (length($attr)); if ($tag =~ /^link$/i) { if (defined ${$rh}{'href'}) { $href = ${$rh}{'href'}; my %h = (); my ($n,$d); if ($href =~ /^http:\/\//i) { $csslns = fetch_url($href,\@csslns,$dbg); # { my ($url,$ra) = @_; $tmp = $href; $tmp =~ s/^http:\/\///; ($n,$d) = fileparse($tmp); $rcss = read_css_lines(\@csslns,\%h,$n,$dbg); } else { $ff = $dir.$href; ($n,$d) = fileparse($href); $rcss = read_css_file(\%h,$ff,$dbg); } search_for_find(\%h,$n); } } } $txt = ''; $tag = ''; $attr = ''; $endsc = 0; } elsif ($hadsp) { $attr .= $ch; # add to attributes } elsif ($ch =~ /\s/) { $hadsp = 1; if ($tag =~ /^script$/i) { $inscript = 1; } } else { $tag .= $ch; if (($ch eq '-')&&($tag =~ /^!--/)) { #prt("$lnn: Begin comment - <$tag\n"); $incomm = 1; } } } else { if ($ch eq '<') { $intag = 1; $hadsp = 0; $attr = ''; $tag = ''; } else { $txt .= $ch; } } } # end of line if ($intag) { if ($hadsp) { ###$attr .= ' ' if (length($attr)); $attr .= $eol if (length($attr)); } else { $hadsp = 1; } } else { $txt .= $eol if (length($txt)); } } #write2file($http,$out_file); #prt("New HTML wirtten to $out_file\n"); } ######################################### ### MAIN ### parse_args(@ARGV); process_in_file($in_file); pgm_exit(0,""); ######################################## sub need_arg { my ($arg,@av) = @_; pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av); } sub parse_args { my (@av) = @_; my ($arg,$sarg); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); if (($sarg =~ /^h/i)||($sarg eq '?')) { give_help(); pgm_exit(0,"Help exit(0)"); } elsif ($sarg =~ /^f/) { need_arg(@av); shift @av; $sarg = $av[0]; $find = $sarg; prt("Set FIND to [$find].\n") if (VERB1()); } elsif ($sarg =~ /^v/) { if ($sarg =~ /^v.*(\d+)$/) { $verbosity = $1; } else { while ($sarg =~ /^v/) { $verbosity++; $sarg = substr($sarg,1); } } prt("Verbosity = $verbosity\n") if (VERB1()); } elsif ($sarg =~ /^l/) { if ($sarg =~ /^ll/) { $load_log = 2; } else { $load_log = 1; } prt("Set to load log at end. ($load_log)\n") if (VERB1()); } elsif ($sarg =~ /^o/) { need_arg(@av); shift @av; $sarg = $av[0]; $out_file = $sarg; prt("Set out file to [$out_file].\n") if (VERB1()); } else { pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n"); } } else { $in_file = $arg; prt("Set input to [$in_file]\n") if (VERB1()); } shift @av; } if ($debug_on) { prtw("WARNING: DEBUG is ON\n"); if (length($in_file) == 0) { $in_file = $def_file; prt("Set DEFAULT input to [$in_file]\n"); } if (length($find) == 0) { $find = $def_find; } } if (length($in_file) == 0) { pgm_exit(1,"ERROR: No input files found in command!\n"); } if (! -f $in_file) { pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n"); } if (length($find) == 0) { pgm_exit(1,"ERROR: No class/id to find found in command!\n"); } } sub give_help { prt("$pgmname: version $VERS\n"); prt("Usage: $pgmname [options] in-file\n"); prt("Options:\n"); prt(" --help (-h or -?) = This help, and exit 0.\n"); prt(" --verb[n] (-v) = Bump [or set] verbosity. def=$verbosity\n"); prt(" --load (-l) = Load LOG at end. ($outfile)\n"); prt(" --find class (-f) = Class or id to find.\n"); prt(" --out (-o) = Write output to this file.\n"); } # eof - template.pl