#!/usr/bin/perl -w
# NAME: htmlclass.pl
# AIM: Get class list from html file
use strict;
use warnings;
use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
use Cwd;
my $os = $^O;
my $perl_dir = '/home/geoff/bin';
my $PATH_SEP = '/';
my $temp_dir = '/tmp';
if ($os =~ /win/i) {
$perl_dir = 'C:\GTools\perl';
$temp_dir = $perl_dir;
$PATH_SEP = "\\";
}
unshift(@INC, $perl_dir);
require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n";
require 'lib_url.pl' or die "Unable to load 'lib_url.pl' Check paths in \@INC...\n";
require 'lib_css.pl' or die "Unable to load 'lib_css.pl' Check paths in \@INC...\n";
# log file stuff
our ($LF);
my $pgmname = $0;
if ($pgmname =~ /(\\|\/)/) {
my @tmpsp = split(/(\\|\/)/,$pgmname);
$pgmname = $tmpsp[-1];
}
my $outfile = $temp_dir.$PATH_SEP."temp.$pgmname.txt";
open_log($outfile);
# user variables
my $VERS = "0.0.1 2012-07-18";
my $load_log = 0;
my $in_file = '';
my $verbosity = 0;
my $out_file = $temp_dir.$PATH_SEP."temphtm.html";
my $find = '';
# ### DEBUG ###
my $debug_on = 1;
my $def_file = 'C:\FG\17\crossfeed\test\list.html';
my $def_find = 'header_flights';
#my $def_find = "wrapper";
### program variables
my @warnings = ();
my $cwd = cwd();
my $strip_bom = 1;
my $curr_file_bom = '';
my @BOM_list = (
[ "UTF-8", 3, [0xEF,0xBB,0xBF ] ], # 239 187 191
[ "UTF-16 (BE)", 2, [0xFE,0xFF ] ], # 254 255
[ "UTF-16 (LE)", 2, [0xFF,0xFE ] ], # 255 254
[ "UTF-32 (BE)", 4, [0x00,0x00,0xFE,0xFF] ], # 0 0 254 255
[ "UTF-32 (LE)", 4, [0xFF,0xFE,0x00,0x00] ], # 255 254 0 0
[ "UTF-7a" , 4, [0x2B,0x2F,0x76,0x38] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F
[ "UTF-7b" , 4, [0x2B,0x2F,0x76,0x39] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F
[ "UTF-7c" , 4, [0x2B,0x2F,0x76,0x2B] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F
[ "UTF-7d" , 4, [0x2B,0x2F,0x76,0x2F] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F
[ "UTF-1" , 3, [0xF7,0x64,0x4C ] ], # 247 100 76
[ "UTF-EBCDIC" , 4, [0xDD,0x73,0x66,0x73] ], # 221 115 102 115
[ "SCSU" , 3, [0x0E,0xFE,0xFF ] ], # 14 254 255
[ "BOCU-1" , 3, [0xFB,0xEE,0x28 ] ], # 251 238 40
[ "GB-18030" , 4, [0x84,0x31,0x95,0x33] ] # 132 49 149 51
);
sub VERB1() { return $verbosity >= 1; }
sub VERB2() { return $verbosity >= 2; }
sub VERB5() { return $verbosity >= 5; }
sub VERB9() { return $verbosity >= 9; }
sub show_warnings($) {
my ($val) = @_;
if (@warnings) {
prt( "\nGot ".scalar @warnings." WARNINGS...\n" );
foreach my $itm (@warnings) {
prt("$itm\n");
}
prt("\n");
} else {
prt( "\nNo warnings issued.\n\n" ) if (VERB9());
}
}
sub pgm_exit($$) {
my ($val,$msg) = @_;
if (length($msg)) {
$msg .= "\n" if (!($msg =~ /\n$/));
prt($msg);
}
show_warnings($val);
close_log($outfile,$load_log);
exit($val);
}
sub prtw($) {
my ($tx) = shift;
$tx =~ s/\n$//;
prt("$tx\n");
push(@warnings,$tx);
}
# LOAD without a BOM
sub line_has_bom($$) {
my ($line,$rname) = @_;
my $max = scalar @BOM_list;
my $len = length($line);
my ($i,$j,$name,$cnt,$ra,$ch,$val);
for ($i = 0; $i < $max; $i++) {
$name = $BOM_list[$i][0]; # name
$cnt = $BOM_list[$i][1]; # length
$ra = $BOM_list[$i][2]; # ref array of values
if ($len > $cnt) { # make sure line length GT BOM
for ($j = 0; $j < $cnt; $j++) {
$ch = substr($line,$j,1); # extract CHAR
$val = ord($ch); # get VALUE
last if ($val != ${$ra}[$j]); # compare
}
if ($j == $cnt) { # if ALL values found
${$rname} = $name; # give back 'name'
return $cnt; # and return count
}
}
}
return 0; # no BOM found
}
sub remove_utf_bom($$) {
my ($ff,$ra) = @_;
my $line = ${$ra}[0]; # get first line
my $name = '';
my $len = line_has_bom($line,\$name);
if ($len) {
$curr_file_bom = substr($line,0,$len);
$line = substr($line,$len); # truncate line
${$ra}[0] = $line; # and return minus BOM
my ($nm,$dr) = fileparse($ff); # just show name
prt("NOTE: File [$nm] is $name encoding. BOM($len) removed.\n");
}
}
sub load_file_lines($$) {
my ($ff,$ra) = @_;
my $lncnt = 0;
$curr_file_bom = '';
if (open INF, "<$ff") {
@{$ra} = ;
close INF;
$lncnt = scalar @{$ra};
remove_utf_bom($ff,$ra) if ($strip_bom);
} else {
prtw("WARNING: Unable to open [$ff]!\n");
}
return $lncnt;
}
sub show_hash($$) {
my ($rh,$msg) = @_;
my ($key,$val);
my $cnt = 0;
my $hdr = "$msg [";
foreach $key (keys %{$rh}) {
$val = ${$rh}{$key};
prt("$hdr") if (length($hdr));
if (length($val)) {
prt("$key=$val ");
} else {
prt("$key ");
}
$cnt++;
$hdr = '';
}
prt("]\n");
}
sub split_attrs($) {
my $txt = shift;
my @arr = space_split($txt);
my ($cnt,$key,$val,$i);
my %h = ();
foreach $txt (@arr) {
my @a = split('=',$txt);
$cnt = scalar @a;
if ($cnt > 2) {
for ($i = 2; $i < $cnt; $i++) {
$a[1] .= "=";
$a[1] .= $a[$i];
}
$cnt = 2;
}
if ($cnt == 1) {
$h{$txt} = '';
} elsif ($cnt == 2) {
$key = $a[0];
$val = strip_both_quotes($a[1]);
$h{$key} = $val;
} else {
pgm_exit(1,"UGH: cnt=$cnt NOT HANDLED [$txt]! FIX ME!\n");
}
}
show_hash(\%h,'ATTR') if (VERB9());
return \%h;
}
sub search_for_find($$) {
my ($rh,$file) = @_;
my ($key,$val,$k2,$v2);
foreach $key (keys %{$rh}) {
$val = ${$rh}{$key};
if ($key =~ /$find/i) {
prt("$file: Found key $key, with find [$find]\n");
show_hash($val,'VAL');
} else {
foreach $k2 (keys %{$val}) {
$v2 = ${$val}{$k2};
if ($k2 =~ /$find/i) {
prt("$file: With key $key, and find [$find], fnd $k2 = $v2\n");
} elsif ($v2 =~ /$find/i) {
prt("$file: With key $key, and find [$find], k2 $k2 = $v2 fnd\n");
}
}
}
}
}
## for file, read file lines with sub read_css_file($$$) { my ($rh,$file,$dbg) = @_;
## for URL, fetch, with sub fetch_url($$) { my ($url,$ra) = @_;
## and get ref hash =
## return read_css_lines(\@lines,$rh,$file,$dbg);
sub process_in_file($) {
my ($inf) = @_;
my @lines = ();
my $lncnt = load_file_lines($inf,\@lines);
prt("Processing $lncnt lines, from [$inf]...\n");
my ($name,$dir) = fileparse($inf);
my ($line,$inc,$lnn,$i,$len,$ch,$pc,$tag,$txt,$attr,$msg,$tmp,$name,$cnt);
my ($rh,$href,$ff,$tmp);
ut_fix_directory(\$dir);
my $inquot = 0;
my $incomm = 0;
my $intable = 0;
my $intag = 0;
my $hadsp = 0;
my $inbody = 0;
my $intr = 0;
my $tdcnt = 0;
my $trcnt = 0;
my $inscript = 0;
my $endsc = 0;
my $http = '';
my $lsttxt = '';
my $ptxt = '';
my $endtable = 0;
my $eol = " "; # or "\n"
$lnn = 0;
$ch = '';
$tag = '';
$txt = '';
$attr = '';
my @list = ();
my @csslns = ();
my ($rcss,$csslns);
my $dbg = 1;
foreach $line (@lines) {
chomp $line;
$lnn++;
$line = trim_all($line);
$len = length($line);
next if ($len == 0);
for ($i = 0; $i < $len; $i++) {
$pc = $ch;
$ch = substr($line,$i,1);
if ($intag) {
if ($ch eq '>') {
$msg = "<$tag";
$msg .= " $attr" if (length($attr));
$msg .= $ch;
$intag = 0;
if ($incomm) {
if ($attr =~ /--$/) {
$incomm = 0;
$endsc = 1;
}
} elsif ($inscript) {
if ($tag =~ /^\/script$/i) {
$inscript = 0;
$endsc = 1;
}
} elsif ($inbody) {
if ($tag =~ /^\/body$/i) {
$inbody = 0;
}
} elsif ($tag =~ /^body$/i) {
$inbody = 1;
}
if ($inscript || $incomm || $endsc) {
# no show
} else {
prt("$txt");
prt("$msg\n");
$rh = split_attrs($attr) if (length($attr));
if ($tag =~ /^link$/i) {
if (defined ${$rh}{'href'}) {
$href = ${$rh}{'href'};
my %h = ();
my ($n,$d);
if ($href =~ /^http:\/\//i) {
$csslns = fetch_url($href,\@csslns,$dbg); # { my ($url,$ra) = @_;
$tmp = $href;
$tmp =~ s/^http:\/\///;
($n,$d) = fileparse($tmp);
$rcss = read_css_lines(\@csslns,\%h,$n,$dbg);
} else {
$ff = $dir.$href;
($n,$d) = fileparse($href);
$rcss = read_css_file(\%h,$ff,$dbg);
}
search_for_find(\%h,$n);
}
}
}
$txt = '';
$tag = '';
$attr = '';
$endsc = 0;
} elsif ($hadsp) {
$attr .= $ch; # add to attributes
} elsif ($ch =~ /\s/) {
$hadsp = 1;
if ($tag =~ /^script$/i) {
$inscript = 1;
}
} else {
$tag .= $ch;
if (($ch eq '-')&&($tag =~ /^!--/)) {
#prt("$lnn: Begin comment - <$tag\n");
$incomm = 1;
}
}
} else {
if ($ch eq '<') {
$intag = 1;
$hadsp = 0;
$attr = '';
$tag = '';
} else {
$txt .= $ch;
}
}
}
# end of line
if ($intag) {
if ($hadsp) {
###$attr .= ' ' if (length($attr));
$attr .= $eol if (length($attr));
} else {
$hadsp = 1;
}
} else {
$txt .= $eol if (length($txt));
}
}
#write2file($http,$out_file);
#prt("New HTML wirtten to $out_file\n");
}
#########################################
### MAIN ###
parse_args(@ARGV);
process_in_file($in_file);
pgm_exit(0,"");
########################################
sub need_arg {
my ($arg,@av) = @_;
pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av);
}
sub parse_args {
my (@av) = @_;
my ($arg,$sarg);
while (@av) {
$arg = $av[0];
if ($arg =~ /^-/) {
$sarg = substr($arg,1);
$sarg = substr($sarg,1) while ($sarg =~ /^-/);
if (($sarg =~ /^h/i)||($sarg eq '?')) {
give_help();
pgm_exit(0,"Help exit(0)");
} elsif ($sarg =~ /^f/) {
need_arg(@av);
shift @av;
$sarg = $av[0];
$find = $sarg;
prt("Set FIND to [$find].\n") if (VERB1());
} elsif ($sarg =~ /^v/) {
if ($sarg =~ /^v.*(\d+)$/) {
$verbosity = $1;
} else {
while ($sarg =~ /^v/) {
$verbosity++;
$sarg = substr($sarg,1);
}
}
prt("Verbosity = $verbosity\n") if (VERB1());
} elsif ($sarg =~ /^l/) {
if ($sarg =~ /^ll/) {
$load_log = 2;
} else {
$load_log = 1;
}
prt("Set to load log at end. ($load_log)\n") if (VERB1());
} elsif ($sarg =~ /^o/) {
need_arg(@av);
shift @av;
$sarg = $av[0];
$out_file = $sarg;
prt("Set out file to [$out_file].\n") if (VERB1());
} else {
pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n");
}
} else {
$in_file = $arg;
prt("Set input to [$in_file]\n") if (VERB1());
}
shift @av;
}
if ($debug_on) {
prtw("WARNING: DEBUG is ON\n");
if (length($in_file) == 0) {
$in_file = $def_file;
prt("Set DEFAULT input to [$in_file]\n");
}
if (length($find) == 0) {
$find = $def_find;
}
}
if (length($in_file) == 0) {
pgm_exit(1,"ERROR: No input files found in command!\n");
}
if (! -f $in_file) {
pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n");
}
if (length($find) == 0) {
pgm_exit(1,"ERROR: No class/id to find found in command!\n");
}
}
sub give_help {
prt("$pgmname: version $VERS\n");
prt("Usage: $pgmname [options] in-file\n");
prt("Options:\n");
prt(" --help (-h or -?) = This help, and exit 0.\n");
prt(" --verb[n] (-v) = Bump [or set] verbosity. def=$verbosity\n");
prt(" --load (-l) = Load LOG at end. ($outfile)\n");
prt(" --find class (-f) = Class or id to find.\n");
prt(" --out (-o) = Write output to this file.\n");
}
# eof - template.pl