#!/usr/bin/perl -w # NAME: genindex04.pl # AIM: Complete re-write - Given an input folder, generate a tempsitemap.htm of # a complete directory scan # 11/09/2014 - Add -X file - list to exclude # 06/09/2014 - Add -x *.htm* etc to be used with -a (all) # 10/12/2013 - Add alpha sorted jump list. # 18/11/2013 - Add description column, from description.csv file # 06/01/2012 geoff mclane http://geoffair.net/mperl use strict; use warnings; use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] ) use File::stat; use Cwd; my $os = $^O; my $perl_dir = '/home/geoff/bin'; my $PATH_SEP = '/'; my $temp_dir = '/tmp'; if ($os =~ /win/i) { $perl_dir = 'C:\GTools\perl'; $temp_dir = $perl_dir; $PATH_SEP = "\\"; } unshift(@INC, $perl_dir); require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n"; require 'lib_html.pl' or die "Unable to load 'lib_html.pl' Check paths in \@INC...\n"; # log file stuff our ($LF); my $pgmname = $0; if ($pgmname =~ /(\\|\/)/) { my @tmpsp = split(/(\\|\/)/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = $temp_dir.$PATH_SEP."temp.$pgmname.txt"; open_log($outfile); # user variables my $VERS = "0.0.2 2016-08-29"; ###my $VERS = "0.0.1 2012-01-06"; my $load_log = 0; my $in_dir = ''; my $verbosity = 0; my $out_xml = 'tempsitemap.htm'; my $xclude_repo_dirs = 1; my @repo_dirs = qw( CVS .svn .git .hg ); my $recursive = 0; my $html_only = 1; my $blank = 0; my $colcount = 2; my $desccol = 0; # from file, this could be the title of the html is there is one my %descriptions = (); my $add_sorted = 0; my $load_browser = 1; my @excluded = (); my @descriptions = (); my $add_table_width = 0; # debug my $debug_on = 0; my $def_file = 'C:\GTools\perl'; ### program variables my @warnings = (); my $cwd = cwd(); # forward sub scan_directory($$); sub VERB1() { return $verbosity >= 1; } sub VERB2() { return $verbosity >= 2; } sub VERB5() { return $verbosity >= 5; } sub VERB9() { return $verbosity >= 9; } sub is_repo_directory($) { my $dir = shift; my ($test); foreach $test (@repo_dirs) { return 1 if ($dir eq $test); } return 0; } sub show_warnings($) { my ($val) = @_; if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } else { prt( "\nNo warnings issued.\n\n" ) if (VERB9()); } } sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg); } show_warnings($val); close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } sub process_in_file($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$inc,$lnn); $lnn = 0; foreach $line (@lines) { chomp $line; $lnn++; if ($line =~ /\s*#\s*include\s+(.+)$/) { $inc = $1; prt("$lnn: $inc\n"); } } } sub is_html_like($) { my $fil = shift; return 1 if ($fil =~ /\.html$/i); return 1 if ($fil =~ /\.htm$/i); return 1 if ($fil =~ /\.php$/i); return 1 if ($fil =~ /\.shtml$/i); return 1 if ($fil =~ /\.phtml$/i); return 0; } my $dbg_mww = 0; sub match_with_wild($$) { my ($fil1,$fil2) = @_; my $len1 = length($fil1); my $len2 = length($fil2); prt("match_with_wild: [$fil1] [$fil2] ") if ($dbg_mww); my ($i,$j,$c1,$c2); $i = 0; $j = 0; if (($len1 > 0) && ($len2 > 0)) { while (($i < $len1)&&($j < $len2)) { $c1 = substr($fil1,$i,1); $c2 = substr($fil2,$j,1); if (($c1 eq $c2)||($c1 eq '?')||($c2 eq '?')) { $i++; $j++; prt("$c1= ") if ($dbg_mww); } elsif ($c2 eq '*') { $i++; # any $c1 matches asterick if (($j + 1) < $len2) { # but if more, maybe time to step past '*' $c2 = substr($fil2,($j+1),1); if ($c1 eq $c2) { $j += 2; } } prt("$c1* ") if ($dbg_mww); } elsif ($c1 eq '*') { $j++; # any $c2 matches asterick if (($i + 1) < $len1) { # but if more, maybe time to step past '*' $c1 = substr($fil1,($i+1),1); if ($c1 eq $c2) { $i += 2; } } prt("$c2* ") if ($dbg_mww); } else { prt(" = 0 - [$c1] ne [$c2]\n") if ($dbg_mww); return 0; } } if (($i == $len1)&&($j == $len2)) { prt(" = 1 - both ran out of chars\n") if ($dbg_mww); return 1; # both ran out of chars } elsif (($i == $len1)&&($c2 eq '*')&&(($j + 1) == $len2)){ prt(" = 1 - first ran out and last is second $c2\n") if ($dbg_mww); return 1; # first ran out, and second is last '*' } elsif (($j == $len2)&&($c1 eq '*')&&(($i + 1) == $len1)){ prt(" = 1 - second ran out and last of first is $c1\n") if ($dbg_mww); return 1; # second ran out, and second is last '*' } prt(" = 0 - failed - no case\n") if ($dbg_mww); } elsif ($len1 > 0) { # 2nd is nul if ($fil1 eq '*') { prt(" = 1 - asterix matches nul\n") if ($dbg_mww); return 1; # nul matches asterix } prt(" = 0 - len1 > 0, but [$fil1]\n") if ($dbg_mww); } elsif ($len2 > 0) { # 1st is nul if ($fil2 eq '*') { prt(" = 1 - nul match asterix\n") if ($dbg_mww); return 1; # nul matches asterix } prt(" = 0 - len2 > 0, but [$fil1]\n") if ($dbg_mww); } else { prt(" = 0 - no case\n") if ($dbg_mww); } return 0; } # 20140911 - fix for wild like 'temp*' = 'temp*.*'; sub matches_wild($$) { # 20140911 - fix for wild like 'temp*' = 'temp*.*'; my ($fil,$wild) = @_; my ($n1,$d1,$e1) = fileparse( $fil, qr/\.[^.]*/ ); my ($n2,$d2,$e2) = fileparse( $wild, qr/\.[^.]*/ ); my $lcn1 = lc($n1); my $lcn2 = lc($n2); # strip . from extension $e1 =~ s/^\.//; $e2 =~ s/^\.//; my $lce1 = lc($e1); my $lce2 = lc($e2); # add * if no extent $lce1 = '*' if (length($lce1) == 0); $lce2 = '*' if (length($lce2) == 0); prt("matches_wild: [$n1] [$n2] and [$e1] [$e2]\n") if (VERB9()); return 1 if (($lcn1 eq $lcn2)&&($lce1 eq $lce2)); return 2 if (($lcn1 eq $lcn2)&&($lce2 eq '*')); return 3 if (($lcn2 eq '*')&&($lce1 eq $lce2)); return 4 if (match_with_wild($lcn1,$lcn2) && match_with_wild($lce1,$lce2)); return 0; } sub has_wild($) { my $txt = shift; my $len = length($txt); my ($i,$c); for ($i = 0; $i < $len; $i++) { $c = substr($txt,$i,1); return 1 if (($c eq '?')||($c eq '*')); } return 0; } sub is_in_excluded($) { my $file = shift; my ($xcl); foreach $xcl (@excluded) { return 1 if ($xcl eq $file); return 1 if (lc($xcl) eq lc($file)); if (has_wild($xcl)) { return 1 if (matches_wild($file,$xcl)); } } return 0; } sub get_html_title($) { my $ff = shift; my $title = ''; if (open FIL,"<$ff") { my @lines = ; close FIL; my ($line); $line = join(" ",@lines); my $ra = get_html_refarray($line); # my $content = shift; my $ta = get_whole_tag_array($ra,'title',0); # show_html_refarray($ta); my $tta = get_title_text($ta,0); $title = join(" ",@{$tta}); prt("File $ff has title: $title\n") if (VERB9()); } return $title; } sub scan_directory($$) { my ($dir,$rlist) = @_; if (!opendir(DIR,$dir)) { prtw("WARNING: Failed to open directory [$dir]\n"); } my @files = readdir(DIR); closedir(DIR); my ($item,$ff,$ishtml,$sb,$ft,$desc,$title); $dir .= $PATH_SEP if (!($dir =~ /(\\|\/)$/)); my @dirs = (); $desc = 'nbsp;'; foreach $item (@files) { next if (($item eq '.')||($item eq '..')); $ff = $dir.$item; if (-f $ff) { # next if ($item eq $out_xml); # skip self next if ($item =~ /\.bak$/i); # skip .bak $ishtml = is_html_like($item); next if ($html_only && !$ishtml); if (is_in_excluded($item)) { prt("User excluded [$ff]\n") if (VERB1()); next; } my ($n,$d,$e) = fileparse($item, qr/\.[^.]*/); $sb = stat($ff); $ft = $sb->mtime; if ($desccol) { if (defined $descriptions{$item}) { $desc = $descriptions{$item}; } else { $title = get_html_title($ff); prtw("WARNING: No desc for $item,$title\n"); if (length($title)) { push(@descriptions,"$item,$title"); $desc = $title; } else { $desc = ' '; } } } # 0 1 2 3 push(@{$rlist},[$ff,$e,$ft,$desc]); # got a FILE } elsif (-d $ff) { push(@dirs,$ff) if (!is_repo_directory($item)); # got a directory } else { prtw("WARNING: item [$ff] skipped!\n"); } } foreach $dir (@dirs) { scan_directory($dir,$rlist); } } sub mycmp_nc_n1 { my $nm1 = lc(${$a}[1]); my $nm2 = lc(${$b}[2]); return 1 if ($nm1 gt $nm2); return -1 if ($nm1 lt $nm2); return 0; } sub write_html($$) { my ($dir,$rlist) = @_; my $len = length($dir); my ($file,$cnt,$i,$html,$wrap,$cols,$rows,$mrow,$flen,$maxlen,$ind,$href,$tmp,$desc); my ($line); my $added_cnt = 0; $cnt = scalar @{$rlist}; if ($cnt == 0) { prt("No files to write, thus no html generated!\n"); return; } $html = "\n"; $html .= "\n"; $html .= " \n"; $html .= " \n"; $html .= " Site Map\n"; $html .= " \n"; $html .= " \n"; $html .= " \n"; $html .= "

Site Map - $cnt Files

\n"; $html .= "

index\n"; $html .= " end

\n"; if ($add_sorted) { $html .= "

Alpha sorted jump list

\n"; } $html .= "\n"; $html .= "
\n"; $html .= " $maxlen)&&($ind > 0)) { $dr = substr($file,0,$ind); $file = "$dr...$nm"; } push(@links,[$href,$nm,$file]); if ($desc eq 'Previous backup copy') { push(@backups,[$file,$href,$desc]); next; } $added_cnt++; $line = " \n"; $line .= " \n" if ($desccol); $html .= " \n" if ($cols == 0); $html .= $line; $cols++; if ($cols == $wrap) { $html .= " \n"; $cols = 0; $rows++; if ($rows == $mrow) { $tmp = $wrap; $tmp *= 2 if ($desccol); $html .= " \n"; $html .= " \n"; $html .= " \n"; $rows = 0; } } } my $cnt2 = scalar @backups; for ($i = 0; $i < $cnt2; $i++) { $file = $backups[$i][0]; $href = $backups[$i][1]; $desc = $backups[$i][2]; $line = " \n"; $line .= " \n" if ($desccol); $html .= " \n" if ($cols == 0); $html .= $line; $cols++; $added_cnt++; if ($cols == $wrap) { $html .= " \n"; $cols = 0; $rows++; if ($rows == $mrow) { $tmp = $wrap; $tmp *= 2 if ($desccol); $html .= " \n"; $html .= " \n"; $rows = 0; } } } if ($cols) { while ($cols < $wrap) { $html .= " "; $html .= " " if ($desccol); $cols++; } $html .= " \n"; } $html .= " \n"; $html .= "
"; if ($desc eq 'This file') { $line .= "$file"; } else { $line .= "$file"; } $line .= " $desc
\n"; $html .= " top end\n"; $html .= "
$file$desc
\n"; $html .= " top end\n"; $html .= "
  
\n"; $html .= "
\n"; $html .= "\n"; $cnt2 = scalar @links; if ($add_sorted && $cnt2) { $html .= "\n"; $line = "

Alpha sorted list: "; @links = sort mycmp_nc_n1 @links; for ($i = 0; $i < $cnt2; $i++) { $href = $links[$i][0]; $file = $links[$i][2]; $line .= "$file "; if (length($line) > 90) { $html .= "$line\n"; $line = ''; } } $html .= "$line" if (length($line)); $html .= "

\n"; } # maybe now another table by the extension, or whatevr my $date = lu_get_YYYYMMDD_hhmmss_UTC(time()); $html .= "

Done Site Map of $added_cnt of $cnt files, on ".$date." UTC, by $pgmname

\n"; $html .= " "; $html .= " \n"; $html .= "\n"; rename_2_old_bak($out_xml); write2file($html,$out_xml); prt("Site list written to $out_xml\n"); if ($load_log && VERB5()) { prt("=== HTML start ==========================================\n"); prt($html); prt("=== HTML end ==========================================\n"); } if ($load_browser) { if ($os =~ /win/i) { system($out_xml); } else { system("firefox $out_xml"); } } } sub mycmp_decend_n2 { return 1 if (${$a}[2] < ${$b}[2]); return -1 if (${$a}[2] > ${$b}[2]); return 0; } sub process_in_directory($) { my $dir = shift; opendir(DIR,$dir) || pgm_exit(1,"ERROR: Unable to open directory [$dir]!\n"); my @files = readdir(DIR); closedir(DIR); my $itemcnt = scalar @files; prt("Got $itemcnt items, from base directory [$dir]...\n"); my ($item,$ff,$ishtml,$sb,$ft,$desc); ut_fix_directory(\$dir); #$dir .= $PATH_SEP if (!($dir =~ /(\\|\/)$/)); my @file_list = (); my @dirs = (); $desc = ' '; foreach $item (@files) { next if (($item eq '.')||($item eq '..')); $ff = $dir.$item; if (-f $ff) { next if ($item eq $out_xml); # skip self next if ($item =~ /\.bak$/i); # skip .bak $ishtml = is_html_like($item); next if ($html_only && !$ishtml); if (is_in_excluded($item)) { prt("User excluded [$ff]\n") if (VERB1()); next; } my ($n,$d,$e) = fileparse($item, qr/\.[^.]*/); $sb = stat($ff); $ft = $sb->mtime; if ($desccol) { if (defined $descriptions{$item}) { $desc = $descriptions{$item}; } else { $desc = get_html_title($ff); # get 'title' prtw("WARNING: No decription for $item,$desc\n"); if (length($desc) == 0) { $desc = ' '; } else { push(@descriptions,"$item,$desc"); } } } # 0 1 2 3 push(@file_list,[$ff,$e,$ft,$desc]); # got a FILE } elsif (-d $ff) { push(@dirs,$ff) if (!is_repo_directory($item)); # got a directory - skip repos } else { prtw("WARNING: item [$ff] skipped!\n"); } } if ($recursive) { foreach $dir (@dirs) { scan_directory($dir,\@file_list); } } $itemcnt = scalar @file_list; my @arr = sort mycmp_decend_n2 @file_list; prt("Got TOTAL $itemcnt files, from directory $dir...\n"); write_html($dir,\@arr); ##write_html($dir,\@file_list); } sub show_descriptions() { my $cnt = scalar @descriptions; if (!$cnt) { return; } prt("Using document title, have following $cnt descriptions for consideration...\n"); prt(join("\n",@descriptions)."\n"); } ######################################### ### MAIN ### parse_args(@ARGV); set_show_warnings(0) if (!VERB2()); process_in_directory($in_dir); show_descriptions(); pgm_exit(0,""); ######################################## sub need_arg { my ($arg,@av) = @_; pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av); } sub load_descriptions($) { my $fil = shift; if (! open(FIL,"<$fil")) { pgm_exit(1,"ERROR: Unable to open description file [$fil]!\n"); } my @lines = ; close FIL; my $lncnt = scalar @lines; my ($i,$line,@arr,$cnt,$len,$tline,$i2,$file,$desc); my $dcnt = 0; for ($i = 1; $i < $lncnt; $i++) { $i2 = $i+1; $line = $lines[$i]; chomp $line; $tline = trim_all($line); $len = length($tline); next if ($len == 0); @arr = split(",",$line); $cnt = scalar @arr; if ($cnt == 2) { $file = $arr[0]; $desc = $arr[1]; $descriptions{$file} = $desc; $dcnt++; } else { prtw("$i2: Did NOT split into 2 [$line]! Got $cnt\n"); } } prt("Loaded $dcnt descriptions from $fil\n"); } sub parse_args { my (@av) = @_; my ($arg,$sarg,$tmp,@arr,$len); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); if (($sarg =~ /^h/i)||($sarg eq '?')) { give_help(); pgm_exit(0,"Help exit(0)"); } elsif ($sarg =~ /^v/) { if ($sarg =~ /^v.*(\d+)$/) { $verbosity = $1; } else { while ($sarg =~ /^v/) { $verbosity++; $sarg = substr($sarg,1); } } prt("Verbosity = $verbosity\n") if (VERB1()); } elsif ($sarg =~ /^B/) { $tmp = 1; while ($sarg =~ /^[a-zA-Z]+/) { $sarg = substr($sarg,1); } if ($sarg =~ /^\d+$/) { $tmp = $sarg; } $load_browser = $tmp; prt("Set to load browser at end to $load_browser.\n") if (VERB1()); } elsif ($sarg =~ /^l/) { $load_log = 1; prt("Set to load log at end.\n") if (VERB1()); } elsif ($sarg =~ /^r/) { $recursive = 1; prt("Set recrusive to $recursive.\n") if (VERB1()); } elsif ($sarg =~ /^a/) { $html_only = 0; prt("Set load ALL files.\n") if (VERB1()); } elsif ($sarg =~ /^d/) { $desccol = 1; need_arg(@av); shift @av; $sarg = $av[0]; load_descriptions($sarg); prt("Set to add a description column.\n") if (VERB1()); } elsif ($sarg =~ /^b/) { $blank = 1; prt("Set to add _blank to href.\n") if (VERB1()); } elsif ($sarg =~ /^o/) { need_arg(@av); shift @av; $sarg = $av[0]; $out_xml = $sarg; prt("Set out file to [$out_xml].\n") if (VERB1()); } elsif ($sarg =~ /^c/) { need_arg(@av); shift @av; $sarg = $av[0]; if (($sarg =~ /^\d+$/) && ($sarg ne '0')) { $colcount = $sarg; prt("Set column count to [$colcount].\n") if (VERB1()); } else { pgm_exit(1,"ERROR: Column count must be 1 to nn! Not [$arg]!\n"); } } elsif ($sarg =~ /^s/) { $add_sorted = 1; prt("Add alpha sorted links.\n") if (VERB1()); } elsif ($sarg =~ /^w/) { $add_table_width = 1; prt("Add table 'width=\"100%\".\n") if (VERB1()); } elsif ($sarg =~ /^x/) { need_arg(@av); shift @av; $sarg = $av[0]; @arr = split(':',$sarg); foreach $tmp (@arr) { push(@excluded,$tmp); prt("Exclude file/files matching $tmp\n") if (VERB1()); } } elsif ($sarg =~ /^X/) { need_arg(@av); shift @av; $sarg = $av[0]; if (-f $sarg) { if (open FIL, "<$sarg") { @arr = ; close FIL; foreach $tmp (@arr) { chomp $tmp; $tmp = trim_all($tmp); $len = length($tmp); next if ($len == 0); next if ($tmp =~ /^\#/); push(@excluded,$tmp); prt("Exclude file/files matching $tmp\n") if (VERB1()); } } else { pgm_exit(1,"ERROR: Unable to 'open' file [$sarg]! Check name, location, spelling.\n"); } } else { pgm_exit(1,"ERROR: Unable to 'stat' file [$sarg]! Check name, location, spelling.\n"); } } else { pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n"); } } else { $in_dir = $arg; prt("Set input to [$in_dir]\n") if (VERB1()); if (! -d $in_dir) { pgm_exit(1,"ERROR: Unable to find in directory [$in_dir]! Check name, location...\n"); } } shift @av; } if ((length($in_dir) == 0) && $debug_on) { $in_dir = $def_file; prt("Set DEFAULT input to [$in_dir]\n"); } if (length($in_dir) == 0) { pgm_exit(1,"ERROR: No input directory found in command!\n"); } if (! -d $in_dir) { pgm_exit(1,"ERROR: Unable to find in directory [$in_dir]! Check name, location...\n"); } } sub give_help { prt("$pgmname: version $VERS\n"); prt("Usage: $pgmname [options] in-directory\n"); prt("Options:\n"); prt(" --help (-h or -?) = This help, and exit 0.\n"); prt(" --Browser (-B) = Load resulting HTML in a browser - (def=on) -B0 for off\n"); prt(" --verb[n] (-v) = Bump [or set] verbosity. def=$verbosity\n"); prt(" --load (-l) = Load LOG at end. ($outfile)\n"); prt(" --out (-o) = Write output to this file.\n"); prt(" --recursive (-r) = Recurse into subdirectories. (def=$recursive)\n"); prt(" --all (-a) = Include ALL files. Default is just html only like files. (def=$html_only)\n"); prt(" --blank (-b) = Add target=\"_blank\" to href. (def=$blank)\n"); prt(" --cols n (-c) = Set column count. (def=$colcount)\n"); prt(" --desc file.csv (-d) = Add description column, from csv file. (def=$desccol)\n"); prt(" --sort (-s) = Add alpha sorted jump list. (def=$add_sorted)\n"); prt(" --width (-w) = Add table `width=\"100%\"`. (def=$add_table_width)\n"); prt(" --xclude nm1[:nm2] (-x) = Exclude matching files. Can be ':' sep. list and wild, *,?.\n"); prt(" --Xclude file (-X) = Exclude file list, line separated. Lines begin # ignored\n"); prt(" Will scan the input directory, and build a $out_xml html file.\n"); } # eof - genindex04.pl