chklinks03.pl to HTML.

index -|- end

Generated: Sat Oct 24 16:35:11 2020 from chklinks03.pl 2019/05/08 112.5 KB. text copy

#!/perl -w
# NAME: chklinks03.pl
# AIM: Given a input FOLDER, check all the HTML found for a <a href="...."
# reference and make sure that reference EXISTS, either as a LOCAL file,
# or that an IP address can be obtained for the HOST if http://<something> ...
# AND check ALL image links <img src="..."...>, if it is a LOCAL file,
# and other 'link' items, like .zip, .txt, etc.
# 2019-05-06 - Review
# 17/10/2013 - Default is to follow page links. Added -nofollow to only do the page
# 22/05/2011 - More tidying, reducing output to 'essentials' unless -v2 or more
# 19/05/2011 - Fix of 20100312 fix, to find substitutes '.txt' files
# 02/04/2010 - Some more 'fixes' - continue 2010/04/04 - 2010/04/12...
# 2010/04/01 - Fix for 'remove_script' - to move to htmltools.pl
# 15/03/2010 - After a number of 'fixes', starting to feel SOLID ;=))
# 12/03/2010 - Special case - I have replaced large ZIPS, with a TEXT
# file, so do NOT give this WARNING, if such a .txt file exists.
# 11/03/2010 - Lots of tidying, and fixing... experimented with another collecthrefs2
# but found only difference is in the DUPLICATE that can be in htmltools::collecthrefs
# 13/11/2008 - some tidying, especially when from ONE FILE
# where there will be NO from|to references
# 18/06/2007 - Add some command parameters, and help
# 02/06/2007 - geoff mclane - geoffair.com/mperl/index.htm
use strict;
use warnings;
use File::Basename;  # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
use File::Spec; # File::Spec->rel2abs($rel); # get ABSOLUTE form
use Socket;
use Cwd;    # for cwd()
my $os = $^O;
unshift(@INC, 'C:/GTools/perl');
# for htmltools, if functions used
my @imgs = ();
my @hrefs = ();
require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
require 'htmltools.pl' or die "Unable to load htmltools.pl ...\n";
# log file stuff
my ($LF);
my $pgmname = $0;
if ($pgmname =~ /\w{1}:\\.*/) {
    my @tmpsp = split(/\\/,$pgmname);
    $pgmname = $tmpsp[-1];
}
my $perl_base = 'C:/GTools/perl';
my $outfile = "$perl_base/temp.$pgmname.txt";
open_log($outfile);

my $currworkdir = cwd();
my $out_hrefs = $perl_base."\\temphrefs.txt";
my $out_hosts = $perl_base."\\temphosts.txt";
my $out_domains = $perl_base."\\tempdomains.txt";

# some FEATURES and USER variables
# my @excludes = qw( cvineng2.htm );
my @excludes = qw( desktop.ini php.ini blank.html blank.htm index3.htm );
my @splexcludes = qw( macpc tmp wedding );
my @def_indexes = qw( default.html default.htm  index.aspx
   index.htm index.html index.jsp index.php index.shtml  home.htm
   home.html welcome.html welcome.htm );

my $recurse = 1;        # recursive # follow links to other files, and check them also
my $ignfpd = 1;         # ignore FRONTPAGE folders
my $chkip = 0;          # check the IP address
my $check_host_ip = 0;  # check the host domain only IP
my $load_log = 0;       # load log file at end -ll to set.
my $show_no_index = 0;  # show when NOT html in folder, OR no 'index' type file
my $showhreflinks = 0;    # show a WARNING when an IMG, ICO, etc is a REMOTE link
my $showlinks = 0;    # show the links for each file
my $shownolinks = 1;    # show NO links for each file
my $showscripts = 0;    # show SCRIPT files
my $writeips = 1;       # write IP found to a file
my $refreships = 0;    # if $chkip, and $writeips, re-write NEW check file
my $shownohrefs = 0;    # show when NO HREF found in file
my $show_all_not_found = 0; # avoid repeating an image file not found
my $not_found_file = '';    # if file name given, write list to that file...
my $out_remote_links = 0;   # potential BIG LIST of remote LINKS
my $in_folder = "";

my $ipfile = "iplinks.txt";
my @ipsfound = ();
my $verbosity = 0;

my @html_ext = qw( .htm .html .shtml .php );
my @graf_ext = qw( .jpg .jpeg .gif .png .bmp .ico .mpg );
my @css_ext  = qw( .css );
my @script_ext = qw( .js .class .cgi );
my @fpfolders = qw( _vti_cnf _vti_pvt _private _derived );
my @g_xclude_dir = ();
my @excused = ( '?dir=test', '?dir=.' );

# *** DEBUG ***
my $debug_on = 0;      # OFF for release - only to load default
# SET A DEFAULT INPUT FOLDER / FILE
##my $def_file = "C:\\HOMEPAGE\\FG\\index.html";
my $def_file = "";
my $def_in_folder = 'F:\Projects\html-tidy.org.api\tidy\tidylib_api_next';

# program variables
# NOTE: Each of these is a multidimensional array - see offset below
my @htm_files = ();    # store files found in folder
my @img_files = ();
my @css_files = ();
my @zip_files = ();
my @txt_files = ();
my @script_files = ();
my @other_files = ();
my @g_empty_folders = ();
my @g_folders_noind = ();
my %g_images_notfound = ();
my %g_dir_files = ();   # store files %hash->dir->%type->@files
my $g_active_file = '';
my $g_active_lnn  = '';
my $g_user_file = '';
my $single_file = 0;

# offsets in above arrays
my $of_ff = 0;    # full file name
my $of_hr = 1;    # array ref of href links
my $of_im = 2;    # array ref of image links
my $of_lk = 3;    # linked count
my $of_rh = 4;    # ref hash from 'collecthrefs2'
my $of_to = 5;    # links TO, but this is really already in $of_hr, after excluding in-file, and external links
my $of_fm = 6;  # links FROM
my $of2hr = 7;  # more or less duplicate of $of_hr
my $of2im = 8;  # more or less duplicate of $of_im
my $of_s1 = 9; # spare 1
my $of_s2 = 10; # spare 2
#                    0    1   2   3  4  5   6   7  8  9  10
# push(@{$arr_ref}, [$ff, '', '', 0, 0, '', '', 0, 0, 0, 0] );

my @donesrcs = ();
my @doneimgs = ();

my %ext_hash = ();
##my $cnt = 0;
##my $file = '';
my @warnings = ();    # list of errors, warnings during running
my @httprefs = ();    # set of HREF src values push(@httprefs, [$src, $fil, $lnnos] );
my @httpsrefs = ();
my @ftprefs = ();
my @mtrefs = ();
my %g_hrefs = ();   # full list of ALL http items found
my %g_hosts = ();   # list of http://domain.names
my %g_domains = (); # just domain names
my @scripts = ();
my $tot_imgcnt = 0;
my $homefile = '';
my $total_hrefs = 0;
my $total_imgs = 0;
my @missed = ();
my $excusecnt = 0;
my $hrflnkcnt = 0; # $showhreflinks
my $homeoffset = -1;
my @offsdone = ();
my @htmlinks = ();
my @missing_links = (); # store files with NO links

# debug only bits
my $dbg1 = 0; # prt( "[dbg1] Processed $inf folder finding $fcnt HTML files ...\n" ) if ($dbg1);
my $dbg1b = 0; # prt( "[dbg1b] Processing [$inf] folder...\n" ) if ($dbg1b);
my $dbg2 = 0;    # show ALL HREF entries ...
my $dbg3 = 0;    # show IP found ...
my $dbg4 = 0;    # show entered/exit script
my $dbg5 = 0;    # show 'ok' when found
my $dbg6 = 0;    # show processing lines
my $dbg7 = 0;    # show anchor count
my $dbg8 = 0;    # show unique anchor href
my $dbg9 = 0;    # show files with SCRIPTS
my $dbg10 = 0;    # show diag for get_img_srcs() ...
my $dbg11 = 0;    # in image processing show entered/exits script 
my $dbg12 = 0;    # in image processing show processing count
my $dbg13 = 0;    # in image processing show ok - found file
my $dbg14 = 0;    # in image processing show image count found
my $dbg15 = 0;    # in image processing show image count when NONE found
my $dbg16 = 0;    # show WARNINGS during run ...
my $dbg17 = 0;    # show MISSING or BLANK HREF in PHP file
my $dbg18 = 0;    # check_linkages: show 'ok', in 2nd link check
my $dbg19 = 0;    # check_local_links: show progress ...
my $dbg20 = 0;    # check_local_links: show ALL link COUNTS - NONE IS ALWAYS SHOWN ...
my $dbg21 = 0;    # check_local_links: show LINK when found ...
my $dbg22 = 0;    # mark_image_link: show comparing, and comparision ...
my $dbg23 = 0;    # mark_image_link: show count of new images marked ...
my $dbg24 = 0;    # show each image file being marked
my $dbg25 = 0;    # prt( "[dbg25] NO LINK FOUND HREF [$src]($msrc) in $totcnt file - $ff ($fnd) - ($lev)!\n" ) if ($dbg25);
my $dbg26 = 0;    # show EACH HTML FILE BEING PROCESSED
my $dbg27 = 0;    # show EACH extesnions, and counts
my $dbg28 = 0;    # show image links information ...
my $dbg29 = 0;    # show ZIP, TXT, CSS, SCRIPT and OTHER file links information ...
my $dbg30 = 0;    # show HTML HREF links information ...
my $dbg31 = 0;    # like $dbg20 - check_local_links: show ALL links - NONE IS ALWAYS SHOWN ...
my $dbg32 = 0;    # show missing during processing
my $dbg33 = 0;    # show HAS NO LINKS during processing
my $dbg34 = 0; # show prt( "REL PATH [$src] to UNIX PATH [$nusrc]
my $dbg35 = 0; # prt("[dbg35] Got anchor [$hrf]($len2) ...\n") if ($dbg35);
my $dbg36 = 0; # prt("[dbg36] Got file [$fil], with ext = [$ext] = $type\n") if ($dbg36);
my $dbg37 = 0; # prt("[dbg37] File [$fil] EXCLUDED!\n") if ($dbg37);
my $dbg38 = 0; # prt( "Got [$hr2] = [$txt]\n" );
my $dbg39 = 0; # prt( "[dbg39] Got [$hr2] = [$txt] - no inverted commas! [$fil]\n" ) if ($dbg39);
my $dbg40 = 0; # prt("[dbg40] mark_link fmfil=[$fmfil] fnd=[$fnd] src=[$src] lenv=[$lev]\n");
my $dbg41 = 0; # prt( "[dbg41] $i2: compare \n[$mfil] with \n[$msrc]\n" ) if ($dbg41);
my $dbg42 = 0; # prt("[dbg42] Processing file [$file]...\n") if ($dbg42);
my $dbg43 = 0; # prt("[dbg43] Got ".scalar @hr." hrefs, and ".scalar @is." image links.\n") if ($dbg43);
my $dbg44 = 0; # prt("[dbg44] Sending [$src] to 'mark_dir_link'...\n") if ($dbg44);
my $dbg45 = 0; # prt("[dbg45] LINK not found in HREF files, maybe IMAGES, zip, etc TO=[$src], FROM=[$fmfil]\n") if ($dbg45);
my $dbg46 = 0; # prt("[dbg46] LINK found, but not in HREF files! TO=[$src], FROM=[$fmfil]\n") if ($dbg46);

sub set_debug_val($) {
   my ($v) = shift;
   $dbg1 = $v$dbg1b = $v; $dbg2 = $v$dbg3 = $v$dbg4 = $v$dbg5 = $v$dbg6 = $v;
   $dbg7 = $v$dbg8 = $v$dbg9 = $v$dbg10 = $v; $dbg11 = $v; $dbg12 = $v;
   $dbg13 = $v; $dbg14 = $v; $dbg15 = $v; $dbg16 = $v; $dbg17 = $v; $dbg18 = $v;
   $dbg19 = $v; $dbg20 = $v; $dbg21 = $v; $dbg22 = $v; $dbg23 = $v; $dbg24 = $v;
   $dbg25 = $v; $dbg26 = $v; $dbg27 = $v; $dbg28 = $v; $dbg29 = $v; $dbg30 = $v;
   $dbg31 = $v; $dbg32 = $v; $dbg33 = $v; $dbg34 = $v; $dbg35 = $v; $dbg36 = $v;
   $dbg37 = $v; $dbg38 = $v; $dbg39 = $v; $dbg40 = $v; $dbg41 = $v; $dbg42 = $v;
   $dbg43 = $v; $dbg44 = $v; $dbg45 = $v; $dbg46 = $v;
}

sub set_debug_on() { set_debug_val(1); }
sub set_debug_off() { set_debug_val(0); }

# forward refs
sub mark_link($$$$);
sub trace_from_htm($$);
sub process_folder($);
sub is_same_file($$);
sub mark_dir_link($$$$);
sub is_htm_file_ext($);

sub pgm_exit($$) {
   my ($val,$msg) = @_;
   if (length($msg)) {
      $msg .= "\n" if !($msg =~ /\n$/);
      prt($msg);
   }
   #show_warnings($val);
   close_log($outfile,$load_log);
   exit($val);
}

sub show_warnings() {
    my $wcnt = scalar @warnings;
    if ($wcnt) {
        prt( "\nWARNINGS FOLLOW ($wcnt):\n" );
        foreach my $w (@warnings) {
            prt( "$w\n" );
        }
    } else {
        ##### prt( "No warnings ...\n" );
    }
}

sub prtw($) {
    my ($tx) = shift;
    $tx =~ s/\n$//;
    prt("$tx\n");
    push(@warnings,$tx);
}

sub u2d {
    my ($f) = shift;
    $f =~ s/\//\\/g;
    return $f;
}

sub VERB1() { return ($verbosity >= 1); }
sub VERB2() { return ($verbosity >= 2); }
sub VERB5() { return (($verbosity >=5) ? 1 : 0); }
sub VERB9() { return (($verbosity >=9) ? 1 : 0); }

sub sub_common_folder_dos {
    my ($f1, $f2) = @_;
    my $df1 = u2d($f1);
    my $df2 = u2d($f2);
    if ($os =~ /Win/) {
        $df1 = lc($df1);
        $df2 = lc($df2);
   }
   # paddle across, stopping at first difference
    my $off = 0;
    while ( substr($df1,$off,1) && substr($df2,$off,1) &&
            ( substr($df1,$off,1) eq substr($df2,$off,1) ) ) {
        $off++;
    }
    return substr($f1,$off);
}

sub sub_in_folder($) {
    my ($path) = shift;
    $path = sub_common_folder_dos($path,$in_folder);
    $path =~ s/^(\\|\/)//; # kick off any leading '\' or '/' - 2010-04-02
    return $path;
}

# 2010.03.16 - seek an 'index.htm' type file
sub is_def_index_file($) {
   my ($fil) = @_;
   foreach my $f (@def_indexes) {
      return 1 if ($f eq $fil);
   }
   return 0;
}

# 20100311 - only report MISSING LINKS on HTML files
sub is_link_type_file($) {
   my ($f) = @_;
    my ($n,$d,$e) = fileparse( $f, qr/\.[^.]*/ );
   return 1 if (is_htm_ext($e));
   return 0;
}

sub show_startup() {
    prt( "Checking $in_folder ..." );
    prt( " Using HOME file $homefile ..." ) if length($homefile);
    prt("\n");
    if (VERB5()) {
        prt( "\nOptions:\n" );
        prt( sprintf(" -checkips      - HREF check %s, \n", ($chkip ? "On" : "Off")) );
        prt( sprintf(" -showhreflinks - Show HREF %s, \n", ($showhreflinks ? "On" : "Off")) );
        prt( sprintf(" -showlinks     - Show links %s, \n", ($showlinks ? "On" : "Off")) );
        prt( sprintf(" -showscripts   - Show script files %s, \n", ($showscripts ? "On" : "Off")) );
        prt( sprintf(" -writeips      - Write HREF $ipfile %s, \n", ($writeips ? "On" : "Off")) );
        prt( sprintf(" -refreships    - Refresh HREF %s\n", ($refreships ? "On" : "Off")) );
        prt( sprintf(" -shownohrefs   - Show NO HREF found %s\n", ($shownohrefs ? "On" : "Off")) );
        prt( sprintf(" -nofollow      - Follow page links found %s\n", ($recurse ? "On" : "Off")) );
        if (@excludes) {
            prt( "Have ".scalar @excludes." excluded files - " );
            foreach my $ex (@excludes) {
                prt( "$ex " );
            }
            prt("\n");
        }
    }
    ###pgm_exit(1,"TEMP EXIT(1)");
}

# if (!is_in_array_ref0($tmp, \@hr))
sub is_in_array_ref0($$) {
   my ($test,$ra) = @_;
   my $lct1 = lc($test);
   my $ac = scalar @{$ra};
   for (my $i = 0; $i < $ac; $i++) {
      my $rt = ${$ra}[$i]; # extract array ref
      my $t = ${$rt}[0];   # extract HREF
      my $lct2 = lc($t);
      return 1 if ($lct1 eq $lct2);
   }
   return 0;
}

sub is_in_array_ref($$) {
   my ($test,$ra) = @_;
   my $lct1 = lc($test);
   my $ac = scalar @{$ra};
   for (my $i = 0; $i < $ac; $i++) {
      my $t = ${$ra}[$i]; # extract item
      my $lct2 = lc($t);
      return 1 if ($lct1 eq $lct2);
   }
   return 0;
}

sub get_anchor_hash_ref($$$) {
   my ($fank,$fil,$dbg) = @_;
   my %hash = ();
   my ($ank,$len,$i,$ch,$pc,$hr2,$txt);
   if ($fank =~ /<a\s+(.+)>$/) {
      $ank = trim_all($1);
      $len = length($ank);
      $ch = '';
      $hr2 = '';
      for ($i = 0; $i < $len; $i++) {
         $pc = $ch;
         $ch = substr($ank,$i,1);
         if ($ch =~ /\w/) {
            $hr2 .= $ch;   # accumulate \w chars - alphanumeric, including _
         } elsif (length($hr2)) {
            if (($ch ne '=') && ($ch =~ /\s/)) {
               $i++;
               for (; $i < $len; $i++) {
                  $ch = substr($ank,$i,1);
                  last if ($ch eq '=');
                  last if !($ch =~ /\s/);
               }
            }
            if ($ch eq '=') {
               # found our equal sign
               $i++; # move on...
               for (; $i < $len; $i++) {
                  $ch = substr($ank,$i,1);
                  last if ($ch =~ /('|")/);
                  last if !($ch =~ /\s/);
               }
               if (($ch eq '"')||($ch eq "'")) {
                  $pc = $ch;
                  $i++; # move on...
                  $txt = '';
                  for (; $i < $len; $i++) {
                     $ch = substr($ank,$i,1);
                     last if ($ch eq $pc);
                     $txt .= $ch;
                  }
                  if ($ch eq $pc) {
                     $hash{$hr2} = $txt;
                     prt( "[dbg38] Got [$hr2] = [$txt] [$fil]\n" ) if ($dbg38);
                  } else {
                     prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO END INVERTED COMMA! ($pc) [$fil]\n");
                     pgm_exit(1,"") if ($dbg);
                  }
               } else {
                  if (($ch =~ /\w/) && (($hr2 =~ /name/i)||($hr2 =~ /id/i))) {
                     # accept these WITHOUT inverted comma
                     $txt = $ch;
                     $i++; # MOVING ON
                     for (; $i < $len; $i++) {
                        $ch = substr($ank,$i,1);
                        last if !($ch =~ /\w/);
                        $txt .= $ch;
                     }
                     $hash{$hr2} = $txt;
                     prt( "[dbg39] Got [$hr2] = [$txt] - no inverted commas! [$fil]\n" ) if ($dbg39);
                  } else {
                     prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO START INVERTED COMMA! [$fil]\n");
                     pgm_exit(1,"") if ($dbg);
                  }
               }
            } else {
               prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO EQUAL SIGN! [$fil]\n");
               pgm_exit(1,"") if ($dbg);
            }
            $hr2 = '';
         }
      }
   }
   return \%hash;
}

# Collect HREF anchors from a TEXT stream
# 25/07/2007 - Skip over comments <!-- to -->
# return a reference array
sub collecthrefs2($$) {
    my ($txt,$fil) = @_;
    my $ntxt = '';
    my $len = length($txt);
    my ($ch,$pc);
    my $hrf = '';
    my ($i,$j,$len2,$hr2,$hfile);
   my @hr = ();
   my @nr = ();
    for ($i = 0; $i < $len; $i++) {
        $ch = substr($txt,$i,1);
        if ($ch eq '<') {
            $hrf = $ch;    # start a tag
            $i++;
         # go to the end of that tag
            for ( ; $i < $len; $i++) {
                $ch = substr($txt,$i,1);
                $hrf .= $ch;
                # 20100312 and 25/07/2007 watch OUT for COMMENTS - skip these
                if ($ch eq '-') {
                    if ($hrf eq '<!--') {
                        # we have START of a COMMENT - YUK!!!
                        $i++;    # move to NEXT, and EAT ALL THE COMMENT
                        for ( ; $i < $len; $i++) {
                            $ch = substr($txt,$i,1);
                            $hrf .= $ch;
                            if ($ch eq '>') {
                        # potential end, but ONLY if '-->' found
                                if ($hrf =~ /-->$/) {
                                    last;
                                }
                            }
                        }
                    }
                }
                if ($ch eq '>') {
                    last;
                }
            }
            if ($hrf =~ /^<a\s+/i) {
            # Got an ANCHOR - process it
            my $ahr = get_anchor_hash_ref($hrf,$fil,0); # 1 to exit on problem
            $len2 = length($hrf);
                prt("[dbg35] got anchor [$hrf]($len2)... file [$fil]\n") if ($dbg35);
            $hr2 = '';
            $hfile = '';   # href="file", if any
            # process 'anchor', finding 'href="something";
            for ($j = 0; $j < $len2; $j++) {
               $ch = substr($hrf,$j,1);
               if ($ch =~ /\w/) {
                  $hr2 .= $ch;   # accumulate \w chars - alphanumeric, including _
               } else {
                  # NOT alphanumeric, including _, so check what we have
                  if ($hr2 =~ /^href$/i) {
                     # got a HREF entry
                     # not sure this is CORRECT, but continue to find an equal, but ONLY if space
                     if (($ch ne '=')&&($ch =~ /\s/)) {
                        $j++;
                        for (; $j < $len2; $j++) {
                           $ch = substr($hrf,$j,1);
                           last if ($ch eq '=');
                           last if !($ch =~ /\s/); # ONLY eat spaces
                        }
                     }
                     if ($ch eq '=') {
                        $j++;
                        # only eat SPACES
                        for (; $j < $len2; $j++) {
                           $ch = substr($hrf,$j,1);
                           last if ($ch =~ /('|")/);
                           last if (!($ch =~ /\s/));
                        }
                        if (($ch =~ /('|")/) && ($j < $len2)) {
                           $pc = $ch;
                           $j++;    # bump PAST this 'inverted' comma
                           $hfile = ''; # clear the LINK name, file or http reference
                           for (; $j < $len2; $j++) {
                              $ch = substr($hrf,$j,1);
                              last if ($ch eq $pc);
                              $hfile .= $ch;
                           }
                           if (($ch eq $pc) && ($j < $len2)) {
                              # ok, it can be added
                              if (is_in_array_ref0($hfile, \@hr)) {
                                 # already in list
                                 $hfile = '';
                              } else {
                                     prt("[dbg35] Added HREF [$hfile]...\n") if ($dbg35);
                              }
                           } else {
                              prt("PROBLEM: No NEXT inverted comma ($pc) [$hrf]! [$fil]\n");
                              pgm_exit(1,"ERROR EXIT 1");
                           }
                        } else {
                           prtw("WARNING: No inverted comma [$hrf]! [$fil]\n");
                           $hfile = ''; # clear the LINK name, file or http reference
                           for (; $j < $len2; $j++) {
                              $ch = substr($hrf,$j,1);
                              last if (($ch =~ /\s/)||($ch eq '>'));    # stop only on NON SPACE, or '>'
                              $hfile .= $ch;
                           }
                           #pgm_exit(1,"ERROR EXIT 2");
                        }
                     } else {
                        prt("PROBLEM: No equal sign [$hrf]! [$fil]\n");
                        pgm_exit(1,"ERROR EXIT 3");
                     }
                     last; # end this scan of the anchor
                  }
                  $hr2 = '';
               }
            }
            if (length($hfile)) {
               push(@hr, [$hfile, $hrf, 0, '', $ahr]) ;
            } else {
               push(@nr, [$hfile, $hrf, 0, '', $ahr]) ;
            }
            }
        }
    }
    prt( "[dbg35] Collected ". scalar @hr . " HREF... [$fil]\n" ) if ($dbg35);
   my %h = ();
   $h{'href'} = \@hr;
   $h{'name'} = \@nr;
    return \%h;
}

sub secs_2_hhmmss($) {
    my ($secs) = shift;
    my $rt = '';
    my $mins = int($secs / 60);
    $secs = $secs - ($mins * 60);
    $secs = (int(($secs * 10) + 0.5)) / 10;
    if ($mins > 60) {
        my $hrs = int($mins / 60);
        $mins = $mins - ($hrs * 60);
        $mins = '0'.$mins if ($mins < 10);
        $secs = '0'.$secs if ($secs < 10);
        $rt = "$hrs:$mins:$secs";
    } else {
        $mins = '0'.$mins if ($mins < 10);
        $secs = '0'.$secs if ($secs < 10);
        $rt = "$mins:$secs";
    }
    return $rt;
}

sub show_time($$$$) {
    my ($totcnt, $lncnt, $bgntime, $msg) = @_;
    my ($currtime, $difftime, $persec, $remains, $remsecs, $tenths, $remtm, $elapsed);
    $currtime = time();
    $difftime = $currtime - $bgntime;
    $persec = $lncnt / $difftime;
    $remains = $totcnt - $lncnt;
    $remsecs = $remains / $persec;
    $tenths = (int(($persec * 100) + 0.05)) / 100;
    $remtm = secs_2_hhmmss($remsecs);
    $elapsed = secs_2_hhmmss($difftime);
    prt( "$elapsed Done $lncnt, at $tenths/sec, left $remains in $remtm - $msg\n" );
}

sub test_remove_script {
    my ($txt) = shift;
    my $ntxt = '';
    my $len = length($txt);
    my ($i,$ch,$pc,$bal,$ppc);
    $pc = '';
    $ch = '';
    for ($i = 0; $i < $len; $i++) {
        $ppc = $pc;
        $pc = $ch;
        $ch = substr($txt,$i,1);
        if (($ch eq '<')&&(substr($txt,$i) =~ /^<script/i)) {
            # start of <script
            $i++;
            for (; $i < $len; $i++) {
                $ppc = $pc;
                $pc = $ch;
                $ch = substr($txt,$i,1);
                last if ($ch eq '>');
            }
            $i++;
            # /*<![CDATA[*/ ... ONLY ]]> to close
            for (; $i < $len; $i++) {
                $ppc = $pc;
                $pc = $ch;
                $ch = substr($txt,$i,1);
                if ($ch eq '<') {
                    $bal = substr($txt,$i);
                    if ($bal =~ /^<\/script>/i) {
                        $i += 8;
                        last;
                    } elsif ($bal =~ /^<\!\[CDATA\[/) {
                        #pgm_exit(1,"Got CDATA\n");
                        $i += 9;    # eat CDATA
                        for (; $i < $len; $i++) {
                            $ppc = $pc;
                            $pc = $ch;
                            $ch = substr($txt,$i,1);
                            last if (($ch eq '>')&&($pc eq ']')&&($ppc eq ']'));
                        }
                    }
                }
            }
        } else {
            $ntxt .= $ch;
        }
    }
    return $ntxt;
}

sub dropscripts_from_array {
    my (@arr) = @_;
    my $txt = '';
    foreach my $ln (@arr) {
        chomp $ln;
        $txt .= ' {=*==*=} ' if (length($txt));
        $txt .= $ln;
    }
    $txt = test_remove_script( $txt );
    @arr = split( / \{=\*==\*=\} /, $txt );
    return @arr;
}

# ========================================================
# sub process_file_array() {
# uses global @htm_files array
#
# Primary process of EACH file found in the directories,
# and anchors are extracted and kept.
# $htm_files[$i][$of_hr] = \@hr;   # like $of_to, but is ALL HREF links
# $htm_files[$i][$of_im] = \@is;
# ========================================================
sub process_file_array() {
    my $max = scalar @htm_files;
    my $bgntime = time();
    my $msg = '';
    my $pmsg = '';
    my ($file,$cnth,$cntn,$refa,$sfil,$pcnt);
    #my $test_file = 'maroc-image1';
    #my $test_file = 'maroc-slide4';
    $pcnt = 0;
    prt("[v5] Process file array, for $max html files...\n") if (VERB5());
    for (my $i = 0; $i < $max; $i++) {
        $file = $htm_files[$i][$of_ff];
        $sfil = sub_in_folder($file);
        $pmsg = "[v5] File [$sfil]";
        prt("[dbg42] Processing file [$file]...\n") if ($dbg42);
        my ($nm,$dir,$ext) = fileparse( $file, qr/\.[^.]*/ );
        my $htot = 0;   # start anchor/href counter
        my $itot = 0;   # start image counter
        $pcnt++;
        if (open INF, "<$file") {
            my @lines = <INF>;
            close INF;
            @lines = drop_php_from_array( @lines ) if (lc($ext) eq '.php');
            # THIS IS USING htmltool.pl - get a single line of TEXT ...
            my $rawtxt = join( '', @lines ); # get whole text
            #my $ntxt = remove_script( $rawtxt );   # remove SCRIPTS <script...> ... ... ... </script>
            my $ntxt = test_remove_script( $rawtxt );   # remove SCRIPTS <script...> ... ... ... </script>
            my @is = ret_imgs_array($ntxt);
            $ntxt = trimblanklines($ntxt);
            @hrefs = (); # clear
            my @hr = ret_hrefs_array( $ntxt );
            my $rhash = collecthrefs2( $ntxt, $file );
            ### collecthrefs( $txt, 0 );
            ### collectimgs( $txt, 0 );
            # bump the counts of HREF and IMGS collected
            $itot = scalar @is;
            $htot = scalar @hr;
            # store the references ... that is a reference to an array
            $htm_files[$i][$of_hr] = \@hr;   # like $of_to, but is ALL HREF links
            $htm_files[$i][$of_im] = \@is;
            $htm_files[$i][$of_rh] = $rhash# store the reference hash to 2nd ref arrays 'href' & 'name'
            $refa = ${$rhash}{'href'};
            $cnth = scalar @{$refa};
            $refa = ${$rhash}{'name'};
            $cntn = scalar @{$refa};
            $pmsg .= " a=$htot, i=$itot, h=$cnth, n=$cntn";

            # new code to do some similar things, but while in the array
            # BUT IS REALLY JUST A DUPLICATE OF THE ABOVE
            # one day should choose ONE or the OTHER ONLY ;=)) it just wastes time
            ###@lines = drop_php_from_array( @lines ) if (lc($ext) eq '.php');
            #write2file((join("\n",@lines)."\n"),'tempbefore.htm');
            @lines = dropcomments_from_array(@lines);
            # 20100313 *MUST* drop scripts from line array, before partying
            #write2file((join("\n",@lines)."\n"),'tempbefore2.htm');
            @lines = dropscripts_from_array(@lines);
            #write2file((join("\n",@lines)."\n"),'tempafter.htm');
            #pgm_exit(1,"Check tempafter.htm");
            my @isrcs = get_img_srcs($file, @lines);
            $tot_imgcnt += check_images( $file, @isrcs );
            my @hsrcs = get_href_srcs($file, @lines);
            check_hrefs( $file, \@hsrcs );
            $htm_files[$i][$of2hr] = \@hsrcs;   # really just duplicate...
            $htm_files[$i][$of2im] = \@isrcs;   # should be removed!!!
            prt("[dbg43] Got $htot hrefs (h=$cnth,n=$cntn)(".scalar @hsrcs."), and $itot image links (".scalar @isrcs.").\n") if ($dbg43);
            $pmsg .= " h2=".scalar @hsrcs.", i2=".scalar @isrcs;
            prt("$pmsg\n") if (VERB5());
            #if ($file =~ /$test_file/) {
            #   pgm_exit(1,"CHECK THIS FILE [$ntxt]\n");
            #}
        }

        $total_hrefs += $htot;
        $total_imgs += $itot;
        if ((($pcnt % 100) == 0)||($max < 10)) {
            ###local $| = 1;
            ###prt( "\rDone $pcnt HTML files ..." );
            if ($max < 10) {
                prt( "Done $file HTML file ... href,other($htot,$itot)\n" );
            } else {
                $msg = "href,other ($total_hrefs,$total_imgs)";
                show_time( $max, $pcnt, $bgntime, $msg );
                #prt( "Done $pcnt HTML files ... href,other ($total_hrefs,$total_imgs)\n" );
            }
        }
    }
    prt( "Completed $pcnt HTML files ... Found $total_hrefs HREF, and $total_imgs IMG/OTHER tokens.\n" );
}

sub get_href_type($) {
    my ($src) = shift;
    if ($src =~ /^http:/i) {
        #push(@httprefs, [$src, $fil, $lnnos] );
        return 1; # remote HREF
    } elsif ($src =~ /^https:/i) {
        return 1; # remote HREF
        #push(@httpsrefs, [$src, $fil, $lnnos] );
    } elsif ($src =~ /^ftp:/i) {
        #push(@ftprefs, [$src, $fil, $lnnos] );
        return 3; # remote HREF
    } elsif ($src =~ /^mailto:/i) {
        #push(@mtrefs, [$src, $fil, $lnnos] );
        return 4; # remote HREF
    } elsif ( $src =~ /^javascript:/i ) {
        return 5; # a JAVASCRIPT HREF
    } elsif ($src =~ /^file:/i) {
        return 5; # remote HREF
    } elsif ( substr($src,0,1) eq '#') {
        # local in page HREF
        return 6;
    } else {
        my $ind = index($src,'#');
        $src = substr($src,0,$ind) if ( $ind != -1 );
        $ind = index($src,'?');
        $src = substr($src,0,$ind) if ( $ind != -1 );
        $src =~ s/\/$//;
        return 7 if (length($src));
    }
    return 0;
}

sub get_local_href($) {
    my ($src) = shift;
    my $ind = index($src,'#');
    $src = substr($src,0,$ind) if ( $ind != -1 );
    $ind = index($src,'?');
    $src = substr($src,0,$ind) if ( $ind != -1 );
    $src =~ s/\/$//;    # remove any TRAILING '/' char
    # 25/07/2007 - also 'convert' '%20' to space
    $src =~ s/%20/ /g;
    return $src;
}

sub dos_2_unix($) {
    my ($du) = shift;
    $du =~ s/\\/\//g;
    return $du;
}

### my @donesrcs = ();
sub in_done_srcs($) {
    my ($f) = shift;
    foreach my $fd (@donesrcs) {
        if ($fd eq $f) {
            return 1;
        }
    }
    return 0;
}
sub in_done_imgs($) {
    my ($f) = shift;
    foreach my $fd (@doneimgs) {
        if ($fd eq $f) {
            return 1;
        }
    }
    return 0;
}

sub fix_rel_unix_path($) {
    my ($path) = shift;
    $path = dos_2_unix($path);
    # pgm_exit(1,"ERROR: Passed PATH that starts relative! [$path]\n") if (($path =~ /^\.\./)||($path =~ /^\.(\\|\/)\.\./));
    my @a = split(/\//, $path);
    my $npath = '';
    my $max = scalar @a;
    my @na = ();
    for (my $i = 0; $i < $max; $i++) {
        my $p = $a[$i];
        if ($p eq '.') {
            # ignore this
        } elsif ($p eq '..') {
            if (@na) {
                pop @na;    # discard previous
            } else {
                prt( "WARNING: Got relative .. without previous!!! path=[$path]\n" );
            }
        } else {
            push(@na,$p);
        }
    }
    foreach my $pt (@na) {
        $npath .= "/" if length($npath);
        $npath .= $pt;
    }
    return $npath;
}

sub add_new_link($$) {
    my ($nlnk, $lnks) = @_;
    my @arr = split(',', $lnks);
    foreach my $lk (@arr) {
        if ($lk eq $nlnk) {
            return 0;
        }
    }
    return 1;
}

sub mark_image_link($$$$) {
    my ($fmfil, $fnd, $src, $lev) = @_;
    my $fcnt = scalar @img_files;
    my $msrc = lc(dos_2_unix($src));
    my $lnks = '';
    my ($val);
    prt( "[dbg22] Seeking [$msrc] in $fcnt images files ...\n" ) if ($dbg22);
    my ($i,$fil,$mfil);
    for ($i = 0; $i < $fcnt; $i++) {
        $fil = $img_files[$i][$of_ff];
        if (is_same_file($src,$fil)) {
            $val = $img_files[$i][$of_lk];
            $val++;
            # add image file linked to from what file
            $lnks = $img_files[$i][$of_fm];
            if (length($lnks) == 0) {
                $lnks = $fmfil;
            } elsif (add_new_link($fmfil, $lnks)) {
                $lnks .= ',';
                $lnks .= $fmfil;
            } else {
            $val--;  # reduce due to being a repeated link
         }
            $img_files[$i][$of_lk] = $val;
            $img_files[$i][$of_fm] = $lnks;
            prt( "[dbg28] IMG src in $fmfil, of $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg28);
            return 0;
        }
    }
    for ($i = 0; $i < $fcnt; $i++) {
        $fil = $img_files[$i][$of_ff];
        $mfil = lc(dos_2_unix($fil));
        prt( "[dbg22] Comparing to $mfil ...\n" ) if ($dbg22);
        if ($msrc eq $mfil) {
            $val = $img_files[$i][$of_lk];
            $val++;
            $img_files[$i][$of_lk] = $val;
            # add image file linked to from what file
            $lnks = $img_files[$i][$of_fm];
            if (length($lnks) == 0) {
                $lnks = $fmfil;
            } elsif (add_new_link($fmfil, $lnks)) {
                $lnks .= ',';
                $lnks .= $fmfil;
            }
            $img_files[$i][$of_fm] = $lnks;
            prt( "[dbg28] IMG src in $fmfil, of $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg28);
            return 0;
        }
    }
    prt( "[dbg21] $src - NOT FOUND![1]\n" ) if ($dbg21);

    return 1;
}

sub mark_other_links($$$$) {
    my ($fmfil, $fnd, $src, $lev) = @_;
    my $totcnt = 0;
    my $msrc = lc(dos_2_unix($src));
    my $fcnt = scalar @img_files;
    my $i = 0;
    my $lnks = '';
    my ($val);
    $totcnt += $fcnt;
    if (mark_image_link( $fmfil, $fnd, $src, $lev ) == 0) {
        return 0;
    }
    # maybe ZIP files
    $fcnt = scalar @zip_files;
    $totcnt += $fcnt;
    for ($i = 0; $i < $fcnt; $i++) {
        my $fil = $zip_files[$i][$of_ff];
        my $mfil = lc(dos_2_unix($fil));
        if ($msrc eq $mfil) {
            $val = $zip_files[$i][$of_lk];
            $val++;
            # add zip file linked to from what file
            $lnks = $zip_files[$i][$of_fm];
            if (length($lnks) == 0) {
                $lnks = $fmfil;
            } elsif (add_new_link($fmfil, $lnks)) {
                $lnks .= ',';
                $lnks .= $fmfil;
            } else {
            $val--;
         }
            $zip_files[$i][$of_lk] = $val;
            $zip_files[$i][$of_fm] = $lnks;
            prt( "ZIP link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29);
            return 0;
        }
    }
    # maybe TXT files
    $fcnt = scalar @txt_files;
    $totcnt += $fcnt;
    for ($i = 0; $i < $fcnt; $i++) {
        my $fil = $txt_files[$i][$of_ff];
        my $mfil = lc(dos_2_unix($fil));
        if ($msrc eq $mfil) {
            $val = $txt_files[$i][$of_lk];
            $val++;
            # add txt file linked to from what file
            $lnks = $txt_files[$i][$of_fm];
            if (length($lnks) == 0) {
                $lnks = $fmfil;
            } elsif (add_new_link($fmfil, $lnks)) {
                $lnks .= ',';
                $lnks .= $fmfil;
            } else {
            $val--;
         }
            $txt_files[$i][$of_lk] = $val;
            $txt_files[$i][$of_fm] = $lnks;
            prt( "TXT link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29);
            return 0;
        }
    }
    # maybe CSS files
    $fcnt = scalar @css_files;
    $totcnt += $fcnt;
    for ($i = 0; $i < $fcnt; $i++) {
        my $fil = $css_files[$i][$of_ff];
        my $mfil = lc(dos_2_unix($fil));
        if ($msrc eq $mfil) {
            $val = $css_files[$i][$of_lk];
            $val++;
            # add txt file linked to from what file
            $lnks = $css_files[$i][$of_fm];
            if (length($lnks) == 0) {
                $lnks = $fmfil;
            } elsif (add_new_link($fmfil, $lnks)) {
                $lnks .= ',';
                $lnks .= $fmfil;
            } else {
            $val--;
         }
            $css_files[$i][$of_lk] = $val;
            $css_files[$i][$of_fm] = $lnks;
            prt( "CSS link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29);
            return 0;
        }
    }
    # maybe SCRIPT files
    $fcnt = scalar @script_files;
    $totcnt += $fcnt;
    for ($i = 0; $i < $fcnt; $i++) {
        my $fil = $script_files[$i][$of_ff];
        my $mfil = lc(dos_2_unix($fil));
        if ($msrc eq $mfil) {
            $val = $script_files[$i][$of_lk];
            $val++;
            # add script file linked to from what file
            $lnks = $script_files[$i][$of_fm];
            if (length($lnks) == 0) {
                $lnks = $fmfil;
            } elsif (add_new_link($fmfil, $lnks)) {
                $lnks .= ',';
                $lnks .= $fmfil;
            } else {
            $val--;
         }
            $script_files[$i][$of_lk] = $val;
            $script_files[$i][$of_fm] = $lnks;
            prt( "SCRIPT link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29);
            return 0;
        }
    }
    # OK, OTHER
    $fcnt = scalar @other_files;
    $totcnt += $fcnt;
    for ($i = 0; $i < $fcnt; $i++) {
        my $fil = $other_files[$i][$of_ff];
        my $mfil = lc(dos_2_unix($fil));
        if ($msrc eq $mfil) {
            $val = $other_files[$i][$of_lk];
            $val++;
            # add script file linked to from what file
            $lnks = $other_files[$i][$of_fm];
            if (length($lnks) == 0) {
                $lnks = $fmfil;
            } elsif (add_new_link($fmfil, $lnks)) {
                $lnks .= ',';
                $lnks .= $fmfil;
            } else {
            $val--;
         }
            $other_files[$i][$of_lk] = $val;
            $other_files[$i][$of_fm] = $lnks;
            prt( "OTHER link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29);
            return 0;
        }
    }
    $totcnt += 1 if ($totcnt == 0);
    return $totcnt;
}

#######################################################################
# mark a link
# parameters:
# LINK IS IN FILE - $fmfil =  FROM
# Offset in $htm_files[$fnd]
# Link item source - $src  =  TO
# Level
# 2010/04/08 - If given a DIRECTORY, try to find an 'index' to link to...
sub mark_link($$$$) {
    my ($fmfil, $fnd, $src, $lev) = @_;
    my $fcnt = scalar @htm_files;
    my $msrc = lc(dos_2_unix($src));
    my $ff = $htm_files[$fnd][$of_ff];
    my $sfil = sub_in_folder($fmfil);
    my $ssrc = sub_in_folder($src);
    #prt("[dbg40]mark_link: for src=[$src] in fil=[$fmfil] off=[$fnd] lev=$lev ff=[$ff]\n") if ($dbg40);
    #prt("[dbg40]mark_link:$lev: for src=[$ssrc] from=[$sfil]($fnd) ff=[$ff]\n") if ($dbg40);
    prt("[dbg40]mark_link:$lev: TO=[$ssrc] from=[$sfil]($fnd)\n") if ($dbg40);
    my $i = 0;
    my $totcnt = $fcnt;
    my $fil = '';
    my $mfil = '';
    my $val = 0;
    my $lnks = '';
    my ($hrf,$j,$hrt,$tmp,$i2);
    if (-d $src) {
        prt("[dbg44] Sending [$src] to 'mark_dir_link'...\n") if ($dbg44);
        return mark_dir_link($fmfil, $fnd, $src, $lev);
    }
    for ($i = 0; $i < $fcnt; $i++) {
        $i2 = $i + 1;
        if ($i != $fnd) {
            $fil = $htm_files[$i][$of_ff];
            $mfil = lc(dos_2_unix($fil));
            prt( "[dbg41] $i2: compare \n[$mfil] with \n[$msrc]\n" ) if ($dbg41);   # VERY NOISY
            if (($msrc eq $mfil) || is_same_file($src,$fil)) {
                # FOUND the TO file in LIST - MARK IT
                $val = $htm_files[$i][$of_lk];
                $val++;
                # add HTML file linked to from what file
                $lnks = $htm_files[$i][$of_fm];
                if (length($lnks) == 0) {
                    $lnks = $fmfil;
                } elsif (add_new_link($fmfil, $lnks)) {
                    $lnks .= ',';
                    $lnks .= $fmfil;
                } else {
                    $val--; # already COUNTED
                }
                $htm_files[$i][$of_lk] = $val;
                $htm_files[$i][$of_fm] = $lnks; # add to LINKS
                prt( "[dbg30] SET link to $fil($i) $val ok [lnks=$lnks] in $fmfil\n" ) if ($dbg30);
                ###prt( "$ff ($fnd) linked to $fil ($i) $val\n" ) if ($dbg21);
                my $hr = $htm_files[$i][$of_hr];    # extract HREF ref.array - all LINKS to another!
                my $im = $htm_files[$i][$of_im];    # extract IMAGE ref.array
                # my $hr2 = $htm_files[$i][$of_rh]; # extract the NEW ANCHOR collection
                # $htm_files[$i][$of_rh] = $rhash;  # store the reference hash to 2 ref arrays 'href' & 'name'
                my $rhash = $htm_files[$i][$of_rh];
                my $hr2 = ${$rhash}{'href'}; # extract the NEW ANCHOR collection
                my $hrc = scalar @{$hr};
                my $imc = scalar @{$im};            # get count of images
                my $hr2c = scalar @{$hr2};
                if ($hrc != $hr2c) {
                    # ====== this is mainly diagnostic ======
                    ##prt("WARNING: Counts old [$hrc], new [$hr2c] in [$fil]\n");
                    my %th1 = ();
                    for ($j = 0; $j < $hrc; $j++) {
                        $hrf = ${$hr}[$j];
                        $th1{$hrf} = 1;
                    }
                    $tmp = scalar keys(%th1);
                    if ($tmp == $hr2c) {
                        ##prt("WARNING: Solved when dupes removed! New counts old [$tmp] = new [$hr2c]\n");
                    } else {
                        prt("PROBLEM: Counts old [$hrc], new [$hr2c] in [$fil]\n");
                        prt("List of $hrc OLD\n");
                        for ($j = 0; $j < $hrc; $j++) {
                            $hrf = ${$hr}[$j];
                            $hrt = get_href_type($hrf);
                            $val = is_in_array_ref0($hrf,$hr2);
                            $tmp = ($val ? "Yes" : "No");
                            prt("[$hrf] ($hrt) $tmp\n");
                        }
                        prt("List of $hr2c NEW\n");
                        for ($j = 0; $j < $hr2c; $j++) {
                            $hrf = ${$hr2}[$j][0];
                            $hrt = get_href_type($hrf); 
                            $val = is_in_array_ref($hrf,$hr);
                            $tmp = ($val ? "Yes" : "No");
                            prt("[$hrf] ($hrt) $tmp anchor=<".${$hr2}[$j][1].">\n");
                        }
                        pgm_exit(1,"What is different/missing...\n");
                    }
                    # ====== above is mainly diagnostic ======
                }
                # found the file LINKED TO...
                my ($itmnam, $itmdir) = fileparse($fil);    # get name and path
                $itmdir = $currworkdir.'/' if ($itmdir =~ /^\.(\\|\/)$/);
                prt( "$lev [$fil] has $hrc href, $imc img/other links ..\n" ) if ($dbg19 || $dbg26);
                # now process links to links...
                for ($j = 0; $j < $hrc; $j++) {
                    $hrf = ${$hr}[$j];
                    $hrt = get_href_type($hrf); 
                    if ($hrt == 7) {
                        my $nsrc = fix_rel_unix_path($itmdir.get_local_href($hrf));
                        if ( !in_done_srcs($nsrc) ) {
                            push(@donesrcs, $nsrc);    # put it in DONE list
                            mark_link( $fil, $i, $nsrc, $lev + 1 );    # and MARK its links now
                        }
                    }
                }
                $val = 0;
                prt( "$fil - Checking $imc images files ...\n") if ($dbg24);
                for ($j = 0; $j < $imc; $j++) {        # do each, in this linked file
                    my $img = ${$im}[$j];            # get the image string
                    my $isrc = $itmdir.$img;        # join it with the path
                    my $nisrc = fix_rel_unix_path($isrc);    # fix rel, and force unix path
                    prt( "Marking [$nisrc] - ".($j+1)." of $imc img/other links ..\n" ) if ($dbg19 || $dbg26);
                    if ( !in_done_imgs($nisrc) ) {
                        push(@doneimgs, $nisrc);    # put it in DONE list
                        mark_other_links( $fil, $j, $nisrc, 0 );    # and MARK the link in @img_files
                        $val++;
                    } else {
                        prt( "Already IN doneimgs ...\n" ) if ($dbg19 || $dbg26);
                    }
                }
                prt( "$fil - Marked $val of $imc images files ...\n") if ($val && $dbg24);
                return 0;
            }
        }   # do NOT find self
    }
    # hmmmm, LINK not found in HREF files, maybe IMAGES, zip, etc ...
    prt("[dbg45] LINK not found in HREF files, maybe IMAGES, zip, etc TO=[$src], FROM=[$fmfil]\n") if ($dbg45);
    if (is_htm_file_ext($src)) {
        # 19/05/2011 - BUt if the file does EXIST, pobably just means it is outside the start point
        if (-f $src) {
            prt("[dbg46] LINK found, but not in HREF files! TO=[$src], FROM=[$fmfil]\n") if ($dbg46);
        } else {
            prtw("WARNING: LINK not found in HREF files! TO=[$src], FROM=[$fmfil]\n");
        }
    }
    $val = mark_other_links( $fmfil, $fnd, $src, $lev );
    if ($val) {
        $totcnt += $val;
        prt( "[dbg25] NO LINK FOUND to [$src]($msrc) in $totcnt file - $ff ($fnd) - ($lev)!\n" ) if ($dbg25);
        return 1;
    }
    return 0;   # SUCCESS - FOUND THE LINK
}

# ============================================================
# sub seek_dir_index_file($$$$) {
# Seek the first 'index' type file for a DIRECTORY
# IN - ($directory, $resultref);
# OUT - 0 = SUCCESS, and result reference contains file
#       1 = FAILED   no 'index' type found
#       2 = FAILED   directory NOT in %g_dir_file
# ============================================================
sub seek_dir_index_file($$) {
    my ($nusrc, $rind) = @_;
    my $hr = \%g_dir_files;
    my ($dhr,$v2,$i,$ii,$cnt,$v3,$icnt,$ff,$k2);
    if (defined ${$hr}{$nusrc}) {
        $dhr = ${$hr}{$nusrc};  # get HASH reference for this DIRECTORY (in unix form)
        foreach $k2 (sort keys %{$dhr}) {
            # for each type HTML, CSS, etc...
            $v2 = ${$dhr}{$k2};     # extract array reference for this TYPE
            $cnt = scalar @{$v2};   # count to process
            # prt(" Type $k2 = $cnt\n");
            for ($i = 0; $i < $cnt; $i++) {
                $v3 = ${$v2}[$i][0];    # get the file name
                $ii = ${$v2}[$i][1];    # and if it is 'index' type
                # prt("  File: [$v3] ");
                if ($ii) {  # FOUND an 'index; type
                    # the $v3 file name is an INDEX type file
                    $ff = $nusrc.'/'.$v3;   # build FULL FILE name
                    $icnt++;
                    #prt(" ('index' type)");
                    ${$rind} = $ff;
                    return 0;  # SUCCESS
                }
                # prt("\n");
            }
        }
        return 1;
    } else {
        pgm_exit(1,"ERROR: Why is [$nusrc] NOT in directory/file reference?\n");
    }
    return 2;   # FAILED!!!
}


# 2010/04/08 - If given a DIRECTORY, try to find an 'index' to link to...
# $fil = $htm_files[$fnd][$of_ff];    # extract FULL PATH name of file CONTAINING the HREF...
# ($itmnam, $itmdir) = fileparse($fil);    # get name and path
# $itmdir = $currworkdir.'/' if ($itmdir =~ /^\.(\\|\/)$/);
# my $lref = get_local_href($hrf);
# $src = $itmdir.$lref;
# my $nusrc = fix_rel_unix_path($src);
# $nusrc already tested as a DIRECTORY - did I keep DIRECTORY LISTS/
#sub show_dir_files($) {
#    my ($hr) = @_;  # = \%g_dir_files
#    my ($key,$val,$k2,$v2,$v3,$dir,$cnt,$i,$ii,$icnt);
#    foreach $key (sort keys %{$hr}) {
#        $dir = sub_in_folder($key);
#        $dir = '<root>' if (length($dir)==0);
#        $val = ${$hr}{$key};    # extract hash ref BY TYPE
#        prt("Directory: [$dir]\n");
#        $icnt = 0;
#        foreach $k2 (sort keys %{$val}) {
#            # for each type
#            $v2 = ${$val}{$k2};
#            $cnt = scalar @{$v2};
#            prt(" Type $k2 = $cnt\n");
#            for ($i = 0; $i < $cnt; $i++) {
#                $v3 = ${$v2}[$i][0];
#                $ii = ${$v2}[$i][1];
#                prt("  File: [$v3] ");
#                if ($ii) {
#                    $icnt++;
#                    prt(" ('index' type)");
#                }
#                prt("\n");
#            }
#        }
#        if (!$icnt) {
#            prt("Directory: [$dir] has NO 'index' type file.\n");
#        } elsif ($icnt > 1) {
#            prt("Directory: [$dir] has $icnt 'index' type files.\n");
#        }
#    }
#    pgm_exit(1,"Debug run show of directories and files...\n");
#}
sub mark_dir_link($$$$) {
    my ($fil, $fnd, $nusrc, $lev) = @_;
    my $fcnt = scalar @htm_files;
    my $hr = \%g_dir_files;
    my ($dhr,$v2,$i,$ii,$cnt,$v3,$icnt,$ff,$k2);
    if (defined ${$hr}{$nusrc}) {
        $dhr = ${$hr}{$nusrc};  # get HASH reference for this DIRECTORY (in unix form)
        foreach $k2 (sort keys %{$dhr}) {
            # for each type HTML, CSS, etc...
            $v2 = ${$dhr}{$k2};     # extract array reference for this TYPE
            $cnt = scalar @{$v2};   # count to process
            # prt(" Type $k2 = $cnt\n");
            for ($i = 0; $i < $cnt; $i++) {
                $v3 = ${$v2}[$i][0];    # get the file name
                $ii = ${$v2}[$i][1];    # and if it is 'index' type
                # prt("  File: [$v3] ");
                if ($ii) {  # FOUND an 'index; type
                    # the $v3 file name is an INDEX type file
                    $ff = $nusrc.'/'.$v3;   # build FULL FILE name
                    $icnt++;
                    #prt(" ('index' type)");
                    return mark_link($fil,$fnd,$ff,0);  # and mark this file
                }
                # prt("\n");
            }
        }
    } else {
        pgm_exit(1,"ERROR: Why is [$nusrc] NOT in directory/file reference?\n");
    }
    return 1;   # FAILED!!!
}

sub get_links_stg($) {
   my ($lnks) = @_;
   my @arr = split(',',$lnks);
   my $nlnks = '';
   foreach my $l (@arr) {
      my $sl = sub_in_folder($l);
      $nlnks .= ',' if (length($nlnks));
      $nlnks .= $sl;
   }
   return $nlnks;
}

###########################################################################
# show link count, and links, in passed multidimensional file array
#
# If showlinks (or $dbg20 or $dbg31) is ON, shows internal LINKS
# NOTE: Presently DOES NOT get all LINKS??? BAH!!!
###########################################################################

sub show_link_counts($$) {
    my ($m, $hf) = @_;
    my $fcnt = scalar @{$hf};
    my $mcnt = 0;
    my $mss = "Checking LINKS for $fcnt $m files ...\n"; 
    my ($i,$fil,$hrt,$sfil,$lnks,$min,$len,$msg,$i2);
    if ($fcnt) {
      $min = 0;
        for ($i = 0; $i < $fcnt; $i++) {
            $fil = ${$hf}[$i][$of_ff];
         $sfil = sub_in_folder($fil);
         $len = length($sfil);
         $min = $len if ($len > $min);
      }
        for ($i = 0; $i < $fcnt; $i++) {
         $i2 = $i + 1;
            $fil = ${$hf}[$i][$of_ff];
            $hrt = ${$hf}[$i][$of_lk];
            $lnks = ${$hf}[$i][$of_fm];
         $sfil = sub_in_folder($fil);
            if ($hrt) {
                if ($dbg20 || $dbg31 || $showlinks) {
                    prt( $mss ) if (length($mss));
                    $mss = '';
               $sfil .= ' ' while (length($sfil) < $min);
               $msg = sprintf("%3d:",$i2);
                    if ($dbg31 || $showlinks) {
                        prt( "$msg $sfil has $hrt link(s) [".get_links_stg($lnks)."]\n" );
                    } else {
                        prt( "$msg $sfil has $hrt link(s)\n" );
                    }
                }
            } else {
            # expect NONE for most types - only check HTML
            if ( is_link_type_file($fil) ) {
               prt( $mss ) if (length($mss));
               $mss = '';
               prtw("WARNING: $sfil($i) HAS NO LINKS!") if ($dbg33 || $showlinks);
               push(@missing_links,$fil);
            }

            $mcnt++; # count a MISSED item
            if ($shownolinks || $showlinks) {
               prt( $mss ) if (length($mss));
               $mss = '';
               prt( "$i: [$sfil] has NO links\n" );
            }
            }
        }
        if ($mcnt) {
            prt( $mss ) if (length($mss));
         $mss = '';
            prt( "Done LINKS for $fcnt $m files ... MISSED $mcnt!!!\n" );
        }
    } else {
        prt( "There are NO $m files ...\n" ) if ($dbg20 || $showlinks);
    }
}

sub in_excused($) {
    my ($tx) = shift;
    foreach my $t (@excused) {
        if ($t eq $tx) {
            return 1;
        }
    }
    return 0;
}

sub is_same_file($$) {
    my ($fil1,$fil2) = @_;
    #prt("Comparing -\n[$fil1] to\n[$fil2]\n");
    if ($fil1 =~ /^\.(\\|\/)/) {
        $fil1 = $currworkdir.substr($fil1,1);
    }
    if ($fil2 =~ /^\.(\\|\/)/) {
        $fil2 = $currworkdir.substr($fil2,1);
    }
    #prt("Comparing -\n[$fil1] to\n[$fil2]\n");
    my $len = length($fil1);
    return 0 if (length($fil2) != $len);    # NOT same length
    return 1 if ($fil1 eq $fil2);
    my $ufil1 = dos_2_unix($fil1);
    my $ufil2 = dos_2_unix($fil2);
    #prt("Comparing UNIX -\n[$ufil1] to\n[$ufil2]\n");
    return 1 if ($ufil1 eq $ufil2);
    my ($i,$ch1,$ch2);
    for ($i = 0; $i < $len; $i++) {
        $ch1 = substr($fil1,$i,1);
        $ch2 = substr($fil2,$i,1);
        if ($ch1 ne $ch2) {
            #prt("Not same on char [$ch1] vs [$ch2] ($i)\n");
            return 0;
        }
    }
    return 1 if ($i == $len);
    return 0;
}

# ==========================================================
# using the given (home) PAGE, try to TRACE ALL LINKS
# not really meaingful unless it is a main 'index' type file
# ===========================================================
sub check_local_links($) {
    my ($hf) = shift;
    my $fcnt = scalar @htm_files;
    my ($hfnm,$hfdir) = fileparse($hf);
    my $lchf = lc($hfnm);
    my $fnd = get_home_offset($hf);
    my ($fil,$nm,$dir,$ext);
    if ($hfdir eq ".\\") {
        $hfdir = $in_folder."\\";
    }
    my $itmdir = '';
    my $itmnam = '';
    my $i = 0;
    my $i2 = 0;
    prt( "Checking local links, for $fcnt files, from $hf ...\n") if (VERB9());
    if ($fnd == -1) {
        prt( "WARNING: check_local_links: Unable to find [$hf] ...\n" );
        return 1;
    }
    # process item 1 ...
    my $pcnt = 1;
    my $hr = $htm_files[$fnd][$of_hr];
    my $im = $htm_files[$fnd][$of_im];
    my $hrc = scalar @{$hr};
    my $imc = scalar @{$im};
    my $hrf = '';
    my $img = '';
    my $hrt = 0;
    my $src = '';
    my $nsrc = '';
    my $ff = '';
    my $shwerr = 0;
    my $emsg = '';
    my ($msg,$sfil);
    $fil = $htm_files[$fnd][$of_ff];    # extract FULL PATH name of file CONTAINING the HREF...
    $htm_files[$fnd][$of_lk] = 1;       # mark as DONE
    ($itmnam, $itmdir) = fileparse($fil);    # get name and path
    $itmdir = $currworkdir.'/' if ($itmdir =~ /^\.(\\|\/)$/);
    $sfil = sub_in_folder($fil);
    prt( "HOME [$fil] has $hrc href, $imc img/other links ..\n" ) if ($dbg19 || $dbg26);
    for ($i = 0; $i < $hrc; $i++) {
        $hrf = ${$hr}[$i];
        $hrt = get_href_type($hrf); 
        $i2 = $i + 1;
        $shwerr = 0;
        $emsg = "IN[$sfil] $i2 HREF [$hrf]$hrt";
        if ($hrt == 0) {
            if (in_excused($hrf)) {
                $excusecnt++;
            } else {
                $emsg .= " CHECK UNKNOWN ****** CHECK ME ******[1]";
                push(@warnings, "WARNING: $emsg!" );
                $shwerr = 1;
            }
        } elsif ($hrt < 5) {
            $emsg .= " REMOTE";
        } elsif ($hrt == 5) {
            $emsg .= " JAVASCRIPT";
        } elsif ($hrt == 6) {
            $emsg .= " IN PAGE";
        } else {
            my $lref = get_local_href($hrf);
            $src = $itmdir.$lref;
            my $nusrc = fix_rel_unix_path($src);
            prt( "[dbg34] REL PATH [$src] to UNIX PATH [$nusrc] hrt=$hrt file=[$fil]\n" ) if ($dbg34);
            push(@donesrcs, $nusrc);    # put it in DONE list
            # mark link - FROM $fil
            if (is_same_file($fil,$nusrc)) {
                $emsg .= " LINK to SELF ok";
            } elsif (-d $nusrc) {
                if ( mark_dir_link( $fil, $fnd, $nusrc, 0 ) ) {
                    pgm_exit(1,"CODE ERROR: To be done mark links for DIRECTORY! [$nusrc]\n");
                }
            } else {
                if ( mark_link( $fil, $fnd, $nusrc, 0 ) ) {
                    $emsg .= " SITE REF [$nusrc] ***NO IN-SITE LINK***???";
                    $msg = "$i2 [$fil] HREF [$hrf]$hrt SITE REF [$nusrc] ***NO IN-SITE LINK***???";
                    if (-f $src) {
                        $msg .= "\n*** BUT FILE EXISTS [$src] ***";
                        $emsg .= "\n*** BUT FILE EXISTS [$src] ***";
                        # 19/05/2011 - do NOT post this warning
                        # push(@warnings, "WARNING: Local HREF [$lref] in [$fil] OUTSIDE WEB! but EXISTS!" );
                    } else {
                        push(@missed, $msg );
                        $shwerr = 1;
                    }
                } else {
                    $emsg .= " SITE REF [$src] ok" if ($dbg19);
                }
            }
        }
        prt( "$emsg\n" ) if ($dbg19 || $shwerr || VERB5());
    }

    prt( "HOME - Marking $imc images files ...\n") if ($dbg24);
    for ($i = 0; $i < $imc; $i++) {
        $img = ${$im}[$i];
        $src = $itmdir.$img;
        $nsrc = fix_rel_unix_path($src);
        prt( "HOME $fil - Mark $src ($nsrc) image ...\n" ) if ($dbg24);
        push(@doneimgs, $nsrc);    # put it in DONE list
        mark_other_links( $fil, $i, $nsrc, 0 );
    }
    return 0;
}

sub offset_done($$) {
    my ($off, @done) = @_;
    foreach my $num (@done) {
        if ($off == $num) {
            return 1;
        }
    }
    return 0;
}

sub trace_from_htm($$) {
    my ($hf, $lev) = @_;
    my $fnd = get_offset_of_htm($hf);
    my $msg = '';
    if (($fnd != -1) && !offset_done($fnd,@offsdone)) {
        push(@offsdone,$fnd);
        my $hr = $htm_files[$fnd][$of_hr];
        my $hrc = scalar @{$hr};
        my @offsets = ();
        my($itmnam, $itmdir) = fileparse($hf);    # get name and path
        $itmdir = $currworkdir.'/' if ($itmdir =~ /^\.(\\|\/)$/);
        for (my $i = 0; $i < $hrc; $i++) {
            my $hrf = ${$hr}[$i];
            my $hrt = get_href_type($hrf); 
            if ($hrt == 7) {
                #my $src = fix_rel_unix_path($itmdir.get_local_href($hrf));
                my $src = $itmdir.get_local_href($hrf);
                push(@offsets,$src);
                trace_from_htm($src, ($lev + 1));
            }
        }
        $hrc = scalar @offsets;
        my $cnt = $lev;
        $msg = sprintf("%4d ", $lev);
        prt( $msg );
        while($cnt) {
            prt( ' ' );
            $msg .= ' ';
            $cnt--;
        }
        prt( "$hf links to $hrc files ...\n" );
        $msg .= "$hf links to $hrc files ...";
        push(@htmlinks, [$lev, $msg]);
        foreach my $fil (@offsets) {
            $cnt = $lev;
            $msg = sprintf("%4d ", $lev);
            prt( $msg );
            while($cnt) {
                prt( ' ' );
                $msg .= ' ';
                $cnt--;
            }
            prt( "$fil\n" );
            $msg .= $fil;
            push(@htmlinks, [$lev, $msg]);
        }
    }
}


sub get_offset_of_htm($) {
    my ($hf) = shift;
    my $fcnt = scalar @htm_files;
    my ($hfnm,$hfdir) = fileparse($hf);
    if ($hfdir eq ".\\") {
        $hfdir = $in_folder."\\";
    }
    my $lchf = lc($hfnm);
    my $fnd = -1;
    for (my $i = 0; $i < $fcnt; $i++) {
        my $fil = $htm_files[$i][$of_ff];
        my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
        if (lc($nm.$ext) eq $lchf) {
            # have at least the NAME, but maybe not the FOLDER
            if (lc($hfdir) eq lc($dir)) {
                $fnd = $i;
                last;
            }
        }
    }
    return $fnd;
}


sub get_home_offset($) {
    if ($homeoffset != -1) {
        return $homeoffset;
    }
    my ($hf) = shift;
    prt( "Getting offset of HOME file [$hf]... ");
    my $fnd = get_offset_of_htm($hf);
    if ($fnd == -1) {
        prtw("WARNING: Unable to find [$hf] ...\n" );
    } else {
        prt( "index $fnd\n" );
    }
    $homeoffset = $fnd# set HOME file offset in list...
    return $homeoffset;
}


#############################################################################
# check linkages
#############################################################################
sub check_linkages($) {
    my ($hf) = shift;
    my $fcnt = scalar @htm_files;
    my ($hfnm,$hfdir) = fileparse($hf);
    my $lchf = lc($hfnm);
    my $fnd = get_home_offset($hf);
    my ($fil,$nm,$dir,$ext);
    if ($hfdir eq ".\\") {
        $hfdir = $in_folder."\\";
    }
    my $itmdir = '';
    my $itmnam = '';
    my $i = 0;
    my $i2 = 0;
    my ($tmp);
    prt( "Re-checking HREF and IMG/OTHER links, for $fcnt files ...\n") if (VERB9());
    if ($fnd == -1) {
        prt( "WARNING: Unable to find [$hf] ...\n" );
        push(@warnings, "WARNING: Unable to find [$hf] ...");
        return 1;
    }
    # process item 1 ...
    my $pcnt = 1;
    my $hr = $htm_files[$fnd][$of_hr];
    my $im = $htm_files[$fnd][$of_im];
    my $hrc = scalar @{$hr};
    my $imc = scalar @{$im};
    my $hrf = '';
    my $img = '';
    my $hrt = 0;
    my $src = '';
    my $ff = '';
    my $shwerr = 0;
    my $emsg = '';
    $fil = $htm_files[$fnd][$of_ff];    # extract FULL PATH name of file ...
    ($itmnam, $itmdir) = fileparse($fil);    # get name and path
    $itmdir = $currworkdir.'/' if ($itmdir =~ /^\.(\\|\/)$/);
    prt( "\n" ) if ($dbg18);
    prt( "$pcnt [$fil] has $hrc href, and $imc image links ..\n" ) if ($dbg18);
    for ($i = 0; $i < $hrc; $i++) {
        $hrf = ${$hr}[$i];
        $hrt = get_href_type($hrf); 
        $i2 = $i + 1;
        $shwerr = 0;
        $emsg = "HH[$fil] ";
        $emsg .= "$i2 HREF [$hrf]$hrt";
        if ($hrt == 0) {
            if (in_excused($hrf)) {
                $excusecnt++;
            } else {
                $emsg .= " CHECK UNKNOWN ****** CHECK ME ******[2]";
                push(@warnings, "WARNING: $emsg" );
                $shwerr = 1;
            }
        } elsif ($hrt < 5) {
            $emsg .= " REMOTE";
        } elsif ($hrt == 5) {
            $emsg .= " JAVASCRIPT";
        } elsif ($hrt == 6) {
            $emsg .= " LOCAL";
        } else {
            $src = $itmdir.get_local_href($hrf);
            if (-f $src) {
                $emsg .= " SITE REF [$src] ok";
            } elsif (-d $src) {
                   $emsg .= " SITE REF [$src] okd (as .txt)";
            } else {
                # special sometimes replacement with a TEXT file
                $tmp = $src.".txt";
                if (-f $tmp) {
                    $emsg .= " SITE REF [$src] ok (as .txt)";
                } else {
                   $emsg .= " SITE REF [$src] ***MISSING***?[1]";
                       push(@missed, $emsg );
                       $shwerr = 1;
                }
            }
        }
        prt( "$emsg\n" ) if ($dbg18 || $shwerr || VERB9() );
    }

    # From this beginning
    for (my $j = 0; $j < $fcnt; $j++) {
        $fil = $htm_files[$j][$of_ff];
        ($itmnam, $itmdir) = fileparse($fil);    # get name and path
        $itmdir = $currworkdir.'/' if ($itmdir =~ /^\.(\\|\/)$/);
        if ($j != $fnd) {
            $pcnt++;
            $hr = $htm_files[$j][$of_hr];
            $im = $htm_files[$j][$of_im];
            $hrc = scalar @{$hr};
            $imc = scalar @{$im};
            prt( "\n" ) if ($dbg18);
            prt( "$pcnt [$fil] has $hrc href, and $imc image links ..\n" ) if ($dbg18 || VERB9());
            for ($i = 0; $i < $hrc; $i++) {
                $hrf = ${$hr}[$i];
                $hrt = get_href_type($hrf); 
                $i2 = $i + 1;
                $shwerr = 0;
                $emsg = "HF[".sub_in_folder($fil)."] ";
                $emsg .= "$i2 HREF [$hrf]$hrt";
                if ($hrt == 0) {
                    if (in_excused($hrf)) {
                        $excusecnt++;
                    } else {
                        $emsg .= " CHECK UNKNOWN ****** CHECK ME ******[3]";
                        push(@warnings, "WARNING: $emsg");
                        $shwerr = 1;
                    }
                } elsif ($hrt < 5) {
                    $emsg .= " REMOTE";
                } elsif ($hrt == 5) {
                    $emsg .= " JAVASCRIPT";
                } elsif ($hrt == 6) {
                    $emsg .= " LOCAL";
                } else {
                    $src = get_local_href($hrf);
                    if ($src eq '.') {
                        if (length($homefile)) { # && ($fdir eq $in_folder)
                            $src = $homefile;    # translate a DOT to HOME FILE
                        }
                    }
                    $ff = $itmdir.$src;
                    if (-f $ff) {
                        $emsg .= " SITE REF [$ff] ok";
                    } elsif (-d $ff) {
                        $emsg .= " SITE REF [$ff] okd";
                        # maybe check if there is an 'index' type file
                        my $res = fix_rel_unix_path($ff);
                        if (seek_dir_index_file($res,\$res)) {
                            $emsg .= " BUT NO 'index' found in [$ff]";
                            $shwerr = 1;
                            push(@missed, $emsg );
                        } else {
                            $emsg .= " found 'index' [$res]";
                        }
                    } else {
                        $tmp = $ff.".txt";
                        if (-f $tmp) {
                           $emsg .= " SITE REF [$ff] ok (as .txt)";
                        } else {
                           $emsg .= " SITE REF [$src][$ff] ***MISSING***?[3]";
                           push(@missed, $emsg );
                           $shwerr = 1;
                        }
                    }
                }
                prt( "$emsg\n" ) if ($dbg18 || ($shwerr && $dbg32) || VERB9());
            }
            for (my $i = 0; $i < $imc; $i++) {
                $img = ${$im}[$i];
                my $sfil = sub_in_folder($fil);
                my $simg = sub_in_folder($img);
                $emsg = "IF[$sfil] [$simg] ";
                if ($img =~ /^http:\/\/.*/i) {
                    if ($showhreflinks) {
                        prtw("WARNING: IMG link is HREF $emsg [1]");
                    } else {
                        $hrflnkcnt++;
                    }
                } else {
                    # 25/07/2007 - deal with '%20' in text
                    $img =~ s/%20/ /g;
                    $src = $itmdir.$img;
                    $shwerr = 0;
                    if (-f $src) {
                        $emsg .= " IMG ok";
                    } else {
                  $tmp = $src.".txt";
                  if (-f $tmp) {
                           $emsg .= " IMG ok (as .txt)";
                  } else {
                     $emsg .= " IMG ***MISSING***?[5]";
                           push(@missed, $emsg );
                           $shwerr = 1;
                  }
                    }
                }
                prt( "$emsg\n" ) if ($dbg18 || ($shwerr && $dbg32) || VERB9());
            }
        }
    }
    return 0;
}

sub check_images($$) {
   my ($ifile, @srcs) = @_;
    my ($nm, $dir) = fileparse($ifile);
    my $scnt = scalar @srcs;
   my ($tmp);
    if ($scnt) {
      prt( "Found $scnt imgs in $nm ...\n" ) if ($dbg14);
      for (my $i = 0; $i < $scnt; $i++) {
         my $src = $srcs[$i][0];
         my $lnn = $srcs[$i][1];
         if ($src =~ /^http:\/\//i) {
            # remote HREF
            } else {
                # 25/07/2007 - deal with '%20' to space
                $src =~ s/%20/ /g;
                my $ff = $dir.$src;
                if ( -f $ff ) {
                    prt( "$src - ok\n" ) if ($dbg13);
                } else {
               $tmp = $ff.".txt";
               if (-f $tmp) {
                  prt( "$src - ok (as .txt)\n" ) if ($dbg13);
               } else {
                  if ($show_all_not_found) {
                     prtw("WARNING: [$src] $ifile:$lnn NOT FOUND![2]a");
                  } elsif (!defined $g_images_notfound{$src}) {
                     prtw("WARNING: [$src] $ifile:$lnn NOT FOUND![2]b");
                     $g_images_notfound{$src} = "$ifile,$lnn";
                  }
               }
                }
            }
        } # for image count
    } else {
        prt( "Found NO imgs in [$ifile] ...\n" ) if ($dbg15);
    }
    return $scnt;
}

sub get_img_srcs($$) {
    my ($fil, @lns) = @_;
    my $lc = scalar @lns;
    my $scnt = 0;
    my ($nm,$dir) = fileparse( $fil );
    prt( "Processing $lc lines from [$nm] dir=[$dir]...\n" ) if ($dbg12);
    my @isrc = ();
    my $ln = '';
    my $bal = '';
    my $inscript = 0;
    my $msg = '';
    my $bgnln = 0;
    my $lnnos = '';
    for (my $i = 0; $i < $lc; $i++) {
        $ln = $bal;
        $bal = '';
        $ln .= $lns[$i];
        chomp $ln;
        prt( "$i [$ln] ...\n" ) if ($dbg10);
        if ($inscript) {
            if ($ln =~ /<\/script>/i) {
                $inscript =0;
                prt( "EXIT a SCRIPT ...\n" ) if ($dbg11);
            }
            next;
        }
        if ( $ln =~ /<img\s+(.*)/i ) {
            my $iln = $1;
            if ( $ln =~ /<script.*>/i ) {
                $msg = "WARNING: Also found SCRIPT in IMG line ...[$ln]";
                push(@warnings, $msg);
                prt( "$msg\n" ) if ($dbg16);
            }
            prt( "[dbg10] Found [$iln] ...\n" ) if ($dbg10);
            $bgnln = $i;
            while ( !($iln =~ />/) && ($i < $lc)) {
                $i++;
                my $nxln = $lns[$i];
                chomp $nxln;
                prt( "[dbg10] Adding [$nxln] ...\n" ) if ($dbg10);
                $iln .= ' ' if !($iln =~ /=$/);
                $iln .= $nxln;
            }
            $lnnos = "$bgnln:$i";
            my $ind = index($iln, '>');
            if ($ind != -1) {
                $bal = substr($iln, $ind+1);
                $iln = substr($iln, 0, $ind+1);
            }
            $iln = trim_all($iln);
            #if ($iln =~ /src=\"(.+)\"/i) {
            if ($iln =~ /src=\s*\"(\S+)\"/i) {
                prt( "[dbg10] SRC = $1 In line [$iln]$lnnos...\n" ) if ($dbg10);
                push(@isrc, [$1, $lnnos, $fil]);
                $scnt++;
            } elsif ($iln =~ /src=\s*(\S+)/i) {    # without QUOTES
                prt( "[dbg10] SRC = $1 In line [$iln]$lnnos...\n" ) if ($dbg10);
                push(@isrc, [$1, $lnnos, $fil]);
                $scnt++;
            } elsif ($iln =~ /src=\s*\'(\S+)\'/i) {    # single QUOTES
                prt( "SRC = $1 In line [$iln]$lnnos...\n" ) if ($dbg10);
                push(@isrc, [$1, $lnnos, $fil]);
                $scnt++;
            } else {
                $msg = "WARNING: SRC NOT FOUND in [$iln]$fil:$lnnos...";
                push(@warnings, $msg);
                prt( "$msg\n" ) if ($dbg16);
            }
        } elsif ( $ln =~ /<script.*>/i ) {
            $inscript = 1;
            prt( "Entered a SCRIPT ...\n" ) if ($dbg11);
            if ($ln =~ /<\/script>/i) {
                $inscript =0;
                prt( "EXIT a SCRIPT ...\n" ) if ($dbg11);
            }
        }
    }
    prt( "Returning $scnt img sources ...\n") if ($dbg10);
    #$dbg12 = $sdbg12;
    #$dbg11 = $sdbg11;
    #$dbg16 = $sdbg16;
    #$dbg10 = $sdbg10;
    return @isrc;
}

sub get_text_sub($) {
    my $ff = shift;     # full file name
    my @arr = split(/\./,$ff); # split on DOT
    $arr[-1] = 'txt';   # make last 'txt'
    my $nf = '';    # start a new file name
    my ($part);
    foreach $part (@arr) {
        $nf .= '.' if (length($nf));
        $nf .= $part;
    }
    return $nf;
}


# ================================================================
# sub check_hrefs($$)
# called from primary file processing - sub process_file_array()
# split passed 'anchor' list into 'type' of link
# and store in global @httprefs, @httpsrefs, @ftprefs, @mtrefs
# skipping javascript:, and checking file/dir for file links
# The global @httprefs is later used for an IP check, if requested.
# ================================================================
sub check_hrefs($$) {
    my ($fil, $rsrcs) = @_;
    my ($fnm,$fdir,$fext) = fileparse( $fil, qr/\.[^.]*/ );
    $fdir = $currworkdir.'/' if ($fdir =~ /^\.(\\|\/)/);    # 20100401 - if local use current directory
    my $scnt = scalar @{$rsrcs};
    my $isphp = (lc($fext) eq '.php');
    my $sfil = sub_in_folder($fil);
    if ($scnt) {
        prt( "Found $scnt anchor href= in $fnm$fext ...\n" ) if ($dbg7);
        for (my $i = 0; $i < $scnt; $i++) {
            my $orgsrc = ${$rsrcs}[$i][0];
            my $lnnos  = ${$rsrcs}[$i][1];
            my $src = $orgsrc;
            if ($src =~ /^http:/i) {
                # remote HREF
                push(@httprefs, [$src, $fil, $lnnos] );
            } elsif ($src =~ /^https:/i) {
                # remote HREF
                push(@httpsrefs, [$src, $fil, $lnnos] );
            } elsif ($src =~ /^ftp:/i) {
                # remote HREF
                push(@ftprefs, [$src, $fil, $lnnos] );
            } elsif ($src =~ /^mailto:/i) {
                # remote HREF
                push(@mtrefs, [$src, $fil, $lnnos] );
            } elsif ( $src =~ /^#/ ) {
                # local in page HREF
            } elsif ( $src =~ /^javascript:/i ) {
                # a JAVASCRIPT HREF
            } else {
                my $ind = index($src,'#');
                if ( $ind != -1 ) {
                    $src = substr($src,0,$ind);
                }
                $ind = index($src,'?');
                if ( $ind != -1 ) {
                    $src = substr($src,0,$ind);
                }
                $src =~ s/\/$//;
                if (length($src)) {
                    if ($src eq '.') {    # HREF is just a DOT
                        if (length($homefile)) { # && ($fdir eq $in_folder)
                            $src = $homefile;    # translate a DOT to HOME FILE
                        }
                    }
                    # 25/07/2007 - deal with '%20' back to space
                    $src =~ s/%20/ /g;
                    my $ff = $fdir.$src;
                    if ( -f $ff ) {
                        prt( "[dbg5] $src - ok\n" ) if ($dbg5);
                    } elsif (-d $ff) {
                        prt( "[dbg5] $src - okd\n" ) if ($dbg5);
                    } else {
                          # 20100312 - Special case - I have replaced large ZIPS, and others, with a TEXT
                          # file, so do NOT give this WARNING, if such a .txt file exists.
                          my $tmpf = $ff.".txt";
                          if ( -f $tmpf ) {
                              prt( "[dbg5] $src - ok AS $tmpf REPLACEMENT FILE!\n" ) if ($dbg5);
                          } else {
                              $tmpf = get_text_sub($ff);
                              if ( -f $tmpf ) {
                                  prt( "[dbg5] $src - ok AS $tmpf REPLACEMENT FILE!2\n" ) if ($dbg5);
                              } else {
                                  if ($orgsrc eq $src) {
                                      prtw("WARNING: href [$orgsrc] file [$ff] NOT FOUND![3]s in [$sfil]$lnnos\n");
                                  } else {
                                      prtw("WARNING: href [$orgsrc] [$src] file [$ff] NOT FOUND![3] in [$sfil]$lnnos\n");
                                  }
                              }
                          }
                    }
                } else {
                    if ($isphp) {
                        prt( "Found BLANK HREF [$orgsrc] in PHP [$sfil]...\n" ) if ($dbg17);
                    } else {
                        prtw("WARNING: Found BLANK HREF [$orgsrc] in [$sfil] ...");
                    }
                }
            }
        }
    } else {
        if ($isphp) {
            prt( "Found NO HREFs in PHP [$sfil]...\n" ) if ($dbg17);
        } else {
            prt( "NO HREF FOUND in [$sfil]...\n" ) if ($shownohrefs);
        }
    }
}

############################################################
# Only used is $chkip = 1;
# Show IP Address
# uses sockets, gethostbyname
# Return 0, if can NOT be resolved.
# else the number of IP addresses resolved.
############################################################
sub checkIPAddress($) {
    my ($nm) = shift;
    my @addr = gethostbyname($nm);
    my $cnt = 0;
    if( !@addr ) {
        my $err = $!;
        $err =~ s/\n/ /g;
        $err =~ s/^\s+//;
        $err =~ s/\s+$//;
        if ($nm =~ /:\d+$/) {
            my $nm2 = $nm;
            $nm2 =~ s/:\d+$//;
            @addr = gethostbyname($nm2);
            if (@addr) {
                $nm = $nm2;
            } else {
                my $err2 = $1;
                $err2 =~ s/\n/ /g;
                $err2 =~ s/^\s+//;
                $err2 =~ s/\s+$//;
                prt( "Can't resolve $nm: $err!\nNor $nm2: $err2" );
                return 0;
            }
        } else {
            prt( "Can't resolve $nm: $err!\n" );
            return 0;
        }
    }
    @addr = map { inet_ntoa($_) } @addr[4 .. $#addr];
    foreach my $k (@addr) {
        $cnt++;
        prt( "$cnt: $nm resolves to IP [$k]\n" ) if ($dbg3);
    }
    return $cnt;
}

sub getIPAddress($) {
    my ($uri) = shift;
    my $packed_ip = gethostbyname($uri);
    my $ip_address = 'UNKNOWN';
    if (defined $packed_ip) {
        $ip_address = inet_ntoa($packed_ip);
    }
    return $ip_address;
}

sub show_uri_ip($$) {
    my ($u,$l) = @_;
    my $msg = "[$u]";
    $msg .= ' ' while (length($msg) < $l);
    if (checkIPAddress($u)) {
        my $ip = getIPAddress($u);
        prt("$msg ok, on IP [$ip]\n");
    } else {
        prt("$msg FAILED\n");
    }
}

################################################
# Add to @scripts multidimensional array,
# if NOT already in there, when on the line
# numbers are added.
###############################################
sub add_2_scripts($$) {
    my ($fil, $lns) = @_;
    my $sc = scalar @scripts;
    for (my $i = 0; $i < $sc; $i++) {
        my $cf = $scripts[$i][0];
        if ($cf eq $fil) {
            my $lc = $scripts[$i][1];
            $lc .= ":$lns";
            $scripts[$i][1] = $lc;
            return 0;
        }
    }
    push(@scripts, [$fil, $lns]);
    return 1;
}

####################################################
# Get HREF sources
# Given an ARRAY of file lines, check for
# anchor href="something" ...
# Return the "something" in an array
####################################################
sub get_href_srcs($$) {
    my ($fil, @lns) = @_;
    my $lc = scalar @lns;
    my $scnt = 0;
    my $slns = 0;    # count the SCRIPT lines
    my ($nm,$dir) = fileparse( $fil );
    prt( "Processing $lc lines from [$nm] dir=[$dir]...\n" ) if ($dbg6);
    my @isrc = ();
    my $ln = '';
    my $bal = '';
    my $inscript = 0;
    $slns = 0;
    my $bgnln = 0;
    my $endln = 0;
    for (my $i = 0; $i < $lc; $i++) {
        $ln = $bal;
        $bal = '';
        $ln .= $lns[$i];
        chomp $ln;
        prt( "$i [$ln] ...\n" ) if ($dbg10);
        if ($inscript) {
            if ($ln =~ /<\/script>/i) {
                $inscript =0;
                prt( "EXIT a SCRIPT ...\n" ) if ($dbg4);
                add_2_scripts( $fil, $slns );
                $slns = 0;
                next;
            }
            $slns++;
            next;
        }
        if ( $ln =~ /<a\s+(.*)/i ) {
            my $iln = $1;
            prt( "Found [$iln] ...\n" ) if ($dbg10);
            $bgnln = $i;
            while ( !($iln =~ />/) && ($i < $lc)) {
                $i++;
                my $nxln = $lns[$i];
                chomp $nxln;
                prt( "Adding [$nxln] ...\n" ) if ($dbg10);
                $iln .= ' ' if !($iln =~ /=$/);
                $iln .= $nxln;
            }
            $endln = $i;
            my $ind = index($iln, '>');
            if ($ind != -1) {
                $bal = substr($iln, $ind+1);
                $iln = substr($iln, 0, $ind+1);
            }
            #if ($iln =~ /src=\"(.+)\"/i) {
            if ($iln =~ /href\s*=\s*\"(\S+)\"/i) {
                prt( "HREF = $1\nIn line [$iln]...\n" ) if ($dbg10);
                push(@isrc, [$1, "$bgnln:$endln"] );
                $scnt++;
            } else {
                # hmmm... NO 'href' in his line
                if (( $iln =~ /name=\s*\"(\S+)\"/i )||( $iln =~ /name=(\S+)/i )) {
                    # ignore BOOKMARKS
                } elsif (( $iln =~ /id=\s*\"(\S+)\"/i )||( $iln =~ /id=(\S+)/i )) {
                    # ignore BOOKMARKS
                } else {
                    prtw("WARNING:$fil:$bgnln: HREF NOT FOUND in [$iln] [$ln] [$fil]");
                }
            }
        } elsif ( $ln =~ /<script.*>/i ) {
            $inscript = 1;
            prt( "Entered a SCRIPT ...\n" ) if ($dbg4);
            $slns = 0;
            $ln = substr($ln, 7);
            if ($ln =~ /<\/script>/i) {
                $inscript =0;
                prt( "EXIT a SCRIPT ...\n" ) if ($dbg4);
                add_2_scripts( $fil, 1 );
                $slns = 0;
            }
        }
    }
    if ($inscript) {
        my $msg = "WARNING: EXIT WHILE IN SCRIPT in [$fil]...";
        push(@warnings, $msg);
        prt( "$msg\n" ) if ($dbg16);
    }
    prt( "Returning $scnt HREF sources ...\n") if ($dbg10);
    return @isrc;
}


#########################################################
# Passed an array REF of extensions,
# check if this is one of them?
#########################################################
sub is_this_extent($$) {
   my ($ext, $rex) = @_;
   my $lcx = lc($ext);
   foreach my $x (@{$rex}) {
      return 1 if ($lcx eq lc($x));
   }
   return 0;
}

############################################
# only looking for HTM, HTML, PHP,
# could be extended to others maybe ...
############################################

# test an EXTENSION, or form '.htm'...
sub is_htm_ext($) {
   my ($ext) = shift;
   return( is_this_extent($ext,\@html_ext) );
}
sub is_graf_ext($) {
   my ($ext) = shift;
   return( is_this_extent($ext,\@graf_ext) );
}
sub is_zip_ext($) {
   my ($ext) = shift;
    my @arr = qw( .zip .gz );
   return( is_this_extent($ext,\@arr) );
}
sub is_css_ext($) {
    my ($ext) = shift;
    return( is_this_extent($ext, \@css_ext) );
}
sub is_txt_ext($) {
    my ($ext) = shift;
    my @arr = qw( .txt );
    return( is_this_extent($ext, \@arr) );
}
sub is_script_ext($) {
    my ($fil) = shift;
    return( is_this_extent($fil, \@script_ext) );
}

# test a FILE/PATH extension
sub is_htm_file_ext($) {
    my ($fil) = shift;
    my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/);
    return( is_htm_ext($e) );
}
sub is_graphic_file_ext($) {
    my ($fil) = shift;
    my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/);
    return( is_graf_ext($e) );
}
sub is_zip_file_ext($) {
    my ($fil) = shift;
    my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/);
    return( is_zip_ext($e) );
}
sub is_css_file_ext($) {
    my ($fil) = shift;
    my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/);
    return( is_css_ext($e) );
}
sub is_txt_file_ext($) {
    my ($fil) = shift;
    my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/);
    return( is_txt_ext($e) );
}
sub is_script_file_ext($) {
    my ($fil) = shift;
    my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/);
    return( is_script_ext($e) );
}

################################################
# my $ignfpd = 1;    # ignore FRONTPAGE folders
################################################
sub is_fp_folder($) {
    my ($inf) = shift;
    foreach my $fil (@fpfolders) {
        if (lc($inf) eq lc($fil)) {
            return 1;
        }
    }
    return 0;
}
sub is_xclude_folder($) {
    my ($inf) = shift;
    foreach my $fil (@g_xclude_dir) {
        if (lc($inf) eq lc($fil)) {
            return 1;
        }
    }
    return 0;
}

####################################
# Check if FILE is in EXCLUDE list
####################################
sub in_excludes($) {
    my ($fil) = shift;
    my $lcf = lc($fil);
    foreach my $f (@excludes) {
        if (lc($f) eq $lcf) {
            return 1;
        }
    }
    return 0;
}

sub in_spl_excludes($) {
    my ($fldr) = shift;
    my $lfldr = lc($fldr);
    foreach my $f (@splexcludes) {
        if (lc($f) eq $lfldr) {
            return 1;
        }
    }

    return 0;
}

####################################################################
# process_folder(folder) 
# Main DIRECTORY processing function
#
# Open the FOLDER given, and collect ALL files found,
# iterate into sub-directories, if $recurse is non-zero,
# and it is NOT a special FRONTPAGE (hidden) FOLDER.
#
# Files are collected into multidemensional arrays, and
# %g_dir_files - store per directory, per type array of files
####################################################################
sub process_folder($) {
    my ($inf) = shift;
    my $fcnt = 0;  # start with NO 'html' files
    my ($fil,$ff,$fnd,$dhr,$far,$isind);
    my $rg_dir_files = \%g_dir_files;
    prt( "[dbg1b] Processing [$inf] folder...\n" ) if ($dbg1b);
    my @dirs = ();
    my $uinf = dos_2_unix($inf);    # keep HASH in UNIX form
    $fnd = 0;   # count of 'index.htm' type files found in folder
    if ( opendir( DIR, $inf ) ) {
        my @files = readdir(DIR);
        closedir DIR;
        $fcnt = scalar @files;
        #prt("Processing $fcnt files from [$inf]...\n");
        $fcnt = 0;
        ${$rg_dir_files}{$uinf} = {} if (!defined ${$rg_dir_files}{$uinf});   # store files %hash->dir->%type->@files
        foreach $fil (@files) {
            next if (($fil eq ".")||($fil eq ".."));
            $isind = 0;
            $ff = $inf."\\".$fil;
            if ( -d $ff ) {
                if ($recurse) {
                    next if ($ignfpd && is_fp_folder($fil));    # ignore FRONTPAGE folders
                    next if (@splexcludes && in_spl_excludes($fil));
                    next if (is_xclude_folder($fil));
                    #process_folder( $ff );
                    push(@dirs,$ff);
                }
            } else {
                # NOTE: multidimensional arrays pushed - offsets into arrays
                # my $of_ff = 0;    # 1 - full file name
                # my $of_hr = 1;    # 2 - array ref of href links
                # my $of_im = 2;    # 3 - array ref of image links
                # my $of_lk = 3;    # 4 - linked count
                # my $of_rh = 4;    # 5 - contains 'collecthrefs2' collections
                # my $of_to = 5;    # links TO
                # my $of_fm = 6;  # links FROM
                # my $of2hr = 7;  # more or less duplicate of $of_hr
                # my $of2im = 8;  # more or less duplicate of $of_im
                if ( in_excludes($fil) ) {  # NOT in @excludes
                    prt("[dbg37] File [$fil] EXCLUDED!\n") if ($dbg37);
                } else {
                    $dhr = ${$rg_dir_files}{$uinf}; # extract hash ref for this (unix) path
                    my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
                    if (defined $ext_hash{$ext}) {
                        $ext_hash{$ext}++;
                    } else {
                        $ext_hash{$ext} = 1;
                    }
                    my $arr_ref = \@other_files;
                    my $type = "Other";
                    if (is_htm_file_ext($fil)) {
                        $arr_ref = \@htm_files;
                        $fcnt++;
                        $isind = is_def_index_file($fil);
                        $fnd++ if ($isind);
                        $type = "HTML";
                    } elsif (is_graphic_file_ext($fil)) {
                        $arr_ref = \@img_files;
                        $type = "IMG";
                    } elsif (is_zip_file_ext($fil)) {
                        $arr_ref = \@zip_files;
                        $type = "ZIP";
                    } elsif (is_css_file_ext($fil)) {
                        $arr_ref = \@css_files;
                        $type = "CSS";
                    } elsif (is_txt_file_ext($fil)) {
                        $arr_ref = \@txt_files;
                        $type = "TEXT";
                    } elsif (is_script_file_ext($fil)) {
                        $arr_ref = \@script_files;
                        $type = "SCRIPT";
                    }
                    # ==========================================================
                    #                  0    1   2   3  4  5   6   7  8  9  10
                    push(@{$arr_ref}, [$ff, '', '', 0, 0, '', '', 0, 0, 0, 0] );
                    # ==========================================================
                    ${$dhr}{$type} = [] if (!defined ${$dhr}{$type});
                    $far = ${$dhr}{$type};
                    push(@{$far},[$fil,$isind,0,[],[],0]);
                    prt("[dbg36] Got file [$fil], with ext = [$ext] = $type\n") if ($dbg36);
                }
            }
        }
        prt( "[dbg1] Processed $inf folder finding $fcnt HTML files ...\n" ) if ($dbg1);
        if ($fcnt) {
            # got some, but any 'index.htm' type
            push(@g_folders_noind, $inf) if (!$fnd);
        } else { # NO 'html' files found
            push(@g_empty_folders, $inf); # store the EMPTY folder
        }
    } else {
        pgm_exit(1,"ERROR: Failed to open folder [$inf]!\n" );
    }
    if (@dirs) {
        # process the subdirectories, if any to process
        foreach $ff (@dirs) {
            process_folder( $ff );
        }
    }
}

##############################################
# Just to show the COUNTS in the ARRAYS
##############################################
sub show_found_counts() {
    my ($cnt);
    if (VERB1()) {
        $cnt = scalar @htm_files;
        prt( "show_found_counts() $cnt HTML, ");
        $cnt = scalar @img_files;
        prt( "$cnt images, " );
        $cnt = scalar @css_files;
        prt( "$cnt css, " );
        $cnt = scalar @zip_files;
        prt( "$cnt zip, " );
        $cnt = scalar @txt_files;
        prt( "$cnt txt, " );
        $cnt = scalar @script_files;
        prt( "$cnt script, " );
        $cnt = scalar @other_files;
        prt( "and $cnt others ...\n" );
    }
    $cnt = scalar keys %ext_hash;
    if ($dbg27 || VERB5()) {
        prt( "$cnt extensions, and each count ...\n" );
        foreach my $key (keys %ext_hash) {
            my $val = $ext_hash{$key};
            prt( "$val $key ");
        }
        prt("\n");
    }

   my $cnt1 = scalar @g_empty_folders;
   my $cnt2 = scalar @g_folders_noind;
   prtw("WARNING: Got $cnt1 folders with NO HTML type files, and $cnt2 with no 'index.htm'\n") if (($cnt1 || $cnt2) && $show_no_index);
}

# @ipsfound = <INF>;
sub in_ips_found($) {
    my ($ip) = shift;
    my $lcip = lc($ip);
    foreach my $i (@ipsfound) {
        chomp $i;
        if (lc($i) eq $lcip) {
            return 1;
        }
    }
    return 0;
}

sub trim_href($) {
    my ($fh) = shift;
    my $h = $fh;
    my $ind = index($h,'#');
    $h = substr($h,0,$ind) if ($ind > 0);
    $ind = index($h,'?');
    $h = substr($h,0,$ind) if ($ind > 0);
    $h =~ s/\/$//;  # remove any trailing '/'
    if ($h =~ /\.$/) {
        $h =~ s/\.$//;  # remove any trailing '.' - but wonder what this is
        prt("PROBLEM: File [$g_active_file]$g_active_lnn\n");
        pgm_exit(1,"Check out this trailing '.', from [$fh]!\n");
    }
    return $h;
}

sub get_host_link($) {
    my ($h) = shift;
    if ($h =~ /^(\w+:\/\/[\w\.-]+\.[A-Z]{2,4})/i) {
        return "$1";
    }
    return $h;
}

sub get_host_link13($) {
    my ($uri) = shift;
    # print        "$1,$2,       $3,$4,        $5,     $6,$7,      $8, $9\n"
    if ($uri =~ m{^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?}) {
        return $1.$3;
    }
    return $uri;
}

sub get_host_link4($) {
    my ($uri) = shift;
    # print        "$1,$2,       $3,$4,        $5,     $6,$7,      $8, $9\n"
    if ($uri =~ m{^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?}) {
        return $4;
    }
    return $uri;
}

sub get_domain_name($$) {
    my ($uri,$rw) = shift;
    $uri = get_host_link4($uri);
    $uri =~ s/^www(\d*)\.// if ($rw);
    return $uri;
}

sub show_remote_links() {
    my $rh = \%g_hrefs;
    my $rh2 = \%g_hosts;
    my $rd = \%g_domains;
    my ($len,$key,$val,$href,$cnt,$chk,$dom);
    my $min = 0;
    my @hosts = ();
    my @domains = ();
    foreach $key (keys %{$rh}) {
        $len = length($key);
        $min = $len if ($len > $min);
    }
    $min += 2;
    $min = 65 if ($min > 65);
    my $icnt = 0;
    foreach $key (sort keys %{$rh}) {
        $val = ${$rh}{$key};
        # still to decide which is best???
        $href = get_host_link($key);
        #my $href13 = get_host_link13($key);
        #if ($href ne $href13) {
        #    pgm_exit(1, "CHECK: from [$key], got \n [$href] and \n [$href13] FIX CODE!\n");
        #}
        $chk = '';
        if (defined ${$rh2}{$href}) {
            ${$rh2}{$href}++;
        } else {
            ${$rh2}{$href} = 1;
            push(@hosts,$href);
            $dom = lc(get_domain_name($key,0));
            if (!defined ${$rd}{$dom}) {
                ${$rd}{$dom} = 1;
                push(@domains,$dom)
            }
            if ($check_host_ip) {
                if (checkIPAddress($href)) {
                    $chk = 'ok';
                } else {
                    $chk = 'NA';
                }
            }
        }
        $key = '['.$key.']';
        $key .= ' ' while (length($key) < $min);
        $icnt++;
        $cnt = ${$rh2}{$href};
        prt( "$icnt: $key ($cnt)$chk in $val\n" ) if ($out_remote_links);
    }
    if (@hosts) {
        if ($os =~ /Win/) {
            $out_hosts = path_u2d($out_hosts);
        }
        write2file(join("\n",sort(@hosts))."\n",$out_hosts);
        prt("Full list of HOSTS written to [$out_hosts]...\n") if (VERB2());
    }
    if (@domains) {
        if ($os =~ /Win/) {
            $out_domains = path_u2d($out_domains);
        }
        write2file(join("\n",sort(@domains))."\n",$out_domains);
        prt("Full list of Domains written  [$out_domains]...\n") if (VERB2());
    }

    $icnt = 0;
    foreach $key (sort keys %{$rh2}) {
        $val = ${$rh2}{$key};
        $icnt++;
        prt("$icnt: $key $val\n") if ($out_remote_links);
    }
}

#######################################################
# Process the HTTP HREF sources
# if $chkip = 1; then attempt to resolve the IP
# addresses from the host name.
# push(@httprefs, [$src, $fil, $lnnos] );
#######################################################
sub process_host_array() {
    my $hcnt = scalar @httprefs;
    my $newcnt = 0;
    my ($val,$msg,$file,$i,$href,$lnn,$nm,$dir,$key,$icnt,$rawhr);
    my @hrefsarr = ();
    $icnt = 0;
    if ($hcnt) {
        prt( "Found total $hcnt remote 'http' HREF entries ...\n" ) if (VERB2());
        for ($i = 0; $i < $hcnt; $i++) {
            $lnn = $httprefs[$i][2];    # get file LINE NUMBER
            $file = $httprefs[$i][1];   # get FULL FILE NAME
            $g_active_file = $file;
            $g_active_lnn  = $lnn;
            # $href = $httprefs[$i][0];
            # should remove any hash like '#pos', and any query like '?a=b&c=d'
            $rawhr = $httprefs[$i][0];
            push(@hrefsarr,$rawhr);
            $href = trim_href($rawhr);
            ($nm,$dir) = fileparse($file);
            if (defined( $g_hrefs{$href} )) {
                $val = $g_hrefs{$href};
                $val .= ' '.$file;
            } else {
                $val = $file;
                $newcnt++;
            }
            $val .= ":$lnn";
            $g_hrefs{$href} = $val;
            prt( "$href in [$file]$lnn\n" ) if ($dbg2);
        }
        if (@hrefsarr) {
            my %dupes = ();
            foreach $href (@hrefsarr) {
                $dupes{$href} = 1;
            }
            @hrefsarr = keys(%dupes);
        }
        if (@hrefsarr) {
            if ($os =~ /Win/) {
                $out_hrefs = path_u2d($out_hrefs);
            }
            write2file(join("\n",sort(@hrefsarr))."\n",$out_hrefs);
            prt("Full list of HREFS written to [$out_hrefs] file.\n") if (VERB2());
        }
        $hcnt = scalar keys(%g_hrefs);
        show_remote_links();
        prt( "Found $hcnt different remote entries ...($newcnt) " );
        if ($chkip) {
            prt("and now checking for valid IP\n");
            my $inips = 0;
            prt( "Checking $hcnt IP addresses ... " );
            if ( !$refreships && ( -f $ipfile)) {
                if (open INF, "<$ipfile") {
                    @ipsfound = <INF>;
                    close INF;
                    prt( "Have ".scalar @ipsfound." in $ipfile" );
                } else {
                    prt( "Warning: Failed to open $ipfile" );
                }
            }
            prt("\n");
            $icnt = 0;
            foreach $key (keys %g_hrefs) {
                $val = $g_hrefs{$key};
                $icnt++;
                prt( "$key in $val\n" ) if ($dbg8);
                if ($key =~ /^http:\/\//i) {
                    my $hkey = substr($key, 7); # REMOVE protocol string
                    my @arr = split( /\//, $hkey ); # and get just the domain name
                    $hkey = $arr[0];    # but this may not work with '#', or queries '?'
                    # ***TBF*** TO BE FIXED
                    if ( !in_ips_found($hkey) ) {
                        if (checkIPAddress( $hkey ) == 0) {
                            $msg = "FAILED: NO IP FOR HOST [$hkey][$val]";
                            push(@warnings, $msg);
                            prt( "$msg\n" ) if ($dbg16);
                        } elsif ($writeips) {
                            push(@ipsfound,"$hkey\n");
                        }
                    } else {
                        $inips++;
                    }
                }
                if (($icnt % 100) == 0) {
                    prt( "Done $icnt IP Addresses ...\n" );
                }
            }
            prt( "Completed $icnt IP Addresses ... " );
            if ($writeips) {
                $val = join("\n", sort @ipsfound);
                $val = trimblanklines($val);
                write2file($val, $ipfile);
                prt( "$inips in previous. Written ".scalar @ipsfound." to $ipfile" );
            }
            prt("\n");
        } else {
            prt(", but IP checking is OFF! (add -checkips)\n");
        }
    }
}

sub write_missing($$) {
    my ($fil,$rh) = @_; # ($not_found_file,\%g_images_notfound) if (length($not_found_file));
    my ($key,$list);
    $list = '';
    foreach $key (keys %{$rh}) {
        $list .= "\n" if (length($list));
        $list .= $key;
    }
    if (length($list)) {
        $list .= "\n";
        write2file($list,$fil);
        prt("Written missing list to [$fil]\n");
    }
}

sub show_scripts() {
    my ($val,$scnt,$file);
    $scnt = scalar @scripts;
    if ($scnt && ($dbg9 || $showscripts)) {
        prt( "Got $scnt files containing SCRIPTS ...\n" );
        # push(@scripts, [$fil, $lns]);
        for (my $i = 0; $i < $scnt; $i++) {
            $file = $scripts[$i][0];
            $val = $scripts[$i][1];
            prt( "$file $val\n" );
        }
    }
}

sub show_dir_files($) {
    my ($hr) = @_# = \%g_dir_files
    my ($key,$val,$k2,$v2,$v3,$dir,$cnt,$i,$ii,$icnt);
    foreach $key (sort keys %{$hr}) {
        $dir = sub_in_folder($key);
        $dir = '<root>' if (length($dir)==0);
        $val = ${$hr}{$key};    # extract hash ref BY TYPE
        prt("Directory: [$dir]\n");
        $icnt = 0;
        foreach $k2 (sort keys %{$val}) {
            # for each type
            $v2 = ${$val}{$k2};
            $cnt = scalar @{$v2};
            prt(" Type $k2 = $cnt\n");
            for ($i = 0; $i < $cnt; $i++) {
                $v3 = ${$v2}[$i][0];
                $ii = ${$v2}[$i][1];
                prt("  File: [$v3] ");
                if ($ii) {
                    $icnt++;
                    prt(" ('index' type)");
                }
                prt("\n");
            }
        }
        if (!$icnt) {
            prt("Directory: [$dir] has NO 'index' type file.\n");
        } elsif ($icnt > 1) {
            prt("Directory: [$dir] has $icnt 'index' type files.\n");
        }
    }
    pgm_exit(1,"Debug run show of directories and files...\n");
}

# ####################################################
### MAIN ###
parse_args(@ARGV);

if (length($in_folder) == 0) {
    mydie( "No input folder (or file) given/found in command ...\n" );
}

if (-f $in_folder) {
    #$g_user_file = File::Spec->rel2abs($in_folder);
    $g_user_file = $in_folder;
    ($homefile, $in_folder) = fileparse($in_folder);
    $in_folder =~ s/[\\\/]$//;
    $in_folder = $currworkdir if ($in_folder eq '.'); # 2010-04-02 - use FULL work directory if just '.'
    $single_file = 1;
}

# set_debug_on();
# set_htools_dbg_on();
show_startup();

if ( !$recurse && length($g_user_file) && (-f $g_user_file)) {
    #                  0             1   2   3  4  5   6   7  8  9  10
    push(@htm_files, [$g_user_file, '', '', 0, 0, '', '', 0, 0, 0, 0] );
} else {
    process_folder($in_folder);
}

###show_dir_files(\%g_dir_files) if ($debug_on);

show_found_counts();

process_file_array();   # main file processing - each HTML file found is processed

process_host_array();   

if (length($homefile)) {
    ###trace_from_htm( $homefile, 0 );
    check_linkages( $homefile );
    check_local_links( $homefile );
    show_link_counts("HTML   ", \@htm_files);
    show_link_counts("IMG    ", \@img_files);
    show_link_counts("CSS    ", \@css_files);
    show_link_counts("ZIP    ", \@zip_files);
    show_link_counts("TXT    ", \@txt_files);
    show_link_counts("Script ", \@script_files);
    show_link_counts("Other  ", \@other_files);
}

show_scripts();
my $mlcnt = scalar @missing_links;
if ($mlcnt && !$dbg33) {
   prtw("WARNING: Found $mlcnt files with NO LINKS!\n");
}

##############################################################
prt( "\n###### SHOW RESULTS ########\n" );
prt( "WARNING: $hrflnkcnt images by HREF not shown! (use -showhreflinks)\n" ) if ($hrflnkcnt);
show_warnings();
if (@missed) {
    prt( "\nMISSING FOLLOW: ".scalar @missed."\n" );
    foreach my $mfile (@missed) {
        prt( "$mfile\n");
    }
}

prt( "###### END RESULTS ########\n" );
write_missing($not_found_file,\%g_images_notfound) if (length($not_found_file));
##############################################################
pgm_exit(0,"Normal exit(0)\n");

##################################

sub show_help {
    prt( "$pgmname [Options] input-folder or home-file-name.\n" );
    prt( "Purpose: To take a folder, or home file in a folder, and\n" );
    prt( "check the assume local Web Site for internal consistency.\n" );
    prt( "Options:\n" );
    prt( " -checkips        = Will check the IP resolution of REMOTE HREF items.\n" );
    prt( " -showhreflinks   = Show a WARNING when an IMG, ICO, etc is a REMOTE link\n" );
    prt( " -showlinks       = Show the links for each file ...\n" );
    prt( " -showremote      = Show remote links, and file containing link.\n" );
    prt( " -showscripts     = Show SCRIPT files ...\n" );
    prt( " -shownoindex     = Show folders with NO 'index' type file, and/or NO html types files.\n");
    prt( " -writeips        = Write HREF of IP found to a file ...\n" );
    prt( " -refreships      = If -checkips, and -writeips, re-write NEW check file...\n" );
    prt( " -ipfile out-file = Set HREF output file. Default is [$ipfile].\n" );
    prt( " -ignore in-file  = Ignore this file. Repeat for more. use '.none.' to reset list.\n" );
    prt( " -shownohrefs     = Show when NO HREF found in a file.\n" );
    prt( " -loadlog (-ll)   = Load LOG file at end.\n");
    prt( " -showallnotfound = Show ALL images not found. Default to show only 'different'.\n");
    prt( " -nffile out-file = Write not found files to file name given.\n");
    prt( " -v[NN]           = Bump verbal, or set to 'NN'.\n" );
    prt( " -x <dir>         = Exclude directory. '.reset.' clears the list.\n" );
    prt( " -load-log   (-l) = load log with output at end.\n");
    prt( " -nofollow        = Do not follow page links.\n");
    prt( "If an input-folder given, then no trace of internal links will be done.\n" );
    prt( "If a home file name is given, the folder used will be of that file.\n" );
    prt( "All arguments can also be given with a '--' prefix, if that 'feels' better ;=))\n");
    prt( "Following are the current default settings ...\n" );
    $verbosity += 5;
    show_startup();
    mydie("                                                      Happy link checking ;=))\n");
}

# Ensure argument exists, or die.
sub require_arg {
    my ($arg, @arglist) = @_;
    mydie( "ERROR: no argument given for option '$arg' ...\n" )
        if ! @arglist;
}

##########################################################
# Parse USER input
# Largerly still to be done
##########################################################
sub parse_args {
    my (@av) = @_;
    my ($arg,$narg,$sarg);
    while (@av) {
        $arg = $av[0];
        if ($arg =~ /^-/) {
            $sarg = substr($arg,1);
            $sarg = substr($sarg,1) while ($sarg =~ /^-/);
            if (($sarg eq 'h')||($sarg eq 'help')||($sarg eq '?')) {
                prt("Showing help...\n");
                show_help();
            } elsif ($sarg =~ /^v/) {
                if ($sarg =~ /^v.*(\d+)$/) {
                    $verbosity = $1;
                } else {
                    $sarg = substr($sarg,1);
                    while ($sarg =~ /^v/) {
                        $sarg = substr($sarg,1);
                        $verbosity++;
                    }
                }
                prt( "Set verbal to $verbosity\n" ) if (VERB1());
            } elsif ($sarg eq 'checkips') {
                $chkip = 1;
                prt( "Will check IP of REMOTE HREF items.\n" );
            } elsif ($sarg eq 'showhreflinks') {
                $showhreflinks = 1;
                prt( "Show a WARNING when an IMG, ICO, etc is a REMOTE link\n" );
            } elsif ($sarg eq 'showlinks') {
                $showlinks = 1;
                prt( "Show the links for each file ...\n" );
            } elsif ($sarg eq 'showremote') {
                $out_remote_links = 1;
                prt( "Show remote links, and each file...\n" );
            } elsif ($sarg eq 'showscripts') {
                $showscripts = 1;
                prt( "Show SCRIPT files ...\n" );
            } elsif ($sarg eq 'writeips') {
                $writeips = 1;
                prt( "Write HREF of IP found to a file ...\n" );
            } elsif ($sarg eq 'refreships') {
                $refreships = 1;
                prt( "If -checkips, and -writeips, re-write NEW check file...\n" );
            } elsif ($sarg eq 'ipfile') {
                require_arg(@av);
                shift @av;
                $ipfile = $av[0];
                prt( "HREF output set to $ipfile ...\n" );
            } elsif ($sarg eq 'ignore') {
                require_arg(@av);
                shift @av;
                $arg = $av[0];
                if ($arg eq '.none.') {
                    @excludes = ();
                    prt( "Reset EXCLUDES array ...\n" );
                } else {
                    push(@excludes, $arg);
                    prt( "Added file [$arg] to EXCLUDES ...\n" );
                }
            } elsif ($sarg eq 'x') {
                require_arg(@av);
                shift @av;
                $arg = $av[0];
                if ($arg eq '.reset.') {
                    @g_xclude_dir = ();
                    prt( "Reset EXCLUDE directory array ...\n" );
                } else {
                    push(@g_xclude_dir, $arg);
                    prt( "Added file [$arg] to EXCLUDE directories ...\n" );
                }
            } elsif ($sarg eq 'shownoindex') {
                $show_no_index = 1;
                prt("Set to show folders with NO 'index' type file, and/or NO html types files.\n");
            } elsif (($sarg eq 'll')||($sarg eq 'loadlog')) {
                $load_log = 1;
                prt("Set to load log at end.\n");
            } elsif ($sarg eq 'showallnotfound') {
                prt( "Set to show ALL images not found.\n");
                $show_all_not_found = 1;
            } elsif ($sarg eq 'nffile') {
                require_arg(@av);
                shift @av;
                $narg = $av[0];
                $not_found_file = $narg;    # if file name given, write list to that file...
                prt("Will write missing to file [$narg]\n");
            } elsif ($sarg eq 'debug') {
                prt("Setting BIG debug ON...\n");
                set_debug_on();
                set_htools_dbg_on();
                $load_log = 1;
            } elsif ($sarg =~ /^l/) {
                $load_log = 1;
                prt("Set to load log at end.\n");
            } elsif ($sarg eq 'nofollow') {
                $recurse = 0;
                prt("Set to NOT follow links\n");
            } else {
                mydie( "ERROR: Unknown argument [$arg] ...\n" );
            }
        } else {
            # no leading '-'
            $in_folder = $av[0];
            prt( "Input folder set to [$in_folder]...\n" ) if (VERB2());
        }
        shift @av;
    }
    # check the INPUT folder
   if ( $debug_on ) {
       prtw("WARNING: DEBUG is ON!\n");
       if (length($in_folder) == 0) {
           if (length($def_in_folder) && (-d $def_in_folder)) {
                $in_folder = $def_in_folder;
                prt("[debug_on] Input folder set to DEFAULT [$in_folder]...\n");
           } elsif (length($def_file) && (-f $def_file)) {
                $in_folder = $def_file;
                prt("[debug_on] Input folder set to DEFAULT [$in_folder]...\n");
          }
       }
      $load_log = 1;
   }
   if (length($in_folder) == 0) {
        mydie( "ERROR: No VALID FOLDER OR FILE NAME found in command\n" );
   }
    if ( !( (-d $in_folder) || (-f $in_folder ) ) ) {
        mydie( "ERROR: [$in_folder] is NOT VALID FOLDER OR FILE NAME\n" );
    }
    # pgm_exit(1,"Current work directory: [$currworkdir]...\n");
}

# eof - chklinks03.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional