#!/perl -w
# NAME: chklinks03.pl
# AIM: Given a input FOLDER, check all the HTML found for a ...
# AND check ALL image links , if it is a LOCAL file,
# and other 'link' items, like .zip, .txt, etc.
# 2019-05-06 - Review
# 17/10/2013 - Default is to follow page links. Added -nofollow to only do the page
# 22/05/2011 - More tidying, reducing output to 'essentials' unless -v2 or more
# 19/05/2011 - Fix of 20100312 fix, to find substitutes '.txt' files
# 02/04/2010 - Some more 'fixes' - continue 2010/04/04 - 2010/04/12...
# 2010/04/01 - Fix for 'remove_script' - to move to htmltools.pl
# 15/03/2010 - After a number of 'fixes', starting to feel SOLID ;=))
# 12/03/2010 - Special case - I have replaced large ZIPS, with a TEXT
# file, so do NOT give this WARNING, if such a .txt file exists.
# 11/03/2010 - Lots of tidying, and fixing... experimented with another collecthrefs2
# but found only difference is in the DUPLICATE that can be in htmltools::collecthrefs
# 13/11/2008 - some tidying, especially when from ONE FILE
# where there will be NO from|to references
# 18/06/2007 - Add some command parameters, and help
# 02/06/2007 - geoff mclane - geoffair.com/mperl/index.htm
use strict;
use warnings;
use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
use File::Spec; # File::Spec->rel2abs($rel); # get ABSOLUTE form
use Socket;
use Cwd; # for cwd()
my $os = $^O;
unshift(@INC, 'C:/GTools/perl');
# for htmltools, if functions used
my @imgs = ();
my @hrefs = ();
require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
require 'htmltools.pl' or die "Unable to load htmltools.pl ...\n";
# log file stuff
my ($LF);
my $pgmname = $0;
if ($pgmname =~ /\w{1}:\\.*/) {
my @tmpsp = split(/\\/,$pgmname);
$pgmname = $tmpsp[-1];
}
my $perl_base = 'C:/GTools/perl';
my $outfile = "$perl_base/temp.$pgmname.txt";
open_log($outfile);
my $currworkdir = cwd();
my $out_hrefs = $perl_base."\\temphrefs.txt";
my $out_hosts = $perl_base."\\temphosts.txt";
my $out_domains = $perl_base."\\tempdomains.txt";
# some FEATURES and USER variables
# my @excludes = qw( cvineng2.htm );
my @excludes = qw( desktop.ini php.ini blank.html blank.htm index3.htm );
my @splexcludes = qw( macpc tmp wedding );
my @def_indexes = qw( default.html default.htm index.aspx
index.htm index.html index.jsp index.php index.shtml home.htm
home.html welcome.html welcome.htm );
my $recurse = 1; # recursive # follow links to other files, and check them also
my $ignfpd = 1; # ignore FRONTPAGE folders
my $chkip = 0; # check the IP address
my $check_host_ip = 0; # check the host domain only IP
my $load_log = 0; # load log file at end -ll to set.
my $show_no_index = 0; # show when NOT html in folder, OR no 'index' type file
my $showhreflinks = 0; # show a WARNING when an IMG, ICO, etc is a REMOTE link
my $showlinks = 0; # show the links for each file
my $shownolinks = 1; # show NO links for each file
my $showscripts = 0; # show SCRIPT files
my $writeips = 1; # write IP found to a file
my $refreships = 0; # if $chkip, and $writeips, re-write NEW check file
my $shownohrefs = 0; # show when NO HREF found in file
my $show_all_not_found = 0; # avoid repeating an image file not found
my $not_found_file = ''; # if file name given, write list to that file...
my $out_remote_links = 0; # potential BIG LIST of remote LINKS
my $in_folder = "";
my $ipfile = "iplinks.txt";
my @ipsfound = ();
my $verbosity = 0;
my @html_ext = qw( .htm .html .shtml .php );
my @graf_ext = qw( .jpg .jpeg .gif .png .bmp .ico .mpg );
my @css_ext = qw( .css );
my @script_ext = qw( .js .class .cgi );
my @fpfolders = qw( _vti_cnf _vti_pvt _private _derived );
my @g_xclude_dir = ();
my @excused = ( '?dir=test', '?dir=.' );
# *** DEBUG ***
my $debug_on = 0; # OFF for release - only to load default
# SET A DEFAULT INPUT FOLDER / FILE
##my $def_file = "C:\\HOMEPAGE\\FG\\index.html";
my $def_file = "";
my $def_in_folder = 'F:\Projects\html-tidy.org.api\tidy\tidylib_api_next';
# program variables
# NOTE: Each of these is a multidimensional array - see offset below
my @htm_files = (); # store files found in folder
my @img_files = ();
my @css_files = ();
my @zip_files = ();
my @txt_files = ();
my @script_files = ();
my @other_files = ();
my @g_empty_folders = ();
my @g_folders_noind = ();
my %g_images_notfound = ();
my %g_dir_files = (); # store files %hash->dir->%type->@files
my $g_active_file = '';
my $g_active_lnn = '';
my $g_user_file = '';
my $single_file = 0;
# offsets in above arrays
my $of_ff = 0; # full file name
my $of_hr = 1; # array ref of href links
my $of_im = 2; # array ref of image links
my $of_lk = 3; # linked count
my $of_rh = 4; # ref hash from 'collecthrefs2'
my $of_to = 5; # links TO, but this is really already in $of_hr, after excluding in-file, and external links
my $of_fm = 6; # links FROM
my $of2hr = 7; # more or less duplicate of $of_hr
my $of2im = 8; # more or less duplicate of $of_im
my $of_s1 = 9; # spare 1
my $of_s2 = 10; # spare 2
# 0 1 2 3 4 5 6 7 8 9 10
# push(@{$arr_ref}, [$ff, '', '', 0, 0, '', '', 0, 0, 0, 0] );
my @donesrcs = ();
my @doneimgs = ();
my %ext_hash = ();
##my $cnt = 0;
##my $file = '';
my @warnings = (); # list of errors, warnings during running
my @httprefs = (); # set of HREF src values push(@httprefs, [$src, $fil, $lnnos] );
my @httpsrefs = ();
my @ftprefs = ();
my @mtrefs = ();
my %g_hrefs = (); # full list of ALL http items found
my %g_hosts = (); # list of http://domain.names
my %g_domains = (); # just domain names
my @scripts = ();
my $tot_imgcnt = 0;
my $homefile = '';
my $total_hrefs = 0;
my $total_imgs = 0;
my @missed = ();
my $excusecnt = 0;
my $hrflnkcnt = 0; # $showhreflinks
my $homeoffset = -1;
my @offsdone = ();
my @htmlinks = ();
my @missing_links = (); # store files with NO links
# debug only bits
my $dbg1 = 0; # prt( "[dbg1] Processed $inf folder finding $fcnt HTML files ...\n" ) if ($dbg1);
my $dbg1b = 0; # prt( "[dbg1b] Processing [$inf] folder...\n" ) if ($dbg1b);
my $dbg2 = 0; # show ALL HREF entries ...
my $dbg3 = 0; # show IP found ...
my $dbg4 = 0; # show entered/exit script
my $dbg5 = 0; # show 'ok' when found
my $dbg6 = 0; # show processing lines
my $dbg7 = 0; # show anchor count
my $dbg8 = 0; # show unique anchor href
my $dbg9 = 0; # show files with SCRIPTS
my $dbg10 = 0; # show diag for get_img_srcs() ...
my $dbg11 = 0; # in image processing show entered/exits script
my $dbg12 = 0; # in image processing show processing count
my $dbg13 = 0; # in image processing show ok - found file
my $dbg14 = 0; # in image processing show image count found
my $dbg15 = 0; # in image processing show image count when NONE found
my $dbg16 = 0; # show WARNINGS during run ...
my $dbg17 = 0; # show MISSING or BLANK HREF in PHP file
my $dbg18 = 0; # check_linkages: show 'ok', in 2nd link check
my $dbg19 = 0; # check_local_links: show progress ...
my $dbg20 = 0; # check_local_links: show ALL link COUNTS - NONE IS ALWAYS SHOWN ...
my $dbg21 = 0; # check_local_links: show LINK when found ...
my $dbg22 = 0; # mark_image_link: show comparing, and comparision ...
my $dbg23 = 0; # mark_image_link: show count of new images marked ...
my $dbg24 = 0; # show each image file being marked
my $dbg25 = 0; # prt( "[dbg25] NO LINK FOUND HREF [$src]($msrc) in $totcnt file - $ff ($fnd) - ($lev)!\n" ) if ($dbg25);
my $dbg26 = 0; # show EACH HTML FILE BEING PROCESSED
my $dbg27 = 0; # show EACH extesnions, and counts
my $dbg28 = 0; # show image links information ...
my $dbg29 = 0; # show ZIP, TXT, CSS, SCRIPT and OTHER file links information ...
my $dbg30 = 0; # show HTML HREF links information ...
my $dbg31 = 0; # like $dbg20 - check_local_links: show ALL links - NONE IS ALWAYS SHOWN ...
my $dbg32 = 0; # show missing during processing
my $dbg33 = 0; # show HAS NO LINKS during processing
my $dbg34 = 0; # show prt( "REL PATH [$src] to UNIX PATH [$nusrc]
my $dbg35 = 0; # prt("[dbg35] Got anchor [$hrf]($len2) ...\n") if ($dbg35);
my $dbg36 = 0; # prt("[dbg36] Got file [$fil], with ext = [$ext] = $type\n") if ($dbg36);
my $dbg37 = 0; # prt("[dbg37] File [$fil] EXCLUDED!\n") if ($dbg37);
my $dbg38 = 0; # prt( "Got [$hr2] = [$txt]\n" );
my $dbg39 = 0; # prt( "[dbg39] Got [$hr2] = [$txt] - no inverted commas! [$fil]\n" ) if ($dbg39);
my $dbg40 = 0; # prt("[dbg40] mark_link fmfil=[$fmfil] fnd=[$fnd] src=[$src] lenv=[$lev]\n");
my $dbg41 = 0; # prt( "[dbg41] $i2: compare \n[$mfil] with \n[$msrc]\n" ) if ($dbg41);
my $dbg42 = 0; # prt("[dbg42] Processing file [$file]...\n") if ($dbg42);
my $dbg43 = 0; # prt("[dbg43] Got ".scalar @hr." hrefs, and ".scalar @is." image links.\n") if ($dbg43);
my $dbg44 = 0; # prt("[dbg44] Sending [$src] to 'mark_dir_link'...\n") if ($dbg44);
my $dbg45 = 0; # prt("[dbg45] LINK not found in HREF files, maybe IMAGES, zip, etc TO=[$src], FROM=[$fmfil]\n") if ($dbg45);
my $dbg46 = 0; # prt("[dbg46] LINK found, but not in HREF files! TO=[$src], FROM=[$fmfil]\n") if ($dbg46);
sub set_debug_val($) {
my ($v) = shift;
$dbg1 = $v; $dbg1b = $v; $dbg2 = $v; $dbg3 = $v; $dbg4 = $v; $dbg5 = $v; $dbg6 = $v;
$dbg7 = $v; $dbg8 = $v; $dbg9 = $v; $dbg10 = $v; $dbg11 = $v; $dbg12 = $v;
$dbg13 = $v; $dbg14 = $v; $dbg15 = $v; $dbg16 = $v; $dbg17 = $v; $dbg18 = $v;
$dbg19 = $v; $dbg20 = $v; $dbg21 = $v; $dbg22 = $v; $dbg23 = $v; $dbg24 = $v;
$dbg25 = $v; $dbg26 = $v; $dbg27 = $v; $dbg28 = $v; $dbg29 = $v; $dbg30 = $v;
$dbg31 = $v; $dbg32 = $v; $dbg33 = $v; $dbg34 = $v; $dbg35 = $v; $dbg36 = $v;
$dbg37 = $v; $dbg38 = $v; $dbg39 = $v; $dbg40 = $v; $dbg41 = $v; $dbg42 = $v;
$dbg43 = $v; $dbg44 = $v; $dbg45 = $v; $dbg46 = $v;
}
sub set_debug_on() { set_debug_val(1); }
sub set_debug_off() { set_debug_val(0); }
# forward refs
sub mark_link($$$$);
sub trace_from_htm($$);
sub process_folder($);
sub is_same_file($$);
sub mark_dir_link($$$$);
sub is_htm_file_ext($);
sub pgm_exit($$) {
my ($val,$msg) = @_;
if (length($msg)) {
$msg .= "\n" if !($msg =~ /\n$/);
prt($msg);
}
#show_warnings($val);
close_log($outfile,$load_log);
exit($val);
}
sub show_warnings() {
my $wcnt = scalar @warnings;
if ($wcnt) {
prt( "\nWARNINGS FOLLOW ($wcnt):\n" );
foreach my $w (@warnings) {
prt( "$w\n" );
}
} else {
##### prt( "No warnings ...\n" );
}
}
sub prtw($) {
my ($tx) = shift;
$tx =~ s/\n$//;
prt("$tx\n");
push(@warnings,$tx);
}
sub u2d {
my ($f) = shift;
$f =~ s/\//\\/g;
return $f;
}
sub VERB1() { return ($verbosity >= 1); }
sub VERB2() { return ($verbosity >= 2); }
sub VERB5() { return (($verbosity >=5) ? 1 : 0); }
sub VERB9() { return (($verbosity >=9) ? 1 : 0); }
sub sub_common_folder_dos {
my ($f1, $f2) = @_;
my $df1 = u2d($f1);
my $df2 = u2d($f2);
if ($os =~ /Win/) {
$df1 = lc($df1);
$df2 = lc($df2);
}
# paddle across, stopping at first difference
my $off = 0;
while ( substr($df1,$off,1) && substr($df2,$off,1) &&
( substr($df1,$off,1) eq substr($df2,$off,1) ) ) {
$off++;
}
return substr($f1,$off);
}
sub sub_in_folder($) {
my ($path) = shift;
$path = sub_common_folder_dos($path,$in_folder);
$path =~ s/^(\\|\/)//; # kick off any leading '\' or '/' - 2010-04-02
return $path;
}
# 2010.03.16 - seek an 'index.htm' type file
sub is_def_index_file($) {
my ($fil) = @_;
foreach my $f (@def_indexes) {
return 1 if ($f eq $fil);
}
return 0;
}
# 20100311 - only report MISSING LINKS on HTML files
sub is_link_type_file($) {
my ($f) = @_;
my ($n,$d,$e) = fileparse( $f, qr/\.[^.]*/ );
return 1 if (is_htm_ext($e));
return 0;
}
sub show_startup() {
prt( "Checking $in_folder ..." );
prt( " Using HOME file $homefile ..." ) if length($homefile);
prt("\n");
if (VERB5()) {
prt( "\nOptions:\n" );
prt( sprintf(" -checkips - HREF check %s, \n", ($chkip ? "On" : "Off")) );
prt( sprintf(" -showhreflinks - Show HREF %s, \n", ($showhreflinks ? "On" : "Off")) );
prt( sprintf(" -showlinks - Show links %s, \n", ($showlinks ? "On" : "Off")) );
prt( sprintf(" -showscripts - Show script files %s, \n", ($showscripts ? "On" : "Off")) );
prt( sprintf(" -writeips - Write HREF $ipfile %s, \n", ($writeips ? "On" : "Off")) );
prt( sprintf(" -refreships - Refresh HREF %s\n", ($refreships ? "On" : "Off")) );
prt( sprintf(" -shownohrefs - Show NO HREF found %s\n", ($shownohrefs ? "On" : "Off")) );
prt( sprintf(" -nofollow - Follow page links found %s\n", ($recurse ? "On" : "Off")) );
if (@excludes) {
prt( "Have ".scalar @excludes." excluded files - " );
foreach my $ex (@excludes) {
prt( "$ex " );
}
prt("\n");
}
}
###pgm_exit(1,"TEMP EXIT(1)");
}
# if (!is_in_array_ref0($tmp, \@hr))
sub is_in_array_ref0($$) {
my ($test,$ra) = @_;
my $lct1 = lc($test);
my $ac = scalar @{$ra};
for (my $i = 0; $i < $ac; $i++) {
my $rt = ${$ra}[$i]; # extract array ref
my $t = ${$rt}[0]; # extract HREF
my $lct2 = lc($t);
return 1 if ($lct1 eq $lct2);
}
return 0;
}
sub is_in_array_ref($$) {
my ($test,$ra) = @_;
my $lct1 = lc($test);
my $ac = scalar @{$ra};
for (my $i = 0; $i < $ac; $i++) {
my $t = ${$ra}[$i]; # extract item
my $lct2 = lc($t);
return 1 if ($lct1 eq $lct2);
}
return 0;
}
sub get_anchor_hash_ref($$$) {
my ($fank,$fil,$dbg) = @_;
my %hash = ();
my ($ank,$len,$i,$ch,$pc,$hr2,$txt);
if ($fank =~ /$/) {
$ank = trim_all($1);
$len = length($ank);
$ch = '';
$hr2 = '';
for ($i = 0; $i < $len; $i++) {
$pc = $ch;
$ch = substr($ank,$i,1);
if ($ch =~ /\w/) {
$hr2 .= $ch; # accumulate \w chars - alphanumeric, including _
} elsif (length($hr2)) {
if (($ch ne '=') && ($ch =~ /\s/)) {
$i++;
for (; $i < $len; $i++) {
$ch = substr($ank,$i,1);
last if ($ch eq '=');
last if !($ch =~ /\s/);
}
}
if ($ch eq '=') {
# found our equal sign
$i++; # move on...
for (; $i < $len; $i++) {
$ch = substr($ank,$i,1);
last if ($ch =~ /('|")/);
last if !($ch =~ /\s/);
}
if (($ch eq '"')||($ch eq "'")) {
$pc = $ch;
$i++; # move on...
$txt = '';
for (; $i < $len; $i++) {
$ch = substr($ank,$i,1);
last if ($ch eq $pc);
$txt .= $ch;
}
if ($ch eq $pc) {
$hash{$hr2} = $txt;
prt( "[dbg38] Got [$hr2] = [$txt] [$fil]\n" ) if ($dbg38);
} else {
prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO END INVERTED COMMA! ($pc) [$fil]\n");
pgm_exit(1,"") if ($dbg);
}
} else {
if (($ch =~ /\w/) && (($hr2 =~ /name/i)||($hr2 =~ /id/i))) {
# accept these WITHOUT inverted comma
$txt = $ch;
$i++; # MOVING ON
for (; $i < $len; $i++) {
$ch = substr($ank,$i,1);
last if !($ch =~ /\w/);
$txt .= $ch;
}
$hash{$hr2} = $txt;
prt( "[dbg39] Got [$hr2] = [$txt] - no inverted commas! [$fil]\n" ) if ($dbg39);
} else {
prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO START INVERTED COMMA! [$fil]\n");
pgm_exit(1,"") if ($dbg);
}
}
} else {
prtw("PROBLEM: got [$hr2]. At pos $i in [$ank], from [$fank], and NO EQUAL SIGN! [$fil]\n");
pgm_exit(1,"") if ($dbg);
}
$hr2 = '';
}
}
}
return \%hash;
}
# Collect HREF anchors from a TEXT stream
# 25/07/2007 - Skip over comments
# return a reference array
sub collecthrefs2($$) {
my ($txt,$fil) = @_;
my $ntxt = '';
my $len = length($txt);
my ($ch,$pc);
my $hrf = '';
my ($i,$j,$len2,$hr2,$hfile);
my @hr = ();
my @nr = ();
for ($i = 0; $i < $len; $i++) {
$ch = substr($txt,$i,1);
if ($ch eq '<') {
$hrf = $ch; # start a tag
$i++;
# go to the end of that tag
for ( ; $i < $len; $i++) {
$ch = substr($txt,$i,1);
$hrf .= $ch;
# 20100312 and 25/07/2007 watch OUT for COMMENTS - skip these
if ($ch eq '-') {
if ($hrf eq '' found
if ($hrf =~ /-->$/) {
last;
}
}
}
}
}
if ($ch eq '>') {
last;
}
}
if ($hrf =~ /^')); # stop only on NON SPACE, or '>'
$hfile .= $ch;
}
#pgm_exit(1,"ERROR EXIT 2");
}
} else {
prt("PROBLEM: No equal sign [$hrf]! [$fil]\n");
pgm_exit(1,"ERROR EXIT 3");
}
last; # end this scan of the anchor
}
$hr2 = '';
}
}
if (length($hfile)) {
push(@hr, [$hfile, $hrf, 0, '', $ahr]) ;
} else {
push(@nr, [$hfile, $hrf, 0, '', $ahr]) ;
}
}
}
}
prt( "[dbg35] Collected ". scalar @hr . " HREF... [$fil]\n" ) if ($dbg35);
my %h = ();
$h{'href'} = \@hr;
$h{'name'} = \@nr;
return \%h;
}
sub secs_2_hhmmss($) {
my ($secs) = shift;
my $rt = '';
my $mins = int($secs / 60);
$secs = $secs - ($mins * 60);
$secs = (int(($secs * 10) + 0.5)) / 10;
if ($mins > 60) {
my $hrs = int($mins / 60);
$mins = $mins - ($hrs * 60);
$mins = '0'.$mins if ($mins < 10);
$secs = '0'.$secs if ($secs < 10);
$rt = "$hrs:$mins:$secs";
} else {
$mins = '0'.$mins if ($mins < 10);
$secs = '0'.$secs if ($secs < 10);
$rt = "$mins:$secs";
}
return $rt;
}
sub show_time($$$$) {
my ($totcnt, $lncnt, $bgntime, $msg) = @_;
my ($currtime, $difftime, $persec, $remains, $remsecs, $tenths, $remtm, $elapsed);
$currtime = time();
$difftime = $currtime - $bgntime;
$persec = $lncnt / $difftime;
$remains = $totcnt - $lncnt;
$remsecs = $remains / $persec;
$tenths = (int(($persec * 100) + 0.05)) / 100;
$remtm = secs_2_hhmmss($remsecs);
$elapsed = secs_2_hhmmss($difftime);
prt( "$elapsed Done $lncnt, at $tenths/sec, left $remains in $remtm - $msg\n" );
}
sub test_remove_script {
my ($txt) = shift;
my $ntxt = '';
my $len = length($txt);
my ($i,$ch,$pc,$bal,$ppc);
$pc = '';
$ch = '';
for ($i = 0; $i < $len; $i++) {
$ppc = $pc;
$pc = $ch;
$ch = substr($txt,$i,1);
if (($ch eq '<')&&(substr($txt,$i) =~ /^
my $ntxt = test_remove_script( $rawtxt ); # remove SCRIPTS ... ... ...
my @is = ret_imgs_array($ntxt);
$ntxt = trimblanklines($ntxt);
@hrefs = (); # clear
my @hr = ret_hrefs_array( $ntxt );
my $rhash = collecthrefs2( $ntxt, $file );
### collecthrefs( $txt, 0 );
### collectimgs( $txt, 0 );
# bump the counts of HREF and IMGS collected
$itot = scalar @is;
$htot = scalar @hr;
# store the references ... that is a reference to an array
$htm_files[$i][$of_hr] = \@hr; # like $of_to, but is ALL HREF links
$htm_files[$i][$of_im] = \@is;
$htm_files[$i][$of_rh] = $rhash; # store the reference hash to 2nd ref arrays 'href' & 'name'
$refa = ${$rhash}{'href'};
$cnth = scalar @{$refa};
$refa = ${$rhash}{'name'};
$cntn = scalar @{$refa};
$pmsg .= " a=$htot, i=$itot, h=$cnth, n=$cntn";
# new code to do some similar things, but while in the array
# BUT IS REALLY JUST A DUPLICATE OF THE ABOVE
# one day should choose ONE or the OTHER ONLY ;=)) it just wastes time
###@lines = drop_php_from_array( @lines ) if (lc($ext) eq '.php');
#write2file((join("\n",@lines)."\n"),'tempbefore.htm');
@lines = dropcomments_from_array(@lines);
# 20100313 *MUST* drop scripts from line array, before partying
#write2file((join("\n",@lines)."\n"),'tempbefore2.htm');
@lines = dropscripts_from_array(@lines);
#write2file((join("\n",@lines)."\n"),'tempafter.htm');
#pgm_exit(1,"Check tempafter.htm");
my @isrcs = get_img_srcs($file, @lines);
$tot_imgcnt += check_images( $file, @isrcs );
my @hsrcs = get_href_srcs($file, @lines);
check_hrefs( $file, \@hsrcs );
$htm_files[$i][$of2hr] = \@hsrcs; # really just duplicate...
$htm_files[$i][$of2im] = \@isrcs; # should be removed!!!
prt("[dbg43] Got $htot hrefs (h=$cnth,n=$cntn)(".scalar @hsrcs."), and $itot image links (".scalar @isrcs.").\n") if ($dbg43);
$pmsg .= " h2=".scalar @hsrcs.", i2=".scalar @isrcs;
prt("$pmsg\n") if (VERB5());
#if ($file =~ /$test_file/) {
# pgm_exit(1,"CHECK THIS FILE [$ntxt]\n");
#}
}
$total_hrefs += $htot;
$total_imgs += $itot;
if ((($pcnt % 100) == 0)||($max < 10)) {
###local $| = 1;
###prt( "\rDone $pcnt HTML files ..." );
if ($max < 10) {
prt( "Done $file HTML file ... href,other($htot,$itot)\n" );
} else {
$msg = "href,other ($total_hrefs,$total_imgs)";
show_time( $max, $pcnt, $bgntime, $msg );
#prt( "Done $pcnt HTML files ... href,other ($total_hrefs,$total_imgs)\n" );
}
}
}
prt( "Completed $pcnt HTML files ... Found $total_hrefs HREF, and $total_imgs IMG/OTHER tokens.\n" );
}
sub get_href_type($) {
my ($src) = shift;
if ($src =~ /^http:/i) {
#push(@httprefs, [$src, $fil, $lnnos] );
return 1; # remote HREF
} elsif ($src =~ /^https:/i) {
return 1; # remote HREF
#push(@httpsrefs, [$src, $fil, $lnnos] );
} elsif ($src =~ /^ftp:/i) {
#push(@ftprefs, [$src, $fil, $lnnos] );
return 3; # remote HREF
} elsif ($src =~ /^mailto:/i) {
#push(@mtrefs, [$src, $fil, $lnnos] );
return 4; # remote HREF
} elsif ( $src =~ /^javascript:/i ) {
return 5; # a JAVASCRIPT HREF
} elsif ($src =~ /^file:/i) {
return 5; # remote HREF
} elsif ( substr($src,0,1) eq '#') {
# local in page HREF
return 6;
} else {
my $ind = index($src,'#');
$src = substr($src,0,$ind) if ( $ind != -1 );
$ind = index($src,'?');
$src = substr($src,0,$ind) if ( $ind != -1 );
$src =~ s/\/$//;
return 7 if (length($src));
}
return 0;
}
sub get_local_href($) {
my ($src) = shift;
my $ind = index($src,'#');
$src = substr($src,0,$ind) if ( $ind != -1 );
$ind = index($src,'?');
$src = substr($src,0,$ind) if ( $ind != -1 );
$src =~ s/\/$//; # remove any TRAILING '/' char
# 25/07/2007 - also 'convert' '%20' to space
$src =~ s/%20/ /g;
return $src;
}
sub dos_2_unix($) {
my ($du) = shift;
$du =~ s/\\/\//g;
return $du;
}
### my @donesrcs = ();
sub in_done_srcs($) {
my ($f) = shift;
foreach my $fd (@donesrcs) {
if ($fd eq $f) {
return 1;
}
}
return 0;
}
sub in_done_imgs($) {
my ($f) = shift;
foreach my $fd (@doneimgs) {
if ($fd eq $f) {
return 1;
}
}
return 0;
}
sub fix_rel_unix_path($) {
my ($path) = shift;
$path = dos_2_unix($path);
# pgm_exit(1,"ERROR: Passed PATH that starts relative! [$path]\n") if (($path =~ /^\.\./)||($path =~ /^\.(\\|\/)\.\./));
my @a = split(/\//, $path);
my $npath = '';
my $max = scalar @a;
my @na = ();
for (my $i = 0; $i < $max; $i++) {
my $p = $a[$i];
if ($p eq '.') {
# ignore this
} elsif ($p eq '..') {
if (@na) {
pop @na; # discard previous
} else {
prt( "WARNING: Got relative .. without previous!!! path=[$path]\n" );
}
} else {
push(@na,$p);
}
}
foreach my $pt (@na) {
$npath .= "/" if length($npath);
$npath .= $pt;
}
return $npath;
}
sub add_new_link($$) {
my ($nlnk, $lnks) = @_;
my @arr = split(',', $lnks);
foreach my $lk (@arr) {
if ($lk eq $nlnk) {
return 0;
}
}
return 1;
}
sub mark_image_link($$$$) {
my ($fmfil, $fnd, $src, $lev) = @_;
my $fcnt = scalar @img_files;
my $msrc = lc(dos_2_unix($src));
my $lnks = '';
my ($val);
prt( "[dbg22] Seeking [$msrc] in $fcnt images files ...\n" ) if ($dbg22);
my ($i,$fil,$mfil);
for ($i = 0; $i < $fcnt; $i++) {
$fil = $img_files[$i][$of_ff];
if (is_same_file($src,$fil)) {
$val = $img_files[$i][$of_lk];
$val++;
# add image file linked to from what file
$lnks = $img_files[$i][$of_fm];
if (length($lnks) == 0) {
$lnks = $fmfil;
} elsif (add_new_link($fmfil, $lnks)) {
$lnks .= ',';
$lnks .= $fmfil;
} else {
$val--; # reduce due to being a repeated link
}
$img_files[$i][$of_lk] = $val;
$img_files[$i][$of_fm] = $lnks;
prt( "[dbg28] IMG src in $fmfil, of $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg28);
return 0;
}
}
for ($i = 0; $i < $fcnt; $i++) {
$fil = $img_files[$i][$of_ff];
$mfil = lc(dos_2_unix($fil));
prt( "[dbg22] Comparing to $mfil ...\n" ) if ($dbg22);
if ($msrc eq $mfil) {
$val = $img_files[$i][$of_lk];
$val++;
$img_files[$i][$of_lk] = $val;
# add image file linked to from what file
$lnks = $img_files[$i][$of_fm];
if (length($lnks) == 0) {
$lnks = $fmfil;
} elsif (add_new_link($fmfil, $lnks)) {
$lnks .= ',';
$lnks .= $fmfil;
}
$img_files[$i][$of_fm] = $lnks;
prt( "[dbg28] IMG src in $fmfil, of $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg28);
return 0;
}
}
prt( "[dbg21] $src - NOT FOUND![1]\n" ) if ($dbg21);
return 1;
}
sub mark_other_links($$$$) {
my ($fmfil, $fnd, $src, $lev) = @_;
my $totcnt = 0;
my $msrc = lc(dos_2_unix($src));
my $fcnt = scalar @img_files;
my $i = 0;
my $lnks = '';
my ($val);
$totcnt += $fcnt;
if (mark_image_link( $fmfil, $fnd, $src, $lev ) == 0) {
return 0;
}
# maybe ZIP files
$fcnt = scalar @zip_files;
$totcnt += $fcnt;
for ($i = 0; $i < $fcnt; $i++) {
my $fil = $zip_files[$i][$of_ff];
my $mfil = lc(dos_2_unix($fil));
if ($msrc eq $mfil) {
$val = $zip_files[$i][$of_lk];
$val++;
# add zip file linked to from what file
$lnks = $zip_files[$i][$of_fm];
if (length($lnks) == 0) {
$lnks = $fmfil;
} elsif (add_new_link($fmfil, $lnks)) {
$lnks .= ',';
$lnks .= $fmfil;
} else {
$val--;
}
$zip_files[$i][$of_lk] = $val;
$zip_files[$i][$of_fm] = $lnks;
prt( "ZIP link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29);
return 0;
}
}
# maybe TXT files
$fcnt = scalar @txt_files;
$totcnt += $fcnt;
for ($i = 0; $i < $fcnt; $i++) {
my $fil = $txt_files[$i][$of_ff];
my $mfil = lc(dos_2_unix($fil));
if ($msrc eq $mfil) {
$val = $txt_files[$i][$of_lk];
$val++;
# add txt file linked to from what file
$lnks = $txt_files[$i][$of_fm];
if (length($lnks) == 0) {
$lnks = $fmfil;
} elsif (add_new_link($fmfil, $lnks)) {
$lnks .= ',';
$lnks .= $fmfil;
} else {
$val--;
}
$txt_files[$i][$of_lk] = $val;
$txt_files[$i][$of_fm] = $lnks;
prt( "TXT link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29);
return 0;
}
}
# maybe CSS files
$fcnt = scalar @css_files;
$totcnt += $fcnt;
for ($i = 0; $i < $fcnt; $i++) {
my $fil = $css_files[$i][$of_ff];
my $mfil = lc(dos_2_unix($fil));
if ($msrc eq $mfil) {
$val = $css_files[$i][$of_lk];
$val++;
# add txt file linked to from what file
$lnks = $css_files[$i][$of_fm];
if (length($lnks) == 0) {
$lnks = $fmfil;
} elsif (add_new_link($fmfil, $lnks)) {
$lnks .= ',';
$lnks .= $fmfil;
} else {
$val--;
}
$css_files[$i][$of_lk] = $val;
$css_files[$i][$of_fm] = $lnks;
prt( "CSS link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29);
return 0;
}
}
# maybe SCRIPT files
$fcnt = scalar @script_files;
$totcnt += $fcnt;
for ($i = 0; $i < $fcnt; $i++) {
my $fil = $script_files[$i][$of_ff];
my $mfil = lc(dos_2_unix($fil));
if ($msrc eq $mfil) {
$val = $script_files[$i][$of_lk];
$val++;
# add script file linked to from what file
$lnks = $script_files[$i][$of_fm];
if (length($lnks) == 0) {
$lnks = $fmfil;
} elsif (add_new_link($fmfil, $lnks)) {
$lnks .= ',';
$lnks .= $fmfil;
} else {
$val--;
}
$script_files[$i][$of_lk] = $val;
$script_files[$i][$of_fm] = $lnks;
prt( "SCRIPT link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29);
return 0;
}
}
# OK, OTHER
$fcnt = scalar @other_files;
$totcnt += $fcnt;
for ($i = 0; $i < $fcnt; $i++) {
my $fil = $other_files[$i][$of_ff];
my $mfil = lc(dos_2_unix($fil));
if ($msrc eq $mfil) {
$val = $other_files[$i][$of_lk];
$val++;
# add script file linked to from what file
$lnks = $other_files[$i][$of_fm];
if (length($lnks) == 0) {
$lnks = $fmfil;
} elsif (add_new_link($fmfil, $lnks)) {
$lnks .= ',';
$lnks .= $fmfil;
} else {
$val--;
}
$other_files[$i][$of_lk] = $val;
$other_files[$i][$of_fm] = $lnks;
prt( "OTHER link in $fmfil, to $fil ($i) $val ok [lnks=$lnks]\n" ) if ($dbg29);
return 0;
}
}
$totcnt += 1 if ($totcnt == 0);
return $totcnt;
}
#######################################################################
# mark a link
# parameters:
# LINK IS IN FILE - $fmfil = FROM
# Offset in $htm_files[$fnd]
# Link item source - $src = TO
# Level
# 2010/04/08 - If given a DIRECTORY, try to find an 'index' to link to...
sub mark_link($$$$) {
my ($fmfil, $fnd, $src, $lev) = @_;
my $fcnt = scalar @htm_files;
my $msrc = lc(dos_2_unix($src));
my $ff = $htm_files[$fnd][$of_ff];
my $sfil = sub_in_folder($fmfil);
my $ssrc = sub_in_folder($src);
#prt("[dbg40]mark_link: for src=[$src] in fil=[$fmfil] off=[$fnd] lev=$lev ff=[$ff]\n") if ($dbg40);
#prt("[dbg40]mark_link:$lev: for src=[$ssrc] from=[$sfil]($fnd) ff=[$ff]\n") if ($dbg40);
prt("[dbg40]mark_link:$lev: TO=[$ssrc] from=[$sfil]($fnd)\n") if ($dbg40);
my $i = 0;
my $totcnt = $fcnt;
my $fil = '';
my $mfil = '';
my $val = 0;
my $lnks = '';
my ($hrf,$j,$hrt,$tmp,$i2);
if (-d $src) {
prt("[dbg44] Sending [$src] to 'mark_dir_link'...\n") if ($dbg44);
return mark_dir_link($fmfil, $fnd, $src, $lev);
}
for ($i = 0; $i < $fcnt; $i++) {
$i2 = $i + 1;
if ($i != $fnd) {
$fil = $htm_files[$i][$of_ff];
$mfil = lc(dos_2_unix($fil));
prt( "[dbg41] $i2: compare \n[$mfil] with \n[$msrc]\n" ) if ($dbg41); # VERY NOISY
if (($msrc eq $mfil) || is_same_file($src,$fil)) {
# FOUND the TO file in LIST - MARK IT
$val = $htm_files[$i][$of_lk];
$val++;
# add HTML file linked to from what file
$lnks = $htm_files[$i][$of_fm];
if (length($lnks) == 0) {
$lnks = $fmfil;
} elsif (add_new_link($fmfil, $lnks)) {
$lnks .= ',';
$lnks .= $fmfil;
} else {
$val--; # already COUNTED
}
$htm_files[$i][$of_lk] = $val;
$htm_files[$i][$of_fm] = $lnks; # add to LINKS
prt( "[dbg30] SET link to $fil($i) $val ok [lnks=$lnks] in $fmfil\n" ) if ($dbg30);
###prt( "$ff ($fnd) linked to $fil ($i) $val\n" ) if ($dbg21);
my $hr = $htm_files[$i][$of_hr]; # extract HREF ref.array - all LINKS to another!
my $im = $htm_files[$i][$of_im]; # extract IMAGE ref.array
# my $hr2 = $htm_files[$i][$of_rh]; # extract the NEW ANCHOR collection
# $htm_files[$i][$of_rh] = $rhash; # store the reference hash to 2 ref arrays 'href' & 'name'
my $rhash = $htm_files[$i][$of_rh];
my $hr2 = ${$rhash}{'href'}; # extract the NEW ANCHOR collection
my $hrc = scalar @{$hr};
my $imc = scalar @{$im}; # get count of images
my $hr2c = scalar @{$hr2};
if ($hrc != $hr2c) {
# ====== this is mainly diagnostic ======
##prt("WARNING: Counts old [$hrc], new [$hr2c] in [$fil]\n");
my %th1 = ();
for ($j = 0; $j < $hrc; $j++) {
$hrf = ${$hr}[$j];
$th1{$hrf} = 1;
}
$tmp = scalar keys(%th1);
if ($tmp == $hr2c) {
##prt("WARNING: Solved when dupes removed! New counts old [$tmp] = new [$hr2c]\n");
} else {
prt("PROBLEM: Counts old [$hrc], new [$hr2c] in [$fil]\n");
prt("List of $hrc OLD\n");
for ($j = 0; $j < $hrc; $j++) {
$hrf = ${$hr}[$j];
$hrt = get_href_type($hrf);
$val = is_in_array_ref0($hrf,$hr2);
$tmp = ($val ? "Yes" : "No");
prt("[$hrf] ($hrt) $tmp\n");
}
prt("List of $hr2c NEW\n");
for ($j = 0; $j < $hr2c; $j++) {
$hrf = ${$hr2}[$j][0];
$hrt = get_href_type($hrf);
$val = is_in_array_ref($hrf,$hr);
$tmp = ($val ? "Yes" : "No");
prt("[$hrf] ($hrt) $tmp anchor=<".${$hr2}[$j][1].">\n");
}
pgm_exit(1,"What is different/missing...\n");
}
# ====== above is mainly diagnostic ======
}
# found the file LINKED TO...
my ($itmnam, $itmdir) = fileparse($fil); # get name and path
$itmdir = $currworkdir.'/' if ($itmdir =~ /^\.(\\|\/)$/);
prt( "$lev [$fil] has $hrc href, $imc img/other links ..\n" ) if ($dbg19 || $dbg26);
# now process links to links...
for ($j = 0; $j < $hrc; $j++) {
$hrf = ${$hr}[$j];
$hrt = get_href_type($hrf);
if ($hrt == 7) {
my $nsrc = fix_rel_unix_path($itmdir.get_local_href($hrf));
if ( !in_done_srcs($nsrc) ) {
push(@donesrcs, $nsrc); # put it in DONE list
mark_link( $fil, $i, $nsrc, $lev + 1 ); # and MARK its links now
}
}
}
$val = 0;
prt( "$fil - Checking $imc images files ...\n") if ($dbg24);
for ($j = 0; $j < $imc; $j++) { # do each, in this linked file
my $img = ${$im}[$j]; # get the image string
my $isrc = $itmdir.$img; # join it with the path
my $nisrc = fix_rel_unix_path($isrc); # fix rel, and force unix path
prt( "Marking [$nisrc] - ".($j+1)." of $imc img/other links ..\n" ) if ($dbg19 || $dbg26);
if ( !in_done_imgs($nisrc) ) {
push(@doneimgs, $nisrc); # put it in DONE list
mark_other_links( $fil, $j, $nisrc, 0 ); # and MARK the link in @img_files
$val++;
} else {
prt( "Already IN doneimgs ...\n" ) if ($dbg19 || $dbg26);
}
}
prt( "$fil - Marked $val of $imc images files ...\n") if ($val && $dbg24);
return 0;
}
} # do NOT find self
}
# hmmmm, LINK not found in HREF files, maybe IMAGES, zip, etc ...
prt("[dbg45] LINK not found in HREF files, maybe IMAGES, zip, etc TO=[$src], FROM=[$fmfil]\n") if ($dbg45);
if (is_htm_file_ext($src)) {
# 19/05/2011 - BUt if the file does EXIST, pobably just means it is outside the start point
if (-f $src) {
prt("[dbg46] LINK found, but not in HREF files! TO=[$src], FROM=[$fmfil]\n") if ($dbg46);
} else {
prtw("WARNING: LINK not found in HREF files! TO=[$src], FROM=[$fmfil]\n");
}
}
$val = mark_other_links( $fmfil, $fnd, $src, $lev );
if ($val) {
$totcnt += $val;
prt( "[dbg25] NO LINK FOUND to [$src]($msrc) in $totcnt file - $ff ($fnd) - ($lev)!\n" ) if ($dbg25);
return 1;
}
return 0; # SUCCESS - FOUND THE LINK
}
# ============================================================
# sub seek_dir_index_file($$$$) {
# Seek the first 'index' type file for a DIRECTORY
# IN - ($directory, $resultref);
# OUT - 0 = SUCCESS, and result reference contains file
# 1 = FAILED no 'index' type found
# 2 = FAILED directory NOT in %g_dir_file
# ============================================================
sub seek_dir_index_file($$) {
my ($nusrc, $rind) = @_;
my $hr = \%g_dir_files;
my ($dhr,$v2,$i,$ii,$cnt,$v3,$icnt,$ff,$k2);
if (defined ${$hr}{$nusrc}) {
$dhr = ${$hr}{$nusrc}; # get HASH reference for this DIRECTORY (in unix form)
foreach $k2 (sort keys %{$dhr}) {
# for each type HTML, CSS, etc...
$v2 = ${$dhr}{$k2}; # extract array reference for this TYPE
$cnt = scalar @{$v2}; # count to process
# prt(" Type $k2 = $cnt\n");
for ($i = 0; $i < $cnt; $i++) {
$v3 = ${$v2}[$i][0]; # get the file name
$ii = ${$v2}[$i][1]; # and if it is 'index' type
# prt(" File: [$v3] ");
if ($ii) { # FOUND an 'index; type
# the $v3 file name is an INDEX type file
$ff = $nusrc.'/'.$v3; # build FULL FILE name
$icnt++;
#prt(" ('index' type)");
${$rind} = $ff;
return 0; # SUCCESS
}
# prt("\n");
}
}
return 1;
} else {
pgm_exit(1,"ERROR: Why is [$nusrc] NOT in directory/file reference?\n");
}
return 2; # FAILED!!!
}
# 2010/04/08 - If given a DIRECTORY, try to find an 'index' to link to...
# $fil = $htm_files[$fnd][$of_ff]; # extract FULL PATH name of file CONTAINING the HREF...
# ($itmnam, $itmdir) = fileparse($fil); # get name and path
# $itmdir = $currworkdir.'/' if ($itmdir =~ /^\.(\\|\/)$/);
# my $lref = get_local_href($hrf);
# $src = $itmdir.$lref;
# my $nusrc = fix_rel_unix_path($src);
# $nusrc already tested as a DIRECTORY - did I keep DIRECTORY LISTS/
#sub show_dir_files($) {
# my ($hr) = @_; # = \%g_dir_files
# my ($key,$val,$k2,$v2,$v3,$dir,$cnt,$i,$ii,$icnt);
# foreach $key (sort keys %{$hr}) {
# $dir = sub_in_folder($key);
# $dir = '' if (length($dir)==0);
# $val = ${$hr}{$key}; # extract hash ref BY TYPE
# prt("Directory: [$dir]\n");
# $icnt = 0;
# foreach $k2 (sort keys %{$val}) {
# # for each type
# $v2 = ${$val}{$k2};
# $cnt = scalar @{$v2};
# prt(" Type $k2 = $cnt\n");
# for ($i = 0; $i < $cnt; $i++) {
# $v3 = ${$v2}[$i][0];
# $ii = ${$v2}[$i][1];
# prt(" File: [$v3] ");
# if ($ii) {
# $icnt++;
# prt(" ('index' type)");
# }
# prt("\n");
# }
# }
# if (!$icnt) {
# prt("Directory: [$dir] has NO 'index' type file.\n");
# } elsif ($icnt > 1) {
# prt("Directory: [$dir] has $icnt 'index' type files.\n");
# }
# }
# pgm_exit(1,"Debug run show of directories and files...\n");
#}
sub mark_dir_link($$$$) {
my ($fil, $fnd, $nusrc, $lev) = @_;
my $fcnt = scalar @htm_files;
my $hr = \%g_dir_files;
my ($dhr,$v2,$i,$ii,$cnt,$v3,$icnt,$ff,$k2);
if (defined ${$hr}{$nusrc}) {
$dhr = ${$hr}{$nusrc}; # get HASH reference for this DIRECTORY (in unix form)
foreach $k2 (sort keys %{$dhr}) {
# for each type HTML, CSS, etc...
$v2 = ${$dhr}{$k2}; # extract array reference for this TYPE
$cnt = scalar @{$v2}; # count to process
# prt(" Type $k2 = $cnt\n");
for ($i = 0; $i < $cnt; $i++) {
$v3 = ${$v2}[$i][0]; # get the file name
$ii = ${$v2}[$i][1]; # and if it is 'index' type
# prt(" File: [$v3] ");
if ($ii) { # FOUND an 'index; type
# the $v3 file name is an INDEX type file
$ff = $nusrc.'/'.$v3; # build FULL FILE name
$icnt++;
#prt(" ('index' type)");
return mark_link($fil,$fnd,$ff,0); # and mark this file
}
# prt("\n");
}
}
} else {
pgm_exit(1,"ERROR: Why is [$nusrc] NOT in directory/file reference?\n");
}
return 1; # FAILED!!!
}
sub get_links_stg($) {
my ($lnks) = @_;
my @arr = split(',',$lnks);
my $nlnks = '';
foreach my $l (@arr) {
my $sl = sub_in_folder($l);
$nlnks .= ',' if (length($nlnks));
$nlnks .= $sl;
}
return $nlnks;
}
###########################################################################
# show link count, and links, in passed multidimensional file array
#
# If showlinks (or $dbg20 or $dbg31) is ON, shows internal LINKS
# NOTE: Presently DOES NOT get all LINKS??? BAH!!!
###########################################################################
sub show_link_counts($$) {
my ($m, $hf) = @_;
my $fcnt = scalar @{$hf};
my $mcnt = 0;
my $mss = "Checking LINKS for $fcnt $m files ...\n";
my ($i,$fil,$hrt,$sfil,$lnks,$min,$len,$msg,$i2);
if ($fcnt) {
$min = 0;
for ($i = 0; $i < $fcnt; $i++) {
$fil = ${$hf}[$i][$of_ff];
$sfil = sub_in_folder($fil);
$len = length($sfil);
$min = $len if ($len > $min);
}
for ($i = 0; $i < $fcnt; $i++) {
$i2 = $i + 1;
$fil = ${$hf}[$i][$of_ff];
$hrt = ${$hf}[$i][$of_lk];
$lnks = ${$hf}[$i][$of_fm];
$sfil = sub_in_folder($fil);
if ($hrt) {
if ($dbg20 || $dbg31 || $showlinks) {
prt( $mss ) if (length($mss));
$mss = '';
$sfil .= ' ' while (length($sfil) < $min);
$msg = sprintf("%3d:",$i2);
if ($dbg31 || $showlinks) {
prt( "$msg $sfil has $hrt link(s) [".get_links_stg($lnks)."]\n" );
} else {
prt( "$msg $sfil has $hrt link(s)\n" );
}
}
} else {
# expect NONE for most types - only check HTML
if ( is_link_type_file($fil) ) {
prt( $mss ) if (length($mss));
$mss = '';
prtw("WARNING: $sfil($i) HAS NO LINKS!") if ($dbg33 || $showlinks);
push(@missing_links,$fil);
}
$mcnt++; # count a MISSED item
if ($shownolinks || $showlinks) {
prt( $mss ) if (length($mss));
$mss = '';
prt( "$i: [$sfil] has NO links\n" );
}
}
}
if ($mcnt) {
prt( $mss ) if (length($mss));
$mss = '';
prt( "Done LINKS for $fcnt $m files ... MISSED $mcnt!!!\n" );
}
} else {
prt( "There are NO $m files ...\n" ) if ($dbg20 || $showlinks);
}
}
sub in_excused($) {
my ($tx) = shift;
foreach my $t (@excused) {
if ($t eq $tx) {
return 1;
}
}
return 0;
}
sub is_same_file($$) {
my ($fil1,$fil2) = @_;
#prt("Comparing -\n[$fil1] to\n[$fil2]\n");
if ($fil1 =~ /^\.(\\|\/)/) {
$fil1 = $currworkdir.substr($fil1,1);
}
if ($fil2 =~ /^\.(\\|\/)/) {
$fil2 = $currworkdir.substr($fil2,1);
}
#prt("Comparing -\n[$fil1] to\n[$fil2]\n");
my $len = length($fil1);
return 0 if (length($fil2) != $len); # NOT same length
return 1 if ($fil1 eq $fil2);
my $ufil1 = dos_2_unix($fil1);
my $ufil2 = dos_2_unix($fil2);
#prt("Comparing UNIX -\n[$ufil1] to\n[$ufil2]\n");
return 1 if ($ufil1 eq $ufil2);
my ($i,$ch1,$ch2);
for ($i = 0; $i < $len; $i++) {
$ch1 = substr($fil1,$i,1);
$ch2 = substr($fil2,$i,1);
if ($ch1 ne $ch2) {
#prt("Not same on char [$ch1] vs [$ch2] ($i)\n");
return 0;
}
}
return 1 if ($i == $len);
return 0;
}
# ==========================================================
# using the given (home) PAGE, try to TRACE ALL LINKS
# not really meaingful unless it is a main 'index' type file
# ===========================================================
sub check_local_links($) {
my ($hf) = shift;
my $fcnt = scalar @htm_files;
my ($hfnm,$hfdir) = fileparse($hf);
my $lchf = lc($hfnm);
my $fnd = get_home_offset($hf);
my ($fil,$nm,$dir,$ext);
if ($hfdir eq ".\\") {
$hfdir = $in_folder."\\";
}
my $itmdir = '';
my $itmnam = '';
my $i = 0;
my $i2 = 0;
prt( "Checking local links, for $fcnt files, from $hf ...\n") if (VERB9());
if ($fnd == -1) {
prt( "WARNING: check_local_links: Unable to find [$hf] ...\n" );
return 1;
}
# process item 1 ...
my $pcnt = 1;
my $hr = $htm_files[$fnd][$of_hr];
my $im = $htm_files[$fnd][$of_im];
my $hrc = scalar @{$hr};
my $imc = scalar @{$im};
my $hrf = '';
my $img = '';
my $hrt = 0;
my $src = '';
my $nsrc = '';
my $ff = '';
my $shwerr = 0;
my $emsg = '';
my ($msg,$sfil);
$fil = $htm_files[$fnd][$of_ff]; # extract FULL PATH name of file CONTAINING the HREF...
$htm_files[$fnd][$of_lk] = 1; # mark as DONE
($itmnam, $itmdir) = fileparse($fil); # get name and path
$itmdir = $currworkdir.'/' if ($itmdir =~ /^\.(\\|\/)$/);
$sfil = sub_in_folder($fil);
prt( "HOME [$fil] has $hrc href, $imc img/other links ..\n" ) if ($dbg19 || $dbg26);
for ($i = 0; $i < $hrc; $i++) {
$hrf = ${$hr}[$i];
$hrt = get_href_type($hrf);
$i2 = $i + 1;
$shwerr = 0;
$emsg = "IN[$sfil] $i2 HREF [$hrf]$hrt";
if ($hrt == 0) {
if (in_excused($hrf)) {
$excusecnt++;
} else {
$emsg .= " CHECK UNKNOWN ****** CHECK ME ******[1]";
push(@warnings, "WARNING: $emsg!" );
$shwerr = 1;
}
} elsif ($hrt < 5) {
$emsg .= " REMOTE";
} elsif ($hrt == 5) {
$emsg .= " JAVASCRIPT";
} elsif ($hrt == 6) {
$emsg .= " IN PAGE";
} else {
my $lref = get_local_href($hrf);
$src = $itmdir.$lref;
my $nusrc = fix_rel_unix_path($src);
prt( "[dbg34] REL PATH [$src] to UNIX PATH [$nusrc] hrt=$hrt file=[$fil]\n" ) if ($dbg34);
push(@donesrcs, $nusrc); # put it in DONE list
# mark link - FROM $fil
if (is_same_file($fil,$nusrc)) {
$emsg .= " LINK to SELF ok";
} elsif (-d $nusrc) {
if ( mark_dir_link( $fil, $fnd, $nusrc, 0 ) ) {
pgm_exit(1,"CODE ERROR: To be done mark links for DIRECTORY! [$nusrc]\n");
}
} else {
if ( mark_link( $fil, $fnd, $nusrc, 0 ) ) {
$emsg .= " SITE REF [$nusrc] ***NO IN-SITE LINK***???";
$msg = "$i2 [$fil] HREF [$hrf]$hrt SITE REF [$nusrc] ***NO IN-SITE LINK***???";
if (-f $src) {
$msg .= "\n*** BUT FILE EXISTS [$src] ***";
$emsg .= "\n*** BUT FILE EXISTS [$src] ***";
# 19/05/2011 - do NOT post this warning
# push(@warnings, "WARNING: Local HREF [$lref] in [$fil] OUTSIDE WEB! but EXISTS!" );
} else {
push(@missed, $msg );
$shwerr = 1;
}
} else {
$emsg .= " SITE REF [$src] ok" if ($dbg19);
}
}
}
prt( "$emsg\n" ) if ($dbg19 || $shwerr || VERB5());
}
prt( "HOME - Marking $imc images files ...\n") if ($dbg24);
for ($i = 0; $i < $imc; $i++) {
$img = ${$im}[$i];
$src = $itmdir.$img;
$nsrc = fix_rel_unix_path($src);
prt( "HOME $fil - Mark $src ($nsrc) image ...\n" ) if ($dbg24);
push(@doneimgs, $nsrc); # put it in DONE list
mark_other_links( $fil, $i, $nsrc, 0 );
}
return 0;
}
sub offset_done($$) {
my ($off, @done) = @_;
foreach my $num (@done) {
if ($off == $num) {
return 1;
}
}
return 0;
}
sub trace_from_htm($$) {
my ($hf, $lev) = @_;
my $fnd = get_offset_of_htm($hf);
my $msg = '';
if (($fnd != -1) && !offset_done($fnd,@offsdone)) {
push(@offsdone,$fnd);
my $hr = $htm_files[$fnd][$of_hr];
my $hrc = scalar @{$hr};
my @offsets = ();
my($itmnam, $itmdir) = fileparse($hf); # get name and path
$itmdir = $currworkdir.'/' if ($itmdir =~ /^\.(\\|\/)$/);
for (my $i = 0; $i < $hrc; $i++) {
my $hrf = ${$hr}[$i];
my $hrt = get_href_type($hrf);
if ($hrt == 7) {
#my $src = fix_rel_unix_path($itmdir.get_local_href($hrf));
my $src = $itmdir.get_local_href($hrf);
push(@offsets,$src);
trace_from_htm($src, ($lev + 1));
}
}
$hrc = scalar @offsets;
my $cnt = $lev;
$msg = sprintf("%4d ", $lev);
prt( $msg );
while($cnt) {
prt( ' ' );
$msg .= ' ';
$cnt--;
}
prt( "$hf links to $hrc files ...\n" );
$msg .= "$hf links to $hrc files ...";
push(@htmlinks, [$lev, $msg]);
foreach my $fil (@offsets) {
$cnt = $lev;
$msg = sprintf("%4d ", $lev);
prt( $msg );
while($cnt) {
prt( ' ' );
$msg .= ' ';
$cnt--;
}
prt( "$fil\n" );
$msg .= $fil;
push(@htmlinks, [$lev, $msg]);
}
}
}
sub get_offset_of_htm($) {
my ($hf) = shift;
my $fcnt = scalar @htm_files;
my ($hfnm,$hfdir) = fileparse($hf);
if ($hfdir eq ".\\") {
$hfdir = $in_folder."\\";
}
my $lchf = lc($hfnm);
my $fnd = -1;
for (my $i = 0; $i < $fcnt; $i++) {
my $fil = $htm_files[$i][$of_ff];
my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
if (lc($nm.$ext) eq $lchf) {
# have at least the NAME, but maybe not the FOLDER
if (lc($hfdir) eq lc($dir)) {
$fnd = $i;
last;
}
}
}
return $fnd;
}
sub get_home_offset($) {
if ($homeoffset != -1) {
return $homeoffset;
}
my ($hf) = shift;
prt( "Getting offset of HOME file [$hf]... ");
my $fnd = get_offset_of_htm($hf);
if ($fnd == -1) {
prtw("WARNING: Unable to find [$hf] ...\n" );
} else {
prt( "index $fnd\n" );
}
$homeoffset = $fnd; # set HOME file offset in list...
return $homeoffset;
}
#############################################################################
# check linkages
#############################################################################
sub check_linkages($) {
my ($hf) = shift;
my $fcnt = scalar @htm_files;
my ($hfnm,$hfdir) = fileparse($hf);
my $lchf = lc($hfnm);
my $fnd = get_home_offset($hf);
my ($fil,$nm,$dir,$ext);
if ($hfdir eq ".\\") {
$hfdir = $in_folder."\\";
}
my $itmdir = '';
my $itmnam = '';
my $i = 0;
my $i2 = 0;
my ($tmp);
prt( "Re-checking HREF and IMG/OTHER links, for $fcnt files ...\n") if (VERB9());
if ($fnd == -1) {
prt( "WARNING: Unable to find [$hf] ...\n" );
push(@warnings, "WARNING: Unable to find [$hf] ...");
return 1;
}
# process item 1 ...
my $pcnt = 1;
my $hr = $htm_files[$fnd][$of_hr];
my $im = $htm_files[$fnd][$of_im];
my $hrc = scalar @{$hr};
my $imc = scalar @{$im};
my $hrf = '';
my $img = '';
my $hrt = 0;
my $src = '';
my $ff = '';
my $shwerr = 0;
my $emsg = '';
$fil = $htm_files[$fnd][$of_ff]; # extract FULL PATH name of file ...
($itmnam, $itmdir) = fileparse($fil); # get name and path
$itmdir = $currworkdir.'/' if ($itmdir =~ /^\.(\\|\/)$/);
prt( "\n" ) if ($dbg18);
prt( "$pcnt [$fil] has $hrc href, and $imc image links ..\n" ) if ($dbg18);
for ($i = 0; $i < $hrc; $i++) {
$hrf = ${$hr}[$i];
$hrt = get_href_type($hrf);
$i2 = $i + 1;
$shwerr = 0;
$emsg = "HH[$fil] ";
$emsg .= "$i2 HREF [$hrf]$hrt";
if ($hrt == 0) {
if (in_excused($hrf)) {
$excusecnt++;
} else {
$emsg .= " CHECK UNKNOWN ****** CHECK ME ******[2]";
push(@warnings, "WARNING: $emsg" );
$shwerr = 1;
}
} elsif ($hrt < 5) {
$emsg .= " REMOTE";
} elsif ($hrt == 5) {
$emsg .= " JAVASCRIPT";
} elsif ($hrt == 6) {
$emsg .= " LOCAL";
} else {
$src = $itmdir.get_local_href($hrf);
if (-f $src) {
$emsg .= " SITE REF [$src] ok";
} elsif (-d $src) {
$emsg .= " SITE REF [$src] okd (as .txt)";
} else {
# special sometimes replacement with a TEXT file
$tmp = $src.".txt";
if (-f $tmp) {
$emsg .= " SITE REF [$src] ok (as .txt)";
} else {
$emsg .= " SITE REF [$src] ***MISSING***?[1]";
push(@missed, $emsg );
$shwerr = 1;
}
}
}
prt( "$emsg\n" ) if ($dbg18 || $shwerr || VERB9() );
}
# From this beginning
for (my $j = 0; $j < $fcnt; $j++) {
$fil = $htm_files[$j][$of_ff];
($itmnam, $itmdir) = fileparse($fil); # get name and path
$itmdir = $currworkdir.'/' if ($itmdir =~ /^\.(\\|\/)$/);
if ($j != $fnd) {
$pcnt++;
$hr = $htm_files[$j][$of_hr];
$im = $htm_files[$j][$of_im];
$hrc = scalar @{$hr};
$imc = scalar @{$im};
prt( "\n" ) if ($dbg18);
prt( "$pcnt [$fil] has $hrc href, and $imc image links ..\n" ) if ($dbg18 || VERB9());
for ($i = 0; $i < $hrc; $i++) {
$hrf = ${$hr}[$i];
$hrt = get_href_type($hrf);
$i2 = $i + 1;
$shwerr = 0;
$emsg = "HF[".sub_in_folder($fil)."] ";
$emsg .= "$i2 HREF [$hrf]$hrt";
if ($hrt == 0) {
if (in_excused($hrf)) {
$excusecnt++;
} else {
$emsg .= " CHECK UNKNOWN ****** CHECK ME ******[3]";
push(@warnings, "WARNING: $emsg");
$shwerr = 1;
}
} elsif ($hrt < 5) {
$emsg .= " REMOTE";
} elsif ($hrt == 5) {
$emsg .= " JAVASCRIPT";
} elsif ($hrt == 6) {
$emsg .= " LOCAL";
} else {
$src = get_local_href($hrf);
if ($src eq '.') {
if (length($homefile)) { # && ($fdir eq $in_folder)
$src = $homefile; # translate a DOT to HOME FILE
}
}
$ff = $itmdir.$src;
if (-f $ff) {
$emsg .= " SITE REF [$ff] ok";
} elsif (-d $ff) {
$emsg .= " SITE REF [$ff] okd";
# maybe check if there is an 'index' type file
my $res = fix_rel_unix_path($ff);
if (seek_dir_index_file($res,\$res)) {
$emsg .= " BUT NO 'index' found in [$ff]";
$shwerr = 1;
push(@missed, $emsg );
} else {
$emsg .= " found 'index' [$res]";
}
} else {
$tmp = $ff.".txt";
if (-f $tmp) {
$emsg .= " SITE REF [$ff] ok (as .txt)";
} else {
$emsg .= " SITE REF [$src][$ff] ***MISSING***?[3]";
push(@missed, $emsg );
$shwerr = 1;
}
}
}
prt( "$emsg\n" ) if ($dbg18 || ($shwerr && $dbg32) || VERB9());
}
for (my $i = 0; $i < $imc; $i++) {
$img = ${$im}[$i];
my $sfil = sub_in_folder($fil);
my $simg = sub_in_folder($img);
$emsg = "IF[$sfil] [$simg] ";
if ($img =~ /^http:\/\/.*/i) {
if ($showhreflinks) {
prtw("WARNING: IMG link is HREF $emsg [1]");
} else {
$hrflnkcnt++;
}
} else {
# 25/07/2007 - deal with '%20' in text
$img =~ s/%20/ /g;
$src = $itmdir.$img;
$shwerr = 0;
if (-f $src) {
$emsg .= " IMG ok";
} else {
$tmp = $src.".txt";
if (-f $tmp) {
$emsg .= " IMG ok (as .txt)";
} else {
$emsg .= " IMG ***MISSING***?[5]";
push(@missed, $emsg );
$shwerr = 1;
}
}
}
prt( "$emsg\n" ) if ($dbg18 || ($shwerr && $dbg32) || VERB9());
}
}
}
return 0;
}
sub check_images($$) {
my ($ifile, @srcs) = @_;
my ($nm, $dir) = fileparse($ifile);
my $scnt = scalar @srcs;
my ($tmp);
if ($scnt) {
prt( "Found $scnt imgs in $nm ...\n" ) if ($dbg14);
for (my $i = 0; $i < $scnt; $i++) {
my $src = $srcs[$i][0];
my $lnn = $srcs[$i][1];
if ($src =~ /^http:\/\//i) {
# remote HREF
} else {
# 25/07/2007 - deal with '%20' to space
$src =~ s/%20/ /g;
my $ff = $dir.$src;
if ( -f $ff ) {
prt( "$src - ok\n" ) if ($dbg13);
} else {
$tmp = $ff.".txt";
if (-f $tmp) {
prt( "$src - ok (as .txt)\n" ) if ($dbg13);
} else {
if ($show_all_not_found) {
prtw("WARNING: [$src] $ifile:$lnn NOT FOUND![2]a");
} elsif (!defined $g_images_notfound{$src}) {
prtw("WARNING: [$src] $ifile:$lnn NOT FOUND![2]b");
$g_images_notfound{$src} = "$ifile,$lnn";
}
}
}
}
} # for image count
} else {
prt( "Found NO imgs in [$ifile] ...\n" ) if ($dbg15);
}
return $scnt;
}
sub get_img_srcs($$) {
my ($fil, @lns) = @_;
my $lc = scalar @lns;
my $scnt = 0;
my ($nm,$dir) = fileparse( $fil );
prt( "Processing $lc lines from [$nm] dir=[$dir]...\n" ) if ($dbg12);
my @isrc = ();
my $ln = '';
my $bal = '';
my $inscript = 0;
my $msg = '';
my $bgnln = 0;
my $lnnos = '';
for (my $i = 0; $i < $lc; $i++) {
$ln = $bal;
$bal = '';
$ln .= $lns[$i];
chomp $ln;
prt( "$i [$ln] ...\n" ) if ($dbg10);
if ($inscript) {
if ($ln =~ /<\/script>/i) {
$inscript =0;
prt( "EXIT a SCRIPT ...\n" ) if ($dbg11);
}
next;
}
if ( $ln =~ //i ) {
$msg = "WARNING: Also found SCRIPT in IMG line ...[$ln]";
push(@warnings, $msg);
prt( "$msg\n" ) if ($dbg16);
}
prt( "[dbg10] Found [$iln] ...\n" ) if ($dbg10);
$bgnln = $i;
while ( !($iln =~ />/) && ($i < $lc)) {
$i++;
my $nxln = $lns[$i];
chomp $nxln;
prt( "[dbg10] Adding [$nxln] ...\n" ) if ($dbg10);
$iln .= ' ' if !($iln =~ /=$/);
$iln .= $nxln;
}
$lnnos = "$bgnln:$i";
my $ind = index($iln, '>');
if ($ind != -1) {
$bal = substr($iln, $ind+1);
$iln = substr($iln, 0, $ind+1);
}
$iln = trim_all($iln);
#if ($iln =~ /src=\"(.+)\"/i) {
if ($iln =~ /src=\s*\"(\S+)\"/i) {
prt( "[dbg10] SRC = $1 In line [$iln]$lnnos...\n" ) if ($dbg10);
push(@isrc, [$1, $lnnos, $fil]);
$scnt++;
} elsif ($iln =~ /src=\s*(\S+)/i) { # without QUOTES
prt( "[dbg10] SRC = $1 In line [$iln]$lnnos...\n" ) if ($dbg10);
push(@isrc, [$1, $lnnos, $fil]);
$scnt++;
} elsif ($iln =~ /src=\s*\'(\S+)\'/i) { # single QUOTES
prt( "SRC = $1 In line [$iln]$lnnos...\n" ) if ($dbg10);
push(@isrc, [$1, $lnnos, $fil]);
$scnt++;
} else {
$msg = "WARNING: SRC NOT FOUND in [$iln]$fil:$lnnos...";
push(@warnings, $msg);
prt( "$msg\n" ) if ($dbg16);
}
} elsif ( $ln =~ //i ) {
$inscript = 1;
prt( "Entered a SCRIPT ...\n" ) if ($dbg11);
if ($ln =~ /<\/script>/i) {
$inscript =0;
prt( "EXIT a SCRIPT ...\n" ) if ($dbg11);
}
}
}
prt( "Returning $scnt img sources ...\n") if ($dbg10);
#$dbg12 = $sdbg12;
#$dbg11 = $sdbg11;
#$dbg16 = $sdbg16;
#$dbg10 = $sdbg10;
return @isrc;
}
sub get_text_sub($) {
my $ff = shift; # full file name
my @arr = split(/\./,$ff); # split on DOT
$arr[-1] = 'txt'; # make last 'txt'
my $nf = ''; # start a new file name
my ($part);
foreach $part (@arr) {
$nf .= '.' if (length($nf));
$nf .= $part;
}
return $nf;
}
# ================================================================
# sub check_hrefs($$)
# called from primary file processing - sub process_file_array()
# split passed 'anchor' list into 'type' of link
# and store in global @httprefs, @httpsrefs, @ftprefs, @mtrefs
# skipping javascript:, and checking file/dir for file links
# The global @httprefs is later used for an IP check, if requested.
# ================================================================
sub check_hrefs($$) {
my ($fil, $rsrcs) = @_;
my ($fnm,$fdir,$fext) = fileparse( $fil, qr/\.[^.]*/ );
$fdir = $currworkdir.'/' if ($fdir =~ /^\.(\\|\/)/); # 20100401 - if local use current directory
my $scnt = scalar @{$rsrcs};
my $isphp = (lc($fext) eq '.php');
my $sfil = sub_in_folder($fil);
if ($scnt) {
prt( "Found $scnt anchor href= in $fnm$fext ...\n" ) if ($dbg7);
for (my $i = 0; $i < $scnt; $i++) {
my $orgsrc = ${$rsrcs}[$i][0];
my $lnnos = ${$rsrcs}[$i][1];
my $src = $orgsrc;
if ($src =~ /^http:/i) {
# remote HREF
push(@httprefs, [$src, $fil, $lnnos] );
} elsif ($src =~ /^https:/i) {
# remote HREF
push(@httpsrefs, [$src, $fil, $lnnos] );
} elsif ($src =~ /^ftp:/i) {
# remote HREF
push(@ftprefs, [$src, $fil, $lnnos] );
} elsif ($src =~ /^mailto:/i) {
# remote HREF
push(@mtrefs, [$src, $fil, $lnnos] );
} elsif ( $src =~ /^#/ ) {
# local in page HREF
} elsif ( $src =~ /^javascript:/i ) {
# a JAVASCRIPT HREF
} else {
my $ind = index($src,'#');
if ( $ind != -1 ) {
$src = substr($src,0,$ind);
}
$ind = index($src,'?');
if ( $ind != -1 ) {
$src = substr($src,0,$ind);
}
$src =~ s/\/$//;
if (length($src)) {
if ($src eq '.') { # HREF is just a DOT
if (length($homefile)) { # && ($fdir eq $in_folder)
$src = $homefile; # translate a DOT to HOME FILE
}
}
# 25/07/2007 - deal with '%20' back to space
$src =~ s/%20/ /g;
my $ff = $fdir.$src;
if ( -f $ff ) {
prt( "[dbg5] $src - ok\n" ) if ($dbg5);
} elsif (-d $ff) {
prt( "[dbg5] $src - okd\n" ) if ($dbg5);
} else {
# 20100312 - Special case - I have replaced large ZIPS, and others, with a TEXT
# file, so do NOT give this WARNING, if such a .txt file exists.
my $tmpf = $ff.".txt";
if ( -f $tmpf ) {
prt( "[dbg5] $src - ok AS $tmpf REPLACEMENT FILE!\n" ) if ($dbg5);
} else {
$tmpf = get_text_sub($ff);
if ( -f $tmpf ) {
prt( "[dbg5] $src - ok AS $tmpf REPLACEMENT FILE!2\n" ) if ($dbg5);
} else {
if ($orgsrc eq $src) {
prtw("WARNING: href [$orgsrc] file [$ff] NOT FOUND![3]s in [$sfil]$lnnos\n");
} else {
prtw("WARNING: href [$orgsrc] [$src] file [$ff] NOT FOUND![3] in [$sfil]$lnnos\n");
}
}
}
}
} else {
if ($isphp) {
prt( "Found BLANK HREF [$orgsrc] in PHP [$sfil]...\n" ) if ($dbg17);
} else {
prtw("WARNING: Found BLANK HREF [$orgsrc] in [$sfil] ...");
}
}
}
}
} else {
if ($isphp) {
prt( "Found NO HREFs in PHP [$sfil]...\n" ) if ($dbg17);
} else {
prt( "NO HREF FOUND in [$sfil]...\n" ) if ($shownohrefs);
}
}
}
############################################################
# Only used is $chkip = 1;
# Show IP Address
# uses sockets, gethostbyname
# Return 0, if can NOT be resolved.
# else the number of IP addresses resolved.
############################################################
sub checkIPAddress($) {
my ($nm) = shift;
my @addr = gethostbyname($nm);
my $cnt = 0;
if( !@addr ) {
my $err = $!;
$err =~ s/\n/ /g;
$err =~ s/^\s+//;
$err =~ s/\s+$//;
if ($nm =~ /:\d+$/) {
my $nm2 = $nm;
$nm2 =~ s/:\d+$//;
@addr = gethostbyname($nm2);
if (@addr) {
$nm = $nm2;
} else {
my $err2 = $1;
$err2 =~ s/\n/ /g;
$err2 =~ s/^\s+//;
$err2 =~ s/\s+$//;
prt( "Can't resolve $nm: $err!\nNor $nm2: $err2" );
return 0;
}
} else {
prt( "Can't resolve $nm: $err!\n" );
return 0;
}
}
@addr = map { inet_ntoa($_) } @addr[4 .. $#addr];
foreach my $k (@addr) {
$cnt++;
prt( "$cnt: $nm resolves to IP [$k]\n" ) if ($dbg3);
}
return $cnt;
}
sub getIPAddress($) {
my ($uri) = shift;
my $packed_ip = gethostbyname($uri);
my $ip_address = 'UNKNOWN';
if (defined $packed_ip) {
$ip_address = inet_ntoa($packed_ip);
}
return $ip_address;
}
sub show_uri_ip($$) {
my ($u,$l) = @_;
my $msg = "[$u]";
$msg .= ' ' while (length($msg) < $l);
if (checkIPAddress($u)) {
my $ip = getIPAddress($u);
prt("$msg ok, on IP [$ip]\n");
} else {
prt("$msg FAILED\n");
}
}
################################################
# Add to @scripts multidimensional array,
# if NOT already in there, when on the line
# numbers are added.
###############################################
sub add_2_scripts($$) {
my ($fil, $lns) = @_;
my $sc = scalar @scripts;
for (my $i = 0; $i < $sc; $i++) {
my $cf = $scripts[$i][0];
if ($cf eq $fil) {
my $lc = $scripts[$i][1];
$lc .= ":$lns";
$scripts[$i][1] = $lc;
return 0;
}
}
push(@scripts, [$fil, $lns]);
return 1;
}
####################################################
# Get HREF sources
# Given an ARRAY of file lines, check for
# anchor href="something" ...
# Return the "something" in an array
####################################################
sub get_href_srcs($$) {
my ($fil, @lns) = @_;
my $lc = scalar @lns;
my $scnt = 0;
my $slns = 0; # count the SCRIPT lines
my ($nm,$dir) = fileparse( $fil );
prt( "Processing $lc lines from [$nm] dir=[$dir]...\n" ) if ($dbg6);
my @isrc = ();
my $ln = '';
my $bal = '';
my $inscript = 0;
$slns = 0;
my $bgnln = 0;
my $endln = 0;
for (my $i = 0; $i < $lc; $i++) {
$ln = $bal;
$bal = '';
$ln .= $lns[$i];
chomp $ln;
prt( "$i [$ln] ...\n" ) if ($dbg10);
if ($inscript) {
if ($ln =~ /<\/script>/i) {
$inscript =0;
prt( "EXIT a SCRIPT ...\n" ) if ($dbg4);
add_2_scripts( $fil, $slns );
$slns = 0;
next;
}
$slns++;
next;
}
if ( $ln =~ //) && ($i < $lc)) {
$i++;
my $nxln = $lns[$i];
chomp $nxln;
prt( "Adding [$nxln] ...\n" ) if ($dbg10);
$iln .= ' ' if !($iln =~ /=$/);
$iln .= $nxln;
}
$endln = $i;
my $ind = index($iln, '>');
if ($ind != -1) {
$bal = substr($iln, $ind+1);
$iln = substr($iln, 0, $ind+1);
}
#if ($iln =~ /src=\"(.+)\"/i) {
if ($iln =~ /href\s*=\s*\"(\S+)\"/i) {
prt( "HREF = $1\nIn line [$iln]...\n" ) if ($dbg10);
push(@isrc, [$1, "$bgnln:$endln"] );
$scnt++;
} else {
# hmmm... NO 'href' in his line
if (( $iln =~ /name=\s*\"(\S+)\"/i )||( $iln =~ /name=(\S+)/i )) {
# ignore BOOKMARKS
} elsif (( $iln =~ /id=\s*\"(\S+)\"/i )||( $iln =~ /id=(\S+)/i )) {
# ignore BOOKMARKS
} else {
prtw("WARNING:$fil:$bgnln: HREF NOT FOUND in [$iln] [$ln] [$fil]");
}
}
} elsif ( $ln =~ //i ) {
$inscript = 1;
prt( "Entered a SCRIPT ...\n" ) if ($dbg4);
$slns = 0;
$ln = substr($ln, 7);
if ($ln =~ /<\/script>/i) {
$inscript =0;
prt( "EXIT a SCRIPT ...\n" ) if ($dbg4);
add_2_scripts( $fil, 1 );
$slns = 0;
}
}
}
if ($inscript) {
my $msg = "WARNING: EXIT WHILE IN SCRIPT in [$fil]...";
push(@warnings, $msg);
prt( "$msg\n" ) if ($dbg16);
}
prt( "Returning $scnt HREF sources ...\n") if ($dbg10);
return @isrc;
}
#########################################################
# Passed an array REF of extensions,
# check if this is one of them?
#########################################################
sub is_this_extent($$) {
my ($ext, $rex) = @_;
my $lcx = lc($ext);
foreach my $x (@{$rex}) {
return 1 if ($lcx eq lc($x));
}
return 0;
}
############################################
# only looking for HTM, HTML, PHP,
# could be extended to others maybe ...
############################################
# test an EXTENSION, or form '.htm'...
sub is_htm_ext($) {
my ($ext) = shift;
return( is_this_extent($ext,\@html_ext) );
}
sub is_graf_ext($) {
my ($ext) = shift;
return( is_this_extent($ext,\@graf_ext) );
}
sub is_zip_ext($) {
my ($ext) = shift;
my @arr = qw( .zip .gz );
return( is_this_extent($ext,\@arr) );
}
sub is_css_ext($) {
my ($ext) = shift;
return( is_this_extent($ext, \@css_ext) );
}
sub is_txt_ext($) {
my ($ext) = shift;
my @arr = qw( .txt );
return( is_this_extent($ext, \@arr) );
}
sub is_script_ext($) {
my ($fil) = shift;
return( is_this_extent($fil, \@script_ext) );
}
# test a FILE/PATH extension
sub is_htm_file_ext($) {
my ($fil) = shift;
my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/);
return( is_htm_ext($e) );
}
sub is_graphic_file_ext($) {
my ($fil) = shift;
my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/);
return( is_graf_ext($e) );
}
sub is_zip_file_ext($) {
my ($fil) = shift;
my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/);
return( is_zip_ext($e) );
}
sub is_css_file_ext($) {
my ($fil) = shift;
my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/);
return( is_css_ext($e) );
}
sub is_txt_file_ext($) {
my ($fil) = shift;
my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/);
return( is_txt_ext($e) );
}
sub is_script_file_ext($) {
my ($fil) = shift;
my ($n,$d,$e) = fileparse($fil,qr/\.[^.]*/);
return( is_script_ext($e) );
}
################################################
# my $ignfpd = 1; # ignore FRONTPAGE folders
################################################
sub is_fp_folder($) {
my ($inf) = shift;
foreach my $fil (@fpfolders) {
if (lc($inf) eq lc($fil)) {
return 1;
}
}
return 0;
}
sub is_xclude_folder($) {
my ($inf) = shift;
foreach my $fil (@g_xclude_dir) {
if (lc($inf) eq lc($fil)) {
return 1;
}
}
return 0;
}
####################################
# Check if FILE is in EXCLUDE list
####################################
sub in_excludes($) {
my ($fil) = shift;
my $lcf = lc($fil);
foreach my $f (@excludes) {
if (lc($f) eq $lcf) {
return 1;
}
}
return 0;
}
sub in_spl_excludes($) {
my ($fldr) = shift;
my $lfldr = lc($fldr);
foreach my $f (@splexcludes) {
if (lc($f) eq $lfldr) {
return 1;
}
}
return 0;
}
####################################################################
# process_folder(folder)
# Main DIRECTORY processing function
#
# Open the FOLDER given, and collect ALL files found,
# iterate into sub-directories, if $recurse is non-zero,
# and it is NOT a special FRONTPAGE (hidden) FOLDER.
#
# Files are collected into multidemensional arrays, and
# %g_dir_files - store per directory, per type array of files
####################################################################
sub process_folder($) {
my ($inf) = shift;
my $fcnt = 0; # start with NO 'html' files
my ($fil,$ff,$fnd,$dhr,$far,$isind);
my $rg_dir_files = \%g_dir_files;
prt( "[dbg1b] Processing [$inf] folder...\n" ) if ($dbg1b);
my @dirs = ();
my $uinf = dos_2_unix($inf); # keep HASH in UNIX form
$fnd = 0; # count of 'index.htm' type files found in folder
if ( opendir( DIR, $inf ) ) {
my @files = readdir(DIR);
closedir DIR;
$fcnt = scalar @files;
#prt("Processing $fcnt files from [$inf]...\n");
$fcnt = 0;
${$rg_dir_files}{$uinf} = {} if (!defined ${$rg_dir_files}{$uinf}); # store files %hash->dir->%type->@files
foreach $fil (@files) {
next if (($fil eq ".")||($fil eq ".."));
$isind = 0;
$ff = $inf."\\".$fil;
if ( -d $ff ) {
if ($recurse) {
next if ($ignfpd && is_fp_folder($fil)); # ignore FRONTPAGE folders
next if (@splexcludes && in_spl_excludes($fil));
next if (is_xclude_folder($fil));
#process_folder( $ff );
push(@dirs,$ff);
}
} else {
# NOTE: multidimensional arrays pushed - offsets into arrays
# my $of_ff = 0; # 1 - full file name
# my $of_hr = 1; # 2 - array ref of href links
# my $of_im = 2; # 3 - array ref of image links
# my $of_lk = 3; # 4 - linked count
# my $of_rh = 4; # 5 - contains 'collecthrefs2' collections
# my $of_to = 5; # links TO
# my $of_fm = 6; # links FROM
# my $of2hr = 7; # more or less duplicate of $of_hr
# my $of2im = 8; # more or less duplicate of $of_im
if ( in_excludes($fil) ) { # NOT in @excludes
prt("[dbg37] File [$fil] EXCLUDED!\n") if ($dbg37);
} else {
$dhr = ${$rg_dir_files}{$uinf}; # extract hash ref for this (unix) path
my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ );
if (defined $ext_hash{$ext}) {
$ext_hash{$ext}++;
} else {
$ext_hash{$ext} = 1;
}
my $arr_ref = \@other_files;
my $type = "Other";
if (is_htm_file_ext($fil)) {
$arr_ref = \@htm_files;
$fcnt++;
$isind = is_def_index_file($fil);
$fnd++ if ($isind);
$type = "HTML";
} elsif (is_graphic_file_ext($fil)) {
$arr_ref = \@img_files;
$type = "IMG";
} elsif (is_zip_file_ext($fil)) {
$arr_ref = \@zip_files;
$type = "ZIP";
} elsif (is_css_file_ext($fil)) {
$arr_ref = \@css_files;
$type = "CSS";
} elsif (is_txt_file_ext($fil)) {
$arr_ref = \@txt_files;
$type = "TEXT";
} elsif (is_script_file_ext($fil)) {
$arr_ref = \@script_files;
$type = "SCRIPT";
}
# ==========================================================
# 0 1 2 3 4 5 6 7 8 9 10
push(@{$arr_ref}, [$ff, '', '', 0, 0, '', '', 0, 0, 0, 0] );
# ==========================================================
${$dhr}{$type} = [] if (!defined ${$dhr}{$type});
$far = ${$dhr}{$type};
push(@{$far},[$fil,$isind,0,[],[],0]);
prt("[dbg36] Got file [$fil], with ext = [$ext] = $type\n") if ($dbg36);
}
}
}
prt( "[dbg1] Processed $inf folder finding $fcnt HTML files ...\n" ) if ($dbg1);
if ($fcnt) {
# got some, but any 'index.htm' type
push(@g_folders_noind, $inf) if (!$fnd);
} else { # NO 'html' files found
push(@g_empty_folders, $inf); # store the EMPTY folder
}
} else {
pgm_exit(1,"ERROR: Failed to open folder [$inf]!\n" );
}
if (@dirs) {
# process the subdirectories, if any to process
foreach $ff (@dirs) {
process_folder( $ff );
}
}
}
##############################################
# Just to show the COUNTS in the ARRAYS
##############################################
sub show_found_counts() {
my ($cnt);
if (VERB1()) {
$cnt = scalar @htm_files;
prt( "show_found_counts() $cnt HTML, ");
$cnt = scalar @img_files;
prt( "$cnt images, " );
$cnt = scalar @css_files;
prt( "$cnt css, " );
$cnt = scalar @zip_files;
prt( "$cnt zip, " );
$cnt = scalar @txt_files;
prt( "$cnt txt, " );
$cnt = scalar @script_files;
prt( "$cnt script, " );
$cnt = scalar @other_files;
prt( "and $cnt others ...\n" );
}
$cnt = scalar keys %ext_hash;
if ($dbg27 || VERB5()) {
prt( "$cnt extensions, and each count ...\n" );
foreach my $key (keys %ext_hash) {
my $val = $ext_hash{$key};
prt( "$val $key ");
}
prt("\n");
}
my $cnt1 = scalar @g_empty_folders;
my $cnt2 = scalar @g_folders_noind;
prtw("WARNING: Got $cnt1 folders with NO HTML type files, and $cnt2 with no 'index.htm'\n") if (($cnt1 || $cnt2) && $show_no_index);
}
# @ipsfound = ;
sub in_ips_found($) {
my ($ip) = shift;
my $lcip = lc($ip);
foreach my $i (@ipsfound) {
chomp $i;
if (lc($i) eq $lcip) {
return 1;
}
}
return 0;
}
sub trim_href($) {
my ($fh) = shift;
my $h = $fh;
my $ind = index($h,'#');
$h = substr($h,0,$ind) if ($ind > 0);
$ind = index($h,'?');
$h = substr($h,0,$ind) if ($ind > 0);
$h =~ s/\/$//; # remove any trailing '/'
if ($h =~ /\.$/) {
$h =~ s/\.$//; # remove any trailing '.' - but wonder what this is
prt("PROBLEM: File [$g_active_file]$g_active_lnn\n");
pgm_exit(1,"Check out this trailing '.', from [$fh]!\n");
}
return $h;
}
sub get_host_link($) {
my ($h) = shift;
if ($h =~ /^(\w+:\/\/[\w\.-]+\.[A-Z]{2,4})/i) {
return "$1";
}
return $h;
}
sub get_host_link13($) {
my ($uri) = shift;
# print "$1,$2, $3,$4, $5, $6,$7, $8, $9\n"
if ($uri =~ m{^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?}) {
return $1.$3;
}
return $uri;
}
sub get_host_link4($) {
my ($uri) = shift;
# print "$1,$2, $3,$4, $5, $6,$7, $8, $9\n"
if ($uri =~ m{^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?}) {
return $4;
}
return $uri;
}
sub get_domain_name($$) {
my ($uri,$rw) = shift;
$uri = get_host_link4($uri);
$uri =~ s/^www(\d*)\.// if ($rw);
return $uri;
}
sub show_remote_links() {
my $rh = \%g_hrefs;
my $rh2 = \%g_hosts;
my $rd = \%g_domains;
my ($len,$key,$val,$href,$cnt,$chk,$dom);
my $min = 0;
my @hosts = ();
my @domains = ();
foreach $key (keys %{$rh}) {
$len = length($key);
$min = $len if ($len > $min);
}
$min += 2;
$min = 65 if ($min > 65);
my $icnt = 0;
foreach $key (sort keys %{$rh}) {
$val = ${$rh}{$key};
# still to decide which is best???
$href = get_host_link($key);
#my $href13 = get_host_link13($key);
#if ($href ne $href13) {
# pgm_exit(1, "CHECK: from [$key], got \n [$href] and \n [$href13] FIX CODE!\n");
#}
$chk = '';
if (defined ${$rh2}{$href}) {
${$rh2}{$href}++;
} else {
${$rh2}{$href} = 1;
push(@hosts,$href);
$dom = lc(get_domain_name($key,0));
if (!defined ${$rd}{$dom}) {
${$rd}{$dom} = 1;
push(@domains,$dom)
}
if ($check_host_ip) {
if (checkIPAddress($href)) {
$chk = 'ok';
} else {
$chk = 'NA';
}
}
}
$key = '['.$key.']';
$key .= ' ' while (length($key) < $min);
$icnt++;
$cnt = ${$rh2}{$href};
prt( "$icnt: $key ($cnt)$chk in $val\n" ) if ($out_remote_links);
}
if (@hosts) {
if ($os =~ /Win/) {
$out_hosts = path_u2d($out_hosts);
}
write2file(join("\n",sort(@hosts))."\n",$out_hosts);
prt("Full list of HOSTS written to [$out_hosts]...\n") if (VERB2());
}
if (@domains) {
if ($os =~ /Win/) {
$out_domains = path_u2d($out_domains);
}
write2file(join("\n",sort(@domains))."\n",$out_domains);
prt("Full list of Domains written [$out_domains]...\n") if (VERB2());
}
$icnt = 0;
foreach $key (sort keys %{$rh2}) {
$val = ${$rh2}{$key};
$icnt++;
prt("$icnt: $key $val\n") if ($out_remote_links);
}
}
#######################################################
# Process the HTTP HREF sources
# if $chkip = 1; then attempt to resolve the IP
# addresses from the host name.
# push(@httprefs, [$src, $fil, $lnnos] );
#######################################################
sub process_host_array() {
my $hcnt = scalar @httprefs;
my $newcnt = 0;
my ($val,$msg,$file,$i,$href,$lnn,$nm,$dir,$key,$icnt,$rawhr);
my @hrefsarr = ();
$icnt = 0;
if ($hcnt) {
prt( "Found total $hcnt remote 'http' HREF entries ...\n" ) if (VERB2());
for ($i = 0; $i < $hcnt; $i++) {
$lnn = $httprefs[$i][2]; # get file LINE NUMBER
$file = $httprefs[$i][1]; # get FULL FILE NAME
$g_active_file = $file;
$g_active_lnn = $lnn;
# $href = $httprefs[$i][0];
# should remove any hash like '#pos', and any query like '?a=b&c=d'
$rawhr = $httprefs[$i][0];
push(@hrefsarr,$rawhr);
$href = trim_href($rawhr);
($nm,$dir) = fileparse($file);
if (defined( $g_hrefs{$href} )) {
$val = $g_hrefs{$href};
$val .= ' '.$file;
} else {
$val = $file;
$newcnt++;
}
$val .= ":$lnn";
$g_hrefs{$href} = $val;
prt( "$href in [$file]$lnn\n" ) if ($dbg2);
}
if (@hrefsarr) {
my %dupes = ();
foreach $href (@hrefsarr) {
$dupes{$href} = 1;
}
@hrefsarr = keys(%dupes);
}
if (@hrefsarr) {
if ($os =~ /Win/) {
$out_hrefs = path_u2d($out_hrefs);
}
write2file(join("\n",sort(@hrefsarr))."\n",$out_hrefs);
prt("Full list of HREFS written to [$out_hrefs] file.\n") if (VERB2());
}
$hcnt = scalar keys(%g_hrefs);
show_remote_links();
prt( "Found $hcnt different remote entries ...($newcnt) " );
if ($chkip) {
prt("and now checking for valid IP\n");
my $inips = 0;
prt( "Checking $hcnt IP addresses ... " );
if ( !$refreships && ( -f $ipfile)) {
if (open INF, "<$ipfile") {
@ipsfound = ;
close INF;
prt( "Have ".scalar @ipsfound." in $ipfile" );
} else {
prt( "Warning: Failed to open $ipfile" );
}
}
prt("\n");
$icnt = 0;
foreach $key (keys %g_hrefs) {
$val = $g_hrefs{$key};
$icnt++;
prt( "$key in $val\n" ) if ($dbg8);
if ($key =~ /^http:\/\//i) {
my $hkey = substr($key, 7); # REMOVE protocol string
my @arr = split( /\//, $hkey ); # and get just the domain name
$hkey = $arr[0]; # but this may not work with '#', or queries '?'
# ***TBF*** TO BE FIXED
if ( !in_ips_found($hkey) ) {
if (checkIPAddress( $hkey ) == 0) {
$msg = "FAILED: NO IP FOR HOST [$hkey][$val]";
push(@warnings, $msg);
prt( "$msg\n" ) if ($dbg16);
} elsif ($writeips) {
push(@ipsfound,"$hkey\n");
}
} else {
$inips++;
}
}
if (($icnt % 100) == 0) {
prt( "Done $icnt IP Addresses ...\n" );
}
}
prt( "Completed $icnt IP Addresses ... " );
if ($writeips) {
$val = join("\n", sort @ipsfound);
$val = trimblanklines($val);
write2file($val, $ipfile);
prt( "$inips in previous. Written ".scalar @ipsfound." to $ipfile" );
}
prt("\n");
} else {
prt(", but IP checking is OFF! (add -checkips)\n");
}
}
}
sub write_missing($$) {
my ($fil,$rh) = @_; # ($not_found_file,\%g_images_notfound) if (length($not_found_file));
my ($key,$list);
$list = '';
foreach $key (keys %{$rh}) {
$list .= "\n" if (length($list));
$list .= $key;
}
if (length($list)) {
$list .= "\n";
write2file($list,$fil);
prt("Written missing list to [$fil]\n");
}
}
sub show_scripts() {
my ($val,$scnt,$file);
$scnt = scalar @scripts;
if ($scnt && ($dbg9 || $showscripts)) {
prt( "Got $scnt files containing SCRIPTS ...\n" );
# push(@scripts, [$fil, $lns]);
for (my $i = 0; $i < $scnt; $i++) {
$file = $scripts[$i][0];
$val = $scripts[$i][1];
prt( "$file $val\n" );
}
}
}
sub show_dir_files($) {
my ($hr) = @_; # = \%g_dir_files
my ($key,$val,$k2,$v2,$v3,$dir,$cnt,$i,$ii,$icnt);
foreach $key (sort keys %{$hr}) {
$dir = sub_in_folder($key);
$dir = '' if (length($dir)==0);
$val = ${$hr}{$key}; # extract hash ref BY TYPE
prt("Directory: [$dir]\n");
$icnt = 0;
foreach $k2 (sort keys %{$val}) {
# for each type
$v2 = ${$val}{$k2};
$cnt = scalar @{$v2};
prt(" Type $k2 = $cnt\n");
for ($i = 0; $i < $cnt; $i++) {
$v3 = ${$v2}[$i][0];
$ii = ${$v2}[$i][1];
prt(" File: [$v3] ");
if ($ii) {
$icnt++;
prt(" ('index' type)");
}
prt("\n");
}
}
if (!$icnt) {
prt("Directory: [$dir] has NO 'index' type file.\n");
} elsif ($icnt > 1) {
prt("Directory: [$dir] has $icnt 'index' type files.\n");
}
}
pgm_exit(1,"Debug run show of directories and files...\n");
}
# ####################################################
### MAIN ###
parse_args(@ARGV);
if (length($in_folder) == 0) {
mydie( "No input folder (or file) given/found in command ...\n" );
}
if (-f $in_folder) {
#$g_user_file = File::Spec->rel2abs($in_folder);
$g_user_file = $in_folder;
($homefile, $in_folder) = fileparse($in_folder);
$in_folder =~ s/[\\\/]$//;
$in_folder = $currworkdir if ($in_folder eq '.'); # 2010-04-02 - use FULL work directory if just '.'
$single_file = 1;
}
# set_debug_on();
# set_htools_dbg_on();
show_startup();
if ( !$recurse && length($g_user_file) && (-f $g_user_file)) {
# 0 1 2 3 4 5 6 7 8 9 10
push(@htm_files, [$g_user_file, '', '', 0, 0, '', '', 0, 0, 0, 0] );
} else {
process_folder($in_folder);
}
###show_dir_files(\%g_dir_files) if ($debug_on);
show_found_counts();
process_file_array(); # main file processing - each HTML file found is processed
process_host_array();
if (length($homefile)) {
###trace_from_htm( $homefile, 0 );
check_linkages( $homefile );
check_local_links( $homefile );
show_link_counts("HTML ", \@htm_files);
show_link_counts("IMG ", \@img_files);
show_link_counts("CSS ", \@css_files);
show_link_counts("ZIP ", \@zip_files);
show_link_counts("TXT ", \@txt_files);
show_link_counts("Script ", \@script_files);
show_link_counts("Other ", \@other_files);
}
show_scripts();
my $mlcnt = scalar @missing_links;
if ($mlcnt && !$dbg33) {
prtw("WARNING: Found $mlcnt files with NO LINKS!\n");
}
##############################################################
prt( "\n###### SHOW RESULTS ########\n" );
prt( "WARNING: $hrflnkcnt images by HREF not shown! (use -showhreflinks)\n" ) if ($hrflnkcnt);
show_warnings();
if (@missed) {
prt( "\nMISSING FOLLOW: ".scalar @missed."\n" );
foreach my $mfile (@missed) {
prt( "$mfile\n");
}
}
prt( "###### END RESULTS ########\n" );
write_missing($not_found_file,\%g_images_notfound) if (length($not_found_file));
##############################################################
pgm_exit(0,"Normal exit(0)\n");
##################################
sub show_help {
prt( "$pgmname [Options] input-folder or home-file-name.\n" );
prt( "Purpose: To take a folder, or home file in a folder, and\n" );
prt( "check the assume local Web Site for internal consistency.\n" );
prt( "Options:\n" );
prt( " -checkips = Will check the IP resolution of REMOTE HREF items.\n" );
prt( " -showhreflinks = Show a WARNING when an IMG, ICO, etc is a REMOTE link\n" );
prt( " -showlinks = Show the links for each file ...\n" );
prt( " -showremote = Show remote links, and file containing link.\n" );
prt( " -showscripts = Show SCRIPT files ...\n" );
prt( " -shownoindex = Show folders with NO 'index' type file, and/or NO html types files.\n");
prt( " -writeips = Write HREF of IP found to a file ...\n" );
prt( " -refreships = If -checkips, and -writeips, re-write NEW check file...\n" );
prt( " -ipfile out-file = Set HREF output file. Default is [$ipfile].\n" );
prt( " -ignore in-file = Ignore this file. Repeat for more. use '.none.' to reset list.\n" );
prt( " -shownohrefs = Show when NO HREF found in a file.\n" );
prt( " -loadlog (-ll) = Load LOG file at end.\n");
prt( " -showallnotfound = Show ALL images not found. Default to show only 'different'.\n");
prt( " -nffile out-file = Write not found files to file name given.\n");
prt( " -v[NN] = Bump verbal, or set to 'NN'.\n" );
prt( " -x = Exclude directory. '.reset.' clears the list.\n" );
prt( " -load-log (-l) = load log with output at end.\n");
prt( " -nofollow = Do not follow page links.\n");
prt( "If an input-folder given, then no trace of internal links will be done.\n" );
prt( "If a home file name is given, the folder used will be of that file.\n" );
prt( "All arguments can also be given with a '--' prefix, if that 'feels' better ;=))\n");
prt( "Following are the current default settings ...\n" );
$verbosity += 5;
show_startup();
mydie(" Happy link checking ;=))\n");
}
# Ensure argument exists, or die.
sub require_arg {
my ($arg, @arglist) = @_;
mydie( "ERROR: no argument given for option '$arg' ...\n" )
if ! @arglist;
}
##########################################################
# Parse USER input
# Largerly still to be done
##########################################################
sub parse_args {
my (@av) = @_;
my ($arg,$narg,$sarg);
while (@av) {
$arg = $av[0];
if ($arg =~ /^-/) {
$sarg = substr($arg,1);
$sarg = substr($sarg,1) while ($sarg =~ /^-/);
if (($sarg eq 'h')||($sarg eq 'help')||($sarg eq '?')) {
prt("Showing help...\n");
show_help();
} elsif ($sarg =~ /^v/) {
if ($sarg =~ /^v.*(\d+)$/) {
$verbosity = $1;
} else {
$sarg = substr($sarg,1);
while ($sarg =~ /^v/) {
$sarg = substr($sarg,1);
$verbosity++;
}
}
prt( "Set verbal to $verbosity\n" ) if (VERB1());
} elsif ($sarg eq 'checkips') {
$chkip = 1;
prt( "Will check IP of REMOTE HREF items.\n" );
} elsif ($sarg eq 'showhreflinks') {
$showhreflinks = 1;
prt( "Show a WARNING when an IMG, ICO, etc is a REMOTE link\n" );
} elsif ($sarg eq 'showlinks') {
$showlinks = 1;
prt( "Show the links for each file ...\n" );
} elsif ($sarg eq 'showremote') {
$out_remote_links = 1;
prt( "Show remote links, and each file...\n" );
} elsif ($sarg eq 'showscripts') {
$showscripts = 1;
prt( "Show SCRIPT files ...\n" );
} elsif ($sarg eq 'writeips') {
$writeips = 1;
prt( "Write HREF of IP found to a file ...\n" );
} elsif ($sarg eq 'refreships') {
$refreships = 1;
prt( "If -checkips, and -writeips, re-write NEW check file...\n" );
} elsif ($sarg eq 'ipfile') {
require_arg(@av);
shift @av;
$ipfile = $av[0];
prt( "HREF output set to $ipfile ...\n" );
} elsif ($sarg eq 'ignore') {
require_arg(@av);
shift @av;
$arg = $av[0];
if ($arg eq '.none.') {
@excludes = ();
prt( "Reset EXCLUDES array ...\n" );
} else {
push(@excludes, $arg);
prt( "Added file [$arg] to EXCLUDES ...\n" );
}
} elsif ($sarg eq 'x') {
require_arg(@av);
shift @av;
$arg = $av[0];
if ($arg eq '.reset.') {
@g_xclude_dir = ();
prt( "Reset EXCLUDE directory array ...\n" );
} else {
push(@g_xclude_dir, $arg);
prt( "Added file [$arg] to EXCLUDE directories ...\n" );
}
} elsif ($sarg eq 'shownoindex') {
$show_no_index = 1;
prt("Set to show folders with NO 'index' type file, and/or NO html types files.\n");
} elsif (($sarg eq 'll')||($sarg eq 'loadlog')) {
$load_log = 1;
prt("Set to load log at end.\n");
} elsif ($sarg eq 'showallnotfound') {
prt( "Set to show ALL images not found.\n");
$show_all_not_found = 1;
} elsif ($sarg eq 'nffile') {
require_arg(@av);
shift @av;
$narg = $av[0];
$not_found_file = $narg; # if file name given, write list to that file...
prt("Will write missing to file [$narg]\n");
} elsif ($sarg eq 'debug') {
prt("Setting BIG debug ON...\n");
set_debug_on();
set_htools_dbg_on();
$load_log = 1;
} elsif ($sarg =~ /^l/) {
$load_log = 1;
prt("Set to load log at end.\n");
} elsif ($sarg eq 'nofollow') {
$recurse = 0;
prt("Set to NOT follow links\n");
} else {
mydie( "ERROR: Unknown argument [$arg] ...\n" );
}
} else {
# no leading '-'
$in_folder = $av[0];
prt( "Input folder set to [$in_folder]...\n" ) if (VERB2());
}
shift @av;
}
# check the INPUT folder
if ( $debug_on ) {
prtw("WARNING: DEBUG is ON!\n");
if (length($in_folder) == 0) {
if (length($def_in_folder) && (-d $def_in_folder)) {
$in_folder = $def_in_folder;
prt("[debug_on] Input folder set to DEFAULT [$in_folder]...\n");
} elsif (length($def_file) && (-f $def_file)) {
$in_folder = $def_file;
prt("[debug_on] Input folder set to DEFAULT [$in_folder]...\n");
}
}
$load_log = 1;
}
if (length($in_folder) == 0) {
mydie( "ERROR: No VALID FOLDER OR FILE NAME found in command\n" );
}
if ( !( (-d $in_folder) || (-f $in_folder ) ) ) {
mydie( "ERROR: [$in_folder] is NOT VALID FOLDER OR FILE NAME\n" );
}
# pgm_exit(1,"Current work directory: [$currworkdir]...\n");
}
# eof - chklinks03.pl