Generated: Tue Feb 2 17:54:31 2010 from fav-05.pl 2007/08/24 21.2 KB.
#!/perl -w # NAME: fav-05.pl # AIM: To read the Internet Favorites, and produce # a HTML document, with links and description # # 23/08/2007 - into new 'format', and some updates using Tidy to 'fix' text ... # added date table, if $chkip, check the IP can be resolved, mark <b>(B)</b> if not. # if $chkpage, download the page, and mark <b> if this fails. # # 2006.07.11 - switch link column, and add (B) broken, from c:\HOMEPAGE\Broken02.htm # update 2006.06.28 - weed out local references # Added a MAXIMUM width, so the table approximately 'fits' a 1024 wide screen # change to using '<base target="_blank">' # # 2005.11.12 - works ok - geoff mclane - geoffair.net/favorites.htm # use strict; use warnings; use File::stat; use Socket; use LWP::Simple; unshift(@INC, 'C:/GTools/perl'); require 'logfile.pl' or die "Unable to load logfile.pl ...\n"; # log file stuff my ($LF); my $pgmname = $0; if ($pgmname =~ /\w{1}:\\.*/) { my @tmpsp = split(/\\/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = "temp.$pgmname.txt"; open_log($outfile); prt( "$0 ... Hello, World ...\n" ); ##my $htmfil = 'favorites.htm'; # OUTPUT HTML FILE my $htmfil = 'tempfav.htm'; # OUTPUT HTML FILE my $chkip = 0; # physically CHECK resolves to an IP address, or 2 my $chkpage = 0; # longer FETCH ACTUAL PAGE - about 1 sec per entry!!! # set a sample maximum title, wrap start at -10 from this - original set at 60 # 12345678901234567890123456789012345678901234567890123456789012345678901234567890 # 1 2 3 4 5 6 7 my $maxtit = 'Domain Name Registration, Domain Transfe'; # rs. Your domain name search starts here.'; my $maxwid = length($maxtit); my $DT = YYYYMMDD(time()); my $hvers = " <!-- GA.$DT - geoffair.net - redo listing using $pgmname -->\n"; $hvers .= " <!-- P26.2006.07.11 - update -->\n"; $hvers .= ' <!-- p26.2005.11.11 - List of favorites in PRO-1 geoffmclane.com/favorites.htm -->'; # parameters to run tidydev to 'fix' tit text. could not get stdin functioning, so # write to a file, and pass file to tidydev.exe (in my PATH). my $params = '-f temptidy.txt --tidy-mark no --doctype omit --show-body-only yes'; my $tablelink = 24; # put a LINK LINE after this many rows ### debug my $dbg3 = 0; # show resolved IP addresses ######################################### my $favfolder = get_favorite_folder(); prt( "Processing folder [$favfolder] ...\n" ); my $basedir = quotemeta($favfolder); # entries to EXCLUDE my @fav_exclude = ( 'https://geoffmclane.com:2083/frontend/x/index.html' ); my @dirs = ($favfolder); # start folder my @fils = (); # collection of files from folder my @tblist = (); # final multidimensional array my @warnings = (); my $wmsg = ''; my @broken = (); # if $chkip, broken links found. my @nopage = (); # if $chkpage, pages that did not load. # MULTI-DIMENSIONAL ARRAY - holding all the information for the TABLES # 0 1 2 3 4 5 6 7 8 9 10 # $tl_ sn url mu tm dir tit dn vip txt ttt dom #push(@tblist, [$sn, $u, $mu, $tms, $fold, $tit, 0, 0, 0, $ttt,$dom]); my $tl_sn = 0; my $tl_url = 1; my $tl_mu = 2; my $tl_tm = 3; my $tl_dir = 4; my $tl_tit = 5; my $tl_dn = 6; my $tl_vip = 7; # valid IP, if checked my $tl_txt = 8; # valid page, if checked my $tl_ttt = 9; # Tidied title text my $tl_dom = 10; # domain NAME ### begin processing while (@dirs) { my @dir2 = @dirs; # copy array @dirs = (); # and kill these foreach my $dn (@dir2) { process_dir($dn); # prcess folder, which may yield more folders } } get_table_arr(); # build up MULTI-DIMENSIONAL array for TABLES post_processing(); # Tidy tit text, and get domain name # show MISSING if ($chkip && @broken) { prt( "Showing list of ".scalar @broken." BROKEN URLS ...\n" ); foreach my $u (@broken) { prt( "$u\n" ); } } if ($chkpage && @nopage) { prt( "Showing list of ".scalar @nopage." URLS where PAGE failed ...\n" ); foreach my $u (@nopage) { prt( "$u\n" ); } } # WRITE HTML FILE if( write_html_file( $htmfil ) ) { system($htmfil); # load HTML file } close_log($outfile,1); # close LOG and LOAD exit(0); # ALL DONE ##################################### ###### subs # at presetn ADDS 2 things to the multi-dimensional array sub post_processing { my $bgntime = time(); my $max = scalar @tblist; my ($i, $f_link, $f_tit, $f_ttt, $f_dom); prt( "Post processing $max entries ...\n" ); for ($i = 0; $i < $max; $i++ ) { $f_link = $tblist[$i][$tl_url]; # column 3 (HREF) $f_tit = $tblist[$i][$tl_tit]; # column 2 $f_ttt = get_tidy_text($f_tit); $tblist[$i][$tl_ttt] = $f_ttt; $f_dom = Get_Domain_Name($f_link); $tblist[$i][$tl_dom] = $f_dom; } prt( "Done post for $max entries ...".secs_2_hhmmss(time() - $bgntime). "...\n" ); } ############################################################## ###### HTML STUFF sub get_link_text { my ( $grp ) = shift; my $lt = ''; $lt .= ' <a target="_self" href="#top">top</a> '."\n" if ($grp != 1); $lt .= ' <a target="_self" href="#folder">folder</a> '."\n" if ($grp != 4); $lt .= ' <a target="_self" href="#dateorder">date</a> '."\n" if ($grp != 3); $lt .= ' <a target="_self" href="#end">end</a> '."\n" if ($grp != 2); $lt .= ' <a target="_self" href="favorite.htm">back</a> '."\n"; $lt .= ' <a target="_self" href="home2.htm">home</a> '."\n"; return $lt; } sub add_link_para { my ( $hf, $grp ) = @_; print $hf " <p class=\"ctr\">\n"; print $hf get_link_text( $grp ); print $hf " </p>\n"; } # out the HEAD sub out_htm_head { my ($hf) = shift; print $hf <<EOF; <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <meta http-equiv="Content-Language" content="en-us"> <meta http-equiv="Content-Type" content="text/html; charset=us-ascii"> <meta name="Author" content="Geoff Mclane"> <title> List of Geoff Favorites </title> <link rel="SHORTCUT ICON" href="http://geoffair.net/icong.ico"> <link rel="stylesheet" type="text/css" href="homeg.css"> <style type="text/css"> <!-- /* Style Definitions */ .smfnt { font-size : small; } --> </style> <base target="_blank"> </head> <body> <h1> <a name="top" id="top"></a>List of Geoff Favorites </h1> EOF add_link_para( $hf, 1 ); # exclude 'top' print $hf <<EOF; <p> This is a simple table, as at $DT, of my ever changing, personal <span class= "bld">Favorites</span>. It is autogenerated periodically, using a Perl script, in an attempt to keep it up to date ;=)) It does contain some broken links, sites that have disappeared, but most are valid and current. The base target has been set to _blank, so when a link is clicked, it should open in a NEW browser page. While the link text is sometimes truncated, the underlying anchor reference contains the full link ... Enjoy ... </p> EOF } sub add_link_item { my ($hf, $cols) = @_; print $hf "\n"; print $hf " <tr>\n"; print $hf " <td colspan=\"$cols\" align=\"center\" class=\"smfnt\">"; add_link_para( $hf, 0 ); # exclude none print $hf " </td>\n"; print $hf " </tr>\n"; } # out FIRST TABLE sub out_htm_table { my ($hf) = shift; my $tcnt = scalar @tblist; my ($f_title, $f_link, $f_tlink, $f_date, $f_fold, $f_tit, $f_bkn, $f_pag, $f_ttt); my $rowcnt = 0; add_link_para( $hf, 4 ); # exclude 'folder' print $hf <<EOF; <a name="folder" id="folder"></a> <table border="1" width="100%" summary="List of favorites - First column is the title, and the 2nd is link"> <tr> <th> Folder </th> <th> Title </th> <th> Link </th> <th> Date </th> </tr> EOF # 0 1 2 3 4 5 6 7 8 9 10 # $tl_ sn url mu tm dir tit dn vip txt ttt dom #push(@tblist, [$sn, $u, $mu, $tms, $fold, $tit, 0, 0, $tx, $ttt, $dom ]); for (my $i = 0; $i < $tcnt; $i++) { $f_title = $tblist[$i][$tl_sn]; # unused $f_link = $tblist[$i][$tl_url]; # column 3 (HREF) $f_tlink = $tblist[$i][$tl_mu]; # column 3 (text) $f_date = YYYYMMDD($tblist[$i][$tl_tm]); # column 4 (as YYYY/MM/DD) $f_fold = $tblist[$i][$tl_dir]; # column 1 $f_tit = $tblist[$i][$tl_tit]; # column 2 $f_bkn = $tblist[$i][$tl_vip]; # broken link $f_pag = $tblist[$i][$tl_txt]; # missing page $f_ttt = $tblist[$i][$tl_ttt]; # tidied text if ( !$f_bkn ) { $f_title = '<b>(B)</b> '.$f_title; $f_tit = '<b>(B)</b> '.$f_tit; } elsif ( !$f_pag ) { $f_title = '(b) '.$f_title; $f_tit = '(b) '.$f_tit; } print $hf "\n"; print $hf " <tr>\n"; print $hf " <td>$f_fold</td>\n"; print $hf " <td>$f_ttt</td>\n"; print $hf " <td><a href=\"$f_link\">$f_tlink</a></td>\n"; print $hf " <td>$f_date</td>\n"; print $hf " </tr>\n"; $rowcnt++; if ($rowcnt > $tablelink) { if (($tcnt - $i) > ($tablelink / 2)) { add_link_item($hf, 4); } $rowcnt = 0; } } print $hf " </table>\n"; } # paragraph between tables sub out_htm_para1 { my ($hf) = shift; print $hf <<"EOF"; <p> <a name="bottom" id="bottom"></a> This table is auto-generated from a Perl script, reading and analysing my 'Favorites' folder, from the USERPROFILE given in the environment. Those marked with a <b>(B)</b> were <b>BROKEN</b> links at the last full verification done by the perl script ... and a small (b) suggests the actual page could not be fetched by the perl script ... sometimes this is due to the fact that they are secure sites (https), and sometimes due to the fact that the site, or at least that page, has since been pulled down, but I have yet to delete this link from my personal 'Favorites' ... and just sometimes the perl script makes a mistake in its verification process, and/or the site has a redirection active! </p> EOF } # out 2nd DATE TABLE sub out_htm_table2 { my ($hf) = shift; my $tcnt = scalar @tblist; my ($i, $f_title, $f_link, $f_tlink, $f_date, $f_fold, $f_tit, $f_bkn, $f_pag, $f_tm, $f_ttt); add_link_para( $hf, 3 ); # exclude 'dateorder' print $hf <<EOF; <p> <a name="dateorder" id="dateorder"></a>The following is a repeat ot the above, IN DATE ORDER ... </p> <table border="1" width="100%" summary="Table in DATE order"> <tr> <th> Date </th> <th> Folder </th> <th> Title </th> <th> Link </th> </tr> EOF # 0 1 2 3 4 5 6 7 8 9 # $tl_ sn url mu tm dir tit dn vip txt ttt #push(@tblist, [$sn, $u, $mu, $tms, $fold, $tit, 0, 0, 0, $ttt]); for ($i = 0; $i < $tcnt; $i++) { $tblist[$i][$tl_dn] = 0; ## clear DONE flag } my $maxtm = 0; my $mxoff = 0; my $alldn = 1; my $rowcnt = 0; my $done = 0; while ($alldn) { $alldn = 0; $maxtm = 0; for ($i = 0; $i < $tcnt; $i++) { if ( ! $tblist[$i][$tl_dn] ) { $f_tm = $tblist[$i][$tl_tm]; if ($f_tm > $maxtm) { $maxtm = $f_tm; $mxoff = $i; $alldn = 1; } } } if ($alldn) { $i = $mxoff; $tblist[$i][$tl_dn] = 1; ## SET DONE flag $f_title = $tblist[$i][$tl_sn]; # unused $f_link = $tblist[$i][$tl_url]; # column 4 (HREF) $f_tlink = $tblist[$i][$tl_mu]; # column 4 (text) $f_date = YYYYMMDD($tblist[$i][$tl_tm]); # column 1 (as YYYY/MM/DD) $f_fold = $tblist[$i][$tl_dir]; # column 2 $f_tit = $tblist[$i][$tl_tit]; # column 3 $f_bkn = $tblist[$i][$tl_vip]; # broken link $f_pag = $tblist[$i][$tl_txt]; # missing page $f_ttt = $tblist[$i][$tl_ttt]; # tidyied text if ( !$f_bkn ) { $f_title = '<b>(B)</b> '.$f_title; $f_tit = '<b>(B)</b> '.$f_tit; } elsif ( !$f_pag ) { $f_title = '(b) '.$f_title; $f_tit = '(b) '.$f_tit; } print $hf "\n"; print $hf " <tr>\n"; print $hf " <td>$f_date</td>\n"; print $hf " <td>$f_fold</td>\n"; print $hf " <td>$f_ttt</td>\n"; print $hf " <td><a href=\"$f_link\">$f_tlink</a></td>\n"; print $hf " </tr>\n"; $rowcnt++; $done++; if ($rowcnt > $tablelink) { if (($tcnt - $done) > ($tablelink / 2)) { add_link_item($hf, 4); } $rowcnt = 0; } } } print $hf " </table>\n"; } # out the HTML tail sub out_htm_tail { my ($hf) = shift; add_link_para( $hf, 2 ); # exclude 'end' print $hf <<EOF; <p> <a name="end" id="end"></a><a target="_blank" href="http://tidy.sourceforge.net/"><img border="0" src="images/checked_by_tidy.gif" alt="checked by tidy" width="32" height="32"></a> <a href="http://validator.w3.org/check?uri=referer" target="_blank"><img src="images/valid-html401.gif" alt="Valid HTML 4.01 Transitional" width="88" height="31"></a> </p> $hvers </body> </html> EOF } sub write_html_file { my ($hfil) = shift; my ($HF, $max); $max = scalar @tblist; if ( !$max ) { prt( "WARNING: No items in LIST!\n" ); return 0; } if ( !open( $HF, ">$hfil" ) ) { prt( "ERROR: Can NOT open HTML file $hfil! ... $! ...\n" ); return 0; } prt( "Writing $max items to $hfil ...\n" ); out_htm_head( $HF ); out_htm_table( $HF ); out_htm_para1( $HF ); out_htm_table2( $HF ); out_htm_tail( $HF ); close $HF; return 1; } sub get_favorite_folder { my $ff = ''; if( !defined( $ENV{'USERPROFILE'} ) ) { mydie( "ERROR: Can NOT locate USERPROFILE in ENVironment!\n" ); } $ff = $ENV{'USERPROFILE'} . '\\Favorites'; if( !( -d $ff ) ) { mydie( "ERROR: Folder $ff is NOT a directory!\n" ); } return $ff; } sub get_file_list { my ($d, @fs) = @_; foreach my $fn (@fs) { next if ($fn eq '.'); next if ($fn eq '..'); my $ffn = $d . '\\' . $fn; if( -d $ffn ) { push(@dirs, $ffn); } else { if ($fn =~ /.+\.url$/i) { push(@fils, $ffn); } else { prt( "Discarding file $ffn ...\n" ); } } } my $fcnt = scalar @fils; my $dcnt = scalar @dirs; prt( "Found $fcnt files, and $dcnt directories ...\n" ); } sub process_dir { my ($d) = shift; prt( "Processing $d ...\n" ); if (opendir(DIRH, $d)) { my @dfs = readdir(DIRH); closedir(DIRH); prt( "Found " . scalar @dfs . " entries ...\n" ); get_file_list($d, @dfs); } else { prt( "WARNING: Failed to open directory [$d]!...\n" ); } } sub remdir { my ($f) = shift; $f =~ s/^$basedir\\//; # remove beginning ... $f =~ s/\.url$//; # and remove tail return $f; } sub in_exclude { my ($h) = shift; foreach my $l (@fav_exclude) { if ($l eq $h) { return 1; } } return 0; } sub max_sub2 { my ($ln, $max) = @_; if (length($ln) > ($max+5)) { $ln = substr($ln,0,$max) . '...'; } return $ln; } sub max_sub { my ($ln, $max) = @_; my $nln = $ln; if (length($ln) > $max) { my @arr = split(/ /,$ln); $nln = ''; my $bit = ''; my $bl = 0; my $sl = 0; my $sc = 0; foreach my $s (@arr) { $sl = length($s); $bl = length($bit); while ($sl > $max) { if ($bl) { $bit .= ' '; } $bit .= substr($s, 0, $max - $bl); $s = substr($s, $max - $bl); if (length($nln)) { $nln .= "<br>\n"; } $nln .= $bit; $bit = ''; $sl = length($s); $bl = length($bit); $sc = 0; } if ($bl) { if (( $bl + $sc + length($s) ) > $max ) { if (length($nln)) { $nln .= "<br>\n"; } $nln .= $bit; $bit = $s; $sc = 0; } else { $bit .= ' '; $sc++; $bit .= $s; } } else { $bit = $s; $sc = 0; } } if (length($bit)) { if (length($nln)) { $nln .= "<br>\n"; } $nln .= $bit; } } return $nln; } sub secs_2_hhmmss { my ($secs) = shift; my $rt = ''; my $mins = int($secs / 60); $secs = $secs - ($mins * 60); $secs = (int(($secs * 10) + 0.5)) / 10; if ($mins > 60) { my $hrs = int($mins / 60); $mins = $mins - ($hrs * 60); $mins = '0'.$mins if ($mins < 10); $secs = '0'.$secs if ($secs < 10); $rt = "$hrs:$mins:$secs"; } else { $mins = '0'.$mins if ($mins < 10); $secs = '0'.$secs if ($secs < 10); $rt = "$mins:$secs"; } return $rt; } sub get_table_arr { my $totcnt = scalar @fils; prt( "Getting array of $totcnt favorite items ...\n" ); my $lncnt = 0; my $bgntime = time(); # seconds my ($currtime, $difftime, $persec, $remains, $remsecs, $tenths, $remtm, $elapsed); foreach my $fn (@fils) { # process each file $lncnt++; if ($chkpage && (($lncnt % 20) == 0)) { $currtime = time(); $difftime = $currtime - $bgntime; $persec = $lncnt / $difftime; $remains = $totcnt - $lncnt; $remsecs = $remains / $persec; $tenths = (int(($persec * 100) + 0.05)) / 100; $remtm = secs_2_hhmmss($remsecs); $elapsed = secs_2_hhmmss($difftime); prt( "$elapsed Done $lncnt ... $tenths/sec ... remains $remains in $remtm mm:ss ...\n" ); } my $sb = stat($fn); my $tms = $sb->mtime; if ( open( FH, $fn ) ) { my @lns = <FH>; # slurp in the lines close FH; my $sn = remdir($fn); # file name is the TITLE of the favorite ... my $line = ''; my $fnd = 1; my $bkn = 1; # assume NOT broken link my $pag = 1; # assume NOT no page # get the FOLDER my $ind = rindex($sn, "\\"); my $fold = '.'; my $tit = $sn; if ($ind != -1) { $fold = substr($sn, 0, $ind); $tit = substr($sn, ($ind + 1)); } foreach $line (@lns) { chomp $line; if( $line =~ /^URL=/ ) { my $u = substr($line,4); ## ~ s/^URL=//; if (in_exclude($u)) { $fnd = 0; # avoid a WARNING ... last; } if ($chkip) { if ( !showIPAddress($u) ) { push(@broken, $u); $bkn = 0; } } if ($chkpage) { # WARNING: this is QUITE SLOW if (!Get_URL_Text_Count($u)) { push(@nopage, $u); $pag = 0; } } my $mu = max_sub2($u,$maxwid); $mu =~ s/&/&/g; $sn = max_sub($sn, $maxwid); # wrap text to max width $sn =~ s/&/&/mg; # possible MULTIPLE lines $u =~ s/&/&/g; $tit =~ s/&/&/g; # 0 1 2 3 4 5 6 7 8 9 10 # $tl_ sn url mu tm dir tit dn vip txt ttt dom #push(@tblist, [$sn, $u, $mu, $tms, $fold, $tit, 0, 0, 0, ttt,dom]); push(@tblist, [$sn, $u, $mu, $tms, $fold, $tit, 0, $bkn,$pag, '', '' ]); $fnd = 0; last; } } if ($fnd) { $wmsg = "WARNING: Did NOT find a URL line in [$fn] ...\n"; prt($wmsg); push(@warnings,$wmsg); } } else { $wmsg = "WARNING: Unable to open file [$fn] ...\n"; prt($wmsg); push(@warnings,$wmsg); } } $currtime = time(); $difftime = $currtime - $bgntime; $remtm = secs_2_hhmmss($difftime); prt( "Got array of $totcnt items (in $remtm) ...\n" ); } ################################################ # My particular time 'translation' - replaced date_string sub YYYYMMDD { # 0 1 2 3 4 5 6 7 8 my ($tm) = shift; my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($tm); $year += 1900; $mon += 1; my $ymd = "$year/"; if ($mon < 10) { $ymd .= '0'.$mon.'/'; } else { $ymd .= "$mon/"; } if ($mday < 10) { $ymd .= '0'.$mday; } else { $ymd .= "$mday"; } return $ymd; } sub get_tidy_text { my ($tx) = shift; my $inf = 'temptidy.htm'; my $ntx = ''; write2file($tx,$inf); if (open (TDY, "tidydev $params $inf|")) { my @arr = <TDY>; close TDY; foreach my $ln (@arr) { chomp $ln; $ntx .= "\n" if length($ntx); $ntx .= $ln; } ###$ntx = join('',@arr); } else { prt( "FAILED! ... $! ...\n" ); $ntx = $tx; } return $ntx; } sub Get_URL_Text_Count { my ($url) = shift; my $txt = get( $url ); my $tcnt = 0; if ($txt) { $tcnt = length($txt); } return $tcnt; } sub Get_Host_Name { my ($nm) = shift; if ($nm =~ /^http:\/\/(.*)/i) { $nm = $1; } if ($nm =~ /^https:\/\/(.*)/i) { $nm = $1; } if ($nm =~ /^ftp:\/\/(.*)/i) { $nm = $1; } my @arr = split('/', $nm); $nm = $arr[0]; return $nm; } sub Get_Domain_Name { my ($nm) = shift; $nm = Get_Host_Name($nm); if ($nm =~ /^www\.(.*)/) { $nm = substr($nm,4); } return $nm; } ############################################################ # Only used is $chkip = 1; # Show IP Address # uses sockets, gethostbyname # Return 0, if can NOT be resolved. # else the number of IP addresses resolved. ############################################################ sub showIPAddress { my ($nm) = shift; my $hnm = Get_Host_Name($nm); my @addr = gethostbyname($hnm); my $cnt = 0; if( !@addr ) { prt( "Can't resolve $nm ($hnm): $!\n" ); return 0; } @addr = map { inet_ntoa($_) } @addr[4 .. $#addr]; foreach my $k (@addr) { $cnt++; prt( "$cnt: $nm($hnm) resolves to IP [$k]\n" ) if ($dbg3); } return $cnt; } # eof - fav-05.pl