striphtml02.pl to HTML.

Generated: Tue Feb 2 17:54:57 2010 from striphtml02.pl 2005/11/30 9.3 KB.
#!/Perl
# AIM: To strip HTML from a file using HTML::Parser
# some regex that is similar -
# FROM : http://www.trilug.org/pipermail/trilug/Week-of-Mon-20040216/024049.html
# Doing some experimentation, I see that perl is normally greedy, but
# if you postpend a quantifier with ? it turns that off.  So, this
# should remove all html tags from a file:
# perl -pi -e 's/<.*?>//g' [filename]
# Unfortunately, the non-greedy operator -- the question mark, is not 
# standard to the C library regexp() call, which I'm using.  However, the 
# following accomplishes something similar (my thanks to 'scalar' on IRC) :
#    s/<[^>]+>//g
use HTML::Parser ();
use Data::Dump ();
use Carp;
# use HiRes timer
use Time::HiRes qw( usleep ualarm gettimeofday tv_interval nanosleep );
use IO::File;
use Fcntl; # for mode constants
my $definp = 'C:/HOMEPAGE/P26/browser1.htm';
###my $definp = 'c:/HOMEPAGE/P26/perl.htm';
###my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/Wednesday.htm";
###my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/My Webs/moon-01.htm";
###my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/Russel/Russ-04.htm";
my $WHITE_PATTERN2 = "^[ \t\r\n]*\$"; # spacey if ($var =~ /$WHITE_PATTERN2/o ) { ...}
my $defout = 'tempstrip.txt';
my $defstrip = 'tempout.txt';
my $defstrip2 = 'tempout2.txt';
my $verb2 = 0;
my ($HO1, $HO2, $H03);
my $dncr = 0;
my @tagarr; ## tag array
my $intable = 0;
my $tnewline = 0;
my $tnewhcol = 0;
my $tnewcol = 0;
my $colcount = 0;
my $inbody = 0;
my $inhead = 0;
my $noheadout = 1; ### avoid HEAD tag text
my $mxt = 60; ### 40
my $intd = 0;
my $nointdcr = 1;
my $t0 = [gettimeofday];
print "$0: Started on " . localtime(time()) . "\n"; ### . " in $cwdir ...\n" if $shwtm;
open $HO1, ">$defout"  or die "No output file ... [$defout]!\n";
open $HO2, ">$defstrip"  or die "No output file ... [$defstrip]!\n";
###open $HO3, ">$defstrip2"  or die "No output file ... [$defstrip2]!\n";
###close $H03;
###my $infile = shift || die "\nERROR: Must give an INPUT FILE ...\n";
my $infile = shift || $definp;
###$infile = $definp;
print "Hello, World ... Strip HTML ... stripping $infile ...";
die "\nERROR: Can not locate the file [$infile]!\n" if (! -f $infile);
prt ("From $infile ...\n");
my $p = HTML::Parser->new(api_version => 3);
$p->handler(default => \&hand, "event, line, column, text, tagname, attr");
$p->parse_file($infile);
close $HO1;
close $HO2;
print "$0: Ended on " . localtime(time()) . "\n"; ### . " in $cwdir ...\n" if $shwtm;
$t1 = [gettimeofday];
$elapsed = tv_interval ( $t0, $t1 );
print "$0 ran for $elapsed seconds ...\n";
print "Check results in $defstrip ...\n";
#system ($defstrip); ### check out the RESULTS
print "Check results in $defout ...\n";
#system ($defout); ### anaysis of data from parser
## now test the regex method
my $text = read_file_f( $infile ) ;
###print $text;
$text =~ s/<.*?>//g;
###$text =~ s/<[^>]+>//g;
my $temp3 = 'temp005.txt';
##open $H03, ">$temp3";
##close $H03;
write_file_f ( $temp3, $text );
###print "<STRIPPED>\n$text\n</STRIPPED>\n";
###my $jstrip = join('', $text);
##print "<JOINED>\n$jstrip\n</JOINED>\n";
###open $HO3, ">$defstrip2"  or carp "No output file ... [$defstrip2]! ... $!\n";
###print $H03, $jstrip;
###print $H03, $text;
###close $H03;
###print "Check results in $defstrip2 ...\n";
$t2 = [gettimeofday];
$elap2 = tv_interval ( $t1, $t2 );
print "$0 ran for $elap2 seconds ...\n";
sub hand {
    my($event, $line, $column, $to, $tagname, $attr) = @_;
   my $typ = uc(substr($event,0,1)); ## get TYPE
   my $ll = length($to);
    my $msg =  "$typ L$line C$column:";
    my @d =  $msg;
    push(@d, $to);
    push(@d, $tagname) if defined $tagname;
   push(@d, $attr) if $attr;
   my $asz = @d; ### get length of array
   my $np = 1;
   my $tag = '*NO_TAG*';
   if (defined $tagname) {
      $tag = uc($tagname);
   }
   my $dtxt = Data::Dump::dump(@d);
   my $text = $to;
   my $txout = ''; ### start the final OUTPUT
   if ($typ eq 'S') {
      ## start of tag
      $msg .= " S-$tag";
      if ($tag eq 'HEAD') {
         $inhead = 1;
      } elsif ($tag eq 'BODY') {
         $inbody = 1;
      } elsif ($tag eq 'TABLE') {
         $intable++;
         $tnewline = 0;
         $tnewhcol = 0;
         $tnewcol = 0;
      } elsif ($tag eq 'TR') {
         if ($colcount) {
            $txout .= "\n"; ## print $HO2 "\n";
            $msg .= " Added <TR>NEW LINE!";
            $colcount = 0;
         } else {
            $msg .= " Skipped <TR>NEW LINE!";
         }
         $tnewline++;
         $tnewhcol = 0;
         $tnewcol = 0;
      } elsif ($tag eq 'TH') {
         $msg .= " Added <TH>SPACE!";
         $txout .= " "; ### print $HO2 ' ';
         $tnewhcol++;
      } elsif ($tag eq 'TD') {
         $msg .= " Added <TD>SPACE! in td";
         $txout .= ' '; ### print $HO2 ' ';
         $tnewcol++;
         $intd = 1;
      } else {
         $msg .= ' B tag with no case';
      }
   } elsif ($typ eq 'E') {
      ## end tag
      $msg .= " E-$tag";
      if ($tag eq 'HEAD') {
         $inhead = 0;
      } elsif ($tag eq 'BODY') {
         $inbody = 0;
      } elsif ($tag eq 'TABLE') {
         if ($intable) {
            $intable--;
         }
      } elsif ($tag eq 'TR') {
         $tnewline-- if $tnewline;
      } elsif ($tag eq 'TH') {
         $tnewhcol--;
      } elsif ($tag eq 'TD') {
         $tnewcol--;
         $intd = 0;
      } else {
         $msg .= ' E tag with no case';
      }
   } elsif ($typ eq 'T') {
      ### text item
      if ($ll) {
         my $addtx = 0;
         $text =~ s/&nbsp;/ /g; ## get back spaces
         $text =~ s/&lt;/</g; ## get back less than
         $text =~ s/&gt;/>/g; ## get back greater than
         $text =~ s/&quot;/"/g; ## get back QUOTES
         ### note this LAST
         $text =~ s/&amp;/&/g; ## get back ampersound
         if ($to ne $text) {
            my $l2 = length($text);
            $msg .= " *CHG* [$to]$ll to [$text]$l2 ! ";
            $ll = $l2;
         }
         my @sptxt = split (' ', $text);
         if ($text =~ $WHITE_PATTERN2) {
            my $iscr = 0;
            if ($text =~ /^[\r\n]*$/ ) {
               $msg .= "[all cr/lf stuff for $ll]";
               $iscr = 1;
            } elsif ($text =~ /^ *$/ ) {
               $msg .= "[space for $ll]";
               $txout .= $text; ### print $HO2 $text;
               $msg .= " Added <real>SPACE!";
               $addtx = 1;
               $np = 0;
            } else {
               my $tt = $text;
               $tt =~ s/[\r\n]//g; ### kill the CR/LF
               $txout .= $tt; ### add this space???
               $msg .= " [*CHK* mixed sp[$tt] + cr/lf for $ll]";
               if ($intd && $nointdcr) {
                  $msg .= 'intd cr avoided';
                  $iscr = 0;
               } else {
                  $iscr = 1;
               }
            }
            if ($iscr) {
               if ($dncr >= 2) {
                  $msg .= " *dup cr*";
                  ###return;
               } else {
                  $msg .= " Added <newline>!";
                  $txout .= "\n"; ### print $HO2 "\n";
                  $np = 0;
                  $dncr++;
                  $colcount = 0;
               }
            }
         } else {
            if (( $ll > 4 ) && 
               (( substr ($text, 0, 4) eq '<!--' )||
                ( substr ($sptxt[0], 0, 4) eq '<!--'       ))) {
               substr($text, $mxt) = "..." if length($text) > $mxt; ### limit, to say 40
               $msg .= "[" . $text . "]";
               $msg .= "html comment $msg *end c*";
            } else {
               $dncr = 0;
               $txout .= $text; ### print $HO2 $text;
               $colcount += $ll;
               substr($text, $mxt) = "..." if length($text) > $mxt;
               $msg .= "[" . $text . "]ADDED";
               $np = 0;
            }
         }
      } else {
         $msg .= " [$text] SKIPPED";
      }
   } else {
      if (length($text)) {
         $msg .= '[' . $text . ']';
      } else {
         $msg .= '[empty]';
      }
   }
   ###if ($np) {
   my $omsg = $msg;
   prt ($dtxt . "\n");
   if (length ($txout)) {
      ###if ($tag eq 'HEAD') { $inhead = 1;
      if ($inhead && $noheadout) {
         ### no output
         $omsg = "-=> $msg [SKIP-IN-HEAD]"
      } else {
         print $HO2 $txout;
         $omsg = "$msg (written)";
      }
   } else {
      $omsg = "--> $msg (no out text length)";
   }
   prt ($omsg . "\n");
}
sub prt {
   my ($t) = @_;
   print $t if ($verb2);
   print $HO1 $t;
}
sub read_file_f {
    my( $file_name, %args ) = @_ ;
    my $buf ;
    my $buf_ref = $args{'buf_ref'} || \$buf ;
    my $mode = O_RDONLY ;
    $mode |= O_BINARY if $args{'binmode'} ;
    local( *FH ) ;
    sysopen( FH, $file_name, $mode ) or
        carp "Can't open $file_name: $!" ;
    my $size_left = -s FH ;
    while( $size_left > 0 ) {
        my $read_cnt = sysread( FH, ${$buf_ref},
            $size_left, length ${$buf_ref} ) ;
        unless( $read_cnt ) {
            carp "read error in file $file_name: $!" ;
            last ;
        }
      $size_left -= $read_cnt ;
    }
# handle void context (return scalar by buffer reference)
    return unless defined wantarray ;
# handle list context
    return split m|?<$/|g, ${$buf_ref} if wantarray ;
# handle scalar context
    return ${$buf_ref} ;
}
sub write_file_f {
    my $file_name = shift ;
    my $args = ( ref $_[0] eq 'HASH' ) ? shift : {} ;
    my $buf = join '', @_ ;
    ###my $mode = O_WRONLY ;
    my $mode = O_WRONLY | O_CREAT;
    $mode |= O_BINARY if $args->{'binmode'} ;
    $mode |= O_APPEND if $args->{'append'} ;
    local( *FH ) ;
    sysopen( FH, $file_name, $mode ) or
        carp "Can't open $file_name: $!" ;
    my $size_left = length( $buf ) ;
    my $offset = 0 ;
    while( $size_left > 0 ) {
        my $write_cnt = syswrite( FH, $buf,
                $size_left, $offset ) ;
        unless( $write_cnt ) {
            carp "write error in file $file_name: $!" ;
            last ;
        }
        $size_left -= $write_cnt ;
        $offset += $write_cnt ;
    }
    return ;
}
# EOF
index -|- top