relinetxt02.pl to HTML.

index -|- end

Generated: Sat Oct 12 17:23:16 2013 from relinetxt02.pl 2013/08/19 17.5 KB. text copy

#!/usr/bin/perl -w
# NAME: relinetxt02.pl
# AIM: General RELINE an input text file...
# 20130501 - Do not kill all paragraphing by default, and no wrap by default...
# 31/12/2012 - Initial cut
# Also see special relinetxt.pl - 03/08/2008
use strict;
use warnings;
use File::Basename;  # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
use Cwd;
my $os = $^O;
my $perl_dir = '/home/geoff/bin';
my $PATH_SEP = '/';
my $temp_dir = '/tmp';
if ($os =~ /win/i) {
    $perl_dir = 'C:\GTools\perl';
    $temp_dir = $perl_dir;
    $PATH_SEP = "\\";
}
unshift(@INC, $perl_dir);
require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n";
# log file stuff
our ($LF);
my $pgmname = $0;
if ($pgmname =~ /(\\|\/)/) {
    my @tmpsp = split(/(\\|\/)/,$pgmname);
    $pgmname = $tmpsp[-1];
}
my $outfile = $temp_dir.$PATH_SEP."temp.$pgmname.txt";
open_log($outfile);

# user variables
my $VERS = "0.0.2 2013-05-01";
#my $VERS = "0.0.1 2012-12-31";
my $load_log = 0;
my $in_file = '';
my $verbosity = 0;
my $out_file = '';
my $line_len_max = 0;
my $max_over = 20;
my $preserve_paras = 1;
my $name_file = 'C:\GTools\perl\people.txt';
if ( !($os =~ /win/i) ) {
    $name_file = '/home/geoff/Documents/fg/people.txt';
}
my %emails = ();

# ### DEBUG ###
my $debug_on = 0;
my $def_file = 'C:\Users\user\Documents\Yves\email.txt';
my $def_out = $temp_dir.$PATH_SEP."temprel2.txt";

### program variables
my @warnings = ();
my $cwd = cwd();

sub VERB1() { return $verbosity >= 1; }
sub VERB2() { return $verbosity >= 2; }
sub VERB5() { return $verbosity >= 5; }
sub VERB9() { return $verbosity >= 9; }

sub show_warnings($) {
    my ($val) = @_;
    if (@warnings) {
        prt( "\nGot ".scalar @warnings." WARNINGS...\n" );
        foreach my $itm (@warnings) {
           prt("$itm\n");
        }
        prt("\n");
    } else {
        prt( "\nNo warnings issued.\n\n" ) if (VERB9());
    }
}

sub pgm_exit($$) {
    my ($val,$msg) = @_;
    if (length($msg)) {
        $msg .= "\n" if (!($msg =~ /\n$/));
        prt($msg);
    }
    show_warnings($val);
    close_log($outfile,$load_log);
    exit($val);
}


sub prtw($) {
   my ($tx) = shift;
   $tx =~ s/\n$//;
   prt("$tx\n");
   push(@warnings,$tx);
}

sub trim_all2($) {
    my $txt = shift;
    $txt = trim_all($txt);
    my $len = length($txt);
    return $txt if ($len == 0);
    my ($i,$ch);
    for ($i = 0; $i < $len; $i++) {
        $ch = substr($txt,$i,1);
        if ( !(($ch =~ /\s/) || ($ch eq '>') || ($ch eq '*')) ) {
            return $txt; # have more than space or '>', for Theo '*'
        }
     }
     return ''; # return a blank
}

# ==============================================
# 10/04/2012 - begin to process files with a BOM
# LOAD without a BOM
my $strip_bom = 1;
my $curr_file_bom = '';

my @BOM_list = (
    [ "UTF-8",       3, [0xEF,0xBB,0xBF     ] ], # 239 187 191   
    [ "UTF-16 (BE)", 2, [0xFE,0xFF          ] ], # 254 255 
    [ "UTF-16 (LE)", 2, [0xFF,0xFE          ] ], # 255 254
    [ "UTF-32 (BE)", 4, [0x00,0x00,0xFE,0xFF] ], # 0 0 254 255
    [ "UTF-32 (LE)", 4, [0xFF,0xFE,0x00,0x00] ], # 255 254 0 0
    [ "UTF-7a"     , 4, [0x2B,0x2F,0x76,0x38] ], # 2B 2F 76 39  2B 2F 76 2B  2B 2F 76 2F
    [ "UTF-7b"     , 4, [0x2B,0x2F,0x76,0x39] ], # 2B 2F 76 39  2B 2F 76 2B  2B 2F 76 2F
    [ "UTF-7c"     , 4, [0x2B,0x2F,0x76,0x2B] ], # 2B 2F 76 39  2B 2F 76 2B  2B 2F 76 2F
    [ "UTF-7d"     , 4, [0x2B,0x2F,0x76,0x2F] ], # 2B 2F 76 39  2B 2F 76 2B  2B 2F 76 2F
    [ "UTF-1"      , 3, [0xF7,0x64,0x4C     ] ], # 247 100 76 
    [ "UTF-EBCDIC" , 4, [0xDD,0x73,0x66,0x73] ], # 221 115 102 115
    [ "SCSU"       , 3, [0x0E,0xFE,0xFF     ] ], # 14 254 255
    [ "BOCU-1"     , 3, [0xFB,0xEE,0x28     ] ], # 251 238 40
    [ "GB-18030"   , 4, [0x84,0x31,0x95,0x33] ]  # 132 49 149 51
);

sub line_has_bom($$) {
    my ($line,$rname) = @_;
    my $max = scalar @BOM_list;
    my $len = length($line);
    my ($i,$j,$name,$cnt,$ra,$ch,$val);
    for ($i = 0; $i < $max; $i++) {
        $name = $BOM_list[$i][0]; # name
        $cnt  = $BOM_list[$i][1]; # length
        $ra   = $BOM_list[$i][2]; # ref array of values
        if ($len > $cnt) {  # make sure line length GT BOM
            for ($j = 0; $j < $cnt; $j++) {
                $ch = substr($line,$j,1);   # extract CHAR
                $val = ord($ch);            # get VALUE
                last if ($val != ${$ra}[$j]); # compare
            }
            if ($j == $cnt) {   # if ALL values found
                ${$rname} = $name# give back 'name'
                return $cnt;    # and return count
            }
        }
    }
    return 0;   # no BOM found
}

sub remove_utf_bom($$) {
    my ($ff,$ra) = @_;
    my $line = ${$ra}[0];  # get first line
    my $name = '';
    my $len = line_has_bom($line,\$name);
    if ($len) {
        $curr_file_bom = substr($line,0,$len);
        $line = substr($line,$len); # truncate line
        ${$ra}[0] = $line# and return minus BOM
        my ($nm,$dr) = fileparse($ff); # just show name
        prt("[v9] NOTE: File [$nm] is $name encoding. BOM($len) removed.\n") if (VERB9());
    }
}

sub load_file_lines($$) {
    my ($ff,$ra) = @_;
    my $lncnt = 0;
    $curr_file_bom = '';
    if (open INF, "<$ff") {
        @{$ra} = <INF>;
        close INF;
        $lncnt = scalar @{$ra};
        remove_utf_bom($ff,$ra) if ($strip_bom);
    } else {
        prtw("WARNING: Unable to open [$ff]!\n");
    }
    return $lncnt;
}


sub load_people_file($) {
    my $rh = shift;
    my @lines = ();
    my $lncnt = load_file_lines($name_file,\@lines);
    if (! $lncnt ) {
        return 0;
    }
    my ($line,$ind,$len,$lnn);
    my ($name,$email,$bal,@arr);
    $lnn = 0;
    foreach $line (@lines) {
        $lnn++;
        chomp $line;
        $line = trim_all($line);
        $len = length($line);
        next if ($len == 0); # skip blanks
        next if ($line =~ /^\#/); # skip comments
        $ind = index($line,':');
        if ($ind < 0) {
            prt("$lnn: Discarded [$line] NO COLON\n");
            next;
        }
        $name = substr($line,0,$ind);
        $bal = trim_all(substr($line,$ind+1));
        @arr = split(/\s+/,$bal);
        $email = $arr[0];
        $email =~ s/^<//;
        $email =~ s/>$//;
        ${$rh}{$email} = $name;
        prt("$name $email\n") if (VERB9());
    }
    @arr = keys %{$rh};
    $lnn = scalar @arr;
    return $lnn;
}


sub process_emails() {
    my (@arr,$cnt,$key,$val,$tmp);
    @arr = keys %emails;
    $cnt = scalar @arr;
    return if (!$cnt);
    prt("Got $cnt email names to check...\n");
    my %list = ();
    my $ncnt = 0;
    my $ccnt = 0;
    my $ocnt = 0;
    if ( !load_people_file(\%list) ) {
        prt("Failed to load $name_file!\n");
        foreach $key (sort @arr) {
            $val = $emails{$key};
            prt("$val $key\n");
        }
    } else {
        # only show NEW emails
        foreach $key (sort @arr) {
            $val = $emails{$key};
            if (defined $list{$key}) {
                $tmp = $list{$key};
                if ($tmp eq $val) {
                    prt("OLD $val: $key\n") if (VERB5());
                } else {
                    if ((length($tmp) > length($val)) && ($tmp =~ /$val/)) {
                        prt("OLD $tmp: $key, maybe part name $val\n") if (VERB5());
                    } else {
                        prt("OLD $tmp: $key, but DIFFERENT name $val - CHECK ME\n");
                        $ccnt++;
                    }
                }
                $ocnt++;
            } else {
                prt("NEW $val: $key\n");
                $ncnt++;
            }
        }
        if ($ccnt) {
            prt("Carefully review the above $ccnt name changes\n");
        }
        if ($ncnt) {
            prt("Note the $ncnt NEW to be added\n");
        } else {
            prt("However $ocnt aleady appear in the $name_file\n");
        }
    }
}

# 20130819 - another try to ensure paragraphs are preserved
sub process_in_file($) {
    my ($inf) = @_;
    if (! open INF, "<$inf") {
        pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); 
    }
    my @lines = <INF>;
    close INF;
    my $lncnt = scalar @lines;
    prt("Processing $lncnt lines, from [$inf]...\n");
    my ($line,$i,$lnn,$tline,$len,$ind,@arr,$tmp);
    my ($email,$name);
    my @nlines = ();
    for ($i = 0; $i < $lncnt; $i++) {
        $line = $lines[$i];
        chomp $line;
        $lnn = $i + 1;
        $tline = trim_all2($line);
        $len = length($tline);
        if ($len == 0) {
            if ($preserve_paras) {
                if ($line =~ /^\s*(>+)/) {
                    $ind = $1;
                    push(@nlines,$ind);
                } else {
                    push(@nlines,"");
                }
            }
            $i++; # eat all blanks
            for (; $i < $lncnt; $i++) {
                $line = $lines[$i];
                chomp $line;
                $lnn = $i + 1;
                $tline = trim_all2($line);
                $len = length($tline);
                next if ($len == 0);
                last;
            }
        }
        $ind = index($tline,'From:');
        if ($ind >= 0) {
            $tmp = trim_all(substr($tline, ($ind + 5)));
            prt("$tmp\n") if (VERB9());
            if ($tmp =~ /<(.+)>/) {
                $email = $1;
                $ind = index($tmp,'<');
                if ($ind > 1) {
                    $name = trim_all(substr($tmp,0,$ind-1));
                } else {
                    $name = '';
                }
            } else {
                $email = $tmp;
                $name = '';
            }
            $email = '' if ($email =~ /\@flightgear.org/);
            if (length($email)) {
                if (defined $emails{$email}) {
                    if (length($name)) {
                        $tmp = $emails{$email};
                        if (length($tmp)) {
                            if ($tmp ne $name) {
                                if (length($name) > length($tmp)) {
                                    prt("Changed email $email name from [$tmp] to [$name] on length\n");
                                    $emails{$email} = $name;
                                } else {
                                    prt("CHECK changed email $email name from [$name] to [$tmp]!\n");
                                }
                            }
                        } else {
                            $emails{$email} = $name;
                        }
                    }
                } else {
                    $emails{$email} = $name;
                }
            }
        }
        # hmmm, really this should effect ALL lines in the 
        # paragraph... that is fully rework the paragraph
        if (($line_len_max > 0) && ($len > $line_len_max)) {
            $ind = '';
            if ($line =~ /^\s*(>+)/) {
                $ind = $1;
                $tline =~ s/^\s*(>+)//;
            }
            # hmmm maybe should NOT preserve 'quotes' here
            #@arr = space_split($tline);
            @arr = split(/\s+/,$tline);
            $tline = $ind;
            foreach $tmp (@arr) {
                $tline .= ' ' if (length($tline));
                $tline .= $tmp;
                if (length($tline) > $line_len_max) {
                    push(@nlines,$tline);
                    $tline = $ind;
                }
            }
            push(@nlines,$tline) if (length($tline) > length($ind));
        } elsif ($len) {
            push(@nlines,$tline);
        }
    }
    my ($cnt,$key,$val);
    if (length($out_file)) {
        my $cnt = scalar @nlines + 1;
        rename_2_old_bak($out_file);
        write2file(join("\n",@nlines)."\n",$out_file);
        prt("New $cnt lines written to [$out_file]\n");
    } else {
        prt(join("\n",@nlines)."\n");
        prt("No -o out-file given, so results only written to stdout..\n");
    }
}

sub process_in_file_ok_but($) {
    my ($inf) = @_;
    if (! open INF, "<$inf") {
        pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); 
    }
    my @lines = <INF>;
    close INF;
    my $lncnt = scalar @lines;
    prt("Processing $lncnt lines, from [$inf]...\n");
    my ($line,$inc,$lnn,$tline,$len,@arr,$ind,$nline,$i);
    my ($tmp,$tmp2,$cnt,$i2,$addp);
    $lnn = 0;
    my @nlines = ();
    $addp = 0;
    for ($i = 0; $i < $lncnt; $i++) {
        $line = $lines[$i];
        chomp $line;
        $lnn = $i + 1;
        $tline = trim_all2($line);
        $len = length($tline);
        if ($len == 0) {
            $i2 = $i + 1;
            if ($preserve_paras && ($i2 < $lncnt) && !$addp) {
                $line = $lines[$i2];
                chomp $line;
                $tline = trim_all2($line);
                $len = length($tline);
                if ($len == 0) {
                    $addp = 1;
                    $i++;
                    push(@nlines,"");
                }
            }
            next;
        }
        $ind = '';
        $addp = 0;
        while ($line =~ /^\s/) {
            $ind .= ' ';
            $line = substr($line,1);
        }
        $nline = $ind.trim_all($line);
        if ($line_len_max > 0) {
            if ($len > $line_len_max) {
                @arr = space_split($line);
                $nline = $ind;
                foreach $tmp (@arr) {
                    $tmp2 = $nline." ".$tmp;
                    if (length($tmp2) > $line_len_max) {
                        push(@nlines,$nline);
                        $nline = '';
                    }
                    $nline .= ' ' if (length($nline));
                    $nline .= $tmp;
                }
                push(@nlines,$nline) if (length($nline));
            } else {
                push(@nlines,$ind.$nline);
            }
        } else {
            push(@nlines,$ind.$nline);
        }
    }
    if (length($out_file)) {
        $cnt = scalar @nlines + 1;
        rename_2_old_bak($out_file);
        write2file(join("\n",@nlines)."\n",$out_file);
        prt("New $cnt lines written to [$out_file]\n");
    } else {
        prt(join("\n",@nlines)."\n");
        prt("No -o out-file given, so results only written to stdout..\n");
    }
}

#########################################
### MAIN ###
parse_args(@ARGV);
process_in_file($in_file);
process_emails();
pgm_exit(0,"");
########################################

sub need_arg {
    my ($arg,@av) = @_;
    pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av);
}

sub parse_args {
    my (@av) = @_;
    my ($arg,$sarg);
    while (@av) {
        $arg = $av[0];
        if ($arg =~ /^-/) {
            $sarg = substr($arg,1);
            $sarg = substr($sarg,1) while ($sarg =~ /^-/);
            if (($sarg =~ /^h/i)||($sarg eq '?')) {
                give_help();
                pgm_exit(0,"Help exit(0)");
            } elsif ($sarg =~ /^v/) {
                if ($sarg =~ /^v.*(\d+)$/) {
                    $verbosity = $1;
                } else {
                    while ($sarg =~ /^v/) {
                        $verbosity++;
                        $sarg = substr($sarg,1);
                    }
                }
                prt("Verbosity = $verbosity\n") if (VERB1());
            } elsif ($sarg =~ /^l/) {
                if ($sarg =~ /^ll/) {
                    $load_log = 2;
                } else {
                    $load_log = 1;
                }
                prt("Set to load log at end. ($load_log)\n") if (VERB1());
            } elsif ($sarg =~ /^o/) {
                need_arg(@av);
                shift @av;
                $sarg = $av[0];
                $out_file = $sarg;
                prt("Set out file to [$out_file].\n") if (VERB1());
            } elsif ($sarg =~ /^p/) {
                $preserve_paras = 0;
                prt("Set to remove paragraphs.\n") if (VERB1());
            } elsif ($sarg =~ /^w/) {
                need_arg(@av);
                shift @av;
                $sarg = $av[0];
                $line_len_max = $sarg;
                prt("Set line wrap to [$line_len_max].\n") if (VERB1());

            } else {
                pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n");
            }
        } else {
            $in_file = $arg;
            prt("Set input to [$in_file]\n") if (VERB1());
        }
        shift @av;
    }

    if ((length($in_file) ==  0) && $debug_on) {
        $in_file = $def_file;
        prt("Set DEFAULT input to [$in_file]\n");
        if (length($out_file) == 0) {
            $out_file = $def_out;
        }
    }
    if (length($in_file) ==  0) {
        pgm_exit(1,"ERROR: No input files found in command!\n");
    }
    if (! -f $in_file) {
        pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n");
    }
}

sub give_help {
    prt("$pgmname: version $VERS\n");
    prt("Usage: $pgmname [options] in-file\n");
    prt("Options:\n");
    prt(" --help  (-h or -?) = This help, and exit 0.\n");
    prt(" --verb[n]     (-v) = Bump [or set] verbosity. (def=$verbosity)\n");
    prt(" --load        (-l) = Load LOG at end. ($outfile)\n");
    prt(" --out <file>  (-o) = Write output to this file.\n");
    prt(" --para        (-p) = Kill all paragraphs. That is double linefeeds.\n");
    prt(" --wrap num    (-w) = Set line wrap width. 0 for none. (def=$line_len_max)\n");
}

# eof - relinetxt02.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional