#!/usr/bin/perl -w # NAME: relinetxt02.pl # AIM: General RELINE an input text file... # 20130501 - Do not kill all paragraphing by default, and no wrap by default... # 31/12/2012 - Initial cut # Also see special relinetxt.pl - 03/08/2008 use strict; use warnings; use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] ) use Cwd; my $os = $^O; my $perl_dir = '/home/geoff/bin'; my $PATH_SEP = '/'; my $temp_dir = '/tmp'; if ($os =~ /win/i) { $perl_dir = 'C:\GTools\perl'; $temp_dir = $perl_dir; $PATH_SEP = "\\"; } unshift(@INC, $perl_dir); require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n"; # log file stuff our ($LF); my $pgmname = $0; if ($pgmname =~ /(\\|\/)/) { my @tmpsp = split(/(\\|\/)/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = $temp_dir.$PATH_SEP."temp.$pgmname.txt"; open_log($outfile); # user variables my $VERS = "0.0.2 2013-05-01"; #my $VERS = "0.0.1 2012-12-31"; my $load_log = 0; my $in_file = ''; my $verbosity = 0; my $out_file = ''; my $line_len_max = 0; my $max_over = 20; my $preserve_paras = 1; my $name_file = 'C:\GTools\perl\people.txt'; if ( !($os =~ /win/i) ) { $name_file = '/home/geoff/Documents/fg/people.txt'; } my %emails = (); # ### DEBUG ### my $debug_on = 0; my $def_file = 'C:\Users\user\Documents\Yves\email.txt'; my $def_out = $temp_dir.$PATH_SEP."temprel2.txt"; ### program variables my @warnings = (); my $cwd = cwd(); sub VERB1() { return $verbosity >= 1; } sub VERB2() { return $verbosity >= 2; } sub VERB5() { return $verbosity >= 5; } sub VERB9() { return $verbosity >= 9; } sub show_warnings($) { my ($val) = @_; if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } else { prt( "\nNo warnings issued.\n\n" ) if (VERB9()); } } sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg); } show_warnings($val); close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } sub trim_all2($) { my $txt = shift; $txt = trim_all($txt); my $len = length($txt); return $txt if ($len == 0); my ($i,$ch); for ($i = 0; $i < $len; $i++) { $ch = substr($txt,$i,1); if ( !(($ch =~ /\s/) || ($ch eq '>') || ($ch eq '*')) ) { return $txt; # have more than space or '>', for Theo '*' } } return ''; # return a blank } # ============================================== # 10/04/2012 - begin to process files with a BOM # LOAD without a BOM my $strip_bom = 1; my $curr_file_bom = ''; my @BOM_list = ( [ "UTF-8", 3, [0xEF,0xBB,0xBF ] ], # 239 187 191 [ "UTF-16 (BE)", 2, [0xFE,0xFF ] ], # 254 255 [ "UTF-16 (LE)", 2, [0xFF,0xFE ] ], # 255 254 [ "UTF-32 (BE)", 4, [0x00,0x00,0xFE,0xFF] ], # 0 0 254 255 [ "UTF-32 (LE)", 4, [0xFF,0xFE,0x00,0x00] ], # 255 254 0 0 [ "UTF-7a" , 4, [0x2B,0x2F,0x76,0x38] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F [ "UTF-7b" , 4, [0x2B,0x2F,0x76,0x39] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F [ "UTF-7c" , 4, [0x2B,0x2F,0x76,0x2B] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F [ "UTF-7d" , 4, [0x2B,0x2F,0x76,0x2F] ], # 2B 2F 76 39 2B 2F 76 2B 2B 2F 76 2F [ "UTF-1" , 3, [0xF7,0x64,0x4C ] ], # 247 100 76 [ "UTF-EBCDIC" , 4, [0xDD,0x73,0x66,0x73] ], # 221 115 102 115 [ "SCSU" , 3, [0x0E,0xFE,0xFF ] ], # 14 254 255 [ "BOCU-1" , 3, [0xFB,0xEE,0x28 ] ], # 251 238 40 [ "GB-18030" , 4, [0x84,0x31,0x95,0x33] ] # 132 49 149 51 ); sub line_has_bom($$) { my ($line,$rname) = @_; my $max = scalar @BOM_list; my $len = length($line); my ($i,$j,$name,$cnt,$ra,$ch,$val); for ($i = 0; $i < $max; $i++) { $name = $BOM_list[$i][0]; # name $cnt = $BOM_list[$i][1]; # length $ra = $BOM_list[$i][2]; # ref array of values if ($len > $cnt) { # make sure line length GT BOM for ($j = 0; $j < $cnt; $j++) { $ch = substr($line,$j,1); # extract CHAR $val = ord($ch); # get VALUE last if ($val != ${$ra}[$j]); # compare } if ($j == $cnt) { # if ALL values found ${$rname} = $name; # give back 'name' return $cnt; # and return count } } } return 0; # no BOM found } sub remove_utf_bom($$) { my ($ff,$ra) = @_; my $line = ${$ra}[0]; # get first line my $name = ''; my $len = line_has_bom($line,\$name); if ($len) { $curr_file_bom = substr($line,0,$len); $line = substr($line,$len); # truncate line ${$ra}[0] = $line; # and return minus BOM my ($nm,$dr) = fileparse($ff); # just show name prt("[v9] NOTE: File [$nm] is $name encoding. BOM($len) removed.\n") if (VERB9()); } } sub load_file_lines($$) { my ($ff,$ra) = @_; my $lncnt = 0; $curr_file_bom = ''; if (open INF, "<$ff") { @{$ra} = ; close INF; $lncnt = scalar @{$ra}; remove_utf_bom($ff,$ra) if ($strip_bom); } else { prtw("WARNING: Unable to open [$ff]!\n"); } return $lncnt; } sub load_people_file($) { my $rh = shift; my @lines = (); my $lncnt = load_file_lines($name_file,\@lines); if (! $lncnt ) { return 0; } my ($line,$ind,$len,$lnn); my ($name,$email,$bal,@arr); $lnn = 0; foreach $line (@lines) { $lnn++; chomp $line; $line = trim_all($line); $len = length($line); next if ($len == 0); # skip blanks next if ($line =~ /^\#/); # skip comments $ind = index($line,':'); if ($ind < 0) { prt("$lnn: Discarded [$line] NO COLON\n"); next; } $name = substr($line,0,$ind); $bal = trim_all(substr($line,$ind+1)); @arr = split(/\s+/,$bal); $email = $arr[0]; $email =~ s/^$//; ${$rh}{$email} = $name; prt("$name $email\n") if (VERB9()); } @arr = keys %{$rh}; $lnn = scalar @arr; return $lnn; } sub process_emails() { my (@arr,$cnt,$key,$val,$tmp); @arr = keys %emails; $cnt = scalar @arr; return if (!$cnt); prt("Got $cnt email names to check...\n"); my %list = (); my $ncnt = 0; my $ccnt = 0; my $ocnt = 0; if ( !load_people_file(\%list) ) { prt("Failed to load $name_file!\n"); foreach $key (sort @arr) { $val = $emails{$key}; prt("$val $key\n"); } } else { # only show NEW emails foreach $key (sort @arr) { $val = $emails{$key}; if (defined $list{$key}) { $tmp = $list{$key}; if ($tmp eq $val) { prt("OLD $val: $key\n") if (VERB5()); } else { if ((length($tmp) > length($val)) && ($tmp =~ /$val/)) { prt("OLD $tmp: $key, maybe part name $val\n") if (VERB5()); } else { prt("OLD $tmp: $key, but DIFFERENT name $val - CHECK ME\n"); $ccnt++; } } $ocnt++; } else { prt("NEW $val: $key\n"); $ncnt++; } } if ($ccnt) { prt("Carefully review the above $ccnt name changes\n"); } if ($ncnt) { prt("Note the $ncnt NEW to be added\n"); } else { prt("However $ocnt aleady appear in the $name_file\n"); } } } # 20130819 - another try to ensure paragraphs are preserved sub process_in_file($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$i,$lnn,$tline,$len,$ind,@arr,$tmp); my ($email,$name); my @nlines = (); for ($i = 0; $i < $lncnt; $i++) { $line = $lines[$i]; chomp $line; $lnn = $i + 1; $tline = trim_all2($line); $len = length($tline); if ($len == 0) { if ($preserve_paras) { if ($line =~ /^\s*(>+)/) { $ind = $1; push(@nlines,$ind); } else { push(@nlines,""); } } $i++; # eat all blanks for (; $i < $lncnt; $i++) { $line = $lines[$i]; chomp $line; $lnn = $i + 1; $tline = trim_all2($line); $len = length($tline); next if ($len == 0); last; } } $ind = index($tline,'From:'); if ($ind >= 0) { $tmp = trim_all(substr($tline, ($ind + 5))); prt("$tmp\n") if (VERB9()); if ($tmp =~ /<(.+)>/) { $email = $1; $ind = index($tmp,'<'); if ($ind > 1) { $name = trim_all(substr($tmp,0,$ind-1)); } else { $name = ''; } } else { $email = $tmp; $name = ''; } $email = '' if ($email =~ /\@flightgear.org/); if (length($email)) { if (defined $emails{$email}) { if (length($name)) { $tmp = $emails{$email}; if (length($tmp)) { if ($tmp ne $name) { if (length($name) > length($tmp)) { prt("Changed email $email name from [$tmp] to [$name] on length\n"); $emails{$email} = $name; } else { prt("CHECK changed email $email name from [$name] to [$tmp]!\n"); } } } else { $emails{$email} = $name; } } } else { $emails{$email} = $name; } } } # hmmm, really this should effect ALL lines in the # paragraph... that is fully rework the paragraph if (($line_len_max > 0) && ($len > $line_len_max)) { $ind = ''; if ($line =~ /^\s*(>+)/) { $ind = $1; $tline =~ s/^\s*(>+)//; } # hmmm maybe should NOT preserve 'quotes' here #@arr = space_split($tline); @arr = split(/\s+/,$tline); $tline = $ind; foreach $tmp (@arr) { $tline .= ' ' if (length($tline)); $tline .= $tmp; if (length($tline) > $line_len_max) { push(@nlines,$tline); $tline = $ind; } } push(@nlines,$tline) if (length($tline) > length($ind)); } elsif ($len) { push(@nlines,$tline); } } my ($cnt,$key,$val); if (length($out_file)) { my $cnt = scalar @nlines + 1; rename_2_old_bak($out_file); write2file(join("\n",@nlines)."\n",$out_file); prt("New $cnt lines written to [$out_file]\n"); } else { prt(join("\n",@nlines)."\n"); prt("No -o out-file given, so results only written to stdout..\n"); } } sub process_in_file_ok_but($) { my ($inf) = @_; if (! open INF, "<$inf") { pgm_exit(1,"ERROR: Unable to open file [$inf]\n"); } my @lines = ; close INF; my $lncnt = scalar @lines; prt("Processing $lncnt lines, from [$inf]...\n"); my ($line,$inc,$lnn,$tline,$len,@arr,$ind,$nline,$i); my ($tmp,$tmp2,$cnt,$i2,$addp); $lnn = 0; my @nlines = (); $addp = 0; for ($i = 0; $i < $lncnt; $i++) { $line = $lines[$i]; chomp $line; $lnn = $i + 1; $tline = trim_all2($line); $len = length($tline); if ($len == 0) { $i2 = $i + 1; if ($preserve_paras && ($i2 < $lncnt) && !$addp) { $line = $lines[$i2]; chomp $line; $tline = trim_all2($line); $len = length($tline); if ($len == 0) { $addp = 1; $i++; push(@nlines,""); } } next; } $ind = ''; $addp = 0; while ($line =~ /^\s/) { $ind .= ' '; $line = substr($line,1); } $nline = $ind.trim_all($line); if ($line_len_max > 0) { if ($len > $line_len_max) { @arr = space_split($line); $nline = $ind; foreach $tmp (@arr) { $tmp2 = $nline." ".$tmp; if (length($tmp2) > $line_len_max) { push(@nlines,$nline); $nline = ''; } $nline .= ' ' if (length($nline)); $nline .= $tmp; } push(@nlines,$nline) if (length($nline)); } else { push(@nlines,$ind.$nline); } } else { push(@nlines,$ind.$nline); } } if (length($out_file)) { $cnt = scalar @nlines + 1; rename_2_old_bak($out_file); write2file(join("\n",@nlines)."\n",$out_file); prt("New $cnt lines written to [$out_file]\n"); } else { prt(join("\n",@nlines)."\n"); prt("No -o out-file given, so results only written to stdout..\n"); } } ######################################### ### MAIN ### parse_args(@ARGV); process_in_file($in_file); process_emails(); pgm_exit(0,""); ######################################## sub need_arg { my ($arg,@av) = @_; pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av); } sub parse_args { my (@av) = @_; my ($arg,$sarg); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); if (($sarg =~ /^h/i)||($sarg eq '?')) { give_help(); pgm_exit(0,"Help exit(0)"); } elsif ($sarg =~ /^v/) { if ($sarg =~ /^v.*(\d+)$/) { $verbosity = $1; } else { while ($sarg =~ /^v/) { $verbosity++; $sarg = substr($sarg,1); } } prt("Verbosity = $verbosity\n") if (VERB1()); } elsif ($sarg =~ /^l/) { if ($sarg =~ /^ll/) { $load_log = 2; } else { $load_log = 1; } prt("Set to load log at end. ($load_log)\n") if (VERB1()); } elsif ($sarg =~ /^o/) { need_arg(@av); shift @av; $sarg = $av[0]; $out_file = $sarg; prt("Set out file to [$out_file].\n") if (VERB1()); } elsif ($sarg =~ /^p/) { $preserve_paras = 0; prt("Set to remove paragraphs.\n") if (VERB1()); } elsif ($sarg =~ /^w/) { need_arg(@av); shift @av; $sarg = $av[0]; $line_len_max = $sarg; prt("Set line wrap to [$line_len_max].\n") if (VERB1()); } else { pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n"); } } else { $in_file = $arg; prt("Set input to [$in_file]\n") if (VERB1()); } shift @av; } if ((length($in_file) == 0) && $debug_on) { $in_file = $def_file; prt("Set DEFAULT input to [$in_file]\n"); if (length($out_file) == 0) { $out_file = $def_out; } } if (length($in_file) == 0) { pgm_exit(1,"ERROR: No input files found in command!\n"); } if (! -f $in_file) { pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n"); } } sub give_help { prt("$pgmname: version $VERS\n"); prt("Usage: $pgmname [options] in-file\n"); prt("Options:\n"); prt(" --help (-h or -?) = This help, and exit 0.\n"); prt(" --verb[n] (-v) = Bump [or set] verbosity. (def=$verbosity)\n"); prt(" --load (-l) = Load LOG at end. ($outfile)\n"); prt(" --out (-o) = Write output to this file.\n"); prt(" --para (-p) = Kill all paragraphs. That is double linefeeds.\n"); prt(" --wrap num (-w) = Set line wrap width. 0 for none. (def=$line_len_max)\n"); } # eof - relinetxt02.pl