#!/usr/local/bin/perl -s
# check-punct Version 4.5
#
# Checks punctuation in a Gutenberg file
#
# If no files are given, reads from stdin and writes to stdout.
# If filenames are given, then writes to a .not (notes) file
#  -- If -b option also given, then also writes to stdout.
#  -- Checks for the header automatically if files are on the command line
#
# -h option to skip Gutenberg header (up to "*END*THE SMALL PRINT!")
# -b option only prints bad lines (with line numbers)
#    default is to print everything (with no line numbers).
#    (If you print everything, you can restore the original file by deleting
#    all the lines which start with ##).
# -q option to use alternative method for checking quotes (recommended)
# -e option, allow ". . ." for ellipses!
# -d option runs "showdups" with output in .dup
# -s option runs "gutspell" with output in .spl
# -m option marks the words which appear in .spl
# (if -s and -m are used together, then the .spl is created first
# and then used to mark the spelling errors)
#
# These assignments set "-b -q -e" as default options:
$b = 1; $q = 1; $e = 1;
#
# Work a paragraph at a time
# Check for, and complain about:
#   Mismatched quotes. Should be like this: "...`..."..."...'..."
#     NB: whole para quoted at beginning (no quote at end)
#     NB: apostrophes not quotes: ain't  'em  'Arry  'Talian  yo'  jes'
#   Mismatched parentheses
#   bad characters
#   Space before punctuation (?!;:,.)
#   Punctuation (?!;:,.) before letter
#     (except initials: M.P. Ward, and...ellipses)
#   No space or " or ( before ` or (
#   No letter or " or ( after ` or (
#   Space before )
#   Letter after )
#   Hyphens should be either letter-letter or notspace--notspace
#
# Author: Martin Ward, Martin.Ward@durham.ac.uk
# Comments welcomed!
#
# Copyright 1994 Martin Ward
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
# Recent history:
# Version 4.1	Modified to not require the Unix "tee" program
# Version 4.2   Fixed a bug in 4.1 closing the output file to early
# Version 4.3	Fixed a bug where output for later files goes to all
#               previously opened files!
# Version 4.4   Allow nor'-letter (don't complain about the hyphen)
# Version 4.5   Allow four dots (with optional spaces) anywhere
#

$SIG{PIPE} = sig_pipe;

$| = 1 if ($b); # make output unbuffered if only printing bad lines
$prefix = "##";
$quote_mode = 0;
$quote_mode = 1 if ($q);

$punct = '[\?\!\;\:\,\.]';
$bad = '[^a-zA-Z0-9\$\&\s\(\)\'\`\"\-\?\!\;\:\,\.\200-\377\_\*\[\]\%\/\+\@]';
$letter = '[a-zA-Z0-9\$\341-\372\301-\332\260-\271\244\@]';

# Note that the following are negated character classes
# - it is an error if these match.
# error if this matches after ( or open quote:
$before_open = '[^ \200\202\"\(\[\n\-\`]';
$after_open = '[^\200-\377a-zA-Z0-9\$\"\(\[\.\`\-\'\_\*]';
# error if this matches before ) or close quote:
$before_close = '[^\200-\377a-zA-Z0-9\$\"\)\]\-\?\!\;\:\,\.\']';
$after_close = '[^ \200\202\"\(\[\n\212\.\-\?\!\;\:\,\.\']';

# Check for .zip files:
if ($#ARGV < 0) {
  # no files, read stdin, write to stdout:
  *IN = *STDIN;
  open(OUT, ">&STDOUT");
  &process_file;
} else {
  foreach $file (@ARGV) {
    die "File not found: $file\n" unless (-f $file);
    $killme = 0; # delete file after processing?
    if ($file =~ /\.zip$/) {
      $killme = 1;
      # Quietly unzip file, overwriting existing file:
      system 'unzip', '-qo', $file;
      $file =~ s/\.zip$/\.txt/;
      die "File not created from zip file: $file\n" unless (-f $file);
    }
    # check for DOS format
    open(IN, $file) || die "Can't open $file: $!";
    ($base = $file) =~ s/\.txt$//;
    read(IN, $head, 8192);
    close(IN);
    if ($head =~ /\015/) {
      # Convert DOS format file:
      $old = "$base.old";
      rename($file, $old);
      open(IN, $old);
      open(OUT, ">$file") || die "Can't write to $file: $!";
      do {
	$bytes = read(IN, $_, 8192);
	s/[\015\032]//g;
	print OUT $_;
      } until ($bytes < 8192);
      close(IN); close(OUT);
      unlink($old);
    }
    # check for header:
    $h = 0;
    $h = 1 if ($head =~ /\*START\*/); # start of small print
    open(IN, $file) || die "Can't open $file: $!";
    $out = "$base.not";
    $| = 1; # make unbuffered
    open(NOTES, ">$out") or die "Can't write to $out: $!\n";
    $| = 1; # make unbuffered
    message("Checking $file...\n\n") if ($b);
    #system "showdups $file > $base.dup" if ($d);
    #system "gutspell $file > $base.spl" if ($s);
    &process_file;
    close(IN);
    unlink ($file) if ($killme);
    # Close notes:
    close(NOTES);
  } # next file
}
exit(0);


sub process_file {

# Read list of spelling errors to mark
%err = ();
if ($m && (-f "$base.spl")) {
  open (ERR, "$base.spl") || die "Can't open spellings file $base.spl: $!";
  @words = <ERR>;
  chop(@words);
  $wordpat = '\b(' . join("|", @words) . ')\b';
  $m = 0 if ($#words < 0);
}
if ($m && (!-f "$base.spl")) {
  message("Warning: -m option given but $base.spl file not found\n");
  $m = 0;
}

$lineno = 1;
$/ = "\n"; # read input a line at a time.

# skip header:
if ($h) {
  while (<IN>) {
    $lineno++;
    last if (/^\*END\*/ || /\*END\*$/);
  }
  die "End of header not found!\n" if (eof);
}

$/ = "\n\n"; # read input a paragraph at a time.
$quoting = 0; # are we expecting the next paragraph to be quoted?

while ($para = <IN>) {
  &clear_bads;
  # add the paragraph break:
  # split para into lines each of which ends in \n:
  @lines = split(/\n/, $para, 999999);
  pop(@lines);
  grep ($_ .= "\n", @lines);

  # First the simple checks, do a line at a time:
  $l = 0;
  foreach (@lines) {
    # Marked up accents and ligatures and page numbers etc.:
    s/<[^<>]+>/&set_meta("$&")/ge;
    s/-\^-/&set_meta("$&")/ge;
    while (s/$bad/&set_meta("$&")/e) {
      # Only flag each bad character once in the file:
      substr($bad, -1, 0) = "\\$&";
      &bad("Bad character", $l, length($`));
    }
    # Mark words in wordlist:
    if ($m) {
      while (s/$wordpat/&set_meta("$&")/e) {
	&bad("Possible misspelling", $l, length($`));
	# Remove the word from the pattern:
	$word = $&;
	@words = grep (!/^$word$/, @words);
	$wordpat = '\b(' . join("|", @words) . ')\b';
	if ($#words < 0) {
	  $m = 0;
	  last;
	}
      }
    }
    # Avoid false matches on numbers n,nnn and n.nnn and times n:nn
    s/\d[\.\,\:]\d/&set_meta("$&")/ge;
    s/(^|\s)\.(\d)/"$1" . &set_meta(".") . "$2"/ge;
    #
    # Allow four or more dots anywhere:
    s/(\s*\.){4,}/&set_meta("$&")/eg;
    # M.P. Ward, Ph.D, c.v. and...ellipses and.... ellipses are OK:
    s/\b($letter$letter?\.)+$letter/&set_meta("$&")/eg;
    s/((^|$letter|[\`\"\!\?])\.\.\.)+($letter|[\'\"\!\?]|\n)/&set_meta("$&")/eg;
    if ($e) {
      # Allow ". . ." also as ellipses:
      s/(^|$letter|[\`\"\!\?])\s*\.\s*\.\s*(\.\s*)+_?($letter|[\'\"\!\?]|\n)/&set_meta("$&")/eg;
    }
    # (!) and (?) are OK:
    s/\([\!\?]\)/&set_meta("$&")/ge;
    #
    while (s/(\s)($punct)/"$1".&set_meta("$2")/e) {
      &bad("Space before punctuation", $l, length($`));
    }
    while (s/($punct$letter)/&set_meta("$&")/e) {
      &bad("Letter after punctuation", $l, length($`));
    }
    # must have space or " or ( before ` or (
    while (s/($before_open)([\`\(])/"$1".&set_meta("$2")/e) {
      &bad("No space before ` or (", $l, length($`));
    }
    # must have letter or " or ( after ` or (
    while (s/([\`\(])($after_open)/&set_meta("$1")."$2"/e) {
      &bad("No letter or punctuation after ` or (", $l, length($`));
    }
    while (s/(^| )(\))/"$1".&set_meta("$2")/e) {
      &bad("Space before )",  $l, length($`));
    }
    while (s/(\))($letter)/&set_meta("$1")."$2"/e) {
      &bad("Letter after )",  $l, length($`));
    }
    # Hyphens should be either letter-letter or -----... (2 or more)
    # NB all of space--space, notspace--notspace,
    # space--notspace and notspace--space are used, also ----.
    # Also allow letter-o'-letter and NOR'-letter and letter-'and
    s/($letter)-o'-($letter)/&set_meta("$&")/eg;
    s/nor'-($letter)/&set_meta("$&")/eg; #'
    s/($letter)-'and/&set_meta("$&")/eg; #'
    s/($letter-)+($letter)/&set_meta("$&")/eg;
    s/-{2,}/&set_meta("$&")/eg;
    s/tête-à-tête/&set_meta("$&")/eg;
    # Also allow :- and -/- anywhere:
    s/:-/&set_meta("$&")/eg;
    s/-\/-/&set_meta("$&")/eg;
    # Also allow s'-letter
    s/($letter)s'-($letter)/&set_meta("$&")/eg; #'
    # remaining (single) hyphens are bad:
    while (s/-+/&set_meta("$&")/e) {
      &bad("Bad hyphen(s)",  $l, length($`));
    }

    tr/\200-\377/\0-\177/;
    $l++;
  }
  # now check parentheses and quotes.
  $_ = $para;
  # Parentheses must match within the paragraph.
  # First zap balanced parentheses:
  1 while (s/\(([^\(\)]*)\)/\377$1\377/g);
  # remaining parentheses are unbalanced:
  while (s/[\(\)]/\377/) {
    &bad_par("Unmatched parenthesis", $`);
  }
  # Next, check for matching quotes
  $_ = $para;
  s/<p>/   /gi; # Clear HTML paragraph markers.
  # if $quoting, then first non-space must be "
  # 
  if ($quoting) {
    if (m/^\s*"/) { #"
      # quote found. Check if this para closes the quote,
      # if so, then clear $quoting
      # if not, then add a "dummy" close quote at the end
      $quotes = tr/"/"/;
      if ($quotes & 1) {
	# odd no. of quotes, so add a dummy close quote:
	s/(\n)*$/ DUMMY\"$1/;
      } else {
	# even no. of quotes, leave quoting mode:
	$quoting = 0;
      }
    } else {
      # opening quote is missing
      &bad("Unmatched \"s in previous paragraph", 0, 0);
      $quoting = 0;
    }
  } else {
    # if an odd no of quotes, then enter quote mode and add a dummy
    # close quote at the end:
    $quotes = tr/"/"/;
    if ($quotes & 1) {
      $quoting = 1;
      s/(\n)*$/ DUMMY\"$1/;
    }
  }

  # Now all quotes should balance.
  # Decide whether "'s are opening or closing.
  # use \200 for opening quote and \201 for closing quote

  # zap balanced `stuff':
  1 while (s/`[^`'"]*'/&set_meta("$&")/ge);
  # Check for `..."stuff"...'
  1 while (s/(`[^`"]*)"([^`"]*)"/$1\200$2\201/g); #`
  # zap balanced `stuff':
  1 while (s/`[^`"]*'/&set_meta("$&")/ge); #"
  # Check for `..."stuff"...'
  1 while (s/(`[^`"]*)"([^`"]*)"/$1\200$2\201/g); #`

  # Decide on remaining "s:
  s/"([^`"]*)"/\200$1\201/g; #"

  # Any remaining "s or `s must be unbalanced:
  $tmp = $_;
  while (s/[`"]/\377/) { #`
    &bad_par("Unmatched quote", $`);
  }
  $_ = $tmp;

  tr/\202-\377/\002-\177/;

  if ($quote_mode == 0) {
    # Old method

    # Check spacing around \200 and \201
    # Must have space or \200 or " or ( before \200
    # Change \200 to \202 when done:
    while (s/($before_open)\200/$1\202/) {
      &bad_par("No space before open quote", $`);
    }
    tr/\202/\200/;
    # must have letter or \200 " or ( or ... after \200
    while (s/\200($after_open)/\202$1/) {
      &bad_par("No letter or \" or ( after open quote", $`);
    }
    tr/\202/\200/;
    while (s/\201($after_close)/\203$1/) {
      &bad_par("No space after close quote", $`);
    }
    tr/\203/\201/;
    # must have letter or \201 " or ) or ... after \201
    while (s/($before_close)\201/$1\203/) {
      &bad_par("No letter or punctuation before close quote", $`);
    }
    tr/\203/\201/;

  } else {
    # Alternative method of checking quotes.
    tr/\200\201\202\203/""""/;

    # Use context to decide whether each " is an open or close quote.
    # Then check for balanced quotes.

    # Trivially, the first char of a para must be an opening quote!
    s/^"/\200/;

    # Simple cases: space"letter (opening) and letter"space (closing)
    # and `"letter, space`" (opening) and '"space, letter'" (closing)
    s/(\s)"($letter)/$1\200$2/g;
    s/($letter)"(\s)/$1\201$2/g;
    s/(\s)`(\s*)"/$1`$2\200/g;
    s/(\s)"(\s*)`/$1\200$2`/g;
    s/`(\s*)"($letter)/`$1\200$2/g;
    s/"(\s*)`($letter)/\200$1`$2/g;
    s/'(\s*)"(\s)/'$1\201$2/g;
    s/"(\s*)'(\s)/\201$1'$2/g;
    s/($letter)'(\s*)"/$1'$2\201/g;
    s/($letter)"(\s*)'/$1\201$2'/g;
    # )" and ") are closing, while (" and "( are opening:
    s|\)"|\)\201|g;
    s|"\)|\201\)|g;
    s|\("|\(\200|g;
    s|"\(|\200\(|g;

    if ($e) {
      # Special cases with ". . ." ellipses: . . . "punct is a closing quote:
      s/(\.\s*\.\s*\.\s*)"($punct|\s)/$1\201$2/g;
      s/(\s)"(\s*\.\s*\.\s*\.\s*)/$1\200$2/g;
    }

    # more difficult cases (punctuation on one side):
    # punct"space, punct"punct (closing), space"'letter (opening)
    # letter"punct (closing)
    s/($punct|-)"(\s)/$1\201$2/g;
    s/($punct)"($punct)/$1\201$2/g;
    s/(\s)"'($letter)/$1\200$2/g;
    s/($letter)"($punct)/$1\201$2/g;
    # hyphens: -"letter is opening, -"space is closing
    # space"- is opening, (letter or punct)"- is closing
    s/-"($letter)/-\200$1/g;
    s/-"(\s)/-\201$1/g;
    s/(\s)"-/$1\200-/g;
    s/($letter|$punct)"-/$1\201-/g;
    # space"... (opening), ..."(punct or space) (closing):
    s/(\s)"\.\.\./$1\200\.\.\./g;
    s/\.\.\."($punct|\s)/\.\.\.\201$1/g;
    # space"space-- is opening and --space"space is closing:
    s/(\s)"(\s--)/$1\200$2/g;
    s/(--\s)"(\s)/$1\201$2/g;
    # "[\d is closing and \d]" is opening (foot note markers)
    s/"(\[\d)/\201$1/g;
    s/(\d\])"/$1\200/g;
    # "<\d is closing and \d>" is opening (foot note markers)
    s/"(<\d)/\201$1/g;
    s/(\d>)"/$1\200/g;
    # space"_ and _"space are opening and closing:
    s/(^|\s)"_/$1\200_/g;
    s/_"($|\s)/_\201$1/g;
    # --"(punct or space) is closing:
    s/--"($punct|\s)/--\201$1/g;
    # (" is opening and ") is closing:
    s/([\[\(\<])"/$1\200/g;
    s/"([\]\)\>])/\201$1/g;

    # Any remaining quotes are errors. \200...\201 should balance
    while (s/"/\377/) {
      &bad_par("Bad quote", $`);
    }

    # zap balanced quotes:
    1 while (s/\200([^\200\201]*)\201/\377$1\377/g);

    # remaining \200 and \201s are unbalanced quotes:
    while (s/[\200\201]/\377/) {
      &bad_par("Unmatched quote", $`);
    }

  }

  &print_bads;
  $lineno += $l;
}

} # end of process_file subroutine.

# record a problem for this paragraph:
sub bad {
  local($whinge, $l, $pos) = @_;
  $whinges{$l, $pos} = "$prefix $whinge\n";
  $posns[$l] .= "," if ($posns[$l] ne "");
  $posns[$l] .= $pos;
}

# record a problem for this paragraph -- calculate line and pos
sub bad_par {
  local($whinge, $p) = @_;
  local ($l);
  $l = ($p =~ tr/\n\212/\n\212/);
  $p .= " "; # Make sure there is at least one char to match:
  $p =~ m/([^\n\212]+)$/;
  $p = length($1) - 1;
  &bad($whinge, $l, $p);
}

# print either just bad lines, or all lines, plus whinges
# ignore posns beyond length of line
sub print_bads {
  local ($l, $ll, @sortposns);
  $l = 0;
  foreach (@lines) {
    $ll = length;
    if (($posns[$l] eq "") || ($posns[$l] > $ll)) {
      message($_) unless ($b);
    } else {
      message(sprintf("%5d:", $l + $lineno)) if ($b);
      message($_);
      # print whinges for this line
      $marks = " " x 80;
      @sortposns = sort(split(/,/, $posns[$l]));
      foreach $pos (@sortposns) {
	last if (($pos > $ll) || ($pos >= 80));
	substr($marks, $pos, 1) = "^";
      }
      $marks =~ s/\s+$//;
      if ($b) {
	$tmp = $prefix x 3;
	message($tmp, $marks, "\n");
      } else {
	substr($marks, 0, 2) = $prefix;
	message("$marks\n");
      }
      foreach $pos (@sortposns) {
	last if ($pos > $ll);
	message($whinges{$l, $pos});
      }
      message("\n") if ($b);
    }
  $l++;
  }
}

sub clear_bads {
  %whinges = ();
  @posns = ();
}



# return string with meta bits set:
sub set_meta {
  local($_) = (@_);
  tr/\0-\177/\200-\377/;
  $_;
}

sub clear_meta {
  local($_) = (@_);
  tr/\200-\377/\0-\177/;
  $_;
}


sub sig_pipe {
  print STDERR "Caught a SIGPIPE!\n";
}

sub message {
  print @_ if ($b);
  print NOTES @_;
}
