#! /usr/local/bin/perl -s
# Fixup script for gutenberg files files
# Add -h option to remove hyphenated words
# Add -d with -h to use the dictionary as well as words in the file.
# (otherwise, all hyphens remain)
# Add -para to fix paragraphs
# Add -q if the quotes are in a real mess!
# Add -s to add end-of-sentence (and clause) spaces
# Add -head to strip page headings (titles)
#
# Rules for hyphen removal. Suppose we have found an eol hyphen "to-day"
# where "today" is either in the text or in the dictionary:
#
# (1) If the word "today" appears in the text, then remove the hyphen;
# (2) Otherwise, if the word "to-day" appears in the text (non-eol hyphen),
#     then keep the hyphen;
# (3) Otherwise, if the file had lots of eol hyphens (more than 100),
#     then remove the hyphen;
# (4) Otherwise, leave it in.
#

# Set this to the filename of your dictionary
# (a simple text file containing one word per line):

$dictionary = "$ENV{HOME}/dict/text710.words";

# Default options:
$h = 1; $d = 1;
# $para = 0; $q = 0; $s = 0; $head = 0;

($myname = $0) =~ s|(.*/)*||;	# strip path component from name

$hyphens = 0;	# number of eol hyphens in the file
$big = 100;	# cutoff point for aggressive hyphen removal (rule 3)
$header = 0;	# Have we seen a Gutenberg header?
$gutpat = '^\*END\*THE SMALL PRINT\!';
$headpat1 = '^\d+\s+[A-Z][^a-z0-9]+$';
$headpat2 = '^[A-Z][^a-z0-9]+\s+\d+$';

$nums = '(\d+|[MDCLXVImdclxvi_]+)';	# pattern for page nos
@cache = ();

$Usage = "Usage: $myname infile [outfile] \n";
# Check one or two arguments:
die $Usage if (($#ARGV < 0) || ($#ARGV > 1));
$file = $ARGV[0];

open (IN, $file) || die "Can't open $file: $!\n";
if ($#ARGV == 1) {
  open (OUT, ">$ARGV[1]") || die "Can't open $ARGV[1]: $!\n";
} else {
  open (OUT, ">&STDOUT");
}

if ($h) {
  if ($d) {
    # Read a dictionary into %word
    open (WORDS, $dictionary) or die "Can't read dictionary file `$dictionary': $!\n";
    while (<WORDS>) {
      chop;
      $word{$_}=1;
    }
  }
  # Read the file for more words:
  while (<IN>) {
    tr/\015\032//d;
    $header++ if (/$gutpat/);
    $hyphens++ if (/\w-$/);
    tr/A-Z/a-z/;
    tr/a-z/ /cs;
    foreach $w (split(/ /)) {
      $textword{$w} = 1;
    }
  }
  # reopen input:
  close (IN);
  open (IN, $file) || die "Can't open $file: $!\n";
  if ($header) {
    while (<IN>) {
      print OUT;
      last if (/$gutpat/);
    }
  }
}

while ($_ = getline()) {
  # tidy up dashes:
  s/(^|\s+)-(\s+|$)/--/g;
  # new para:
  s/^ {3}(\S)/\n$1/ if ($para);
  # check for hyphenated word:

  while ($h && m/([a-zA-Z]+)-$/) {
    # read next line, skipping page number:
    $next = getline();
    $_ .= $next;
    s/([a-zA-Z]+)-\n([a-zA-Z]+)/\377/;
    $orig = $&;
    ($new = $orig) =~ s/-\n//;
    ($old = $orig) =~ s/\n//;
    # Mark the hyphen in $orig as "done":
    $orig =~ tr/-/\376/;
    ($x = $new) =~ tr/A-Z/a-z/;
    ($y = $old) =~ tr/A-Z/a-z/;

# (1) If the word "today" appears in the text, then remove the hyphen;
# (2) Otherwise, if the word "to-day" appears in the text (non-eol hyphen),
#     then keep the hyphen;
# (3) Otherwise, if the file had lots of eol hyphens (more than 100),
#     then remove the hyphen;
# (4) Otherwise, leave it in.

    if ($word{$x} || $textword{$x}) {
      if ($textword{$x}) {
	s/\377/$new/;	# remove the hyphen
      } elsif ($textword{$y}) {
	s/\377/$orig/;	# leave in the hyphen
      } elsif ($hyphens > $big) {
	s/\377/$new/;	# remove the hyphen
      } else {
	s/\377/$orig/;	# leave in the hyphen
      }
    } else {
      s/\377/$orig/;	# leave in the hyphen
    }

  }
  # restore "done" hyphens:
  tr/\376/-/;
  if ($s) {
    # Add end of sentence (and clause) spaces:
    s/([\:\.\!\?]) ($|[A-Z])/$1  $2/g;
    # Double space after :
    s/: (\S)/:  $1/g;
    # Ditto after <punctuation><quote>
    s/([\:\.\!\?])(["'`]) ($|[A-Z])/$1$2  $3/g;
    # Double space after :
    s/:(["'`]) (\S)/:$1  $2/g;
    # Not all "."s are end of sentences:
    s/(Mr|Mrs|Miss|St)\.  /$1. /g;
  }
  if ($q) {
    # rationalise "s:
    s/(^|\s)['`](\S|$)/$1"$2/g;
    s/(^|\S)['`](\s|$)/$1"$2/g;
    s/(\w)["'`](\w)/$1'$2/g;
    # double quotes:
    s/(^|\s)['`](\S|$)/$1"$2/g;
    s/(^|\S)['`](\s|$)/$1"$2/g;
  }
  # long dashes:
  if ($s) {
    s/ -([ "'`]|$)/--$1/g;
    s/\s*--\s*/--/g;
  }
  print OUT;
}

# Read a line from IN and return it:
sub getline() {
  local ($_);
  if (@cache) {
    return(shift(@cache));
  } else {
    get_one_line();
    # There is no closing "}" on this illus:
    for (;;) {
      # get_one_line() while (/^\s*\{illust\. caption =/);
      if (!defined($_)) {
	if (@cache) {
	  return(shift(@cache));
	} else {
	  return("");
	}
      }
      # Cache blank lines:
      if ((/^\s*$/) || ($head && /^\d+$/)) {
	push(@cache, $_);
	get_one_line();
	redo;
      }
      # search for the end of {illus....}:
      while ((/^\s*\{illus[^\{\}]*$/) && !eof(IN)) {
	chomp($_);
	$_ .= " " . <IN>;
      }
      # If we hit a page number or illustration,
      # clear the cache and skip to the next non-blank line (if any):
      if (($head && (/$headpat1/ || /^[A-Z][^a-z0-9]+\s+\d+$/))
	   || (/^\s*\<p\s+$nums\>\s*$/)
	   || (/^\s*\<p\d+\>\s*$/)
	   || (/^\s*\{illus[^\{\}]*\}\s*$/)
	   || (/^\s*\{\s*\}\s*$/)) {
	@cache = ();
	if (/\{/ || ($head && (/$headpat1/ || /$headpat2/))) {
	  # skip to next non-blank
	  $_ = "";
	  get_one_line() while (($_ =~ /^\s*$/) && !eof(IN));
	} else {
	  # read next line:
	  get_one_line();
	}
	redo;
      } else {
	# Process the line:

	tr/\015\032//d;
	s/\s*\<p\s+$nums\>\s*//g;
	s/\s*\<p\d+\>\s*//g;
	s/\s*\{illus[^\{\}]*\}\s*//g;
	# Fix "Larsen" encodes:
	s/<Pd>/L/g;		# pound sign
	s/<([a-zA-Z][a-zA-Z])>/$1/g;
	s/<([a-zA-Z\?])([\'\`\^\:\;\!\,])>/$1/g;
	s/\s*<(\d\/\d)>/ $1/g;	# fractions
	s/<(\d+)s>/$1/g;	# subscripts
	s/<n\?>/n/g;		# ~ over n (?)
	s/<'0>/'0/g;		# degrees symbol
	# Greek words:
	s/<gr ([a-zA-Z_ \?\.-]+)>/$1/g;
	# Emphasis italics:
	s/_\*([^_]*)_/\U$1\E/g;
	# Other italics are left as _italics_
	
	# Fix up HTML codes:
	s/\&(..)lig;/$1/g;
	s/\&(.)(circ|grave|acute|uml);/$1/g;
	s/\&frac(.)(.);/$1\/$2/g;
	s/\&pound;/pounds /g;
	s/\s*\&deg;/ degrees/g;
	s/<\/?i>/_/gi;
	s/_([^ _]{2,})_/\U$1\E/g;
	s/<\/?b>//gi;

	# tidy up dashes:
	s/(^|\s+)-(\s+|$)/--/g;
	# tidy up double quotes:
	s/\`\`/"/g;
	s/\'\'/"/g;

	# If the result is blank, cache it and continue:
	if (/^\s*$/) {
	  push(@cache, $_);
	  get_one_line();
	  redo;
	} else {
	  push(@cache, $_);
	  return(shift(@cache));
	}
      }
    }
  }
}


sub get_one_line () {
  $_ = <IN>;
  s/\cM//g if (defined($_));
}