#!/usr/local/bin/perl -w
    eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}'
	if (0);
=pod

=head1 NAME

cdiff - compare and/or merge two text files

=head1 SYNOPSIS

    B<cdiff> [B<-skipto> I<pattern>] [B<-merge[1|2]>|B<-trim>] F<file1> F<file2>

=head1 DESCRIPTION

B<cdiff> will compare two text files, ignoring differences in layout,
and produce an output file which shows the differences.
It can also be used to I<merge> the two files into a single version,
choosing between variant readings in the files based on a dictionary
of words. (See the B<-merge> option below).

The layout of the output is taken from F<file2>.

Characters deleted from F<file1> are shown in I<[angle brackets]>,
while inserted characters are shown in I<{curly brackets}>.

Suppose F<file1> contains the following text:

    CHAPTER I

    Call me Ishmael.  Some years ago--never mind how long precisely--
    having little or no money in my purse and nothing particular
    to interest me on shore,I thought I would sail about a little
    and see the watery port of the wor1d.  It is a way I have
    of driving of the spleen, and regulating the circulation.

while F<file2> contains this:

    Chapter 1

      Call me ishmael. Some years ago- never mind how long
    precisely- having little or no money in my purse, and
    nothing particular to interest me on shore, I thought
    I would sail about a little see
    the vatery part of the world.
    It is a way I have of driving off the
    spleen and regulating the circulation.

The result of C<cdiff file1 file2> is:

    C[HAPTER I]{hapter 1}

      Call me [I]{i}shmael. Some years ago-[-]{ }never mind how long
    precisely-[-] having little or no money in my purse{,} and
    nothing particular to interest me on shore, I thought
    I would sail about a little [and] see
    the [w]{v}atery part of the wor[1]{l}d.
    It is a way I ha[s]{ve} of driving of{f} the 
    spleen[,] and regulating the circulation.

Adding the C<-trim> option shows only the lines with differences,
which are within 50 lines of a chapter heading (this is designed
for Project Gutenberg's legal team):

    1:C[HAPTER I]{hapter 1}
    3:      Call me [I]{i}shmael. Some years ago-[-]{ }never mind how long 
    4:    precisely-[-] having little or no money in my purse{,} and
    6:    I would sail about a little [and] see
    7:    the [w]{v}atery part of the wor[1]{l}d.
    8:    It is a way I ha[s]{ve} of driving of{f} the 
    9:    spleen[,] and regulating the circulation.

The -merge option merges the files by taking the "obvious" choice
where possible, but leaving the choices in otherwise.
The "obvious" choices are:

=over 4

=item 1.

One variant is the same as the other but with some letters in upper case.
In this case, take the upper case version.

=item 2.

Punctuation has been inserted or deleted.  In this case we include
the punctuation in the output, on the assumption that punctuation
is more likely to be missing.

=item 3.

One or more words have been inserted or deleted.  As for punctuation,
the words are included in the output.

=item 4.

If the choice is between two sets of words, where one variant's
words are all in the dictionary but the other variant has words not in
the dictionary, then choose the dictioary words.

=item 5.

Otherwise, if one variant has more words in it, return that one. 

=item 6.

Otherwise, if one variant has fewer digits, return that one.
(This is mostly for chapter headings: if there is a choice between
roman numerals and arabic numbers, return the latter).

=item 7.

Otherwise, if the B<-merge1> option was given, choose the variant from F<file1>,
if the B<-merge2> option was given, choose the variant from F<file2>,
otherwise, leave in the choice with I<[...]{...}> brackets.

=back


The result of C<cdiff -merge file1 file2> is:

    CHAPTER I

      Call me Ishmael. Some years ago- never mind how long
    precisely-- having little or no money in my purse, and
    nothing particular to interest me on shore, I thought
    I would sail about a little and see
    the watery part of the world.
    It is a way I [has]{have} of driving [of]{off} the 
    spleen and regulating the circulation.

Note that there are only two variants left, and these are expanded
to whole words: I<[has]{have}> and I<[of]{off}>.

If you want to see which choices are made automatically, do

C<cdiff -merge2 file1 file2 E<gt> file3>

and then compare F<file3> against F<file2>, perhaps with:

C<cdiff file2 file3>


=head1 OPTIONS

cdiff takes the following options:

=over 4

=item -skipto I<pattern>

Skip to the first line matching the given pattern in both files
before starting the comparison. Lines in F<file2> up to the matching
line will be included in the output unchanged.

=item -trim

Add line numbers to the output and include only those lines which
have changes on them and which are within 50 lines from a chapter break.

=item -merge[1|2]

Merge the two files into one, automatically choosing which variant
to include in each case. If the choice cannot be made automatically,
then B<-merge1> chooses the text from F<file1>, B<-merge2> chooses
from F<file2> and B<-merge> (or B<-merge3>) leaves in the brackets
for a human being (or some other program) to make the choice.

=back


=head1 REQUIREMENTS

B<cdiff> requires B<perl> and B<wdiff> to be available.
B<wdiff> also requires the B<diff> utility.


=head1 AUTHOR

Martin Ward, E<lt>martin@gkc.org.ukE<gt>.

=head1 SEE ALSO

B<wdiff>, B<diff>

=head1 COPYRIGHT

This program is distributed under the Artistic License.

=cut


#
# Condense wdiff output to show only the different characters
# (rather than the whole words). Eg replace [-foo,-] {+foo+}
# by foo[,]
#
# Usage: cdiff [-skipto patt] [-merge[1|2]|-trim] file1 file2
#
# -merge1 means choose the first version when there are no clues
# -merge2 means choose the second version when there are no clues
# -merge lets the user choose (leave in []{} markers for these cases)
# -trim shows only changed lines within 50 lines of a chapter break
# -skipto patt skips to a like starting with patt.
# 

use strict;

my $HOME = $ENV{'HOME'} || $ENV{'LOGDIR'} ||
		(getpwuid($<))[7] || die "You're homeless!\n";

(my $myname = $0) =~ s|(.*/)*||;	# strip path component from name
my $Usage = "Usage: $myname [-skipto patt] [-merge[1-3]|-trim] file1 file2\n";

sub quit();
sub skipto($$);
sub fix_diffs($$$);
sub merge_diffs($$$);
sub ok($);

my $skipto = "";
my @goners;

my $span = 50; # Number of lines either side of a chapter heading to include

$SIG{INT}  = \&quit;
$SIG{PIPE} = \&quit;

my $w = "\001";
my $x = "\002";
my $y = "\003";
my $z = "\004";

my $merge = 0;
my $trim = 0;

while (@ARGV && $ARGV[0] =~ /^-/) {
  my $opt = shift;
  if ($opt eq "-skipto") {
    $skipto = shift if (@ARGV);
  } elsif ($opt eq "-merge") {
    $merge = 3;
  } elsif ($opt =~ /^-merge(\d)$/) {
    $merge = $1;
  } elsif ($opt eq "-trim") {
    $trim = 1;
  }
}

# Check for two arguments:
die $Usage if ($#ARGV != 1);

my $file1 = $ARGV[0];
my $file2 = $ARGV[1];

foreach ($file1, $file2) {
  die "Can't read `$_': $!\n" unless (-f);
}

# Read a dictionary into %word
my %word = ();
if ($merge) {
  open(WORDS, "$HOME/dict/text710.words") || warn "Can't open default words file.\n";
  grep { chomp; $word{$_}++ } <WORDS>;
  close(WORDS);
}


my $skipped = "";
($file1, $skipped) = skipto($skipto, $file1);
($file2, $skipped) = skipto($skipto, $file2);

$_ = `wdiff -w "$w" -x "$x" -y "$y" -z "$z" "$file1" "$file2"`;

# Remove extra newlines before and inside [...] brackets:
s/\n+$w/ $w/g;
1 while s/($w[^$x]*)\n/$1 /g;

# Remove common initial/final letters from pairs of differences:

s/$w([^$x]*)$x(\s*)$y([^$z]*)$z/fix_diffs(qq[$1],qq[$2],qq[$3])/ges;

if ($merge) {
  # Factor back in the letters taken out of pairs of differences:
  # aaa[bbb]c --> [aaabbb]{aaa}c
  s/(\w+)$w([^$x]*)$x([^$y])/$w$1$2$x$y$1$z$3/gs;
  # c{aaa}bbb --> c[bbb]{aaabbb}
  s/([^$x])$y([^$z]*)$z(\w+)/$1$w$2$x$y$2$3$z/gs;
  # [aaa]bbb --> [aaabbb]{bbb}
  s/$w([^$x]*)$x(\w+)/$w$1$2$x$y$2$z/gs;
  # aaa{bbb} --> [aaa]{aaabbb}
  s/(\w+)$y([^$z]*)$z/$w$1$x$y$1$2$z/gs;
  # aaa[bbb]{ccc} --> [aaabbb]{aaanccc}
  s/(\w+)$w([^$x]*$x\s*$y)/$w$1$2$1/gs;
  # [aaa]{bbb}ccc --> [aaaccc]{bbbccc}
  s/($x\s*$y[^$z]*)$z(\w+)/$2$1$2$z/gs;
  # Merge pairs of differences
  s/$w([^$x]*)$x(\s*)$y([^$z]*)$z/merge_diffs(qq[$1],qq[$2],qq[$3])/ges;
  # Delete all remaining markers:
  s/$w|$x|$y|$z//g;
} else {
  # Restore any remaining markers to [...] and {...}:
  s/$w/\[/g;
  s/$x/\]/g;
  s/$y/\{/g;
  s/$z/\}/g;
}

my @lines = split(/\n/, $skipped . $_);

if ($trim) {
  # Add line numbers:
  my $n = 1; grep { s/^/sprintf("%5i:", $n++)/e } @lines;

  # Trim lines outside the span:
  my %wanted = (); my $i;
  grep { /^(\s*)(\d+):\W*c\W*h\W*a\W*p\W*t\W*e\W*r/i && $wanted{$2}++ } @lines;
  foreach $n (keys %wanted) {
    foreach $i ($n-$span .. $n+$span) {
      $wanted{$i} ||= 1;
    }
  }
  # Grep out the wanted lines:
  @lines = grep { /^(\s*)(\d+):/ && $wanted{$2} } @lines;
  @lines = grep { /^\s*(\d+):\s*chapter/i || /[\}\{\[\]]/ } @lines;
}

# Add a blank line around chapter headings:

grep { s/^(\s*\d+:\s*chapter.*)$/\n$1\n/i } @lines;

print join("\n", @lines), "\n";

quit();


sub quit() {
  unlink(@goners);
  exit(0);
}


sub skipto($$) {
  my($skipto, $file) = @_;
  return($file, "") if ($skipto eq "");
  my $tmp = "/tmp/$file.$$";
  local(*IN, *OUT, $_, $/);
  undef $/;
  open(IN, $file);
  $_ = <IN>;
  s/^(.*?\n)($skipto)/$2/s;
  open(OUT, ">$tmp") or die "Can't write to `$tmp': $!\n";
  print OUT;
  push(@goners, $tmp);
  return($tmp, $1);
}


sub fix_diffs($$$) {
  my ($a, $s, $b) = @_;
  local $_ = "$w$a$x$y$b$z";
  # Trim initial spaces in the separator:
  $s =~ s/^ +//;
  for (;;) {
    1 while (s/$w(.)(.*?)$y\1/$1$w$2$y/s);
    1 while (s/(.)$x(.*?)\1$z/$x$2$z$1/s);
    # Check for a newline in the second part:
    last unless (s/$w\s(.*?)$y\n/\n$w$1$y/s);
  }
  # Ignore empty pairs of brackets
  if (s/$w\s*$x//) {
    s/$y(\s*)$z/$1/s;
  }
  s/$y$z//s;
  # Prepend the separator to the result:
  return("$s$_");
}


sub merge_diffs($$$) {
  my ($a, $s, $b) = @_;
  my ($aa, $bb, @a, @b);
  # Check if one version has capitals not in the other:
  # (ignore punctuation in this comparison)
  ($aa = $a) =~ s/\W+//g;
  ($bb = $b) =~ s/\W+//g;
  if ("\L$aa\E" eq "\L$bb\E") {
    return($a) if (($aa =~ tr/A-Z/A-Z/) > ($bb =~ tr/A-Z/A-Z/));
    return($b);
  }
  # Check if only one version has valid words:
  my $a_ok = ok($a);
  my $b_ok = ok($b);
  return($a) if ($a_ok && !$b_ok);
  return($b) if ($b_ok && !$a_ok);
  # Return the version with more words:
  ($aa = $a) =~ s/-|'//g;
  ($bb = $b) =~ s/-|'//g;
  @a = grep { $_ ne "" } split(/\W+/, $aa);
  @b = grep { $_ ne "" } split(/\W+/, $bb);
  return($a) if (@a > @b);
  return($b) if (@a < @b);
  # Return the version with more punctuation:
  ($aa = $a) =~ s/\w+|\s+//g;
  ($bb = $b) =~ s/\w+|\s+//g;
  return($a) if (length($aa) > length($bb));
  return($b) if (length($aa) < length($bb));
  # Return the version with fewer digits (in case a digit replaces a letter):
  return($a) if (($a =~ tr/0-9/0-9/) < ($b =~ tr/0-9/0-9/));
  return($b) if (($a =~ tr/0-9/0-9/) > ($b =~ tr/0-9/0-9/));
  
  # Otherwise, look at $merge to see which to choose:
  return($a) if ($merge == 1);
  return($b) if ($merge == 2);
  return("[$a]{$b}");
}


# Check if section is a list of valid words
# (Return 0 if there are no words):

sub ok($) {
  local ($_) = @_;
  tr/A-Z/a-z/;
  return(0) unless (/[a-z]/);
  while (s/^[^a-z]*([a-z]+)//) {
    return(0) unless ($word{$1});
  }
  return (1);
}


END { quit() }

