#!/usr/local/bin/perl -w eval 'exec /usr/local/bin/perl -S $0 ${1+"$@"}' if (0); =pod =head1 NAME cdiff - compare and/or merge two text files =head1 SYNOPSIS B [B<-skipto> I] [B<-merge[1|2]>|B<-trim>] F F =head1 DESCRIPTION B will compare two text files, ignoring differences in layout, and produce an output file which shows the differences. It can also be used to I the two files into a single version, choosing between variant readings in the files based on a dictionary of words. (See the B<-merge> option below). The layout of the output is taken from F. Characters deleted from F are shown in I<[angle brackets]>, while inserted characters are shown in I<{curly brackets}>. Suppose F contains the following text: CHAPTER I Call me Ishmael. Some years ago--never mind how long precisely-- having little or no money in my purse and nothing particular to interest me on shore,I thought I would sail about a little and see the watery port of the wor1d. It is a way I have of driving of the spleen, and regulating the circulation. while F contains this: Chapter 1 Call me ishmael. Some years ago- never mind how long precisely- having little or no money in my purse, and nothing particular to interest me on shore, I thought I would sail about a little see the vatery part of the world. It is a way I have of driving off the spleen and regulating the circulation. The result of C is: C[HAPTER I]{hapter 1} Call me [I]{i}shmael. Some years ago-[-]{ }never mind how long precisely-[-] having little or no money in my purse{,} and nothing particular to interest me on shore, I thought I would sail about a little [and] see the [w]{v}atery part of the wor[1]{l}d. It is a way I ha[s]{ve} of driving of{f} the spleen[,] and regulating the circulation. Adding the C<-trim> option shows only the lines with differences, which are within 50 lines of a chapter heading (this is designed for Project Gutenberg's legal team): 1:C[HAPTER I]{hapter 1} 3: Call me [I]{i}shmael. Some years ago-[-]{ }never mind how long 4: precisely-[-] having little or no money in my purse{,} and 6: I would sail about a little [and] see 7: the [w]{v}atery part of the wor[1]{l}d. 8: It is a way I ha[s]{ve} of driving of{f} the 9: spleen[,] and regulating the circulation. The -merge option merges the files by taking the "obvious" choice where possible, but leaving the choices in otherwise. The "obvious" choices are: =over 4 =item 1. One variant is the same as the other but with some letters in upper case. In this case, take the upper case version. =item 2. Punctuation has been inserted or deleted. In this case we include the punctuation in the output, on the assumption that punctuation is more likely to be missing. =item 3. One or more words have been inserted or deleted. As for punctuation, the words are included in the output. =item 4. If the choice is between two sets of words, where one variant's words are all in the dictionary but the other variant has words not in the dictionary, then choose the dictioary words. =item 5. Otherwise, if one variant has more words in it, return that one. =item 6. Otherwise, if one variant has fewer digits, return that one. (This is mostly for chapter headings: if there is a choice between roman numerals and arabic numbers, return the latter). =item 7. Otherwise, if the B<-merge1> option was given, choose the variant from F, if the B<-merge2> option was given, choose the variant from F, otherwise, leave in the choice with I<[...]{...}> brackets. =back The result of C is: CHAPTER I Call me Ishmael. Some years ago- never mind how long precisely-- having little or no money in my purse, and nothing particular to interest me on shore, I thought I would sail about a little and see the watery part of the world. It is a way I [has]{have} of driving [of]{off} the spleen and regulating the circulation. Note that there are only two variants left, and these are expanded to whole words: I<[has]{have}> and I<[of]{off}>. If you want to see which choices are made automatically, do C file3> and then compare F against F, perhaps with: C =head1 OPTIONS cdiff takes the following options: =over 4 =item -skipto I Skip to the first line matching the given pattern in both files before starting the comparison. Lines in F up to the matching line will be included in the output unchanged. =item -trim Add line numbers to the output and include only those lines which have changes on them and which are within 50 lines from a chapter break. =item -merge[1|2] Merge the two files into one, automatically choosing which variant to include in each case. If the choice cannot be made automatically, then B<-merge1> chooses the text from F, B<-merge2> chooses from F and B<-merge> (or B<-merge3>) leaves in the brackets for a human being (or some other program) to make the choice. =back =head1 REQUIREMENTS B requires B and B to be available. B also requires the B utility. =head1 AUTHOR Martin Ward, Emartin@gkc.org.ukE. =head1 SEE ALSO B, B =head1 COPYRIGHT This program is distributed under the Artistic License. =cut # # Condense wdiff output to show only the different characters # (rather than the whole words). Eg replace [-foo,-] {+foo+} # by foo[,] # # Usage: cdiff [-skipto patt] [-merge[1|2]|-trim] file1 file2 # # -merge1 means choose the first version when there are no clues # -merge2 means choose the second version when there are no clues # -merge lets the user choose (leave in []{} markers for these cases) # -trim shows only changed lines within 50 lines of a chapter break # -skipto patt skips to a like starting with patt. # use strict; my $HOME = $ENV{'HOME'} || $ENV{'LOGDIR'} || (getpwuid($<))[7] || die "You're homeless!\n"; (my $myname = $0) =~ s|(.*/)*||; # strip path component from name my $Usage = "Usage: $myname [-skipto patt] [-merge[1-3]|-trim] file1 file2\n"; sub quit(); sub skipto($$); sub fix_diffs($$$); sub merge_diffs($$$); sub ok($); my $skipto = ""; my @goners; my $span = 50; # Number of lines either side of a chapter heading to include $SIG{INT} = \&quit; $SIG{PIPE} = \&quit; my $w = "\001"; my $x = "\002"; my $y = "\003"; my $z = "\004"; my $merge = 0; my $trim = 0; while (@ARGV && $ARGV[0] =~ /^-/) { my $opt = shift; if ($opt eq "-skipto") { $skipto = shift if (@ARGV); } elsif ($opt eq "-merge") { $merge = 3; } elsif ($opt =~ /^-merge(\d)$/) { $merge = $1; } elsif ($opt eq "-trim") { $trim = 1; } } # Check for two arguments: die $Usage if ($#ARGV != 1); my $file1 = $ARGV[0]; my $file2 = $ARGV[1]; foreach ($file1, $file2) { die "Can't read `$_': $!\n" unless (-f); } # Read a dictionary into %word my %word = (); if ($merge) { open(WORDS, "$HOME/dict/text710.words") || warn "Can't open default words file.\n"; grep { chomp; $word{$_}++ } ; close(WORDS); } my $skipped = ""; ($file1, $skipped) = skipto($skipto, $file1); ($file2, $skipped) = skipto($skipto, $file2); $_ = `wdiff -w "$w" -x "$x" -y "$y" -z "$z" "$file1" "$file2"`; # Remove extra newlines before and inside [...] brackets: s/\n+$w/ $w/g; 1 while s/($w[^$x]*)\n/$1 /g; # Remove common initial/final letters from pairs of differences: s/$w([^$x]*)$x(\s*)$y([^$z]*)$z/fix_diffs(qq[$1],qq[$2],qq[$3])/ges; if ($merge) { # Factor back in the letters taken out of pairs of differences: # aaa[bbb]c --> [aaabbb]{aaa}c s/(\w+)$w([^$x]*)$x([^$y])/$w$1$2$x$y$1$z$3/gs; # c{aaa}bbb --> c[bbb]{aaabbb} s/([^$x])$y([^$z]*)$z(\w+)/$1$w$2$x$y$2$3$z/gs; # [aaa]bbb --> [aaabbb]{bbb} s/$w([^$x]*)$x(\w+)/$w$1$2$x$y$2$z/gs; # aaa{bbb} --> [aaa]{aaabbb} s/(\w+)$y([^$z]*)$z/$w$1$x$y$1$2$z/gs; # aaa[bbb]{ccc} --> [aaabbb]{aaanccc} s/(\w+)$w([^$x]*$x\s*$y)/$w$1$2$1/gs; # [aaa]{bbb}ccc --> [aaaccc]{bbbccc} s/($x\s*$y[^$z]*)$z(\w+)/$2$1$2$z/gs; # Merge pairs of differences s/$w([^$x]*)$x(\s*)$y([^$z]*)$z/merge_diffs(qq[$1],qq[$2],qq[$3])/ges; # Delete all remaining markers: s/$w|$x|$y|$z//g; } else { # Restore any remaining markers to [...] and {...}: s/$w/\[/g; s/$x/\]/g; s/$y/\{/g; s/$z/\}/g; } my @lines = split(/\n/, $skipped . $_); if ($trim) { # Add line numbers: my $n = 1; grep { s/^/sprintf("%5i:", $n++)/e } @lines; # Trim lines outside the span: my %wanted = (); my $i; grep { /^(\s*)(\d+):\W*c\W*h\W*a\W*p\W*t\W*e\W*r/i && $wanted{$2}++ } @lines; foreach $n (keys %wanted) { foreach $i ($n-$span .. $n+$span) { $wanted{$i} ||= 1; } } # Grep out the wanted lines: @lines = grep { /^(\s*)(\d+):/ && $wanted{$2} } @lines; @lines = grep { /^\s*(\d+):\s*chapter/i || /[\}\{\[\]]/ } @lines; } # Add a blank line around chapter headings: grep { s/^(\s*\d+:\s*chapter.*)$/\n$1\n/i } @lines; print join("\n", @lines), "\n"; quit(); sub quit() { unlink(@goners); exit(0); } sub skipto($$) { my($skipto, $file) = @_; return($file, "") if ($skipto eq ""); my $tmp = "/tmp/$file.$$"; local(*IN, *OUT, $_, $/); undef $/; open(IN, $file); $_ = ; s/^(.*?\n)($skipto)/$2/s; open(OUT, ">$tmp") or die "Can't write to `$tmp': $!\n"; print OUT; push(@goners, $tmp); return($tmp, $1); } sub fix_diffs($$$) { my ($a, $s, $b) = @_; local $_ = "$w$a$x$y$b$z"; # Trim initial spaces in the separator: $s =~ s/^ +//; for (;;) { 1 while (s/$w(.)(.*?)$y\1/$1$w$2$y/s); 1 while (s/(.)$x(.*?)\1$z/$x$2$z$1/s); # Check for a newline in the second part: last unless (s/$w\s(.*?)$y\n/\n$w$1$y/s); } # Ignore empty pairs of brackets if (s/$w\s*$x//) { s/$y(\s*)$z/$1/s; } s/$y$z//s; # Prepend the separator to the result: return("$s$_"); } sub merge_diffs($$$) { my ($a, $s, $b) = @_; my ($aa, $bb, @a, @b); # Check if one version has capitals not in the other: # (ignore punctuation in this comparison) ($aa = $a) =~ s/\W+//g; ($bb = $b) =~ s/\W+//g; if ("\L$aa\E" eq "\L$bb\E") { return($a) if (($aa =~ tr/A-Z/A-Z/) > ($bb =~ tr/A-Z/A-Z/)); return($b); } # Check if only one version has valid words: my $a_ok = ok($a); my $b_ok = ok($b); return($a) if ($a_ok && !$b_ok); return($b) if ($b_ok && !$a_ok); # Return the version with more words: ($aa = $a) =~ s/-|'//g; ($bb = $b) =~ s/-|'//g; @a = grep { $_ ne "" } split(/\W+/, $aa); @b = grep { $_ ne "" } split(/\W+/, $bb); return($a) if (@a > @b); return($b) if (@a < @b); # Return the version with more punctuation: ($aa = $a) =~ s/\w+|\s+//g; ($bb = $b) =~ s/\w+|\s+//g; return($a) if (length($aa) > length($bb)); return($b) if (length($aa) < length($bb)); # Return the version with fewer digits (in case a digit replaces a letter): return($a) if (($a =~ tr/0-9/0-9/) < ($b =~ tr/0-9/0-9/)); return($b) if (($a =~ tr/0-9/0-9/) > ($b =~ tr/0-9/0-9/)); # Otherwise, look at $merge to see which to choose: return($a) if ($merge == 1); return($b) if ($merge == 2); return("[$a]{$b}"); } # Check if section is a list of valid words # (Return 0 if there are no words): sub ok($) { local ($_) = @_; tr/A-Z/a-z/; return(0) unless (/[a-z]/); while (s/^[^a-z]*([a-z]+)//) { return(0) unless ($word{$1}); } return (1); } END { quit() }