#!/usr/bin/perl
# Name: newsgrep [-s] [-c config_file]
# Version 0.5
# Author: Martin Ward
# Email: martin@gkc.org.uk

# Changelog
# 0.1: Initial version
# 0.2: Various bug fixes and enhancements
# 0.3: Check for article's own group:number not appearing in Xref header
# 0.4: Allow four letters in the timezone (eg CEST or MESZ)
# 0.5: Allow USER and PASS parameters for authentication

use strict;
use warnings;
use Net::NNTP;
use Date::Parse;
use Fcntl ':flock'; # import LOCK_* constants

sub parse_range($);
sub sort_merge($);
sub in_range($$);
sub save_article($$);
sub wanted($$$$);
sub update_ranges($$$$);
sub insert_range($$);
sub update_config($$);
sub range_to_string($);
sub trim_range($$);

$| = 1;
my $VERSION = "0.5";
my $AUTHOR = 'martin@gkc.org.uk';
my $HOME = $ENV{'HOME'} || $ENV{'LOGDIR'} ||
		(getpwuid($<))[7] || die "You're homeless!\n";

(my $myname = $0) =~ s|(.*/)*||;        # strip path component from name
my $Usage = "Usage: $myname [-s] [-c config_file]\n";

my %params = ();	# Group-independent parameters
my %groups = ();	# Parameters for each selected group
my %ranges = ();	# List of article number sequences for each group

# news.group: n1-m1,n2-m2,...
# is stored as:
# $ranges{"news.group"} = [[n1, m1], [n2, m2] ...]
# where ni <= mi

# Default values:

my $config = "$HOME/.newsgreprc";

$params{NNTP} = "localhost";
$params{MBOX} = "$HOME/NEWS";
$params{TIMEOUT} = 120;
$params{DEBUG} = 0;
$params{MBOXFORMAT} = "unix";
$params{USER} = "";
$params{PASS} = "";


# Options parsing:

while (@ARGV && ($ARGV[0] =~ /^-/)) {
  my $opt = shift;
  if ($opt eq "-s") {
    # silent
    close(STDOUT);
    open(STDOUT, ">/dev/null");
  } elsif ($opt eq "-c") {
    die $Usage unless (@ARGV);
    $config = shift;
  } else {
    die $Usage;
  }
}

die $Usage if (@ARGV);


### Scan the config file ###

my @groups = ();		# Currently selected groups

open(IN, $config) or die "Can't read the config file `$config': $!\n";
while (<IN>) {
  # Skip comments:
  next if (/^\s*#/);
  chomp;
  s/\s+$//;
  s/^\s+//;
  next if ($_ eq "");
  # Handle continuation lines:
  while (s/\\$//) {
    $_ .= <IN>;
    chomp;
    s/^\s+//;
  }
  if (s/^(NNTP|MBOX|TIMEOUT|DEBUG|MBOXFORMAT|USER|PASS)\s+//) {
    $params{$1} = $_;
  } elsif (s/^SELECT\s+//) {
    @groups = split(/\s+/, $_);
    for (@groups) { $ranges{$_} ||= [] };
  } elsif (s/^(WHERE|REQUIRE|UNLESS)\s+//) {
    my $type = $1;
    die "No selected newsgroup(s) on line $. of $config!\n"
      unless (@groups);
    my $re = eval "qr$_";
    die "Badly formatted $type statement on line $. of $config!\n" unless ($re);
    foreach my $group (@groups) {
      push(@{$groups{$group}{$type}}, $re);
    }
  } elsif (/^(\S+):\s*([0-9,-]*)$/) {
    $ranges{$1} = parse_range($2);
  } else {
    die "Unknown command on line $. of $config!\n";
  }
}
close(IN);

# Environment variable overrides config file if present:
$params{NNTP} = $ENV{NNTPSERVER} if defined($ENV{NNTPSERVER});
$params{MBOX} =~ s/~/$HOME/;
if (!-f $params{MBOX}) {
  open(MBOX, ">$params{MBOX}") or die "Cannot write to mailbox $params{MBOX}\n";
  close(MBOX);
}
die "Cannot write to mailbox $params{MBOX}\n" unless (-w $params{MBOX});

print "Running $myname newsreader version $VERSION by $AUTHOR\n";


### Open the link to the NNTP server ###

print "Connecting to NNTP server $params{NNTP}\n";
my $nntp = Net::NNTP->new($params{NNTP});
die "Failed to connect to NNTP server $params{NNTP}\n" unless defined($nntp);

$nntp->debug($params{DEBUG}) if ($params{DEBUG});

$nntp->authinfo($params{USER}, $params{PASS}) if $params{USER};


### Process the groups ###

my ($count, $first, $last, $name);
my ($list, $n, $end, $head, $article, $message, $id);
my $total = 0;
my $saved = 0;

GROUP:
foreach my $group (sort keys %ranges) {
  ($count, $first, $last, $name) = $nntp->group($group);
  if (!defined($count)) {
    warn $nntp->message;
    next GROUP;
  }
  next GROUP unless ($count);

  die "Sanity check: $name ne $group" unless ($name eq $group);
  print "$name:";

  $list = $nntp->listgroup();
  use Data::Dumper;
  if (defined($list)) {
    # Check for empty article list:
    if (!@$list) {
      print "\n";
      next GROUP;
    }
    trim_range($$list[0], $ranges{$name});
    # If $end > 0 then it is the end of the current "seen" range we are in.
    $end = 0;
    foreach $n (@$list) {
      next if ($end && ($n <= $end));
      $end = in_range($n, $ranges{$name});
      next if ($end);
      # Read this article:
      $article = $nntp->article($n);
      # Note: not all numbers returned by listgroup are valid:
      next unless defined($article);
      $total++;
      if (wanted($article, $groups{$name}{WHERE},
		 $groups{$name}{REQUIRE}, $groups{$name}{UNLESS})) {
	print "F";
	save_article($article, $params{MBOX});
      } else {
	print ".";
      }
      update_ranges(\%ranges, $article, $name, $n) if defined($article);
    }

  } else {

    # The LISTGROUP command didn't work.
    # Find the real first article in the group by looking at the header:
    $head = $nntp->head;
    if (!$head) {
      warn "HEAD command failed: " . $nntp->message;
      next GROUP;
    }
    $message = $nntp->message;
    if ($message =~ /^(\d+)\s+(\S+)\s+head$/) {
      $n = $1;
    } else {
      warn "HEAD command returned unexpected result: " . $nntp->message;
      next GROUP;
    }
    trim_range($n, $ranges{$name});
  ARTICLE:
    for (;;) {
      $end = in_range($n, $ranges{$name});
      if ($end > 0) {
	# Skip this article
	print "s";
	if ($end > $n) {
	  # Try to skip to the end of this part of the range:
	  $nntp->nntpstat($end);
	}
      } else {
	$article = $nntp->article($n);
	$total++;
	if (!defined($article)) {
	  warn "ARTICLE $n command failed: " . $nntp->message;
	} elsif (wanted($article, $groups{$name}{WHERE},
			$groups{$name}{REQUIRE}, $groups{$name}{UNLESS})) {
	  print "F";
	  save_article($article, $params{MBOX});
	} else {
	  print ".";
	}
	update_ranges(\%ranges, $article, $name, $n) if defined($article);
      }
      $id = $nntp->next;
      last ARTICLE unless ($id);
      # Get current article number from message:
      $message = $nntp->message;
      if ($message =~ /^(\d+)\s+(\S+)/) {
	$n = $1;
      } else {
	warn "NEXT command returned unexpected result: " . $nntp->message;
	next GROUP;
      }
    } # next ARTICLE
  }

  print "\n";
  update_config($config, \%ranges);
} # next GROUP

my $s;
$s = ($total == 1) ? "" : "s";
print "$myname: Search completed! $total article$s scanned and ";
$s = ($saved == 1) ? "" : "s";
print "$saved article$s saved in\n$params{MBOX}\n";

exit(0);


# Convert a range string to an array ref of pairs:

sub parse_range($) {
  my ($str) = @_;
  $str =~ s/\s+//g;
  my @res;
  local $_;
  for (split(/,/, $str)) {
    if (/^(\d+)-(\d+)$/) {
      push(@res, [$1, $2]);
    } elsif (/^-(\d+)$/) {
      push(@res, [1, $1]);
    } elsif (/^\d+$/) {
      push(@res, [$_, $_]);
    } else {
      die "Syntax error in range `$_' on line $. of $config!\n";
    }
  }
  return(\@res);
}


# Sort a range and merge adjacent pairs:

sub sort_merge($) {
  my ($range) = @_;
  my @range = sort { $$a[0] <=> $$b[0] } @$range;
  return([]) unless @range;
  my @new = (shift(@range));
  while (@range) {
    if ($new[$#new][1] + 1 >= $range[0][0]) {
      $new[$#new][1] = $range[0][1] if ($new[$#new][1] < $range[0][1]);
      shift(@range);
    } else {
      push(@new, shift(@range));
    }
  }
  return(\@new);
}


# Check if given number is in the range,
# if so, then return the upper limit of the pair:

sub in_range($$) {
  my ($n, $range) = @_;
  foreach my $pair (@$range) {
    return($$pair[1]) if (($$pair[0] <= $n) && ($$pair[1] >= $n));
  }
  return(0);
}


# Save an article in a mailbox

sub save_article($$) {
  my ($article, $file) = @_;
  $saved++;
  local *MBOX;
  local $_;
  my ($from, $date);
  my $zone = "GMT";
  for (@$article) {
    if (/^[Ff]rom\s*:.*<([^> ]+)>/) {
      $from = $1;
    } elsif (/^[Ff]rom\s*:\s*(\S+)/) {
      $from = $1;
    } elsif (/^[Dd]ate\s*:\s+(.*)$/) {
      $date = $1;
    } elsif (/^$/) {
      last;
    }
  }
  if (!$from) {
    warn "Could not find Path line in article\n";
    $from = "nobody";
  }
  if (!$date) {
    warn "Could not find Date line in article\n";
    $date = "Mon  1 Jan 00:00:00 1900 GMT";
  } else {
    # Extract the zone, if we can recognise it:
    $zone = $1 if ($date =~ s/([+-]\d{4}( \(?[A-Z]{3,4}\)?)?|\(?[A-Z]{3,4}\)?)$//);
    $zone =~ s/ \(?[A-Z]{3}\)?//;
    if (!defined(str2time($date))) {
      warn "Could not convert <$date> to a time via str2time!\n";
      $date = "Mon  1 Jan 00:00:00 1900 GMT";
    } else {
      $date = localtime(str2time($date)) . " $zone";
    }
  }
  open(MBOX, ">>$file") or die "Can't append to `$file': $!\n";
  flock(MBOX, LOCK_EX);
  seek(MBOX, 0, 2); # in case someone appended while we were waiting...
  if ($params{MBOXFORMAT} =~ /^unix$/i) {
    print MBOX "From $from  $date\n";
    print MBOX @$article;
    print MBOX "\n";
  } elsif ($params{MBOXFORMAT} =~ /^elm$/i) {
    print MBOX "From $from  $date\n";
    print MBOX @$article;
  } elsif ($params{MBOXFORMAT} =~ /^mmdf$/i) {
    print MBOX "\001\001\001\001\n";
    print MBOX @$article;
    print MBOX "\001\001\001\001\n";
  } else {
    die "Unknown mailbox format: $params{MBOXFORMAT}\n";
  }
  flock(MBOX, LOCK_UN);
  close(MBOX);
}


# Decide if we want this article:

sub wanted($$$$) {
  my ($article, $where, $require, $unless) = @_;
  # Any UNLESS match means we don't want this article:
  foreach my $code (@$unless) {
    return(0) if grep { /$code/ } @$article;
  }
  # All the REQUIRE patterns must match:
  foreach my $code (@$require) {
    return(0) unless grep { /$code/ } @$article;
  }
  # If there are no WHERE patterns at all, then assume we want everything else:
  return(1) unless defined($where) && (@$where);
  # Any WHERE pattern means that we want this article:
  foreach my $code (@$where) {
    return(1) if grep { /$code/ } @$article;
  }
  # If no WHERE patterns match, then we don't want it:
  return(0);
}


sub update_ranges($$$$) {
  my ($ranges, $article, $name, $n) = @_;
  my ($xref) = grep { /^Xref: / } @$article;
  my ($group, $num);
  my $found = 0;
  warn "Article $name:$n has no Xref header!\n" unless ($xref);
  $xref =~ s/^Xref: \S+\s+//;
  foreach my $pair (split(/\s+/, $xref)) {
    ($group, $num) = ($pair =~ m/^(\S+):(\d+)$/) or next;
    $found++ if (($group eq $name) && ($num == $n));
    next unless ($$ranges{$group});
    $$ranges{$group} = insert_range($$ranges{$group}, $num);
  }
  if (!$found) {
    #warn "Article's own number $name:$n not in its Xref header!\n";
    $$ranges{$name} = insert_range($$ranges{$name}, $n);
  }
}


sub insert_range($$) {
  my ($range, $n) = @_;
  push(@$range, [$n, $n]);
  return(sort_merge($range));
}


# Update the config file with the new ranges:

sub update_config($$) {
  my ($config, $ranges) = @_;
  local ($_, *IN, *OUT);
  my $save = "$config.save";
  my %done = ();
  rename($config, $save) or die "rename $config -> $save failed: $!\n";
  open(IN, $save) or die "Can't read `$save': $!\n";
  open(OUT, ">$config") or die "Can't write to `$config': $!\n";
  while (<IN>) {
    # Handle continuation lines:
    $_ .= <IN> while (!eof(IN) && /\\$/);
    if (/^(\S+):/ && $$ranges{$1}) {
      print OUT "$1:" . range_to_string($$ranges{$1}) . "\n";
      $done{$1}++;
    } else {
      print OUT;
    }
  }
  foreach my $name (sort keys %$ranges) {
    next if $done{$name};
    print OUT "$name:" . range_to_string($$ranges{$name}) . "\n";
  }
  close(OUT);
  close(IN);
}


# Convert an array ref of pairs to a range string:

sub range_to_string($) {
  my ($range) = @_;
  my $str = "";
  foreach my $pair (@$range) {
    if ($$pair[0] == $$pair[1]) {
      $str .= "$$pair[0],";
    } else {
      $str .= "$$pair[0]-$$pair[1],";
    }
  }
  $str =~ s/,$//;
  return($str);
}

# Delete any pairs below $min in the range:

sub trim_range($$) {
  my ($min, $range) = @_;
  shift(@$range) while ($range && @$range && ($$range[0][1] < $min));
}
