#!/usr/bin/perl # Name: newsgrep [-s] [-c config_file] # Version 0.5 # Author: Martin Ward # Email: martin@gkc.org.uk # Changelog # 0.1: Initial version # 0.2: Various bug fixes and enhancements # 0.3: Check for article's own group:number not appearing in Xref header # 0.4: Allow four letters in the timezone (eg CEST or MESZ) # 0.5: Allow USER and PASS parameters for authentication use strict; use warnings; use Net::NNTP; use Date::Parse; use Fcntl ':flock'; # import LOCK_* constants sub parse_range($); sub sort_merge($); sub in_range($$); sub save_article($$); sub wanted($$$$); sub update_ranges($$$$); sub insert_range($$); sub update_config($$); sub range_to_string($); sub trim_range($$); $| = 1; my $VERSION = "0.5"; my $AUTHOR = 'martin@gkc.org.uk'; my $HOME = $ENV{'HOME'} || $ENV{'LOGDIR'} || (getpwuid($<))[7] || die "You're homeless!\n"; (my $myname = $0) =~ s|(.*/)*||; # strip path component from name my $Usage = "Usage: $myname [-s] [-c config_file]\n"; my %params = (); # Group-independent parameters my %groups = (); # Parameters for each selected group my %ranges = (); # List of article number sequences for each group # news.group: n1-m1,n2-m2,... # is stored as: # $ranges{"news.group"} = [[n1, m1], [n2, m2] ...] # where ni <= mi # Default values: my $config = "$HOME/.newsgreprc"; $params{NNTP} = "localhost"; $params{MBOX} = "$HOME/NEWS"; $params{TIMEOUT} = 120; $params{DEBUG} = 0; $params{MBOXFORMAT} = "unix"; $params{USER} = ""; $params{PASS} = ""; # Options parsing: while (@ARGV && ($ARGV[0] =~ /^-/)) { my $opt = shift; if ($opt eq "-s") { # silent close(STDOUT); open(STDOUT, ">/dev/null"); } elsif ($opt eq "-c") { die $Usage unless (@ARGV); $config = shift; } else { die $Usage; } } die $Usage if (@ARGV); ### Scan the config file ### my @groups = (); # Currently selected groups open(IN, $config) or die "Can't read the config file `$config': $!\n"; while () { # Skip comments: next if (/^\s*#/); chomp; s/\s+$//; s/^\s+//; next if ($_ eq ""); # Handle continuation lines: while (s/\\$//) { $_ .= ; chomp; s/^\s+//; } if (s/^(NNTP|MBOX|TIMEOUT|DEBUG|MBOXFORMAT|USER|PASS)\s+//) { $params{$1} = $_; } elsif (s/^SELECT\s+//) { @groups = split(/\s+/, $_); for (@groups) { $ranges{$_} ||= [] }; } elsif (s/^(WHERE|REQUIRE|UNLESS)\s+//) { my $type = $1; die "No selected newsgroup(s) on line $. of $config!\n" unless (@groups); my $re = eval "qr$_"; die "Badly formatted $type statement on line $. of $config!\n" unless ($re); foreach my $group (@groups) { push(@{$groups{$group}{$type}}, $re); } } elsif (/^(\S+):\s*([0-9,-]*)$/) { $ranges{$1} = parse_range($2); } else { die "Unknown command on line $. of $config!\n"; } } close(IN); # Environment variable overrides config file if present: $params{NNTP} = $ENV{NNTPSERVER} if defined($ENV{NNTPSERVER}); $params{MBOX} =~ s/~/$HOME/; if (!-f $params{MBOX}) { open(MBOX, ">$params{MBOX}") or die "Cannot write to mailbox $params{MBOX}\n"; close(MBOX); } die "Cannot write to mailbox $params{MBOX}\n" unless (-w $params{MBOX}); print "Running $myname newsreader version $VERSION by $AUTHOR\n"; ### Open the link to the NNTP server ### print "Connecting to NNTP server $params{NNTP}\n"; my $nntp = Net::NNTP->new($params{NNTP}); die "Failed to connect to NNTP server $params{NNTP}\n" unless defined($nntp); $nntp->debug($params{DEBUG}) if ($params{DEBUG}); $nntp->authinfo($params{USER}, $params{PASS}) if $params{USER}; ### Process the groups ### my ($count, $first, $last, $name); my ($list, $n, $end, $head, $article, $message, $id); my $total = 0; my $saved = 0; GROUP: foreach my $group (sort keys %ranges) { ($count, $first, $last, $name) = $nntp->group($group); if (!defined($count)) { warn $nntp->message; next GROUP; } next GROUP unless ($count); die "Sanity check: $name ne $group" unless ($name eq $group); print "$name:"; $list = $nntp->listgroup(); use Data::Dumper; if (defined($list)) { # Check for empty article list: if (!@$list) { print "\n"; next GROUP; } trim_range($$list[0], $ranges{$name}); # If $end > 0 then it is the end of the current "seen" range we are in. $end = 0; foreach $n (@$list) { next if ($end && ($n <= $end)); $end = in_range($n, $ranges{$name}); next if ($end); # Read this article: $article = $nntp->article($n); # Note: not all numbers returned by listgroup are valid: next unless defined($article); $total++; if (wanted($article, $groups{$name}{WHERE}, $groups{$name}{REQUIRE}, $groups{$name}{UNLESS})) { print "F"; save_article($article, $params{MBOX}); } else { print "."; } update_ranges(\%ranges, $article, $name, $n) if defined($article); } } else { # The LISTGROUP command didn't work. # Find the real first article in the group by looking at the header: $head = $nntp->head; if (!$head) { warn "HEAD command failed: " . $nntp->message; next GROUP; } $message = $nntp->message; if ($message =~ /^(\d+)\s+(\S+)\s+head$/) { $n = $1; } else { warn "HEAD command returned unexpected result: " . $nntp->message; next GROUP; } trim_range($n, $ranges{$name}); ARTICLE: for (;;) { $end = in_range($n, $ranges{$name}); if ($end > 0) { # Skip this article print "s"; if ($end > $n) { # Try to skip to the end of this part of the range: $nntp->nntpstat($end); } } else { $article = $nntp->article($n); $total++; if (!defined($article)) { warn "ARTICLE $n command failed: " . $nntp->message; } elsif (wanted($article, $groups{$name}{WHERE}, $groups{$name}{REQUIRE}, $groups{$name}{UNLESS})) { print "F"; save_article($article, $params{MBOX}); } else { print "."; } update_ranges(\%ranges, $article, $name, $n) if defined($article); } $id = $nntp->next; last ARTICLE unless ($id); # Get current article number from message: $message = $nntp->message; if ($message =~ /^(\d+)\s+(\S+)/) { $n = $1; } else { warn "NEXT command returned unexpected result: " . $nntp->message; next GROUP; } } # next ARTICLE } print "\n"; update_config($config, \%ranges); } # next GROUP my $s; $s = ($total == 1) ? "" : "s"; print "$myname: Search completed! $total article$s scanned and "; $s = ($saved == 1) ? "" : "s"; print "$saved article$s saved in\n$params{MBOX}\n"; exit(0); # Convert a range string to an array ref of pairs: sub parse_range($) { my ($str) = @_; $str =~ s/\s+//g; my @res; local $_; for (split(/,/, $str)) { if (/^(\d+)-(\d+)$/) { push(@res, [$1, $2]); } elsif (/^-(\d+)$/) { push(@res, [1, $1]); } elsif (/^\d+$/) { push(@res, [$_, $_]); } else { die "Syntax error in range `$_' on line $. of $config!\n"; } } return(\@res); } # Sort a range and merge adjacent pairs: sub sort_merge($) { my ($range) = @_; my @range = sort { $$a[0] <=> $$b[0] } @$range; return([]) unless @range; my @new = (shift(@range)); while (@range) { if ($new[$#new][1] + 1 >= $range[0][0]) { $new[$#new][1] = $range[0][1] if ($new[$#new][1] < $range[0][1]); shift(@range); } else { push(@new, shift(@range)); } } return(\@new); } # Check if given number is in the range, # if so, then return the upper limit of the pair: sub in_range($$) { my ($n, $range) = @_; foreach my $pair (@$range) { return($$pair[1]) if (($$pair[0] <= $n) && ($$pair[1] >= $n)); } return(0); } # Save an article in a mailbox sub save_article($$) { my ($article, $file) = @_; $saved++; local *MBOX; local $_; my ($from, $date); my $zone = "GMT"; for (@$article) { if (/^[Ff]rom\s*:.*<([^> ]+)>/) { $from = $1; } elsif (/^[Ff]rom\s*:\s*(\S+)/) { $from = $1; } elsif (/^[Dd]ate\s*:\s+(.*)$/) { $date = $1; } elsif (/^$/) { last; } } if (!$from) { warn "Could not find Path line in article\n"; $from = "nobody"; } if (!$date) { warn "Could not find Date line in article\n"; $date = "Mon 1 Jan 00:00:00 1900 GMT"; } else { # Extract the zone, if we can recognise it: $zone = $1 if ($date =~ s/([+-]\d{4}( \(?[A-Z]{3,4}\)?)?|\(?[A-Z]{3,4}\)?)$//); $zone =~ s/ \(?[A-Z]{3}\)?//; if (!defined(str2time($date))) { warn "Could not convert <$date> to a time via str2time!\n"; $date = "Mon 1 Jan 00:00:00 1900 GMT"; } else { $date = localtime(str2time($date)) . " $zone"; } } open(MBOX, ">>$file") or die "Can't append to `$file': $!\n"; flock(MBOX, LOCK_EX); seek(MBOX, 0, 2); # in case someone appended while we were waiting... if ($params{MBOXFORMAT} =~ /^unix$/i) { print MBOX "From $from $date\n"; print MBOX @$article; print MBOX "\n"; } elsif ($params{MBOXFORMAT} =~ /^elm$/i) { print MBOX "From $from $date\n"; print MBOX @$article; } elsif ($params{MBOXFORMAT} =~ /^mmdf$/i) { print MBOX "\001\001\001\001\n"; print MBOX @$article; print MBOX "\001\001\001\001\n"; } else { die "Unknown mailbox format: $params{MBOXFORMAT}\n"; } flock(MBOX, LOCK_UN); close(MBOX); } # Decide if we want this article: sub wanted($$$$) { my ($article, $where, $require, $unless) = @_; # Any UNLESS match means we don't want this article: foreach my $code (@$unless) { return(0) if grep { /$code/ } @$article; } # All the REQUIRE patterns must match: foreach my $code (@$require) { return(0) unless grep { /$code/ } @$article; } # If there are no WHERE patterns at all, then assume we want everything else: return(1) unless defined($where) && (@$where); # Any WHERE pattern means that we want this article: foreach my $code (@$where) { return(1) if grep { /$code/ } @$article; } # If no WHERE patterns match, then we don't want it: return(0); } sub update_ranges($$$$) { my ($ranges, $article, $name, $n) = @_; my ($xref) = grep { /^Xref: / } @$article; my ($group, $num); my $found = 0; warn "Article $name:$n has no Xref header!\n" unless ($xref); $xref =~ s/^Xref: \S+\s+//; foreach my $pair (split(/\s+/, $xref)) { ($group, $num) = ($pair =~ m/^(\S+):(\d+)$/) or next; $found++ if (($group eq $name) && ($num == $n)); next unless ($$ranges{$group}); $$ranges{$group} = insert_range($$ranges{$group}, $num); } if (!$found) { #warn "Article's own number $name:$n not in its Xref header!\n"; $$ranges{$name} = insert_range($$ranges{$name}, $n); } } sub insert_range($$) { my ($range, $n) = @_; push(@$range, [$n, $n]); return(sort_merge($range)); } # Update the config file with the new ranges: sub update_config($$) { my ($config, $ranges) = @_; local ($_, *IN, *OUT); my $save = "$config.save"; my %done = (); rename($config, $save) or die "rename $config -> $save failed: $!\n"; open(IN, $save) or die "Can't read `$save': $!\n"; open(OUT, ">$config") or die "Can't write to `$config': $!\n"; while () { # Handle continuation lines: $_ .= while (!eof(IN) && /\\$/); if (/^(\S+):/ && $$ranges{$1}) { print OUT "$1:" . range_to_string($$ranges{$1}) . "\n"; $done{$1}++; } else { print OUT; } } foreach my $name (sort keys %$ranges) { next if $done{$name}; print OUT "$name:" . range_to_string($$ranges{$name}) . "\n"; } close(OUT); close(IN); } # Convert an array ref of pairs to a range string: sub range_to_string($) { my ($range) = @_; my $str = ""; foreach my $pair (@$range) { if ($$pair[0] == $$pair[1]) { $str .= "$$pair[0],"; } else { $str .= "$$pair[0]-$$pair[1],"; } } $str =~ s/,$//; return($str); } # Delete any pairs below $min in the range: sub trim_range($$) { my ($min, $range) = @_; shift(@$range) while ($range && @$range && ($$range[0][1] < $min)); }