use strict;
-use File::stat;
-use Getopt::Long;
-use HTTP::Status;
-use LWP::UserAgent;
-use Time::Local;
+use Date::Calc::Object; # For date/time math
+use File::Copy; # For moving files
+use File::stat; # For getting mtimes
+use Getopt::Long; # For parsing command-line options
+use HTTP::Status; # For HTTP status codes
+use LWP::UserAgent; # For HTTP client functionality
+use HTTP::Date; # For HTTP-compatible str2time and time2str
-use Date::Parse;
-use Date::Format;
-
-# This has name conflicts with Date::Parse and
-# Date::Format; don't import its names.
-use HTTP::Date ();
##################
# Global variables
##################
-# by default, this is not a test
+# Verbose? (Debugging messages)
my $verbose = 0;
# by default, this is the working directory
# The root URL to fetch files from
my $remoteroot;
+# The time we believe we were last up to date
+my $timeoflastupdate;
+
+# The most recent update we've just processed, for updating the above.
+# We use this rather than $timenow, in case of clock discrepancies
+# between the local and remote systems.
+my $mostrecentupdateprocessed;
+
################################
# process command line arguments
################################
print STDERR <<EOF
Usage: update.pl [options]
- --remoteroot=url Set the remote root URL to mirror from (mandatory)
- --workingdir=path Set the root of the local mirror (default ".")
- --now=timestring Pretend it's currently the specified time
- --list-change-files Just list the change files we would fetch
- --verbose Run in verbose mode
- --help Output this help
+ --remoteroot=url Set the remote root URL to mirror from (mandatory)
+ --workingdir=path Set the root of the local mirror (default ".")
+ --now=timestring Pretend it's currently the specified time
+ --lastupdate=timestring Pretend we were last up-to-date at the specified time
+ --list-change-files Just list the change files we would fetch
+ --verbose Run in verbose mode
+ --help Output this help
EOF
;
exit 1;
}
-GetOptions( "verbose!" => \$verbose,
- "workingdir=s" => \$workingdir,
- "now=s" => sub($) { $timenow = str2time($_[0]); },
- "remoteroot=s" => \$remoteroot,
+GetOptions( "verbose!" => \$verbose,
+ "workingdir=s" => \$workingdir,
+ "now=s" => sub($$) { $timenow = str2time($_[1]) or die "Can't parse argument to --".join('=',@_); },
+ "lastupdate=s" => sub($$) { $timeoflastupdate = str2time($_[1]) or die "Can't parse argument to --".join('=',@_); },
+ "remoteroot=s" => \$remoteroot,
"list-change-files!" => \$listchangefiles,
- "help" => sub() { usage(); } );
+ "help" => sub() { usage(); } );
usage() if not defined $remoteroot;
# if the file with the update time has disappeared, alert the admin
# and use the datestamp on the startpage file ( /en/index.html )
sub findLastUpdateTime() {
- open (UPDATETIME, "<", $lastupdatefile) or return $timenow;
+ open (UPDATETIME, "<", $lastupdatefile) or return Date::Calc->today()->date2time();
my $lastupdatetimestr = <UPDATETIME>;
close (UPDATETIME);
chomp ($lastupdatetimestr);
+ my $lastupdatetime = str2time ($lastupdatetimestr);
+ die "Can't parse last update time" if not $lastupdatetime;
return str2time ($lastupdatetimestr);
}
-# convert the date into a correctly formatted string
-sub date2ISOstr($) {
- return time2str ("%Y:%m:%dT%T", $_[0]);
-}
+# write the given time into the last update file
+sub saveLastUpdateTime($) {
+ my $time = $_[0];
+ $time = $timeoflastupdate if !defined $time;
-# convert the date into RFC2616 format
-sub date2HTTPstr($) {
- return time2str ("%Y:%m:%dT%T", $_[0]);
-}
-
-# write the time now into the last update file
-sub saveLastUpdateTime() {
+ print STDERR "Updating timestamp to ".time2str($time)."\n" if $verbose;
open (UPDATETIME, ">", $lastupdatefile) or die "Can't open $lastupdatefile for writing ($!)";
- print UPDATETIME date2ISOstr($timenow);
+ print UPDATETIME time2str($time);
close (UPDATETIME);
}
{
my ($fromtime, $totime) = @_;
my @files;
- for (my $time = str2time(time2str("%Y:%m:%dT00:00:00", $fromtime));
- $time < $totime;
- $time += 86400) {
- push @files, time2str("changes%Y%m%d.txt", $time);
+ my $time = Date::Calc->time2date($fromtime);
+ my $maxtime = Date::Calc->time2date($totime);
+ for (; $time <= $maxtime; ++$time) {
+ my ($y,$m,$d) = $time->date;
+ push @files, sprintf("changes%04d%02d%02d.txt",$y,$m,$d);
}
return @files;
}
}
}
-# get a file, optionally saving it locally
+# get the mtime of a file
+sub getmtime($) {
+ my $file = $_[0];
+ my $stat = stat($file);
+ die "Can't stat $file ($!)" if !$stat;
+ return $stat->mtime;
+}
+
+# get a file, optionally saving it locally.
+# if a local filename is given, return:
+# undef if not found on the server
+# 1 if found but not updated since local version
+# 2 if found and more recent than local version
+#
+# if no local filename is given, return the content of the
+# file, or undef if it was not found on the server
+#
+# on all other errors, die
sub fetchFile($;$) {
my ($remotefile, $localfile) = @_;
my $req = new HTTP::Request(GET => "$remotefile");
- if ($localfile and -e $localfile) {
- # Don't fetch unless more recent than local copy
- my $stat = stat($localfile);
- $req->header("If-Modified-Since" => HTTP::Date::time2str($stat->mtime));
+ if ($localfile) {
+ if (-e $localfile) {
+ # Don't fetch unless more recent than local copy
+ $req->header("If-Modified-Since" => time2str(getmtime($localfile)));
+ }
+ else {
+ ensureDir(dirPart($localfile));
+ }
}
- my $resp = $ua->request($req);
+ my $resp = $ua->request($req, $localfile.".part");
if ($resp->is_success) { # 2xx codes
- my $mtime = HTTP::Date::str2time($resp->header("Last-Modified"));
- if ($localfile) {
- if ($verbose) {
- print STDERR " -> success";
- print STDERR "; mtime ".time2str("%c", $mtime) if $mtime;
- print STDERR "\n";
- }
- ensureDir(dirPart($localfile));
-
- open (LOCAL, ">", "$localfile") or die "Can't open $localfile for writing ($!)";
- print LOCAL $resp->content or die "Error writing $localfile ($!)";
- close LOCAL or die "Error writing $localfile ($!)";
+ my $mtime = str2time($resp->header("Last-Modified"));
+ if ($verbose) {
+ print STDERR " -> success";
+ print STDERR "; mtime ".time2str($mtime) if $mtime;
+ print STDERR "\n";
+ }
+ if ($localfile) {
if ($mtime) {
- utime $mtime, $mtime, $localfile;
+ utime $mtime, $mtime, $localfile.".part";
}
+ move($localfile.".part", $localfile) or die "Can't move $localfile into place";
}
- return $resp->content;
+ return $localfile ? 2 : $resp->content;
}
elsif ($resp->is_redirect) { # 3xx codes
if ($resp->code == RC_NOT_MODIFIED) { # 304
print STDERR " -> not modified\n" if $verbose;
- open (LOCAL, "<", "$localfile") or die "Can't open $localfile ($!)";
- local $/; # slurp whole file
- my $content = <LOCAL>;
- close LOCAL;
- return $content;
+ die "Got 304 with no local file" if not $localfile;
+ return 1;
}
print STDERR " -> redirect (".$resp->code.")\n" if $verbose;
die "Can't fetch $remotefile (got redirect, not yet handled)";
}
else {
+ if ($resp->code == RC_NOT_FOUND) { # 404
+ print STDERR " -> not found\n" if $verbose;
+ return undef;
+ }
+
print STDERR " -> failed (".$resp->code.")\n" if $verbose;
die "Can't fetch $remotefile (".$resp->status_line.")";
}
}
+# update the timestamp if the new one is more recent
+sub updatestamp(\$$) {
+ my ($stampref, $newtime) = @_;
+ $$stampref = $newtime if (!defined $$stampref or $newtime > $$stampref);
+}
+
##################
# the program flow
##################
-# first work out when the last time we were up to date is and
-# find present time.
-my $timeoflastupdate = findLastUpdateTime();
+# first work out when the last time we were up to date is, if
+# it wasn't overridden on the command line
+$timeoflastupdate = findLastUpdateTime() if !defined $timeoflastupdate;
if ($verbose) {
- print STDERR "timenow is ".time2str("%c",$timenow)." \n";
- print STDERR "timeoflastupdate is ".time2str("%c",$timeoflastupdate)." \n\n";
+ print STDERR "timenow is ".time2str($timenow)." \n";
+ print STDERR "timeoflastupdate is ".time2str($timeoflastupdate)." \n\n";
}
# Now we know which days' changes we need to get from the server
my @changesfiles = getChangesFileList($timeoflastupdate, $timenow);
-if ($verbose or $listchangefiles) {
+if ($verbose) {
foreach my $file (@changesfiles) { print STDERR "using changes file $file\n"; }
- exit 0 if $listchangefiles;
}
-# get the changes files
-my %changesfilecontent;
-foreach my $file (@changesfiles) { $changesfilecontent{$file} = getChangesFile($file); }
-
-# if the file has not changed (response code 304) then ignore it
+if ($listchangefiles) {
+ foreach my $file (@changesfiles) { print "$file\n"; }
+ exit 0;
+}
-# iterate over all the fetched files, building up a list of files
+# fetch each changes file in turn, building up a list of files
# to fetch/delete
my %files;
-foreach my $changes (@changesfiles) {
- my $date = $changes;
- $date =~ s{^(?:.*/)?changes([0-9]+)\.txt$}{$1} or die "Can't extract date from changes filename $changes";
+foreach my $changesfile (@changesfiles)
+{
+ my $rv = getChangesFile($changesfile);
+
+ # If the file isn't there, ignore it
+ if (! -e "$changesdir/$changesfile") {
+ print STDERR "Skipping changes file $changesfile; not present\n" if $verbose;
+ next;
+ }
+
+ my $date = $changesfile;
+ $date =~ s{^(?:.*/)?changes([0-9]+)\.txt$}{$1} or die "Can't extract date from changes filename $changesfile";
+
+ # The file exists, set most recent update to at lease "YYYY-MM-DD 00:00:00"
+ updatestamp($mostrecentupdateprocessed, str2time("$date 00:00:00"));
- my @changes = split /[\r\n]+/, $changesfilecontent{$changes};
- foreach my $change (@changes) {
+ # If the file has not changed (response code 304) then ignore it
+ # Also check mtime against "last update" time so that we won't ignore
+ # it if previous runs have been failing
+ if ((!$rv or $rv < 2) and getmtime("$changesdir/$changesfile") < $timeoflastupdate) {
+ print STDERR "Skipping changes file $changesfile; not changed since last run\n" if $verbose;
+ next;
+ }
+
+ print STDERR "Processing changes file $changesfile\n" if $verbose;
+
+ open (CHANGES, "<", "$changesdir/$changesfile") or die "Can't open $changesfile";
+ while (my $change = <CHANGES>) {
my ($time, $op, $path) = split ' ', $change;
# Ignore malformed lines, especially wacky paths that could be malicious
if ($time =~ /[^0-9:]/) {
- die "Invalid time $time in $changes";
+ die "Invalid time $time in $changesfile";
}
if ($path =~ m{(?:^|/)\.\.(?:/|$)}) {
- die "Invalid path $path (contains ..) in $changes";
+ die "Invalid path $path (contains ..) in $changesfile";
}
# Strip scheme and host from absolute URLs
# Ignore changes prior to $timeoflastupdate
next if $time < $timeoflastupdate;
+ # Update timestamp
+ updatestamp($mostrecentupdateprocessed, $time);
+
$files{$path} = $op;
print STDERR "Marked $path as '$op'\n" if $verbose;
}
}
elsif ($op eq "add" or $op eq "change" or $op eq "Modification") {
# add/change: re-fetch the file
- my $content = fetchFile("$remoteroot/$file","$workingdir/$file");
+ fetchFile("$remoteroot/$file","$workingdir/$file") or die "File $remoteroot/$file not found";
}
else {
die "Unknown operation '$op'";
# update the last "up-to-date" time
-saveLastUpdateTime();
+saveLastUpdateTime($mostrecentupdateprocessed);
# finish
exit 0;