3 # This script is to get the changes needed to update the
4 # files on a mir mirror.
6 # Full details (among other stuff) at
7 # http://lists.indymedia.org/pipermail/imc-uk-tech/2006-May/0531-ib.html
11 # Mir will simply publish a list of file changes. It will produce one
12 # file per day. The file will be publicly viewable on the web at
14 # /changes/changesYYYYMMDD.txt
16 # So on May 28th, 2006, the file name would be
18 # /changes/changes20060528.txt
20 # The format of the file will also be simple. There will be three
21 # columns. First will be the time, then (add/change/delete) depending
22 # whether the file is new, has been updated or deleted, and then the full
23 # relative path of the file. So an example line would be
25 # 2006:01:24T09:08:17 add /en/2006/05/341547.html
27 # This will have to be done for HTML files, media files and the include
32 use Date::Calc::Object; # For date/time math
33 use File::Copy; # For moving files
34 use File::stat; # For getting mtimes
35 use Getopt::Long; # For parsing command-line options
36 use HTTP::Status; # For HTTP status codes
37 use LWP::UserAgent; # For HTTP client functionality
38 use HTTP::Date; # For HTTP-compatible str2time and time2str
45 # Verbose? (Debugging messages)
48 # by default, this is the working directory
51 # get the time and date - using UTC
54 # if this is 1, we only list the change files that are needed and stop
55 # otherwise we do our normal processing
56 my $listchangefiles = 0;
58 # The root URL to fetch files from
61 # The time we believe we were last up to date
64 # The most recent update we've just processed, for updating the above.
65 # We use this rather than $timenow, in case of clock discrepancies
66 # between the local and remote systems.
67 my $mostrecentupdateprocessed;
69 ################################
70 # process command line arguments
71 ################################
75 Usage: update.pl [options]
77 --remoteroot=url Set the remote root URL to mirror from (mandatory)
78 --workingdir=path Set the root of the local mirror (default ".")
79 --now=timestring Pretend it's currently the specified time
80 --lastupdate=timestring Pretend we were last up-to-date at the specified time
81 --list-change-files Just list the change files we would fetch
82 --verbose Run in verbose mode
83 --help Output this help
90 GetOptions( "verbose!" => \$verbose,
91 "workingdir=s" => \$workingdir,
92 "now=s" => sub($$) { $timenow = str2time($_[1]) or die "Can't parse argument to --".join('=',@_); },
93 "lastupdate=s" => sub($$) { $timeoflastupdate = str2time($_[1]) or die "Can't parse argument to --".join('=',@_); },
94 "remoteroot=s" => \$remoteroot,
95 "list-change-files!" => \$listchangefiles,
96 "help" => sub() { usage(); } );
98 usage() if not defined $remoteroot;
100 # the directory where all files with state are kept
101 my $statedir = "$workingdir/state";
102 if (! -e $statedir) {
103 mkdir $statedir or die "Can't create $statedir";
106 # the directory where changes files are kept
107 my $changesdir = "$workingdir/changes";
109 # the file we want with the last time in it is
110 my $lastupdatefile = "$statedir/lastupdate.txt";
112 # where we get changes files from
113 my $changesroot = "$remoteroot/changes";
115 # LWP user agent for fetching files
116 # keep_alive is important, to avoid the overhead of
117 # establishing a new connection for each file we fetch
118 my $ua = new LWP::UserAgent( agent => "MirMirror/0.1",
126 # find the last update time
127 # if the file with the update time has disappeared, alert the admin
128 # and use the datestamp on the startpage file ( /en/index.html )
129 sub findLastUpdateTime() {
130 open (UPDATETIME, "<", $lastupdatefile) or return Date::Calc->today()->date2time();
131 my $lastupdatetimestr = <UPDATETIME>;
134 chomp ($lastupdatetimestr);
135 my $lastupdatetime = str2time ($lastupdatetimestr);
136 die "Can't parse last update time" if not $lastupdatetime;
137 return str2time ($lastupdatetimestr);
140 # write the given time into the last update file
141 sub saveLastUpdateTime($) {
143 $time = $timeoflastupdate if !defined $time;
145 print STDERR "Updating timestamp to ".time2str($time)."\n" if $verbose;
146 open (UPDATETIME, ">", $lastupdatefile) or die "Can't open $lastupdatefile for writing ($!)";
147 print UPDATETIME time2str($time);
151 # return an array of filename
152 sub getChangesFileList($$)
154 my ($fromtime, $totime) = @_;
156 my $time = Date::Calc->time2date($fromtime);
157 my $maxtime = Date::Calc->time2date($totime);
158 for (; $time <= $maxtime; ++$time) {
159 my ($y,$m,$d) = $time->date;
160 push @files, sprintf("changes%04d%02d%02d.txt",$y,$m,$d);
165 # get the directory part of a filename
168 $dir =~ s{/+[^/]*$}{};
172 # ensure the given directory exists (like mkdir -p)
176 my $parent = dirPart($dir);
177 &ensureDir($parent) if ($parent);
178 mkdir $dir or die "Can't create directory $dir ($!)";
182 # get the mtime of a file
185 my $stat = stat($file);
186 die "Can't stat $file ($!)" if !$stat;
190 # get a file, optionally saving it locally.
191 # if a local filename is given, return:
192 # undef if not found on the server
193 # 1 if found but not updated since local version
194 # 2 if found and more recent than local version
196 # if no local filename is given, return the content of the
197 # file, or undef if it was not found on the server
199 # on all other errors, die
201 my ($remotefile, $localfile) = @_;
204 print STDERR "fetching $remotefile";
205 print STDERR " as $localfile" if $localfile;
208 my $req = new HTTP::Request(GET => "$remotefile");
212 # Don't fetch unless more recent than local copy
213 $req->header("If-Modified-Since" => time2str(getmtime($localfile)));
216 ensureDir(dirPart($localfile));
219 my $resp = $ua->request($req, $localfile.".part");
220 if ($resp->is_success) { # 2xx codes
221 my $mtime = str2time($resp->header("Last-Modified"));
223 print STDERR " -> success";
224 print STDERR "; mtime ".time2str($mtime) if $mtime;
230 utime $mtime, $mtime, $localfile.".part";
232 move($localfile.".part", $localfile) or die "Can't move $localfile into place";
234 return $localfile ? 2 : $resp->content;
236 elsif ($resp->is_redirect) { # 3xx codes
237 if ($resp->code == RC_NOT_MODIFIED) { # 304
238 print STDERR " -> not modified\n" if $verbose;
239 die "Got 304 with no local file" if not $localfile;
242 print STDERR " -> redirect (".$resp->code.")\n" if $verbose;
243 die "Can't fetch $remotefile (got redirect, not yet handled)";
246 if ($resp->code == RC_NOT_FOUND) { # 404
247 print STDERR " -> not found\n" if $verbose;
251 print STDERR " -> failed (".$resp->code.")\n" if $verbose;
252 die "Can't fetch $remotefile (".$resp->status_line.")";
257 sub getChangesFile($) {
258 return fetchFile($changesroot."/".$_[0], $changesdir."/".$_[0]);
262 # update the timestamp if the new one is more recent
263 sub updatestamp(\$$) {
264 my ($stampref, $newtime) = @_;
265 $$stampref = $newtime if (!defined $$stampref or $newtime > $$stampref);
273 # first work out when the last time we were up to date is, if
274 # it wasn't overridden on the command line
275 $timeoflastupdate = findLastUpdateTime() if !defined $timeoflastupdate;
278 print STDERR "timenow is ".time2str($timenow)." \n";
279 print STDERR "timeoflastupdate is ".time2str($timeoflastupdate)." \n\n";
282 # Now we know which days' changes we need to get from the server
283 my @changesfiles = getChangesFileList($timeoflastupdate, $timenow);
285 foreach my $file (@changesfiles) { print STDERR "using changes file $file\n"; }
288 if ($listchangefiles) {
289 foreach my $file (@changesfiles) { print "$file\n"; }
293 # fetch each changes file in turn, building up a list of files
296 foreach my $changesfile (@changesfiles)
298 my $rv = getChangesFile($changesfile);
300 # If the file isn't there, ignore it
301 if (! -e "$changesdir/$changesfile") {
302 print STDERR "Skipping changes file $changesfile; not present\n" if $verbose;
306 my $date = $changesfile;
307 $date =~ s{^(?:.*/)?changes([0-9]+)\.txt$}{$1} or die "Can't extract date from changes filename $changesfile";
309 # The file exists, set most recent update to at lease "YYYY-MM-DD 00:00:00"
310 updatestamp($mostrecentupdateprocessed, str2time("$date 00:00:00"));
312 # If the file has not changed (response code 304) then ignore it
313 # Also check mtime against "last update" time so that we won't ignore
314 # it if previous runs have been failing
315 if ((!$rv or $rv < 2) and getmtime("$changesdir/$changesfile") < $timeoflastupdate) {
316 print STDERR "Skipping changes file $changesfile; not changed since last run\n" if $verbose;
320 print STDERR "Processing changes file $changesfile\n" if $verbose;
322 open (CHANGES, "<", "$changesdir/$changesfile") or die "Can't open $changesfile";
323 while (my $change = <CHANGES>) {
324 my ($time, $op, $path) = split ' ', $change;
326 # Ignore malformed lines, especially wacky paths that could be malicious
327 if ($time =~ /[^0-9:]/) {
328 die "Invalid time $time in $changesfile";
331 if ($path =~ m{(?:^|/)\.\.(?:/|$)}) {
332 die "Invalid path $path (contains ..) in $changesfile";
335 # Strip scheme and host from absolute URLs
336 $path =~ s{^[a-z]+://[a-z0-9\-\.]+/}{/};
338 # Combine time with date and parse
339 $time = str2time("$date $time");
340 if (not defined $time) {
341 die "Failed to parse datetime '$date $time'";
344 # Ignore changes prior to $timeoflastupdate
345 next if $time < $timeoflastupdate;
348 updatestamp($mostrecentupdateprocessed, $time);
351 print STDERR "Marked $path as '$op'\n" if $verbose;
355 # Fetch all files whose last operation was "add" or "change"
356 # Delete all files whose last operation was "delete"
357 while (my ($file, $op) = each %files) {
358 if ($op eq "delete") {
359 if (-e "$workingdir/$file") {
360 # delete: if the file exists, remove it
361 print STDERR "deleting $workingdir/$file\n" if $verbose;
362 unlink "$workingdir/$file" or die "Can't delete $workingdir/$file ($!)";
365 print STDERR "not deleting $workingdir/$file beacuse it doesn't exist\n" if $verbose;
368 elsif ($op eq "add" or $op eq "change" or $op eq "Modification") {
369 # add/change: re-fetch the file
370 fetchFile("$remoteroot/$file","$workingdir/$file") or die "File $remoteroot/$file not found";
373 die "Unknown operation '$op'";
378 # update the last "up-to-date" time
379 saveLastUpdateTime($mostrecentupdateprocessed);