3 # This script is to get the changes needed to update the
4 # files on a mir mirror.
6 # Full details (among other stuff) at
7 # http://lists.indymedia.org/pipermail/imc-uk-tech/2006-May/0531-ib.html
11 # Mir will simply publish a list of file changes. It will produce one
12 # file per day. The file will be publicly viewable on the web at
14 # /changes/changesYYYYMMDD.txt
16 # So on May 28th, 2006, the file name would be
18 # /changes/changes20060528.txt
20 # The format of the file will also be simple. There will be three
21 # columns. First will be the time, then (add/change/delete) depending
22 # whether the file is new, has been updated or deleted, and then the full
23 # relative path of the file. So an example line would be
25 # 2006:01:24T09:08:17 add /en/2006/05/341547.html
27 # This will have to be done for HTML files, media files and the include
32 use Date::Calc::Object; # For date/time math
33 use File::stat; # For getting mtimes
34 use Getopt::Long; # For parsing command-line options
35 use HTTP::Status; # For HTTP status codes
36 use LWP::UserAgent; # For HTTP client functionality
37 use HTTP::Date; # For HTTP-compatible str2time and time2str
44 # Verbose? (Debugging messages)
47 # by default, this is the working directory
50 # get the time and date - using UTC
53 # if this is 1, we only list the change files that are needed and stop
54 # otherwise we do our normal processing
55 my $listchangefiles = 0;
57 # The root URL to fetch files from
60 ################################
61 # process command line arguments
62 ################################
66 Usage: update.pl [options]
68 --remoteroot=url Set the remote root URL to mirror from (mandatory)
69 --workingdir=path Set the root of the local mirror (default ".")
70 --now=timestring Pretend it's currently the specified time
71 --list-change-files Just list the change files we would fetch
72 --verbose Run in verbose mode
73 --help Output this help
80 GetOptions( "verbose!" => \$verbose,
81 "workingdir=s" => \$workingdir,
82 "now=s" => sub($) { $timenow = str2time($_[0]); },
83 "remoteroot=s" => \$remoteroot,
84 "list-change-files!" => \$listchangefiles,
85 "help" => sub() { usage(); } );
87 usage() if not defined $remoteroot;
89 # the directory where all files with state are kept
90 my $statedir = "$workingdir/state";
92 mkdir $statedir or die "Can't create $statedir";
95 # the directory where changes files are kept
96 my $changesdir = "$workingdir/changes";
98 # the file we want with the last time in it is
99 my $lastupdatefile = "$statedir/lastupdate.txt";
101 # where we get changes files from
102 my $changesroot = "$remoteroot/changes";
104 # LWP user agent for fetching files
105 # keep_alive is important, to avoid the overhead of
106 # establishing a new connection for each file we fetch
107 my $ua = new LWP::UserAgent( agent => "MirMirror/0.1",
115 # find the last update time
116 # if the file with the update time has disappeared, alert the admin
117 # and use the datestamp on the startpage file ( /en/index.html )
118 sub findLastUpdateTime() {
119 open (UPDATETIME, "<", $lastupdatefile) or return Date::Calc->today()->date2time();
120 my $lastupdatetimestr = <UPDATETIME>;
123 chomp ($lastupdatetimestr);
124 my $lastupdatetime = str2time ($lastupdatetimestr);
125 die "Can't parse last update time" if not $lastupdatetime;
126 return str2time ($lastupdatetimestr);
129 # write the time now into the last update file
130 sub saveLastUpdateTime() {
131 open (UPDATETIME, ">", $lastupdatefile) or die "Can't open $lastupdatefile for writing ($!)";
132 print UPDATETIME time2str($timenow);
136 # return an array of filename
137 sub getChangesFileList($$)
139 my ($fromtime, $totime) = @_;
141 my $time = Date::Calc->time2date($fromtime);
142 my $maxtime = Date::Calc->time2date($totime);
143 for (; $time <= $maxtime; ++$time) {
144 my ($y,$m,$d) = $time->date;
145 push @files, sprintf("changes%04d%02d%02d.txt",$y,$m,$d);
150 # get the directory part of a filename
153 $dir =~ s{/+[^/]*$}{};
157 # ensure the given directory exists (like mkdir -p)
161 my $parent = dirPart($dir);
162 &ensureDir($parent) if ($parent);
163 mkdir $dir or die "Can't create directory $dir ($!)";
167 # get a file, optionally saving it locally.
168 # if a local filename is given, return true if the file was
169 # found on the server and undef otherwise
171 # if no local filename is given, return the content of the
172 # file, or undef if it was not found on the server
174 # on all other errors, die
176 my ($remotefile, $localfile) = @_;
179 print STDERR "fetching $remotefile";
180 print STDERR " as $localfile" if $localfile;
183 my $req = new HTTP::Request(GET => "$remotefile");
187 # Don't fetch unless more recent than local copy
188 my $stat = stat($localfile);
189 $req->header("If-Modified-Since" => time2str($stat->mtime));
192 ensureDir(dirPart($localfile));
195 my $resp = $ua->request($req, $localfile);
196 if ($resp->is_success) { # 2xx codes
197 my $mtime = str2time($resp->header("Last-Modified"));
199 print STDERR " -> success";
200 print STDERR "; mtime ".time2str($mtime) if $mtime;
204 if ($localfile and $mtime) {
205 utime $mtime, $mtime, $localfile;
207 return $localfile ? 1 : $resp->content;
209 elsif ($resp->is_redirect) { # 3xx codes
210 if ($resp->code == RC_NOT_MODIFIED) { # 304
211 print STDERR " -> not modified\n" if $verbose;
212 die "Got 304 with no local file" if not $localfile;
215 print STDERR " -> redirect (".$resp->code.")\n" if $verbose;
216 die "Can't fetch $remotefile (got redirect, not yet handled)";
219 if ($resp->code == RC_NOT_FOUND) { # 404
220 print STDERR " -> not found\n" if $verbose;
224 print STDERR " -> failed (".$resp->code.")\n" if $verbose;
225 die "Can't fetch $remotefile (".$resp->status_line.")";
230 sub getChangesFile($) {
231 return fetchFile($changesroot."/".$_[0], $changesdir."/".$_[0]);
240 # first work out when the last time we were up to date is and
242 my $timeoflastupdate = findLastUpdateTime();
245 print STDERR "timenow is ".time2str($timenow)." \n";
246 print STDERR "timeoflastupdate is ".time2str($timeoflastupdate)." \n\n";
249 # Now we know which days' changes we need to get from the server
250 my @changesfiles = getChangesFileList($timeoflastupdate, $timenow);
252 foreach my $file (@changesfiles) { print STDERR "using changes file $file\n"; }
255 if ($listchangefiles) {
256 foreach my $file (@changesfiles) { print "$file\n"; }
260 # get the changes files
261 foreach my $file (@changesfiles) { getChangesFile($file); }
263 # TODO: if the file has not changed (response code 304) then ignore it
265 # iterate over all the fetched files, building up a list of files
268 foreach my $changesfile (@changesfiles) {
269 if (! -e "$changesdir/$changesfile") {
270 print STDERR "Skipping changes file $changesfile; not present\n" if $verbose;
274 my $date = $changesfile;
275 $date =~ s{^(?:.*/)?changes([0-9]+)\.txt$}{$1} or die "Can't extract date from changes filename $changesfile";
277 print STDERR "Processing changes file $changesfile\n" if $verbose;
279 open (CHANGES, "<", "$changesdir/$changesfile") or die "Can't open $changesfile";
280 while (my $change = <CHANGES>) {
281 my ($time, $op, $path) = split ' ', $change;
283 # Ignore malformed lines, especially wacky paths that could be malicious
284 if ($time =~ /[^0-9:]/) {
285 die "Invalid time $time in $changesfile";
288 if ($path =~ m{(?:^|/)\.\.(?:/|$)}) {
289 die "Invalid path $path (contains ..) in $changesfile";
292 # Strip scheme and host from absolute URLs
293 $path =~ s{^[a-z]+://[a-z0-9\-\.]+/}{/};
295 # Combine time with date and parse
296 $time = str2time("$date $time");
297 if (not defined $time) {
298 die "Failed to parse datetime '$date $time'";
301 # Ignore changes prior to $timeoflastupdate
302 next if $time < $timeoflastupdate;
305 print STDERR "Marked $path as '$op'\n" if $verbose;
309 # Fetch all files whose last operation was "add" or "change"
310 # Delete all files whose last operation was "delete"
311 while (my ($file, $op) = each %files) {
312 if ($op eq "delete") {
313 if (-e "$workingdir/$file") {
314 # delete: if the file exists, remove it
315 print STDERR "deleting $workingdir/$file\n" if $verbose;
316 unlink "$workingdir/$file" or die "Can't delete $workingdir/$file ($!)";
319 print STDERR "not deleting $workingdir/$file beacuse it doesn't exist\n" if $verbose;
322 elsif ($op eq "add" or $op eq "change" or $op eq "Modification") {
323 # add/change: re-fetch the file
324 fetchFile("$remoteroot/$file","$workingdir/$file") or die "File $remoteroot/$file not found";
327 die "Unknown operation '$op'";
332 # update the last "up-to-date" time
333 saveLastUpdateTime();