Commit mostly-finished clscan, a tool to scan file copyrights and licenses
[gnulib.git] / debian / clscan / clscan
1 #!/usr/bin/perl -w
2 # Ian Beckwith <ianb@erislabs.net>
3 #
4
5 use strict;
6 use vars qw($me);
7 $me=($0=~/(?:.*\/)?(.*)/)[0];
8
9 use Getopt::Long;
10 use YAML::Any;
11 use Digest::SHA qw(sha256_hex);
12 use File::Find;
13 use File::Copy;
14
15 our $CLSCANDIR="debian/clscan";
16 our $FILESCACHE="$CLSCANDIR/files.yaml";
17 our $NEWFILES="$CLSCANDIR/new.txt";
18 our $COPYRIGHTSTUB="$CLSCANDIR/copyright.in";
19
20 our @filenames=();
21 our $files={};
22 our $new={};
23
24 my $scan=0;
25 my $merge=0;
26 my $writecopyright=0;
27
28 usage() unless(@ARGV);
29 usage() unless GetOptions("scan|s" => \$scan,
30                           "merge|m" => \$merge,
31                           "write|w" => \$writecopyright,
32                           "help|h" => sub { usage(); });
33
34 load_cache();
35 scan() if($scan);
36 merge() if($merge);
37 write_copyright() if ($merge || $writecopyright);
38
39 sub scan
40 {
41     get_filenames();
42     for my $file (@filenames)
43     {
44         scan_file($file);
45     }
46     write_new();
47     find_deleted();
48 }
49
50 sub merge
51 {
52     merge_new();
53     save_cache();
54 }
55
56 sub write_copyright
57 {
58     unless(keys(%$files))
59     {
60         die("$me: no files known, run $0 --scan\n");
61     }
62     unless(copy($COPYRIGHTSTUB, "debian/copyright.new"))
63     {
64         die("$me: cannot copy $COPYRIGHTSTUB to debian/copyright: $!\n");
65     }
66     unless(open(COPYRIGHT, ">>debian/copyright.new"))
67     {
68         die("$me: cannot append to debian/copyright: $!\n");
69     }
70
71     # group files by license/license_text/copyright
72     my $licenses={};
73     for my $file (sort keys(%$files))
74     {
75         my $license=$files->{$file}->{license};
76         my $copyright=$files->{$file}->{copyright};
77         my $license_text=$files->{$file}->{license_text};
78         push(@{$licenses->{$license}->{$license_text}->{$copyright}}, $file);
79     }
80     for my $license (sort keys(%$licenses))
81     {
82         for my $license_text (sort keys(%{$licenses->{$license}}))
83         {
84             for my $copyright (sort keys(%{$licenses->{$license}->{$license_text}}))
85             {
86                 next if(!length($license) && !length($copyright) && !length($license_text));
87                 my @filelist=sort @{$licenses->{$license}->{$license_text}->{$copyright}};
88                 print COPYRIGHT "Files: ", join(', ', @filelist), "\n";
89                 print COPYRIGHT "Copyright: $copyright\n" if length($copyright);
90                 print COPYRIGHT "License: $license\n" if length($license);
91                 if(length($license_text))
92                 {
93                     my @text=split(/\n/, $license_text);
94                     print COPYRIGHT map { "    " . $_ . "\n" } @text;
95                 }
96                 print COPYRIGHT "\n";
97             }
98         }
99     }
100     print COPYRIGHT license_trailer(sort keys(%$licenses));
101     close(COPYRIGHT);
102 }
103
104 sub license_trailer
105 {
106     my @licenses_used=@_;
107     my $license_data = {
108         "Apache-2.0" => "Apache License Version 2.0",
109         "Artistic" => "Artistic License",
110         "GFDL-1.2" => "GNU Free Documentation License Version 1.2",
111         "GFDL-1.3" => "GNU Free Documentation License Version 1.3",
112         "GFDL" => "GNU Free Documentation License",
113         "GPL-2" => "GNU General Public License Version 2",
114         "GPL-3" => "GNU General Public License Version 3",
115         "GPL" => "GNU General Public License",
116         "LGPL-2" => "GNU Library General Public License Version 2",
117         "LGPL-2.1" => "GNU Lesser General Public License Version 2.1",
118         "LGPL-3" => "GNU Lesser General Public License Version 3",
119         "LGPL" => "GNU Lesser General Public License",
120     };
121
122     my %types_found=();
123 TYPE: for my $type (keys(%$license_data))
124     {
125         for my $license (@licenses_used)
126         {
127             if($license =~ /$type(\+|\b)/i)
128             {
129                 $types_found{$type}=1;
130                 # avoid matching eg GPL-2 *and* GPL
131                 next TYPE;
132             }
133         }
134     }
135     my $text="\n";
136     for my $type (sort keys(%types_found))
137     {
138         $text .= "The complete text of the " . $license_data->{$type} ." can be\n";
139         $text .= "found in /usr/share/common-licenses/$type\n";
140     }
141     return $text;
142 }
143
144
145 sub merge_new
146 {
147     unless(open(NEW, $NEWFILES))
148     {
149         die("$me: $NEWFILES: cannot open: $!\n");
150     }
151     my $license='';
152     my $copyright='';
153     my $hash='';
154     my $file='';
155     my $license_text='';
156     my $in_license_text=0;
157     my $line=0;
158     while(<NEW>)
159     {
160         $line++;
161         chomp;
162         next if(/^\s*$/);
163         next if(/^\s*\#/);
164         if($in_license_text && /^\s+(.*)/)
165         {
166             $license_text .= "\n" . $1;
167         }
168         elsif(/^File:\s*(.*)/)
169         {
170             my $newfile=$1;
171             # save previous entry
172             if(length($file) && length($hash))
173             {
174                 $files->{$file}={ hash=>$hash,
175                                   copyright=>$copyright,
176                                   license=>$license,
177                                   license_text=>$license_text };
178             }
179             $file=$newfile;
180             $license='';
181             $copyright='';
182             $hash='';
183             $license_text='';
184         }
185         elsif(/^Hash:\s*(.*)/)
186         {
187             $hash=$1;
188         }
189         elsif(/^Copyright:\s*(.*)/)
190         {
191             $copyright=$1;
192         }
193         elsif(/^License:\s*(.*)/)
194         {
195             $license=$1;
196         }
197         elsif(/^License_text:\s*(.*)/)
198         {
199             $in_license_text=1;
200             $license_text=$1;
201         }
202         elsif($in_license_text && /^\s+(.*)/)
203         {
204             $license_text .= "\n" . $1;
205         }
206         else
207         {
208             warn("$me: $file: line $line not recognized\n");
209         }
210     }
211     close(NEW);
212     # save last entry
213     if(length($file) && length($hash))
214     {
215         $files->{$file}={ hash=>$hash,
216                           copyright=>$copyright,
217                           license=>$license,
218                           license_text=>$license_text };
219     }
220 }
221
222 sub guess_license
223 {
224     my $file=shift;
225     my $license='';
226     my $copyright='';
227     if(open(LICENSECHECK, "licensecheck --copyright \"$file\"|"))
228     {
229         while(<LICENSECHECK>)
230         {
231             chomp;
232             if(/^\s+\[(.*)\]$/)
233             {
234                 $copyright=$1;
235             }
236             elsif(/.*:\s+(.*)/)
237             {
238                 $license=$1;
239             }
240         }
241         close(LICENSECHECK);
242         $copyright =~ s/^\s*Copyright\s*:\s*//;
243         $license =~ s/.*UNKNOWN.*//;
244         $license =~ s/(L?GPL) \(v([\.\d]+) or later\)/$1-$2+/i;
245         $license =~ s/(L?GPL) \(v([\.\d]+)\)/$1-$2/i;
246         $license =~ s/G?FDL \(v([\.\d]+) or later\)/GFDL-$1+/i;
247         $license =~ s/G?FDL \(v([\.\d]+)\)/GFDL-$1/i;
248         $license =~ s/Apache \(v([\.\d]+) or later\)/Apache-$1+/i;
249         $license =~ s/Apache \(v([\.\d]+)\)/Apache-$1+/i;
250     }
251     return($license, $copyright);
252 }
253
254 sub scan_file
255 {
256     my $filename=shift;
257     unless(open(FILE, $filename))
258     {
259         warn("$me: $filename: cannot open: $!\n");
260         return;
261     }
262     my $header='';
263     for(my $i=0; $i < 15; $i++)
264     {
265         my $line=<FILE>;
266         last unless($line);
267         $header .= $line;
268     }
269     close(FILE);
270     my $hash=sha256_hex($header);
271     if( (!exists($files->{$filename})) ||
272         ($files->{$filename}->{hash} ne $hash))
273     {
274         filechanged($filename, $hash, $header);
275     }
276 }
277
278
279 sub filechanged
280 {
281     my($filename, $hash, $header)=@_;
282     my($license_guess, $copyright_guess)=guess_license($filename);
283     $new->{$filename}={
284         hash=>$hash,
285         license=>$license_guess,
286         copyright=>$copyright_guess,
287         header=>$header,
288     };
289     if(exists($files->{$filename}))
290     {
291         if(exists($files->{$filename}->{copyright}))
292         {
293             $new->{$filename}->{copyright_old}=$files->{$filename}->{copyright};
294         }
295         if(exists($files->{$filename}->{license}))
296         {
297             $new->{$filename}->{license_old}=$files->{$filename}->{license};
298         }
299         if(exists($files->{$filename}->{license_text}))
300         {
301             $new->{$filename}->{license_text_old}=$files->{$filename}->{license_text};
302         }
303     }
304 }
305
306 sub wanted
307 {
308     if(/^\.git/ || /^\.cvs/ || /^debian/)
309     {
310         $File::Find::prune=1;
311     }
312     elsif(-f)
313     {
314         push(@filenames, $File::Find::name);
315     }
316 }
317
318 sub get_filenames
319 {
320     find(\&wanted, ".");
321 #    find(\&wanted, "lib/uniname");
322 #    find(\&wanted, "lib/uniconv");
323 }
324
325 sub load_cache
326 {
327     unless(open(YAML,$FILESCACHE))
328     {
329         warn("$me: cannot load cache $FILESCACHE: $!\n");
330         return;
331     }
332     my $yaml;
333     {
334         local $/=undef;
335         $yaml=<YAML>;
336     }
337     close(YAML);
338     $files=Load($yaml);
339 }
340
341 sub save_cache
342 {
343     backup($FILESCACHE);
344     unless(open(YAML,">$FILESCACHE"))
345     {
346         warn("$me: cannot save cache $FILESCACHE: $!\n");
347         return;
348     }
349     print YAML Dump($files);
350     close(YAML);
351 }
352
353 sub write_new
354 {
355     backup($NEWFILES);
356     unless(keys(%$new))
357     {
358         warn("$me: no new/changed files found\n");
359         return;
360     }
361     unless(open(NEW,">$NEWFILES"))
362     {
363         die("$me: cannot write to $NEWFILES: $!\n");
364     }
365     for my $file (sort keys %$new)
366     {
367         print NEW "File: $file\n";
368         print NEW "Hash: ", $new->{$file}->{hash}, "\n";
369         print NEW "License: ", $new->{$file}->{license}, "\n";
370         print NEW "Copyright: ", $new->{$file}->{copyright}, "\n";
371         print NEW "Licence-Text: \n";
372         if($new->{$file}->{license_old})
373         {
374             print NEW "#License_old: ", $new->{$file}->{license_old}, "\n";
375         }
376         if($new->{$file}->{copyright_old})
377         {
378             print NEW "#Copyright_old: ", $new->{$file}->{copyright_old}, "\n";
379         }
380         if($new->{$file}->{licence_text_old})
381         {
382             print NEW "#License_text_old: ", $new->{$file}->{licence_text_old}, "\n";
383         }
384         print NEW "#Header: \n";
385         my @headerlines=split(/\n/, $new->{$file}->{header});
386         @headerlines=map { "#" . $_ } grep { defined $_; } @headerlines;
387         print NEW join("\n", @headerlines);
388         print NEW "\n\n";
389     }
390     close NEW;
391 }
392
393 sub find_deleted
394 {
395     my @deleted=();
396     my %newnames = map { $_ => 1 } @filenames;
397     for my $file (sort keys(%$files))
398     {
399         unless(exists($newnames{$file}))
400         {
401             push(@deleted, $file);
402         }
403     }
404     if(@deleted)
405     {
406         print "Removed files:\n";
407         print join("\n", @deleted),"\n";
408     }
409 }
410
411 sub backup
412 {
413     my $base=shift;
414     my $old="$base.old";
415     if(-f $base)
416     {
417         unlink($base);
418         move($base, $old);
419     }
420 }
421
422 sub usage
423 {
424     die("usage: $me [--scan] [--merge]\n",
425         "  --scan      Scan for new files & files with changed copyright headers\n",
426         "              Writes to debian/clscan/new.txt for manual review.\n",
427         "  --merge     Merges new data from debian/clscan/new.txt\n",
428         "              Writes updated debian/copyright.\n");
429 }