1 #!/usr/local/bin/clisp -C
3 ;;; Creation of gnulib's uninames.h from the UnicodeData.txt table.
4 ;;; Bruno Haible 2000-12-28
6 (defparameter add-comments nil)
8 (defstruct unicode-char
9 (code nil :type integer)
10 (name nil :type string)
16 (hashed nil :type hash-table)
17 (sorted nil :type list)
18 size ; number of characters total
19 length ; number of words
22 (defun main (inputfile outputfile)
23 (declare (type string inputfile outputfile))
24 #+UNICODE (setq *default-file-encoding* charset:utf-8)
25 (let ((all-chars '()))
26 ;; Read all characters and names from the input file.
27 (with-open-file (istream inputfile :direction :input)
29 (let ((line (read-line istream nil nil)))
30 (unless line (return))
31 (let* ((i1 (position #\; line))
32 (i2 (position #\; line :start (1+ i1)))
33 (code-string (subseq line 0 i1))
34 (code (parse-integer code-string :radix 16))
35 (name-string (subseq line (1+ i1) i2)))
36 ; Ignore characters whose name starts with "<".
37 (unless (eql (char name-string 0) #\<)
38 ; Also ignore Hangul syllables; they are treated specially.
39 (unless (<= #xAC00 code #xD7A3)
40 ; Also ignore CJK compatibility ideographs; they are treated
42 (unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A)
43 (<= #xFA70 code #xFAD9) (<= #x2F800 code #x2FA1D))
44 ; Transform the code so that it fits in 16 bits. In
45 ; Unicode 5.1 the following ranges are used.
46 ; 0x00000..0x04DFF >>12= 0x00..0x04 -> 0x0..0x4
47 ; 0x0A000..0x0AAFF >>12= 0x0A -> 0x5
48 ; 0x0F900..0x0FFFF >>12= 0x0F -> 0x6
49 ; 0x10000..0x10A58 >>12= 0x10 -> 0x7
50 ; 0x12000..0x12473 >>12= 0x12 -> 0x8
51 ; 0x1D000..0x1D7FF >>12= 0x1D -> 0x9
52 ; 0x1F000..0x1F093 >>12= 0x1F -> 0xA
53 ; 0x2F800..0x2FAFF >>12= 0x2F -> 0xB
54 ; 0xE0000..0xE00FF >>12= 0xE0 -> 0xC
58 ((#x00 #x01 #x02 #x03 #x04) (ash x -12))
67 (t (error "Update the transform function for 0x~5,'0X" x))
72 (push (make-unicode-char :code (transform code)
77 (setq all-chars (nreverse all-chars))
79 (let ((words-by-length (make-array 0 :adjustable t)))
80 (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars)))
83 (when (>= i1 (length name)) (return))
84 (let ((i2 (or (position #\Space name :start i1) (length name))))
85 (let* ((word (subseq name i1 i2))
87 (when (>= len (length words-by-length))
88 (adjust-array words-by-length (1+ len))
90 (unless (aref words-by-length len)
91 (setf (aref words-by-length len)
93 :hashed (make-hash-table :test #'equal)
96 (let ((word-list (aref words-by-length len)))
97 (unless (gethash word (word-list-hashed word-list))
98 (setf (gethash word (word-list-hashed word-list)) t)
99 (push word (word-list-sorted word-list))
104 ;; Sort the word lists.
105 (dotimes (len (length words-by-length))
106 (unless (aref words-by-length len)
107 (setf (aref words-by-length len)
109 :hashed (make-hash-table :test #'equal)
112 (let ((word-list (aref words-by-length len)))
113 (setf (word-list-sorted word-list)
114 (sort (word-list-sorted word-list) #'string<)
116 (setf (word-list-size word-list)
117 (reduce #'+ (mapcar #'length (word-list-sorted word-list)))
119 (setf (word-list-length word-list)
120 (length (word-list-sorted word-list))
122 ;; Output the tables.
123 (with-open-file (ostream outputfile :direction :output
124 #+UNICODE :external-format #+UNICODE charset:ascii)
125 (format ostream "/* DO NOT EDIT! GENERATED AUTOMATICALLY! */~%")
126 (format ostream "/*~%")
127 (format ostream " * ~A~%" (file-namestring outputfile))
128 (format ostream " *~%")
129 (format ostream " * Unicode character name table.~%")
130 (format ostream " * Generated automatically by the gen-uninames utility.~%")
131 (format ostream " */~%")
132 (format ostream "~%")
133 (format ostream "static const char unicode_name_words[~D] = {~%"
135 (dotimes (len (length words-by-length))
136 (let ((word-list (aref words-by-length len)))
137 (incf sum (word-list-size word-list))
141 (dotimes (len (length words-by-length))
142 (let ((word-list (aref words-by-length len)))
143 (dolist (word (word-list-sorted word-list))
144 (format ostream " ~{ '~C',~}~%" (coerce word 'list))
146 (format ostream "};~%")
147 (format ostream "#define UNICODE_CHARNAME_NUM_WORDS ~D~%"
149 (dotimes (len (length words-by-length))
150 (let ((word-list (aref words-by-length len)))
151 (incf sum (word-list-length word-list))
156 (format ostream "static const uint16_t unicode_name_word_offsets[~D] = {~%"
158 (dotimes (len (length words-by-length))
159 (let ((word-list (aref words-by-length len)))
160 (incf sum (word-list-length word-list))
164 (dotimes (len (length words-by-length))
165 (let ((word-list (aref words-by-length len)))
166 (when (word-list-sorted word-list)
168 (do ((l (word-list-sorted word-list) (cdr l))
169 (offset 0 (+ offset (length (car l)))))
171 (format ostream "~<~% ~0,79:; ~D,~>" offset)
173 (format ostream "~%")
175 (format ostream "};~%")
177 (format ostream "static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%"
178 (1+ (length words-by-length))
180 (let ((extra-offset 0)
182 (dotimes (len (length words-by-length))
183 (let ((word-list (aref words-by-length len)))
184 (format ostream " { ~D, ~D },~%" extra-offset ind-offset)
185 (incf extra-offset (word-list-size word-list))
186 (incf ind-offset (word-list-length word-list))
188 (format ostream " { ~D, ~D }~%" extra-offset ind-offset)
190 (format ostream "};~%")
191 (let ((ind-offset 0))
192 (dotimes (len (length words-by-length))
193 (let ((word-list (aref words-by-length len)))
194 (dolist (word (word-list-sorted word-list))
195 (setf (gethash word (word-list-hashed word-list)) ind-offset)
198 (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY"))
199 (format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word
200 (gethash word (word-list-hashed (aref words-by-length (length word))))
202 ;; Compute the word-indices for every unicode-char.
203 (dolist (uc all-chars)
204 (let ((name (unicode-char-name uc))
208 (when (>= i1 (length name)) (return))
209 (let ((i2 (or (position #\Space name :start i1) (length name))))
210 (let* ((word (subseq name i1 i2))
212 (push (gethash word (word-list-hashed (aref words-by-length len)))
218 (setf (unicode-char-word-indices uc)
219 (coerce (nreverse indices) 'vector)
222 ;; Sort the list of unicode-chars by word-indices.
226 (let ((len1 (length vec1))
227 (len2 (length vec2)))
232 (cond ((< (aref vec1 i) (aref vec2 i)) (return t))
233 ((> (aref vec1 i) (aref vec2 i)) (return nil))
239 :key #'unicode-char-word-indices
241 ;; Output the word-indices.
242 (format ostream "static const uint16_t unicode_names[~D] = {~%"
243 (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
246 (dolist (uc all-chars)
247 (format ostream " ~{ ~D,~}"
248 (maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0)))
249 (coerce (unicode-char-word-indices uc) 'list)
253 (format ostream "~40T/* ~A */" (unicode-char-name uc))
255 (format ostream "~%")
256 (setf (unicode-char-word-indices-index uc) i)
257 (incf i (length (unicode-char-word-indices uc)))
259 (format ostream "};~%")
260 (format ostream "static const struct { uint16_t code; uint32_t name:24; }~%")
261 (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%")
262 (format ostream "unicode_name_to_code[~D] = {~%"
265 (dolist (uc all-chars)
266 (format ostream " { 0x~4,'0X, ~D },"
267 (unicode-char-code uc)
268 (unicode-char-word-indices-index uc)
271 (format ostream "~21T/* ~A */" (unicode-char-name uc))
273 (format ostream "~%")
275 (format ostream "};~%")
276 (format ostream "static const struct { uint16_t code; uint32_t name:24; }~%")
277 (format ostream "#if __GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 7)~%__attribute__((__packed__))~%#endif~%")
278 (format ostream "unicode_code_to_name[~D] = {~%"
281 (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code))
282 (format ostream " { 0x~4,'0X, ~D },"
283 (unicode-char-code uc)
284 (unicode-char-word-indices-index uc)
287 (format ostream "~21T/* ~A */" (unicode-char-name uc))
289 (format ostream "~%")
291 (format ostream "};~%")
292 (format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
293 (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars))
295 (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
296 (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
301 (main (first *args*) (second *args*))