1 | #! /usr/bin/perl -w |
---|
2 | |
---|
3 | # Copyright (C) 1998, 1999 Tom Tromey |
---|
4 | # Copyright (C) 2001 Red Hat Software |
---|
5 | |
---|
6 | # This program is free software; you can redistribute it and/or modify |
---|
7 | # it under the terms of the GNU General Public License as published by |
---|
8 | # the Free Software Foundation; either version 2, or (at your option) |
---|
9 | # any later version. |
---|
10 | |
---|
11 | # This program is distributed in the hope that it will be useful, |
---|
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
14 | # GNU General Public License for more details. |
---|
15 | |
---|
16 | # You should have received a copy of the GNU General Public License |
---|
17 | # along with this program; if not, write to the Free Software |
---|
18 | # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA |
---|
19 | # 02111-1307, USA. |
---|
20 | |
---|
21 | # Contributer(s): |
---|
22 | # Andrew Taylor <andrew.taylor@montage.ca> |
---|
23 | |
---|
24 | # gen-unicode-tables.pl - Generate tables for libunicode from Unicode data. |
---|
25 | # See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html |
---|
26 | # I consider the output of this program to be unrestricted. Use it as |
---|
27 | # you will. |
---|
28 | |
---|
29 | # FIXME: |
---|
30 | # * For decomp table it might make sense to use a shift count other |
---|
31 | # than 8. We could easily compute the perfect shift count. |
---|
32 | |
---|
33 | # we use some perl unicode features |
---|
34 | require 5.006; |
---|
35 | |
---|
36 | use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION); |
---|
37 | |
---|
38 | |
---|
39 | # Names of fields in Unicode data table. |
---|
40 | $CODE = 0; |
---|
41 | $NAME = 1; |
---|
42 | $CATEGORY = 2; |
---|
43 | $COMBINING_CLASSES = 3; |
---|
44 | $BIDI_CATEGORY = 4; |
---|
45 | $DECOMPOSITION = 5; |
---|
46 | $DECIMAL_VALUE = 6; |
---|
47 | $DIGIT_VALUE = 7; |
---|
48 | $NUMERIC_VALUE = 8; |
---|
49 | $MIRRORED = 9; |
---|
50 | $OLD_NAME = 10; |
---|
51 | $COMMENT = 11; |
---|
52 | $UPPER = 12; |
---|
53 | $LOWER = 13; |
---|
54 | $TITLE = 14; |
---|
55 | |
---|
56 | # Names of fields in the line break table |
---|
57 | $BREAK_CODE = 0; |
---|
58 | $BREAK_PROPERTY = 1; |
---|
59 | |
---|
60 | # Names of fields in the SpecialCasing table |
---|
61 | $CASE_CODE = 0; |
---|
62 | $CASE_LOWER = 1; |
---|
63 | $CASE_TITLE = 2; |
---|
64 | $CASE_UPPER = 3; |
---|
65 | $CASE_CONDITION = 4; |
---|
66 | |
---|
67 | # Names of fields in the CaseFolding table |
---|
68 | $FOLDING_CODE = 0; |
---|
69 | $FOLDING_STATUS = 1; |
---|
70 | $FOLDING_MAPPING = 2; |
---|
71 | |
---|
72 | # Map general category code onto symbolic name. |
---|
73 | %mappings = |
---|
74 | ( |
---|
75 | # Normative. |
---|
76 | 'Lu' => "G_UNICODE_UPPERCASE_LETTER", |
---|
77 | 'Ll' => "G_UNICODE_LOWERCASE_LETTER", |
---|
78 | 'Lt' => "G_UNICODE_TITLECASE_LETTER", |
---|
79 | 'Mn' => "G_UNICODE_NON_SPACING_MARK", |
---|
80 | 'Mc' => "G_UNICODE_COMBINING_MARK", |
---|
81 | 'Me' => "G_UNICODE_ENCLOSING_MARK", |
---|
82 | 'Nd' => "G_UNICODE_DECIMAL_NUMBER", |
---|
83 | 'Nl' => "G_UNICODE_LETTER_NUMBER", |
---|
84 | 'No' => "G_UNICODE_OTHER_NUMBER", |
---|
85 | 'Zs' => "G_UNICODE_SPACE_SEPARATOR", |
---|
86 | 'Zl' => "G_UNICODE_LINE_SEPARATOR", |
---|
87 | 'Zp' => "G_UNICODE_PARAGRAPH_SEPARATOR", |
---|
88 | 'Cc' => "G_UNICODE_CONTROL", |
---|
89 | 'Cf' => "G_UNICODE_FORMAT", |
---|
90 | 'Cs' => "G_UNICODE_SURROGATE", |
---|
91 | 'Co' => "G_UNICODE_PRIVATE_USE", |
---|
92 | 'Cn' => "G_UNICODE_UNASSIGNED", |
---|
93 | |
---|
94 | # Informative. |
---|
95 | 'Lm' => "G_UNICODE_MODIFIER_LETTER", |
---|
96 | 'Lo' => "G_UNICODE_OTHER_LETTER", |
---|
97 | 'Pc' => "G_UNICODE_CONNECT_PUNCTUATION", |
---|
98 | 'Pd' => "G_UNICODE_DASH_PUNCTUATION", |
---|
99 | 'Ps' => "G_UNICODE_OPEN_PUNCTUATION", |
---|
100 | 'Pe' => "G_UNICODE_CLOSE_PUNCTUATION", |
---|
101 | 'Pi' => "G_UNICODE_INITIAL_PUNCTUATION", |
---|
102 | 'Pf' => "G_UNICODE_FINAL_PUNCTUATION", |
---|
103 | 'Po' => "G_UNICODE_OTHER_PUNCTUATION", |
---|
104 | 'Sm' => "G_UNICODE_MATH_SYMBOL", |
---|
105 | 'Sc' => "G_UNICODE_CURRENCY_SYMBOL", |
---|
106 | 'Sk' => "G_UNICODE_MODIFIER_SYMBOL", |
---|
107 | 'So' => "G_UNICODE_OTHER_SYMBOL" |
---|
108 | ); |
---|
109 | |
---|
110 | %break_mappings = |
---|
111 | ( |
---|
112 | 'BK' => "G_UNICODE_BREAK_MANDATORY", |
---|
113 | 'CR' => "G_UNICODE_BREAK_CARRIAGE_RETURN", |
---|
114 | 'LF' => "G_UNICODE_BREAK_LINE_FEED", |
---|
115 | 'CM' => "G_UNICODE_BREAK_COMBINING_MARK", |
---|
116 | 'SG' => "G_UNICODE_BREAK_SURROGATE", |
---|
117 | 'ZW' => "G_UNICODE_BREAK_ZERO_WIDTH_SPACE", |
---|
118 | 'IN' => "G_UNICODE_BREAK_INSEPARABLE", |
---|
119 | 'GL' => "G_UNICODE_BREAK_NON_BREAKING_GLUE", |
---|
120 | 'CB' => "G_UNICODE_BREAK_CONTINGENT", |
---|
121 | 'SP' => "G_UNICODE_BREAK_SPACE", |
---|
122 | 'BA' => "G_UNICODE_BREAK_AFTER", |
---|
123 | 'BB' => "G_UNICODE_BREAK_BEFORE", |
---|
124 | 'B2' => "G_UNICODE_BREAK_BEFORE_AND_AFTER", |
---|
125 | 'HY' => "G_UNICODE_BREAK_HYPHEN", |
---|
126 | 'NS' => "G_UNICODE_BREAK_NON_STARTER", |
---|
127 | 'OP' => "G_UNICODE_BREAK_OPEN_PUNCTUATION", |
---|
128 | 'CL' => "G_UNICODE_BREAK_CLOSE_PUNCTUATION", |
---|
129 | 'QU' => "G_UNICODE_BREAK_QUOTATION", |
---|
130 | 'EX' => "G_UNICODE_BREAK_EXCLAMATION", |
---|
131 | 'ID' => "G_UNICODE_BREAK_IDEOGRAPHIC", |
---|
132 | 'NU' => "G_UNICODE_BREAK_NUMERIC", |
---|
133 | 'IS' => "G_UNICODE_BREAK_INFIX_SEPARATOR", |
---|
134 | 'SY' => "G_UNICODE_BREAK_SYMBOL", |
---|
135 | 'AL' => "G_UNICODE_BREAK_ALPHABETIC", |
---|
136 | 'PR' => "G_UNICODE_BREAK_PREFIX", |
---|
137 | 'PO' => "G_UNICODE_BREAK_POSTFIX", |
---|
138 | 'SA' => "G_UNICODE_BREAK_COMPLEX_CONTEXT", |
---|
139 | 'AI' => "G_UNICODE_BREAK_AMBIGUOUS", |
---|
140 | 'NL' => "G_UNICODE_BREAK_NEXT_LINE", |
---|
141 | 'WJ' => "G_UNICODE_BREAK_WORD_JOINER", |
---|
142 | 'XX' => "G_UNICODE_BREAK_UNKNOWN" |
---|
143 | ); |
---|
144 | |
---|
145 | # Title case mappings. |
---|
146 | %title_to_lower = (); |
---|
147 | %title_to_upper = (); |
---|
148 | |
---|
149 | # Maximum length of special-case strings |
---|
150 | |
---|
151 | my @special_cases; |
---|
152 | my @special_case_offsets; |
---|
153 | my $special_case_offset = 0; |
---|
154 | |
---|
155 | $do_decomp = 0; |
---|
156 | $do_props = 1; |
---|
157 | if (@ARGV && $ARGV[0] eq '-decomp') |
---|
158 | { |
---|
159 | $do_decomp = 1; |
---|
160 | $do_props = 0; |
---|
161 | shift @ARGV; |
---|
162 | } |
---|
163 | elsif (@ARGV && $ARGV[0] eq '-both') |
---|
164 | { |
---|
165 | $do_decomp = 1; |
---|
166 | shift @ARGV; |
---|
167 | } |
---|
168 | |
---|
169 | if (@ARGV != 2) { |
---|
170 | $0 =~ s@.*/@@; |
---|
171 | die "\nUsage: $0 [-decomp | -both] UNICODE-VERSION DIRECTORY\n\n DIRECTORY should contain the following Unicode data files:\n UnicodeData.txt, LineBreak.txt, SpecialCasing.txt, CaseFolding.txt,\n CompositionExclusions.txt, BidiMirroring.txt\n\n"; |
---|
172 | } |
---|
173 | |
---|
174 | my ($unicodedatatxt, $linebreaktxt, $specialcasingtxt, $casefoldingtxt, $compositionexclusionstxt, $bidimirroringtxt); |
---|
175 | |
---|
176 | my $d = $ARGV[1]; |
---|
177 | opendir (my $dir, $d) or die "Cannot open Unicode data dir $d: $!\n"; |
---|
178 | for my $f (readdir ($dir)) |
---|
179 | { |
---|
180 | $unicodedatatxt = "$d/$f" if ($f =~ /UnicodeData.*\.txt/); |
---|
181 | $linebreaktxt = "$d/$f" if ($f =~ /LineBreak.*\.txt/); |
---|
182 | $specialcasingtxt = "$d/$f" if ($f =~ /SpecialCasing.*\.txt/); |
---|
183 | $casefoldingtxt = "$d/$f" if ($f =~ /CaseFolding.*\.txt/); |
---|
184 | $compositionexclusionstxt = "$d/$f" if ($f =~ /CompositionExclusions.*\.txt/); |
---|
185 | $bidimirroringtxt = "$d/$f" if ($f =~ /BidiMirroring.*\.txt/); |
---|
186 | } |
---|
187 | |
---|
188 | defined $unicodedatatxt or die "Did not find UnicodeData file"; |
---|
189 | defined $linebreaktxt or die "Did not find LineBreak file"; |
---|
190 | defined $specialcasingtxt or die "Did not find SpecialCasing file"; |
---|
191 | defined $casefoldingtxt or die "Did not find CaseFolding file"; |
---|
192 | defined $compositionexclusionstxt or die "Did not find CompositionExclusions file"; |
---|
193 | defined $bidimirroringtxt or die "Did not find BidiMirroring file"; |
---|
194 | |
---|
195 | print "Creating decomp table\n" if ($do_decomp); |
---|
196 | print "Creating property table\n" if ($do_props); |
---|
197 | |
---|
198 | print "Composition exlusions from $compositionexclusionstxt\n"; |
---|
199 | |
---|
200 | open (INPUT, "< $compositionexclusionstxt") || exit 1; |
---|
201 | |
---|
202 | while (<INPUT>) { |
---|
203 | |
---|
204 | chop; |
---|
205 | |
---|
206 | next if /^#/; |
---|
207 | next if /^\s*$/; |
---|
208 | |
---|
209 | s/\s*#.*//; |
---|
210 | |
---|
211 | s/^\s*//; |
---|
212 | s/\s*$//; |
---|
213 | |
---|
214 | $composition_exclusions{hex($_)} = 1; |
---|
215 | } |
---|
216 | |
---|
217 | close INPUT; |
---|
218 | |
---|
219 | print "Unicode data from $unicodedatatxt\n"; |
---|
220 | |
---|
221 | open (INPUT, "< $unicodedatatxt") || exit 1; |
---|
222 | |
---|
223 | # we save memory by skipping the huge empty area before U+E0000 |
---|
224 | my $pages_before_e0000; |
---|
225 | |
---|
226 | $last_code = -1; |
---|
227 | while (<INPUT>) |
---|
228 | { |
---|
229 | chop; |
---|
230 | @fields = split (';', $_, 30); |
---|
231 | if ($#fields != 14) |
---|
232 | { |
---|
233 | printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields); |
---|
234 | } |
---|
235 | |
---|
236 | $code = hex ($fields[$CODE]); |
---|
237 | |
---|
238 | if ($code >= 0xE0000 and $last_code < 0xE0000) |
---|
239 | { |
---|
240 | $pages_before_e0000 = ($last_code >> 8) + 1; |
---|
241 | } |
---|
242 | |
---|
243 | if ($code > $last_code + 1) |
---|
244 | { |
---|
245 | # Found a gap. |
---|
246 | if ($fields[$NAME] =~ /Last>/) |
---|
247 | { |
---|
248 | # Fill the gap with the last character read, |
---|
249 | # since this was a range specified in the char database |
---|
250 | @gfields = @fields; |
---|
251 | } |
---|
252 | else |
---|
253 | { |
---|
254 | # The gap represents undefined characters. Only the type |
---|
255 | # matters. |
---|
256 | @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '', |
---|
257 | '', '', '', ''); |
---|
258 | } |
---|
259 | for (++$last_code; $last_code < $code; ++$last_code) |
---|
260 | { |
---|
261 | $gfields{$CODE} = sprintf ("%04x", $last_code); |
---|
262 | &process_one ($last_code, @gfields); |
---|
263 | } |
---|
264 | } |
---|
265 | &process_one ($code, @fields); |
---|
266 | $last_code = $code; |
---|
267 | } |
---|
268 | |
---|
269 | close INPUT; |
---|
270 | |
---|
271 | @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '', |
---|
272 | '', '', '', ''); |
---|
273 | for (++$last_code; $last_code <= 0x10FFFF; ++$last_code) |
---|
274 | { |
---|
275 | $gfields{$CODE} = sprintf ("%04x", $last_code); |
---|
276 | &process_one ($last_code, @gfields); |
---|
277 | } |
---|
278 | --$last_code; # Want last to be 0x10FFFF. |
---|
279 | |
---|
280 | print "Creating line break table\n"; |
---|
281 | |
---|
282 | print "Line break data from $linebreaktxt\n"; |
---|
283 | |
---|
284 | open (INPUT, "< $linebreaktxt") || exit 1; |
---|
285 | |
---|
286 | $last_code = -1; |
---|
287 | while (<INPUT>) |
---|
288 | { |
---|
289 | my ($start_code, $end_code); |
---|
290 | |
---|
291 | chop; |
---|
292 | |
---|
293 | next if /^#/; |
---|
294 | |
---|
295 | s/\s*#.*//; |
---|
296 | |
---|
297 | @fields = split (';', $_, 30); |
---|
298 | if ($#fields != 1) |
---|
299 | { |
---|
300 | printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields); |
---|
301 | next; |
---|
302 | } |
---|
303 | |
---|
304 | if ($fields[$CODE] =~ /([A-F0-9]{4,6})\.\.([A-F0-9]{4,6})/) |
---|
305 | { |
---|
306 | $start_code = hex ($1); |
---|
307 | $end_code = hex ($2); |
---|
308 | } else { |
---|
309 | $start_code = $end_code = hex ($fields[$CODE]); |
---|
310 | |
---|
311 | } |
---|
312 | |
---|
313 | if ($start_code > $last_code + 1) |
---|
314 | { |
---|
315 | # The gap represents undefined characters. If assigned, |
---|
316 | # they are AL, if not assigned, XX |
---|
317 | for (++$last_code; $last_code < $start_code; ++$last_code) |
---|
318 | { |
---|
319 | if ($type[$last_code] eq 'Cn') |
---|
320 | { |
---|
321 | $break_props[$last_code] = 'XX'; |
---|
322 | } |
---|
323 | else |
---|
324 | { |
---|
325 | $break_props[$last_code] = 'AL'; |
---|
326 | } |
---|
327 | } |
---|
328 | } |
---|
329 | |
---|
330 | for ($last_code = $start_code; $last_code <= $end_code; $last_code++) |
---|
331 | { |
---|
332 | $break_props[$last_code] = $fields[$BREAK_PROPERTY]; |
---|
333 | } |
---|
334 | |
---|
335 | $last_code = $end_code; |
---|
336 | } |
---|
337 | |
---|
338 | close INPUT; |
---|
339 | |
---|
340 | for (++$last_code; $last_code <= 0x10FFFF; ++$last_code) |
---|
341 | { |
---|
342 | if ($type[$last_code] eq 'Cn') |
---|
343 | { |
---|
344 | $break_props[$last_code] = 'XX'; |
---|
345 | } |
---|
346 | else |
---|
347 | { |
---|
348 | $break_props[$last_code] = 'AL'; |
---|
349 | } |
---|
350 | } |
---|
351 | --$last_code; # Want last to be 0x10FFFF. |
---|
352 | |
---|
353 | print STDERR "Last code is not 0x10FFFF" if ($last_code != 0x10FFFF); |
---|
354 | |
---|
355 | print "Reading special-casing table for case conversion\n"; |
---|
356 | |
---|
357 | open (INPUT, "< $specialcasingtxt") || exit 1; |
---|
358 | |
---|
359 | while (<INPUT>) |
---|
360 | { |
---|
361 | my $code; |
---|
362 | |
---|
363 | chop; |
---|
364 | |
---|
365 | next if /^#/; |
---|
366 | next if /^\s*$/; |
---|
367 | |
---|
368 | s/\s*#.*//; |
---|
369 | |
---|
370 | @fields = split ('\s*;\s*', $_, 30); |
---|
371 | |
---|
372 | $raw_code = $fields[$CASE_CODE]; |
---|
373 | $code = hex ($raw_code); |
---|
374 | |
---|
375 | if ($#fields != 4 && $#fields != 5) |
---|
376 | { |
---|
377 | printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields); |
---|
378 | next; |
---|
379 | } |
---|
380 | |
---|
381 | if (!defined $type[$code]) |
---|
382 | { |
---|
383 | printf STDERR "Special case for code point: $code, which has no defined type\n"; |
---|
384 | next; |
---|
385 | } |
---|
386 | |
---|
387 | if (defined $fields[5]) { |
---|
388 | # Ignore conditional special cases - we'll handle them in code |
---|
389 | next; |
---|
390 | } |
---|
391 | |
---|
392 | if ($type[$code] eq 'Lu') |
---|
393 | { |
---|
394 | (hex $fields[$CASE_UPPER] == $code) || die "$raw_code is Lu and UCD_Upper($raw_code) != $raw_code"; |
---|
395 | |
---|
396 | &add_special_case ($code, $value[$code], $fields[$CASE_LOWER], $fields[$CASE_TITLE]); |
---|
397 | |
---|
398 | } elsif ($type[$code] eq 'Lt') |
---|
399 | { |
---|
400 | (hex $fields[$CASE_TITLE] == $code) || die "$raw_code is Lt and UCD_Title($raw_code) != $raw_code"; |
---|
401 | |
---|
402 | &add_special_case ($code, undef, $fields[$CASE_LOWER], $fields[$CASE_UPPER]); |
---|
403 | } elsif ($type[$code] eq 'Ll') |
---|
404 | { |
---|
405 | (hex $fields[$CASE_LOWER] == $code) || die "$raw_code is Ll and UCD_Lower($raw_code) != $raw_code"; |
---|
406 | |
---|
407 | &add_special_case ($code, $value[$code], $fields[$CASE_UPPER], $fields[$CASE_TITLE]); |
---|
408 | } else { |
---|
409 | printf STDERR "Special case for non-alphabetic code point: $raw_code\n"; |
---|
410 | next; |
---|
411 | } |
---|
412 | } |
---|
413 | |
---|
414 | close INPUT; |
---|
415 | |
---|
416 | open (INPUT, "< $casefoldingtxt") || exit 1; |
---|
417 | |
---|
418 | my $casefoldlen = 0; |
---|
419 | my @casefold; |
---|
420 | |
---|
421 | while (<INPUT>) |
---|
422 | { |
---|
423 | my $code; |
---|
424 | |
---|
425 | chop; |
---|
426 | |
---|
427 | next if /^#/; |
---|
428 | next if /^\s*$/; |
---|
429 | |
---|
430 | s/\s*#.*//; |
---|
431 | |
---|
432 | @fields = split ('\s*;\s*', $_, 30); |
---|
433 | |
---|
434 | $raw_code = $fields[$FOLDING_CODE]; |
---|
435 | $code = hex ($raw_code); |
---|
436 | |
---|
437 | if ($#fields != 3) |
---|
438 | { |
---|
439 | printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields); |
---|
440 | next; |
---|
441 | } |
---|
442 | |
---|
443 | # we don't use Simple or Turkic rules here |
---|
444 | next if ($fields[$FOLDING_STATUS] =~ /^[ST]$/); |
---|
445 | |
---|
446 | @values = map { hex ($_) } split /\s+/, $fields[$FOLDING_MAPPING]; |
---|
447 | |
---|
448 | # Check simple case |
---|
449 | |
---|
450 | if (@values == 1 && |
---|
451 | !(defined $value[$code] && $value[$code] >= 0x1000000) && |
---|
452 | defined $type[$code]) { |
---|
453 | |
---|
454 | my $lower; |
---|
455 | if ($type[$code] eq 'Ll') |
---|
456 | { |
---|
457 | $lower = $code; |
---|
458 | } elsif ($type[$code] eq 'Lt') |
---|
459 | { |
---|
460 | $lower = $title_to_lower{$code}; |
---|
461 | } elsif ($type[$code] eq 'Lu') |
---|
462 | { |
---|
463 | $lower = $value[$code]; |
---|
464 | } else { |
---|
465 | $lower = $code; |
---|
466 | } |
---|
467 | |
---|
468 | if ($lower == $values[0]) { |
---|
469 | next; |
---|
470 | } |
---|
471 | } |
---|
472 | |
---|
473 | my $string = pack ("U*", @values); |
---|
474 | |
---|
475 | if (1 + &length_in_bytes ($string) > $casefoldlen) { |
---|
476 | $casefoldlen = 1 + &length_in_bytes ($string); |
---|
477 | } |
---|
478 | |
---|
479 | push @casefold, [ $code, &escape ($string) ]; |
---|
480 | } |
---|
481 | |
---|
482 | close INPUT; |
---|
483 | |
---|
484 | open (INPUT, "< $bidimirroringtxt") || exit 1; |
---|
485 | |
---|
486 | my @bidimirror; |
---|
487 | while (<INPUT>) |
---|
488 | { |
---|
489 | chomp; |
---|
490 | |
---|
491 | next if /^#/; |
---|
492 | next if /^\s*$/; |
---|
493 | |
---|
494 | s/\s*#.*//; |
---|
495 | |
---|
496 | @fields = split ('\s*;\s*', $_, 30); |
---|
497 | |
---|
498 | push @bidimirror, [hex ($fields[0]), hex ($fields[1])]; |
---|
499 | } |
---|
500 | |
---|
501 | if ($do_props) { |
---|
502 | &print_tables ($last_code) |
---|
503 | } |
---|
504 | if ($do_decomp) { |
---|
505 | &print_decomp ($last_code); |
---|
506 | &output_composition_table; |
---|
507 | } |
---|
508 | |
---|
509 | &print_line_break ($last_code); |
---|
510 | |
---|
511 | exit 0; |
---|
512 | |
---|
513 | |
---|
514 | # perl "length" returns the length in characters |
---|
515 | sub length_in_bytes |
---|
516 | { |
---|
517 | my ($string) = @_; |
---|
518 | |
---|
519 | use bytes; |
---|
520 | return length $string; |
---|
521 | } |
---|
522 | |
---|
523 | # Process a single character. |
---|
524 | sub process_one |
---|
525 | { |
---|
526 | my ($code, @fields) = @_; |
---|
527 | |
---|
528 | $type[$code] = $fields[$CATEGORY]; |
---|
529 | if ($type[$code] eq 'Nd') |
---|
530 | { |
---|
531 | $value[$code] = int ($fields[$DECIMAL_VALUE]); |
---|
532 | } |
---|
533 | elsif ($type[$code] eq 'Ll') |
---|
534 | { |
---|
535 | $value[$code] = hex ($fields[$UPPER]); |
---|
536 | } |
---|
537 | elsif ($type[$code] eq 'Lu') |
---|
538 | { |
---|
539 | $value[$code] = hex ($fields[$LOWER]); |
---|
540 | } |
---|
541 | |
---|
542 | if ($type[$code] eq 'Lt') |
---|
543 | { |
---|
544 | $title_to_lower{$code} = hex ($fields[$LOWER]); |
---|
545 | $title_to_upper{$code} = hex ($fields[$UPPER]); |
---|
546 | } |
---|
547 | |
---|
548 | $cclass[$code] = $fields[$COMBINING_CLASSES]; |
---|
549 | |
---|
550 | # Handle decompositions. |
---|
551 | if ($fields[$DECOMPOSITION] ne '') |
---|
552 | { |
---|
553 | if ($fields[$DECOMPOSITION] =~ s/\<.*\>\s*//) { |
---|
554 | $decompose_compat[$code] = 1; |
---|
555 | } else { |
---|
556 | $decompose_compat[$code] = 0; |
---|
557 | |
---|
558 | if (!exists $composition_exclusions{$code}) { |
---|
559 | $compositions{$code} = $fields[$DECOMPOSITION]; |
---|
560 | } |
---|
561 | } |
---|
562 | $decompositions[$code] = $fields[$DECOMPOSITION]; |
---|
563 | } |
---|
564 | } |
---|
565 | |
---|
566 | sub print_tables |
---|
567 | { |
---|
568 | my ($last) = @_; |
---|
569 | my ($outfile) = "gunichartables.h"; |
---|
570 | |
---|
571 | local ($bytes_out) = 0; |
---|
572 | |
---|
573 | print "Writing $outfile...\n"; |
---|
574 | |
---|
575 | open (OUT, "> $outfile"); |
---|
576 | |
---|
577 | print OUT "/* This file is automatically generated. DO NOT EDIT!\n"; |
---|
578 | print OUT " Instead, edit gen-unicode-tables.pl and re-run. */\n\n"; |
---|
579 | |
---|
580 | print OUT "#ifndef CHARTABLES_H\n"; |
---|
581 | print OUT "#define CHARTABLES_H\n\n"; |
---|
582 | |
---|
583 | print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n"; |
---|
584 | |
---|
585 | printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last; |
---|
586 | |
---|
587 | printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 10000\n\n"; |
---|
588 | |
---|
589 | my $last_part1 = ($pages_before_e0000 * 256) - 1; |
---|
590 | printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1; |
---|
591 | printf OUT "#define G_UNICODE_LAST_PAGE_PART1 %d\n\n", $pages_before_e0000 - 1; |
---|
592 | |
---|
593 | $table_index = 0; |
---|
594 | printf OUT "static const char type_data[][256] = {\n"; |
---|
595 | for ($count = 0; $count <= $last; $count += 256) |
---|
596 | { |
---|
597 | $row[$count / 256] = &print_row ($count, 1, \&fetch_type); |
---|
598 | } |
---|
599 | printf OUT "\n};\n\n"; |
---|
600 | |
---|
601 | printf OUT "/* U+0000 through U+%04X */\n", $last_part1; |
---|
602 | print OUT "static const gint16 type_table_part1[$pages_before_e0000] = {\n"; |
---|
603 | for ($count = 0; $count <= $last_part1; $count += 256) |
---|
604 | { |
---|
605 | print OUT ",\n" if $count > 0; |
---|
606 | print OUT " ", $row[$count / 256]; |
---|
607 | $bytes_out += 2; |
---|
608 | } |
---|
609 | print OUT "\n};\n\n"; |
---|
610 | |
---|
611 | printf OUT "/* U+E0000 through U+%04X */\n", $last; |
---|
612 | print OUT "static const gint16 type_table_part2[768] = {\n"; |
---|
613 | for ($count = 0xE0000; $count <= $last; $count += 256) |
---|
614 | { |
---|
615 | print OUT ",\n" if $count > 0xE0000; |
---|
616 | print OUT " ", $row[$count / 256]; |
---|
617 | $bytes_out += 2; |
---|
618 | } |
---|
619 | print OUT "\n};\n\n"; |
---|
620 | |
---|
621 | |
---|
622 | # |
---|
623 | # Now print attribute table. |
---|
624 | # |
---|
625 | |
---|
626 | $table_index = 0; |
---|
627 | printf OUT "static const gunichar attr_data[][256] = {\n"; |
---|
628 | for ($count = 0; $count <= $last; $count += 256) |
---|
629 | { |
---|
630 | $row[$count / 256] = &print_row ($count, 4, \&fetch_attr); |
---|
631 | } |
---|
632 | printf OUT "\n};\n\n"; |
---|
633 | |
---|
634 | printf OUT "/* U+0000 through U+%04X */\n", $last_part1; |
---|
635 | print OUT "static const gint16 attr_table_part1[$pages_before_e0000] = {\n"; |
---|
636 | for ($count = 0; $count <= $last_part1; $count += 256) |
---|
637 | { |
---|
638 | print OUT ",\n" if $count > 0; |
---|
639 | print OUT " ", $row[$count / 256]; |
---|
640 | $bytes_out += 2; |
---|
641 | } |
---|
642 | print OUT "\n};\n\n"; |
---|
643 | |
---|
644 | printf OUT "/* U+E0000 through U+%04X */\n", $last; |
---|
645 | print OUT "static const gint16 attr_table_part2[768] = {\n"; |
---|
646 | for ($count = 0xE0000; $count <= $last; $count += 256) |
---|
647 | { |
---|
648 | print OUT ",\n" if $count > 0xE0000; |
---|
649 | print OUT " ", $row[$count / 256]; |
---|
650 | $bytes_out += 2; |
---|
651 | } |
---|
652 | print OUT "\n};\n\n"; |
---|
653 | |
---|
654 | # |
---|
655 | # print title case table |
---|
656 | # |
---|
657 | |
---|
658 | print OUT "static const gunichar title_table[][3] = {\n"; |
---|
659 | my ($item); |
---|
660 | my ($first) = 1; |
---|
661 | foreach $item (sort keys %title_to_lower) |
---|
662 | { |
---|
663 | print OUT ",\n" |
---|
664 | unless $first; |
---|
665 | $first = 0; |
---|
666 | printf OUT " { 0x%04x, 0x%04x, 0x%04x }", $item, $title_to_upper{$item}, $title_to_lower{$item}; |
---|
667 | $bytes_out += 12; |
---|
668 | } |
---|
669 | print OUT "\n};\n\n"; |
---|
670 | |
---|
671 | # |
---|
672 | # And special case conversion table -- conversions that change length |
---|
673 | # |
---|
674 | &output_special_case_table (\*OUT); |
---|
675 | &output_casefold_table (\*OUT); |
---|
676 | |
---|
677 | print OUT "static const struct {\n"; |
---|
678 | print OUT " gunichar ch;\n"; |
---|
679 | print OUT " gunichar mirrored_ch;\n"; |
---|
680 | print OUT "} bidi_mirroring_table[] =\n"; |
---|
681 | print OUT "{\n"; |
---|
682 | $first = 1; |
---|
683 | foreach $item (@bidimirror) |
---|
684 | { |
---|
685 | print OUT ",\n" unless $first; |
---|
686 | $first = 0; |
---|
687 | printf OUT " { 0x%04x, 0x%04x }", $item->[0], $item->[1]; |
---|
688 | $bytes_out += 8; |
---|
689 | } |
---|
690 | print OUT "\n};\n\n"; |
---|
691 | |
---|
692 | print OUT "#endif /* CHARTABLES_H */\n"; |
---|
693 | |
---|
694 | close (OUT); |
---|
695 | |
---|
696 | printf STDERR "Generated %d bytes in tables\n", $bytes_out; |
---|
697 | } |
---|
698 | |
---|
699 | # A fetch function for the type table. |
---|
700 | sub fetch_type |
---|
701 | { |
---|
702 | my ($index) = @_; |
---|
703 | return $mappings{$type[$index]}; |
---|
704 | } |
---|
705 | |
---|
706 | # A fetch function for the attribute table. |
---|
707 | sub fetch_attr |
---|
708 | { |
---|
709 | my ($index) = @_; |
---|
710 | if (defined $value[$index]) |
---|
711 | { |
---|
712 | return sprintf ("0x%04x", $value[$index]); |
---|
713 | } |
---|
714 | else |
---|
715 | { |
---|
716 | return "0x0000"; |
---|
717 | } |
---|
718 | } |
---|
719 | |
---|
720 | sub print_row |
---|
721 | { |
---|
722 | my ($start, $typsize, $fetcher) = @_; |
---|
723 | |
---|
724 | my ($i); |
---|
725 | my (@values); |
---|
726 | my ($flag) = 1; |
---|
727 | my ($off); |
---|
728 | |
---|
729 | for ($off = 0; $off < 256; ++$off) |
---|
730 | { |
---|
731 | $values[$off] = $fetcher->($off + $start); |
---|
732 | if ($values[$off] ne $values[0]) |
---|
733 | { |
---|
734 | $flag = 0; |
---|
735 | } |
---|
736 | } |
---|
737 | if ($flag) |
---|
738 | { |
---|
739 | return $values[0] . " + G_UNICODE_MAX_TABLE_INDEX"; |
---|
740 | } |
---|
741 | |
---|
742 | printf OUT ",\n" if ($table_index != 0); |
---|
743 | printf OUT " { /* page %d, index %d */\n ", $start / 256, $table_index; |
---|
744 | my ($column) = 4; |
---|
745 | for ($i = $start; $i < $start + 256; ++$i) |
---|
746 | { |
---|
747 | print OUT ", " |
---|
748 | if $i > $start; |
---|
749 | my ($text) = $values[$i - $start]; |
---|
750 | if (length ($text) + $column + 2 > 78) |
---|
751 | { |
---|
752 | print OUT "\n "; |
---|
753 | $column = 4; |
---|
754 | } |
---|
755 | print OUT $text; |
---|
756 | $column += length ($text) + 2; |
---|
757 | } |
---|
758 | print OUT "\n }"; |
---|
759 | |
---|
760 | $bytes_out += 256 * $typsize; |
---|
761 | |
---|
762 | return sprintf "%d /* page %d */", $table_index++, $start / 256; |
---|
763 | } |
---|
764 | |
---|
765 | sub escape |
---|
766 | { |
---|
767 | my ($string) = @_; |
---|
768 | |
---|
769 | my $escaped = unpack("H*", $string); |
---|
770 | $escaped =~ s/(.{2})/\\x$1/g; |
---|
771 | |
---|
772 | return $escaped; |
---|
773 | } |
---|
774 | |
---|
775 | # Returns the offset of $decomp in the offset string. Updates the |
---|
776 | # referenced variables as appropriate. |
---|
777 | sub handle_decomp ($$$$) |
---|
778 | { |
---|
779 | my ($decomp, $decomp_offsets_ref, $decomp_string_ref, $decomp_string_offset_ref) = @_; |
---|
780 | my $offset = "G_UNICODE_NOT_PRESENT_OFFSET"; |
---|
781 | |
---|
782 | if (defined $decomp) |
---|
783 | { |
---|
784 | if (defined $decomp_offsets_ref->{$decomp}) |
---|
785 | { |
---|
786 | $offset = $decomp_offsets_ref->{$decomp}; |
---|
787 | } |
---|
788 | else |
---|
789 | { |
---|
790 | $offset = ${$decomp_string_offset_ref}; |
---|
791 | $decomp_offsets_ref->{$decomp} = $offset; |
---|
792 | ${$decomp_string_ref} .= "\n \"" . &escape ($decomp) . "\\0\" /* offset ${$decomp_string_offset_ref} */"; |
---|
793 | ${$decomp_string_offset_ref} += &length_in_bytes ($decomp) + 1; |
---|
794 | } |
---|
795 | } |
---|
796 | |
---|
797 | return $offset; |
---|
798 | } |
---|
799 | |
---|
800 | # Generate the character decomposition header. |
---|
801 | sub print_decomp |
---|
802 | { |
---|
803 | my ($last) = @_; |
---|
804 | my ($outfile) = "gunidecomp.h"; |
---|
805 | |
---|
806 | local ($bytes_out) = 0; |
---|
807 | |
---|
808 | print "Writing $outfile...\n"; |
---|
809 | |
---|
810 | open (OUT, "> $outfile") || exit 1; |
---|
811 | |
---|
812 | print OUT "/* This file is automatically generated. DO NOT EDIT! */\n\n"; |
---|
813 | print OUT "#ifndef DECOMP_H\n"; |
---|
814 | print OUT "#define DECOMP_H\n\n"; |
---|
815 | |
---|
816 | printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last; |
---|
817 | |
---|
818 | printf OUT "#define G_UNICODE_MAX_TABLE_INDEX (0x110000 / 256)\n\n"; |
---|
819 | |
---|
820 | my $last_part1 = ($pages_before_e0000 * 256) - 1; |
---|
821 | printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1; |
---|
822 | printf OUT "#define G_UNICODE_LAST_PAGE_PART1 %d\n\n", $pages_before_e0000 - 1; |
---|
823 | |
---|
824 | $NOT_PRESENT_OFFSET = 65535; |
---|
825 | print OUT "#define G_UNICODE_NOT_PRESENT_OFFSET $NOT_PRESENT_OFFSET\n\n"; |
---|
826 | |
---|
827 | my ($count, @row); |
---|
828 | $table_index = 0; |
---|
829 | printf OUT "static const guchar cclass_data[][256] = {\n"; |
---|
830 | for ($count = 0; $count <= $last; $count += 256) |
---|
831 | { |
---|
832 | $row[$count / 256] = &print_row ($count, 1, \&fetch_cclass); |
---|
833 | } |
---|
834 | printf OUT "\n};\n\n"; |
---|
835 | |
---|
836 | print OUT "static const gint16 combining_class_table_part1[$pages_before_e0000] = {\n"; |
---|
837 | for ($count = 0; $count <= $last_part1; $count += 256) |
---|
838 | { |
---|
839 | print OUT ",\n" if $count > 0; |
---|
840 | print OUT " ", $row[$count / 256]; |
---|
841 | $bytes_out += 2; |
---|
842 | } |
---|
843 | print OUT "\n};\n\n"; |
---|
844 | |
---|
845 | print OUT "static const gint16 combining_class_table_part2[768] = {\n"; |
---|
846 | for ($count = 0xE0000; $count <= $last; $count += 256) |
---|
847 | { |
---|
848 | print OUT ",\n" if $count > 0xE0000; |
---|
849 | print OUT " ", $row[$count / 256]; |
---|
850 | $bytes_out += 2; |
---|
851 | } |
---|
852 | print OUT "\n};\n\n"; |
---|
853 | |
---|
854 | print OUT "typedef struct\n{\n"; |
---|
855 | print OUT " gunichar ch;\n"; |
---|
856 | print OUT " guint16 canon_offset;\n"; |
---|
857 | print OUT " guint16 compat_offset;\n"; |
---|
858 | print OUT "} decomposition;\n\n"; |
---|
859 | |
---|
860 | print OUT "static const decomposition decomp_table[] =\n{\n"; |
---|
861 | my ($iter); |
---|
862 | my ($first) = 1; |
---|
863 | my ($decomp_string) = ""; |
---|
864 | my ($decomp_string_offset) = 0; |
---|
865 | for ($count = 0; $count <= $last; ++$count) |
---|
866 | { |
---|
867 | if (defined $decompositions[$count]) |
---|
868 | { |
---|
869 | print OUT ",\n" |
---|
870 | if ! $first; |
---|
871 | $first = 0; |
---|
872 | |
---|
873 | my $canon_decomp; |
---|
874 | my $compat_decomp; |
---|
875 | |
---|
876 | if (!$decompose_compat[$count]) { |
---|
877 | $canon_decomp = make_decomp ($count, 0); |
---|
878 | } |
---|
879 | $compat_decomp = make_decomp ($count, 1); |
---|
880 | |
---|
881 | if (defined $canon_decomp && $compat_decomp eq $canon_decomp) { |
---|
882 | undef $compat_decomp; |
---|
883 | } |
---|
884 | |
---|
885 | my $canon_offset = handle_decomp ($canon_decomp, \%decomp_offsets, \$decomp_string, \$decomp_string_offset); |
---|
886 | my $compat_offset = handle_decomp ($compat_decomp, \%decomp_offsets, \$decomp_string, \$decomp_string_offset); |
---|
887 | |
---|
888 | die if $decomp_string_offset > $NOT_PRESENT_OFFSET; |
---|
889 | |
---|
890 | printf OUT qq( { 0x%04x, $canon_offset, $compat_offset }), $count; |
---|
891 | $bytes_out += 8; |
---|
892 | } |
---|
893 | } |
---|
894 | print OUT "\n};\n\n"; |
---|
895 | $bytes_out += $decomp_string_offset + 1; |
---|
896 | |
---|
897 | printf OUT "static const gchar decomp_expansion_string[] = %s;\n\n", $decomp_string; |
---|
898 | |
---|
899 | print OUT "#endif /* DECOMP_H */\n"; |
---|
900 | |
---|
901 | printf STDERR "Generated %d bytes in decomp tables\n", $bytes_out; |
---|
902 | } |
---|
903 | |
---|
904 | sub print_line_break |
---|
905 | { |
---|
906 | my ($last) = @_; |
---|
907 | my ($outfile) = "gunibreak.h"; |
---|
908 | |
---|
909 | local ($bytes_out) = 0; |
---|
910 | |
---|
911 | print "Writing $outfile...\n"; |
---|
912 | |
---|
913 | open (OUT, "> $outfile"); |
---|
914 | |
---|
915 | print OUT "/* This file is automatically generated. DO NOT EDIT!\n"; |
---|
916 | print OUT " Instead, edit gen-unicode-tables.pl and re-run. */\n\n"; |
---|
917 | |
---|
918 | print OUT "#ifndef BREAKTABLES_H\n"; |
---|
919 | print OUT "#define BREAKTABLES_H\n\n"; |
---|
920 | |
---|
921 | print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n"; |
---|
922 | |
---|
923 | printf OUT "#define G_UNICODE_LAST_CHAR 0x%04X\n\n", $last; |
---|
924 | |
---|
925 | printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 10000\n\n"; |
---|
926 | |
---|
927 | my $last_part1 = ($pages_before_e0000 * 256) - 1; |
---|
928 | printf OUT "/* the last code point that should be looked up in break_property_table_part1 */\n"; |
---|
929 | printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1; |
---|
930 | |
---|
931 | $table_index = 0; |
---|
932 | printf OUT "static const gint8 break_property_data[][256] = {\n"; |
---|
933 | for ($count = 0; $count <= $last; $count += 256) |
---|
934 | { |
---|
935 | $row[$count / 256] = &print_row ($count, 1, \&fetch_break_type); |
---|
936 | } |
---|
937 | printf OUT "\n};\n\n"; |
---|
938 | |
---|
939 | printf OUT "/* U+0000 through U+%04X */\n", $last_part1; |
---|
940 | print OUT "static const gint16 break_property_table_part1[$pages_before_e0000] = {\n"; |
---|
941 | for ($count = 0; $count <= $last_part1; $count += 256) |
---|
942 | { |
---|
943 | print OUT ",\n" if $count > 0; |
---|
944 | print OUT " ", $row[$count / 256]; |
---|
945 | $bytes_out += 2; |
---|
946 | } |
---|
947 | print OUT "\n};\n\n"; |
---|
948 | |
---|
949 | printf OUT "/* U+E0000 through U+%04X */\n", $last; |
---|
950 | print OUT "static const gint16 break_property_table_part2[768] = {\n"; |
---|
951 | for ($count = 0xE0000; $count <= $last; $count += 256) |
---|
952 | { |
---|
953 | print OUT ",\n" if $count > 0xE0000; |
---|
954 | print OUT " ", $row[$count / 256]; |
---|
955 | $bytes_out += 2; |
---|
956 | } |
---|
957 | print OUT "\n};\n\n"; |
---|
958 | |
---|
959 | |
---|
960 | print OUT "#endif /* BREAKTABLES_H */\n"; |
---|
961 | |
---|
962 | close (OUT); |
---|
963 | |
---|
964 | printf STDERR "Generated %d bytes in break tables\n", $bytes_out; |
---|
965 | } |
---|
966 | |
---|
967 | |
---|
968 | # A fetch function for the break properties table. |
---|
969 | sub fetch_break_type |
---|
970 | { |
---|
971 | my ($index) = @_; |
---|
972 | return $break_mappings{$break_props[$index]}; |
---|
973 | } |
---|
974 | |
---|
975 | # Fetcher for combining class. |
---|
976 | sub fetch_cclass |
---|
977 | { |
---|
978 | my ($i) = @_; |
---|
979 | return $cclass[$i]; |
---|
980 | } |
---|
981 | |
---|
982 | # Expand a character decomposition recursively. |
---|
983 | sub expand_decomp |
---|
984 | { |
---|
985 | my ($code, $compat) = @_; |
---|
986 | |
---|
987 | my ($iter, $val); |
---|
988 | my (@result) = (); |
---|
989 | foreach $iter (split (' ', $decompositions[$code])) |
---|
990 | { |
---|
991 | $val = hex ($iter); |
---|
992 | if (defined $decompositions[$val] && |
---|
993 | ($compat || !$decompose_compat[$val])) |
---|
994 | { |
---|
995 | push (@result, &expand_decomp ($val, $compat)); |
---|
996 | } |
---|
997 | else |
---|
998 | { |
---|
999 | push (@result, $val); |
---|
1000 | } |
---|
1001 | } |
---|
1002 | |
---|
1003 | return @result; |
---|
1004 | } |
---|
1005 | |
---|
1006 | sub make_decomp |
---|
1007 | { |
---|
1008 | my ($code, $compat) = @_; |
---|
1009 | |
---|
1010 | my $result = ""; |
---|
1011 | foreach $iter (&expand_decomp ($code, $compat)) |
---|
1012 | { |
---|
1013 | $result .= pack ("U", $iter); # to utf-8 |
---|
1014 | } |
---|
1015 | |
---|
1016 | $result; |
---|
1017 | } |
---|
1018 | # Generate special case data string from two fields |
---|
1019 | sub add_special_case |
---|
1020 | { |
---|
1021 | my ($code, $single, $field1, $field2) = @_; |
---|
1022 | |
---|
1023 | @values = (defined $single ? $single : (), |
---|
1024 | (map { hex ($_) } split /\s+/, $field1), |
---|
1025 | 0, |
---|
1026 | (map { hex ($_) } split /\s+/, $field2)); |
---|
1027 | $result = ""; |
---|
1028 | |
---|
1029 | |
---|
1030 | for $value (@values) { |
---|
1031 | $result .= pack ("U", $value); # to utf-8 |
---|
1032 | } |
---|
1033 | |
---|
1034 | push @special_case_offsets, $special_case_offset; |
---|
1035 | |
---|
1036 | # We encode special cases up in the 0x1000000 space |
---|
1037 | $value[$code] = 0x1000000 + $special_case_offset; |
---|
1038 | |
---|
1039 | $special_case_offset += 1 + &length_in_bytes ($result); |
---|
1040 | |
---|
1041 | push @special_cases, &escape ($result); |
---|
1042 | } |
---|
1043 | |
---|
1044 | sub output_special_case_table |
---|
1045 | { |
---|
1046 | my $out = shift; |
---|
1047 | |
---|
1048 | print $out <<EOT; |
---|
1049 | |
---|
1050 | /* Table of special cases for case conversion; each record contains |
---|
1051 | * First, the best single character mapping to lowercase if Lu, |
---|
1052 | * and to uppercase if Ll, followed by the output mapping for the two cases |
---|
1053 | * other than the case of the codepoint, in the order [Ll],[Lu],[Lt], |
---|
1054 | * encoded in UTF-8, separated and terminated by a null character. |
---|
1055 | */ |
---|
1056 | static const gchar special_case_table[] = { |
---|
1057 | EOT |
---|
1058 | |
---|
1059 | my $i = 0; |
---|
1060 | for $case (@special_cases) { |
---|
1061 | print $out qq( "$case\\0" /* offset ${special_case_offsets[$i]} */\n); |
---|
1062 | $i++; |
---|
1063 | } |
---|
1064 | |
---|
1065 | print $out <<EOT; |
---|
1066 | }; |
---|
1067 | |
---|
1068 | EOT |
---|
1069 | |
---|
1070 | print STDERR "Generated " . ($special_case_offset + 1) . " bytes in special case table\n"; |
---|
1071 | } |
---|
1072 | |
---|
1073 | sub enumerate_ordered |
---|
1074 | { |
---|
1075 | my ($array) = @_; |
---|
1076 | |
---|
1077 | my $n = 0; |
---|
1078 | for my $code (sort { $a <=> $b } keys %$array) { |
---|
1079 | if ($array->{$code} == 1) { |
---|
1080 | delete $array->{$code}; |
---|
1081 | next; |
---|
1082 | } |
---|
1083 | $array->{$code} = $n++; |
---|
1084 | } |
---|
1085 | |
---|
1086 | return $n; |
---|
1087 | } |
---|
1088 | |
---|
1089 | sub output_composition_table |
---|
1090 | { |
---|
1091 | print STDERR "Generating composition table\n"; |
---|
1092 | |
---|
1093 | local ($bytes_out) = 0; |
---|
1094 | |
---|
1095 | my %first; |
---|
1096 | my %second; |
---|
1097 | |
---|
1098 | # First we need to go through and remove decompositions |
---|
1099 | # starting with a non-starter, and single-character |
---|
1100 | # decompositions. At the same time, record |
---|
1101 | # the first and second character of each decomposition |
---|
1102 | |
---|
1103 | for $code (keys %compositions) |
---|
1104 | { |
---|
1105 | @values = map { hex ($_) } split /\s+/, $compositions{$code}; |
---|
1106 | |
---|
1107 | # non-starters |
---|
1108 | if ($cclass[$values[0]]) { |
---|
1109 | delete $compositions{$code}; |
---|
1110 | next; |
---|
1111 | } |
---|
1112 | |
---|
1113 | # single-character decompositions |
---|
1114 | if (@values == 1) { |
---|
1115 | delete $compositions{$code}; |
---|
1116 | next; |
---|
1117 | } |
---|
1118 | |
---|
1119 | if (@values != 2) { |
---|
1120 | die "$code has more than two elements in its decomposition!\n"; |
---|
1121 | } |
---|
1122 | |
---|
1123 | if (exists $first{$values[0]}) { |
---|
1124 | $first{$values[0]}++; |
---|
1125 | } else { |
---|
1126 | $first{$values[0]} = 1; |
---|
1127 | } |
---|
1128 | } |
---|
1129 | |
---|
1130 | # Assign integer indices, removing singletons |
---|
1131 | my $n_first = enumerate_ordered (\%first); |
---|
1132 | |
---|
1133 | # Now record the second character of each (non-singleton) decomposition |
---|
1134 | for $code (keys %compositions) { |
---|
1135 | @values = map { hex ($_) } split /\s+/, $compositions{$code}; |
---|
1136 | |
---|
1137 | if (exists $first{$values[0]}) { |
---|
1138 | if (exists $second{$values[1]}) { |
---|
1139 | $second{$values[1]}++; |
---|
1140 | } else { |
---|
1141 | $second{$values[1]} = 1; |
---|
1142 | } |
---|
1143 | } |
---|
1144 | } |
---|
1145 | |
---|
1146 | # Assign integer indices, removing duplicate |
---|
1147 | my $n_second = enumerate_ordered (\%second); |
---|
1148 | |
---|
1149 | # Build reverse table |
---|
1150 | |
---|
1151 | my @first_singletons; |
---|
1152 | my @second_singletons; |
---|
1153 | my %reverse; |
---|
1154 | for $code (keys %compositions) { |
---|
1155 | @values = map { hex ($_) } split /\s+/, $compositions{$code}; |
---|
1156 | |
---|
1157 | my $first = $first{$values[0]}; |
---|
1158 | my $second = $second{$values[1]}; |
---|
1159 | |
---|
1160 | if (defined $first && defined $second) { |
---|
1161 | $reverse{"$first|$second"} = $code; |
---|
1162 | } elsif (!defined $first) { |
---|
1163 | push @first_singletons, [ $values[0], $values[1], $code ]; |
---|
1164 | } else { |
---|
1165 | push @second_singletons, [ $values[1], $values[0], $code ]; |
---|
1166 | } |
---|
1167 | } |
---|
1168 | |
---|
1169 | @first_singletons = sort { $a->[0] <=> $b->[0] } @first_singletons; |
---|
1170 | @second_singletons = sort { $a->[0] <=> $b->[0] } @second_singletons; |
---|
1171 | |
---|
1172 | my %vals; |
---|
1173 | |
---|
1174 | open OUT, ">gunicomp.h" or die "Cannot open gunicomp.h: $!\n"; |
---|
1175 | |
---|
1176 | # Assign values in lookup table for all code points involved |
---|
1177 | |
---|
1178 | my $total = 1; |
---|
1179 | my $last = 0; |
---|
1180 | printf OUT "#define COMPOSE_FIRST_START %d\n", $total; |
---|
1181 | for $code (keys %first) { |
---|
1182 | $vals{$code} = $first{$code} + $total; |
---|
1183 | $last = $code if $code > $last; |
---|
1184 | } |
---|
1185 | $total += $n_first; |
---|
1186 | $i = 0; |
---|
1187 | printf OUT "#define COMPOSE_FIRST_SINGLE_START %d\n", $total; |
---|
1188 | for $record (@first_singletons) { |
---|
1189 | my $code = $record->[0]; |
---|
1190 | $vals{$code} = $i++ + $total; |
---|
1191 | $last = $code if $code > $last; |
---|
1192 | } |
---|
1193 | $total += @first_singletons; |
---|
1194 | printf OUT "#define COMPOSE_SECOND_START %d\n", $total; |
---|
1195 | for $code (keys %second) { |
---|
1196 | $vals{$code} = $second{$code} + $total; |
---|
1197 | $last = $code if $code > $last; |
---|
1198 | } |
---|
1199 | $total += $n_second; |
---|
1200 | $i = 0; |
---|
1201 | printf OUT "#define COMPOSE_SECOND_SINGLE_START %d\n\n", $total; |
---|
1202 | for $record (@second_singletons) { |
---|
1203 | my $code = $record->[0]; |
---|
1204 | $vals{$code} = $i++ + $total; |
---|
1205 | $last = $code if $code > $last; |
---|
1206 | } |
---|
1207 | |
---|
1208 | printf OUT "#define COMPOSE_TABLE_LAST %d\n\n", $last / 256; |
---|
1209 | |
---|
1210 | # Output lookup table |
---|
1211 | |
---|
1212 | my @row; |
---|
1213 | $table_index = 0; |
---|
1214 | printf OUT "static const guint16 compose_data[][256] = {\n"; |
---|
1215 | for (my $count = 0; $count <= $last; $count += 256) |
---|
1216 | { |
---|
1217 | $row[$count / 256] = &print_row ($count, 2, sub { exists $vals{$_[0]} ? $vals{$_[0]} : 0; }); |
---|
1218 | } |
---|
1219 | printf OUT "\n};\n\n"; |
---|
1220 | |
---|
1221 | print OUT "static const gint16 compose_table[COMPOSE_TABLE_LAST + 1] = {\n"; |
---|
1222 | for (my $count = 0; $count <= $last; $count += 256) |
---|
1223 | { |
---|
1224 | print OUT ",\n" if $count > 0; |
---|
1225 | print OUT " ", $row[$count / 256]; |
---|
1226 | $bytes_out += 2; |
---|
1227 | } |
---|
1228 | print OUT "\n};\n\n"; |
---|
1229 | |
---|
1230 | # Output first singletons |
---|
1231 | |
---|
1232 | print OUT "static const guint16 compose_first_single[][2] = {\n"; |
---|
1233 | $i = 0; |
---|
1234 | for $record (@first_singletons) { |
---|
1235 | if ($record->[1] > 0xFFFF or $record->[2] > 0xFFFF) { |
---|
1236 | die "time to switch compose_first_single to gunichar" ; |
---|
1237 | } |
---|
1238 | print OUT ",\n" if $i++ > 0; |
---|
1239 | printf OUT " { %#06x, %#06x }", $record->[1], $record->[2]; |
---|
1240 | } |
---|
1241 | print OUT "\n};\n"; |
---|
1242 | |
---|
1243 | $bytes_out += @first_singletons * 4; |
---|
1244 | |
---|
1245 | # Output second singletons |
---|
1246 | |
---|
1247 | print OUT "static const guint16 compose_second_single[][2] = {\n"; |
---|
1248 | $i = 0; |
---|
1249 | for $record (@second_singletons) { |
---|
1250 | if ($record->[1] > 0xFFFF or $record->[2] > 0xFFFF) { |
---|
1251 | die "time to switch compose_second_single to gunichar"; |
---|
1252 | } |
---|
1253 | print OUT ",\n" if $i++ > 0; |
---|
1254 | printf OUT " { %#06x, %#06x }", $record->[1], $record->[2]; |
---|
1255 | } |
---|
1256 | print OUT "\n};\n"; |
---|
1257 | |
---|
1258 | $bytes_out += @second_singletons * 4; |
---|
1259 | |
---|
1260 | # Output array of composition pairs |
---|
1261 | |
---|
1262 | print OUT <<EOT; |
---|
1263 | static const guint16 compose_array[$n_first][$n_second] = { |
---|
1264 | EOT |
---|
1265 | |
---|
1266 | for (my $i = 0; $i < $n_first; $i++) { |
---|
1267 | print OUT ",\n" if $i; |
---|
1268 | print OUT " { "; |
---|
1269 | for (my $j = 0; $j < $n_second; $j++) { |
---|
1270 | print OUT ", " if $j; |
---|
1271 | if (exists $reverse{"$i|$j"}) { |
---|
1272 | if ($reverse{"$i|$j"} > 0xFFFF) { |
---|
1273 | die "time to switch compose_array to gunichar" ; |
---|
1274 | } |
---|
1275 | printf OUT "0x%04x", $reverse{"$i|$j"}; |
---|
1276 | } else { |
---|
1277 | print OUT " 0"; |
---|
1278 | } |
---|
1279 | } |
---|
1280 | print OUT " }"; |
---|
1281 | } |
---|
1282 | print OUT "\n"; |
---|
1283 | |
---|
1284 | print OUT <<EOT; |
---|
1285 | }; |
---|
1286 | EOT |
---|
1287 | |
---|
1288 | $bytes_out += $n_first * $n_second * 2; |
---|
1289 | |
---|
1290 | printf STDERR "Generated %d bytes in compose tables\n", $bytes_out; |
---|
1291 | } |
---|
1292 | |
---|
1293 | sub output_casefold_table |
---|
1294 | { |
---|
1295 | my $out = shift; |
---|
1296 | |
---|
1297 | print $out <<EOT; |
---|
1298 | |
---|
1299 | /* Table of casefolding cases that can't be derived by lowercasing |
---|
1300 | */ |
---|
1301 | static const struct { |
---|
1302 | guint16 ch; |
---|
1303 | gchar data[$casefoldlen]; |
---|
1304 | } casefold_table[] = { |
---|
1305 | EOT |
---|
1306 | |
---|
1307 | @casefold = sort { $a->[0] <=> $b->[0] } @casefold; |
---|
1308 | |
---|
1309 | for $case (@casefold) |
---|
1310 | { |
---|
1311 | $code = $case->[0]; |
---|
1312 | $string = $case->[1]; |
---|
1313 | |
---|
1314 | if ($code > 0xFFFF) { |
---|
1315 | die "time to switch casefold_table to gunichar" ; |
---|
1316 | } |
---|
1317 | |
---|
1318 | print $out sprintf(qq( { 0x%04x, "$string" },\n), $code); |
---|
1319 | |
---|
1320 | } |
---|
1321 | |
---|
1322 | print $out <<EOT; |
---|
1323 | }; |
---|
1324 | |
---|
1325 | EOT |
---|
1326 | |
---|
1327 | my $recordlen = (2+$casefoldlen+1) & ~1; |
---|
1328 | printf "Generated %d bytes for casefold table\n", $recordlen * @casefold; |
---|
1329 | } |
---|
1330 | |
---|
1331 | |
---|
1332 | |
---|