1 | #! /usr/bin/perl -w |
---|
2 | |
---|
3 | # Copyright (C) 1998, 1999 Tom Tromey |
---|
4 | # Copyright (C) 2001 Red Hat Software |
---|
5 | |
---|
6 | # This program is free software; you can redistribute it and/or modify |
---|
7 | # it under the terms of the GNU General Public License as published by |
---|
8 | # the Free Software Foundation; either version 2, or (at your option) |
---|
9 | # any later version. |
---|
10 | |
---|
11 | # This program is distributed in the hope that it will be useful, |
---|
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
14 | # GNU General Public License for more details. |
---|
15 | |
---|
16 | # You should have received a copy of the GNU General Public License |
---|
17 | # along with this program; if not, write to the Free Software |
---|
18 | # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA |
---|
19 | # 02111-1307, USA. |
---|
20 | |
---|
21 | # Contributer(s): |
---|
22 | # Andrew Taylor <andrew.taylor@montage.ca> |
---|
23 | |
---|
24 | # gen-unicode-tables.pl - Generate tables for libunicode from Unicode data. |
---|
25 | # See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html |
---|
26 | # Usage: gen-unicode-tables.pl [-decomp | -both] UNICODE-VERSION UnicodeData.txt LineBreak.txt SpecialCasing.txt CaseFolding.txt |
---|
27 | # I consider the output of this program to be unrestricted. Use it as |
---|
28 | # you will. |
---|
29 | |
---|
30 | # FIXME: |
---|
31 | # * For decomp table it might make sense to use a shift count other |
---|
32 | # than 8. We could easily compute the perfect shift count. |
---|
33 | |
---|
34 | use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION); |
---|
35 | |
---|
36 | # Names of fields in Unicode data table. |
---|
37 | $CODE = 0; |
---|
38 | $NAME = 1; |
---|
39 | $CATEGORY = 2; |
---|
40 | $COMBINING_CLASSES = 3; |
---|
41 | $BIDI_CATEGORY = 4; |
---|
42 | $DECOMPOSITION = 5; |
---|
43 | $DECIMAL_VALUE = 6; |
---|
44 | $DIGIT_VALUE = 7; |
---|
45 | $NUMERIC_VALUE = 8; |
---|
46 | $MIRRORED = 9; |
---|
47 | $OLD_NAME = 10; |
---|
48 | $COMMENT = 11; |
---|
49 | $UPPER = 12; |
---|
50 | $LOWER = 13; |
---|
51 | $TITLE = 14; |
---|
52 | |
---|
53 | # Names of fields in the line break table |
---|
54 | $BREAK_CODE = 0; |
---|
55 | $BREAK_PROPERTY = 1; |
---|
56 | |
---|
57 | # Names of fields in the SpecialCasing table |
---|
58 | $CASE_CODE = 0; |
---|
59 | $CASE_LOWER = 1; |
---|
60 | $CASE_TITLE = 2; |
---|
61 | $CASE_UPPER = 3; |
---|
62 | $CASE_CONDITION = 4; |
---|
63 | |
---|
64 | # Names of fields in the CaseFolding table |
---|
65 | $FOLDING_CODE = 0; |
---|
66 | $FOLDING_STATUS = 1; |
---|
67 | $FOLDING_MAPPING = 2; |
---|
68 | |
---|
69 | # Map general category code onto symbolic name. |
---|
70 | %mappings = |
---|
71 | ( |
---|
72 | # Normative. |
---|
73 | 'Lu' => "G_UNICODE_UPPERCASE_LETTER", |
---|
74 | 'Ll' => "G_UNICODE_LOWERCASE_LETTER", |
---|
75 | 'Lt' => "G_UNICODE_TITLECASE_LETTER", |
---|
76 | 'Mn' => "G_UNICODE_NON_SPACING_MARK", |
---|
77 | 'Mc' => "G_UNICODE_COMBINING_MARK", |
---|
78 | 'Me' => "G_UNICODE_ENCLOSING_MARK", |
---|
79 | 'Nd' => "G_UNICODE_DECIMAL_NUMBER", |
---|
80 | 'Nl' => "G_UNICODE_LETTER_NUMBER", |
---|
81 | 'No' => "G_UNICODE_OTHER_NUMBER", |
---|
82 | 'Zs' => "G_UNICODE_SPACE_SEPARATOR", |
---|
83 | 'Zl' => "G_UNICODE_LINE_SEPARATOR", |
---|
84 | 'Zp' => "G_UNICODE_PARAGRAPH_SEPARATOR", |
---|
85 | 'Cc' => "G_UNICODE_CONTROL", |
---|
86 | 'Cf' => "G_UNICODE_FORMAT", |
---|
87 | 'Cs' => "G_UNICODE_SURROGATE", |
---|
88 | 'Co' => "G_UNICODE_PRIVATE_USE", |
---|
89 | 'Cn' => "G_UNICODE_UNASSIGNED", |
---|
90 | |
---|
91 | # Informative. |
---|
92 | 'Lm' => "G_UNICODE_MODIFIER_LETTER", |
---|
93 | 'Lo' => "G_UNICODE_OTHER_LETTER", |
---|
94 | 'Pc' => "G_UNICODE_CONNECT_PUNCTUATION", |
---|
95 | 'Pd' => "G_UNICODE_DASH_PUNCTUATION", |
---|
96 | 'Ps' => "G_UNICODE_OPEN_PUNCTUATION", |
---|
97 | 'Pe' => "G_UNICODE_CLOSE_PUNCTUATION", |
---|
98 | 'Pi' => "G_UNICODE_INITIAL_PUNCTUATION", |
---|
99 | 'Pf' => "G_UNICODE_FINAL_PUNCTUATION", |
---|
100 | 'Po' => "G_UNICODE_OTHER_PUNCTUATION", |
---|
101 | 'Sm' => "G_UNICODE_MATH_SYMBOL", |
---|
102 | 'Sc' => "G_UNICODE_CURRENCY_SYMBOL", |
---|
103 | 'Sk' => "G_UNICODE_MODIFIER_SYMBOL", |
---|
104 | 'So' => "G_UNICODE_OTHER_SYMBOL" |
---|
105 | ); |
---|
106 | |
---|
107 | %break_mappings = |
---|
108 | ( |
---|
109 | 'BK' => "G_UNICODE_BREAK_MANDATORY", |
---|
110 | 'CR' => "G_UNICODE_BREAK_CARRIAGE_RETURN", |
---|
111 | 'LF' => "G_UNICODE_BREAK_LINE_FEED", |
---|
112 | 'CM' => "G_UNICODE_BREAK_COMBINING_MARK", |
---|
113 | 'SG' => "G_UNICODE_BREAK_SURROGATE", |
---|
114 | 'ZW' => "G_UNICODE_BREAK_ZERO_WIDTH_SPACE", |
---|
115 | 'IN' => "G_UNICODE_BREAK_INSEPARABLE", |
---|
116 | 'GL' => "G_UNICODE_BREAK_NON_BREAKING_GLUE", |
---|
117 | 'CB' => "G_UNICODE_BREAK_CONTINGENT", |
---|
118 | 'SP' => "G_UNICODE_BREAK_SPACE", |
---|
119 | 'BA' => "G_UNICODE_BREAK_AFTER", |
---|
120 | 'BB' => "G_UNICODE_BREAK_BEFORE", |
---|
121 | 'B2' => "G_UNICODE_BREAK_BEFORE_AND_AFTER", |
---|
122 | 'HY' => "G_UNICODE_BREAK_HYPHEN", |
---|
123 | 'NS' => "G_UNICODE_BREAK_NON_STARTER", |
---|
124 | 'OP' => "G_UNICODE_BREAK_OPEN_PUNCTUATION", |
---|
125 | 'CL' => "G_UNICODE_BREAK_CLOSE_PUNCTUATION", |
---|
126 | 'QU' => "G_UNICODE_BREAK_QUOTATION", |
---|
127 | 'EX' => "G_UNICODE_BREAK_EXCLAMATION", |
---|
128 | 'ID' => "G_UNICODE_BREAK_IDEOGRAPHIC", |
---|
129 | 'NU' => "G_UNICODE_BREAK_NUMERIC", |
---|
130 | 'IS' => "G_UNICODE_BREAK_INFIX_SEPARATOR", |
---|
131 | 'SY' => "G_UNICODE_BREAK_SYMBOL", |
---|
132 | 'AL' => "G_UNICODE_BREAK_ALPHABETIC", |
---|
133 | 'PR' => "G_UNICODE_BREAK_PREFIX", |
---|
134 | 'PO' => "G_UNICODE_BREAK_POSTFIX", |
---|
135 | 'SA' => "G_UNICODE_BREAK_COMPLEX_CONTEXT", |
---|
136 | 'AI' => "G_UNICODE_BREAK_AMBIGUOUS", |
---|
137 | 'XX' => "G_UNICODE_BREAK_UNKNOWN" |
---|
138 | ); |
---|
139 | |
---|
140 | # Title case mappings. |
---|
141 | %title_to_lower = (); |
---|
142 | %title_to_upper = (); |
---|
143 | |
---|
144 | # Maximum length of special-case strings |
---|
145 | |
---|
146 | my $special_case_len = 0; |
---|
147 | my @special_cases; |
---|
148 | |
---|
149 | $do_decomp = 0; |
---|
150 | $do_props = 1; |
---|
151 | if (@ARGV && $ARGV[0] eq '-decomp') |
---|
152 | { |
---|
153 | $do_decomp = 1; |
---|
154 | $do_props = 0; |
---|
155 | shift @ARGV; |
---|
156 | } |
---|
157 | elsif (@ARGV && $ARGV[0] eq '-both') |
---|
158 | { |
---|
159 | $do_decomp = 1; |
---|
160 | shift @ARGV; |
---|
161 | } |
---|
162 | |
---|
163 | if (@ARGV != 6) { |
---|
164 | $0 =~ s@.*/@@; |
---|
165 | die "Usage: $0 [-decomp | -both] UNICODE-VERSION UnicodeData.txt LineBreak.txt SpecialCasing.txt CaseFolding.txt CompositionExclusions.txt\n"; |
---|
166 | } |
---|
167 | |
---|
168 | print "Creating decomp table\n" if ($do_decomp); |
---|
169 | print "Creating property table\n" if ($do_props); |
---|
170 | |
---|
171 | print "Composition exlusions from $ARGV[5]\n"; |
---|
172 | |
---|
173 | open (INPUT, "< $ARGV[5]") || exit 1; |
---|
174 | |
---|
175 | while (<INPUT>) { |
---|
176 | |
---|
177 | chop; |
---|
178 | |
---|
179 | next if /^#/; |
---|
180 | next if /^\s*$/; |
---|
181 | |
---|
182 | s/\s*#.*//; |
---|
183 | |
---|
184 | s/^\s*//; |
---|
185 | s/\s*$//; |
---|
186 | |
---|
187 | $composition_exclusions{hex($_)} = 1; |
---|
188 | } |
---|
189 | |
---|
190 | close INPUT; |
---|
191 | |
---|
192 | print "Unicode data from $ARGV[1]\n"; |
---|
193 | |
---|
194 | open (INPUT, "< $ARGV[1]") || exit 1; |
---|
195 | |
---|
196 | $last_code = -1; |
---|
197 | while (<INPUT>) |
---|
198 | { |
---|
199 | chop; |
---|
200 | @fields = split (';', $_, 30); |
---|
201 | if ($#fields != 14) |
---|
202 | { |
---|
203 | printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields); |
---|
204 | } |
---|
205 | |
---|
206 | $code = hex ($fields[$CODE]); |
---|
207 | |
---|
208 | last if ($code > 0xFFFF); # ignore characters out of the basic plane |
---|
209 | |
---|
210 | if ($code > $last_code + 1) |
---|
211 | { |
---|
212 | # Found a gap. |
---|
213 | if ($fields[$NAME] =~ /Last>/) |
---|
214 | { |
---|
215 | # Fill the gap with the last character read, |
---|
216 | # since this was a range specified in the char database |
---|
217 | @gfields = @fields; |
---|
218 | } |
---|
219 | else |
---|
220 | { |
---|
221 | # The gap represents undefined characters. Only the type |
---|
222 | # matters. |
---|
223 | @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '', |
---|
224 | '', '', '', ''); |
---|
225 | } |
---|
226 | for (++$last_code; $last_code < $code; ++$last_code) |
---|
227 | { |
---|
228 | $gfields{$CODE} = sprintf ("%04x", $last_code); |
---|
229 | &process_one ($last_code, @gfields); |
---|
230 | } |
---|
231 | } |
---|
232 | &process_one ($code, @fields); |
---|
233 | $last_code = $code; |
---|
234 | } |
---|
235 | |
---|
236 | close INPUT; |
---|
237 | |
---|
238 | @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '', |
---|
239 | '', '', '', ''); |
---|
240 | for (++$last_code; $last_code < 0x10000; ++$last_code) |
---|
241 | { |
---|
242 | $gfields{$CODE} = sprintf ("%04x", $last_code); |
---|
243 | &process_one ($last_code, @gfields); |
---|
244 | } |
---|
245 | --$last_code; # Want last to be 0xFFFF. |
---|
246 | |
---|
247 | print "Creating line break table\n"; |
---|
248 | |
---|
249 | print "Line break data from $ARGV[2]\n"; |
---|
250 | |
---|
251 | open (INPUT, "< $ARGV[2]") || exit 1; |
---|
252 | |
---|
253 | $last_code = -1; |
---|
254 | while (<INPUT>) |
---|
255 | { |
---|
256 | my ($start_code, $end_code); |
---|
257 | |
---|
258 | chop; |
---|
259 | |
---|
260 | next if /^#/; |
---|
261 | |
---|
262 | s/\s*#.*//; |
---|
263 | |
---|
264 | @fields = split (';', $_, 30); |
---|
265 | if ($#fields != 1) |
---|
266 | { |
---|
267 | printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields); |
---|
268 | next; |
---|
269 | } |
---|
270 | |
---|
271 | if ($fields[$CODE] =~ /([A-F0-9]{4})..([A-F0-9]{4})/) |
---|
272 | { |
---|
273 | $start_code = hex ($1); |
---|
274 | $end_code = hex ($2); |
---|
275 | } else { |
---|
276 | $start_code = $end_code = hex ($fields[$CODE]); |
---|
277 | |
---|
278 | } |
---|
279 | |
---|
280 | last if ($start_code > 0xFFFF); # FIXME ignore characters out of the basic plane |
---|
281 | |
---|
282 | if ($start_code > $last_code + 1) |
---|
283 | { |
---|
284 | # The gap represents undefined characters. If assigned, |
---|
285 | # they are AL, if not assigned, XX |
---|
286 | for (++$last_code; $last_code < $start_code; ++$last_code) |
---|
287 | { |
---|
288 | if ($type[$last_code] eq 'Cn') |
---|
289 | { |
---|
290 | $break_props[$last_code] = 'XX'; |
---|
291 | } |
---|
292 | else |
---|
293 | { |
---|
294 | $break_props[$last_code] = 'AL'; |
---|
295 | } |
---|
296 | } |
---|
297 | } |
---|
298 | |
---|
299 | for ($last_code = $start_code; $last_code <= $end_code; $last_code++) |
---|
300 | { |
---|
301 | $break_props[$last_code] = $fields[$BREAK_PROPERTY]; |
---|
302 | } |
---|
303 | |
---|
304 | $last_code = $end_code; |
---|
305 | } |
---|
306 | |
---|
307 | close INPUT; |
---|
308 | |
---|
309 | for (++$last_code; $last_code < 0x10000; ++$last_code) |
---|
310 | { |
---|
311 | if ($type[$last_code] eq 'Cn') |
---|
312 | { |
---|
313 | $break_props[$last_code] = 'XX'; |
---|
314 | } |
---|
315 | else |
---|
316 | { |
---|
317 | $break_props[$last_code] = 'AL'; |
---|
318 | } |
---|
319 | } |
---|
320 | --$last_code; # Want last to be 0xFFFF. |
---|
321 | |
---|
322 | print STDERR "Last code is not 0xFFFF" if ($last_code != 0xFFFF); |
---|
323 | |
---|
324 | print "Reading special-casing table for case conversion\n"; |
---|
325 | |
---|
326 | open (INPUT, "< $ARGV[3]") || exit 1; |
---|
327 | |
---|
328 | while (<INPUT>) |
---|
329 | { |
---|
330 | my $code; |
---|
331 | |
---|
332 | chop; |
---|
333 | |
---|
334 | next if /^#/; |
---|
335 | next if /^\s*$/; |
---|
336 | |
---|
337 | s/\s*#.*//; |
---|
338 | |
---|
339 | @fields = split ('\s*;\s*', $_, 30); |
---|
340 | |
---|
341 | $raw_code = $fields[$CASE_CODE]; |
---|
342 | $code = hex ($raw_code); |
---|
343 | |
---|
344 | if ($#fields != 4 && $#fields != 5) |
---|
345 | { |
---|
346 | printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields); |
---|
347 | next; |
---|
348 | } |
---|
349 | |
---|
350 | if (!defined $type[$code]) |
---|
351 | { |
---|
352 | printf STDERR "Special case for code point: $code, which has no defined type\n"; |
---|
353 | next; |
---|
354 | } |
---|
355 | |
---|
356 | if (defined $fields[5]) { |
---|
357 | # Ignore conditional special cases - we'll handle them in code |
---|
358 | next; |
---|
359 | } |
---|
360 | |
---|
361 | if ($type[$code] eq 'Lu') |
---|
362 | { |
---|
363 | (hex $fields[$CASE_UPPER] == $code) || die "$raw_code is Lu and UCD_Upper($raw_code) != $raw_code"; |
---|
364 | |
---|
365 | &add_special_case ($code, $value[$code],$fields[$CASE_LOWER], $fields[$CASE_TITLE]); |
---|
366 | |
---|
367 | } elsif ($type[$code] eq 'Lt') |
---|
368 | { |
---|
369 | (hex $fields[$CASE_TITLE] == $code) || die "$raw_code is Lt and UCD_Title($raw_code) != $raw_code"; |
---|
370 | |
---|
371 | &add_special_case ($code, undef,$fields[$CASE_LOWER], $fields[$CASE_UPPER]); |
---|
372 | } elsif ($type[$code] eq 'Ll') |
---|
373 | { |
---|
374 | (hex $fields[$CASE_LOWER] == $code) || die "$raw_code is Ll and UCD_Lower($raw_code) != $raw_code"; |
---|
375 | |
---|
376 | &add_special_case ($code, $value[$code],$fields[$CASE_UPPER], $fields[$CASE_TITLE]); |
---|
377 | } else { |
---|
378 | printf STDERR "Special case for non-alphabetic code point: $raw_code\n"; |
---|
379 | next; |
---|
380 | } |
---|
381 | } |
---|
382 | |
---|
383 | close INPUT; |
---|
384 | |
---|
385 | open (INPUT, "< $ARGV[4]") || exit 1; |
---|
386 | |
---|
387 | my $casefoldlen = 0; |
---|
388 | my @casefold; |
---|
389 | |
---|
390 | while (<INPUT>) |
---|
391 | { |
---|
392 | my $code; |
---|
393 | |
---|
394 | chop; |
---|
395 | |
---|
396 | next if /^#/; |
---|
397 | next if /^\s*$/; |
---|
398 | |
---|
399 | s/\s*#.*//; |
---|
400 | |
---|
401 | @fields = split ('\s*;\s*', $_, 30); |
---|
402 | |
---|
403 | $raw_code = $fields[$FOLDING_CODE]; |
---|
404 | $code = hex ($raw_code); |
---|
405 | |
---|
406 | next if $code > 0xffff; # FIXME! |
---|
407 | |
---|
408 | if ($#fields != 3) |
---|
409 | { |
---|
410 | printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields); |
---|
411 | next; |
---|
412 | } |
---|
413 | |
---|
414 | next if ($fields[$FOLDING_STATUS] eq 'S'); |
---|
415 | |
---|
416 | @values = map { hex ($_) } split /\s+/, $fields[$FOLDING_MAPPING]; |
---|
417 | |
---|
418 | # Check simple case |
---|
419 | |
---|
420 | if (@values == 1 && |
---|
421 | !(defined $value[$code] && $value[$code] >= 0xd800 && $value[$code] < 0xdc00) && |
---|
422 | defined $type[$code]) { |
---|
423 | |
---|
424 | my $lower; |
---|
425 | if ($type[$code] eq 'Ll') |
---|
426 | { |
---|
427 | $lower = $code; |
---|
428 | } elsif ($type[$code] eq 'Lt') |
---|
429 | { |
---|
430 | $lower = $title_to_lower{$code}; |
---|
431 | } elsif ($type[$code] eq 'Lu') |
---|
432 | { |
---|
433 | $lower = $value[$code]; |
---|
434 | } else { |
---|
435 | $lower = $code; |
---|
436 | } |
---|
437 | |
---|
438 | if ($lower == $values[0]) { |
---|
439 | next; |
---|
440 | } |
---|
441 | } |
---|
442 | |
---|
443 | my $string = pack ("U*", @values); |
---|
444 | $string =~ s/([\x80-\xff])/sprintf "\\x%02x",ord($1)/eg; |
---|
445 | |
---|
446 | if (1 + length $string > $casefoldlen) { |
---|
447 | $casefoldlen = 1 + length $string; |
---|
448 | } |
---|
449 | |
---|
450 | push @casefold, [ $code, $string ]; |
---|
451 | } |
---|
452 | |
---|
453 | close INPUT; |
---|
454 | |
---|
455 | if ($do_props) { |
---|
456 | &print_tables ($last_code) |
---|
457 | } |
---|
458 | if ($do_decomp) { |
---|
459 | &print_decomp ($last_code); |
---|
460 | &output_composition_table; |
---|
461 | } |
---|
462 | |
---|
463 | &print_line_break ($last_code); |
---|
464 | |
---|
465 | exit 0; |
---|
466 | |
---|
467 | # Process a single character. |
---|
468 | sub process_one |
---|
469 | { |
---|
470 | my ($code, @fields) = @_; |
---|
471 | |
---|
472 | $type[$code] = $fields[$CATEGORY]; |
---|
473 | if ($type[$code] eq 'Nd') |
---|
474 | { |
---|
475 | $value[$code] = int ($fields[$DECIMAL_VALUE]); |
---|
476 | } |
---|
477 | elsif ($type[$code] eq 'Ll') |
---|
478 | { |
---|
479 | $value[$code] = hex ($fields[$UPPER]); |
---|
480 | } |
---|
481 | elsif ($type[$code] eq 'Lu') |
---|
482 | { |
---|
483 | $value[$code] = hex ($fields[$LOWER]); |
---|
484 | } |
---|
485 | |
---|
486 | if ($type[$code] eq 'Lt') |
---|
487 | { |
---|
488 | $title_to_lower{$code} = hex ($fields[$LOWER]); |
---|
489 | $title_to_upper{$code} = hex ($fields[$UPPER]); |
---|
490 | } |
---|
491 | |
---|
492 | $cclass[$code] = $fields[$COMBINING_CLASSES]; |
---|
493 | |
---|
494 | # Handle decompositions. |
---|
495 | if ($fields[$DECOMPOSITION] ne '') |
---|
496 | { |
---|
497 | if ($fields[$DECOMPOSITION] =~ s/\<.*\>\s*//) { |
---|
498 | $decompose_compat[$code] = 1; |
---|
499 | } else { |
---|
500 | $decompose_compat[$code] = 0; |
---|
501 | |
---|
502 | if (!exists $composition_exclusions{$code}) { |
---|
503 | $compositions{$code} = $fields[$DECOMPOSITION]; |
---|
504 | } |
---|
505 | } |
---|
506 | $decompositions[$code] = $fields[$DECOMPOSITION]; |
---|
507 | } |
---|
508 | } |
---|
509 | |
---|
510 | sub print_tables |
---|
511 | { |
---|
512 | my ($last) = @_; |
---|
513 | my ($outfile) = "gunichartables.h"; |
---|
514 | |
---|
515 | local ($bytes_out) = 0; |
---|
516 | |
---|
517 | print "Writing $outfile...\n"; |
---|
518 | |
---|
519 | open (OUT, "> $outfile"); |
---|
520 | |
---|
521 | print OUT "/* This file is automatically generated. DO NOT EDIT!\n"; |
---|
522 | print OUT " Instead, edit gen-unicode-tables.pl and re-run. */\n\n"; |
---|
523 | |
---|
524 | print OUT "#ifndef CHARTABLES_H\n"; |
---|
525 | print OUT "#define CHARTABLES_H\n\n"; |
---|
526 | |
---|
527 | print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n"; |
---|
528 | |
---|
529 | printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last; |
---|
530 | |
---|
531 | printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 1000\n\n"; |
---|
532 | |
---|
533 | $table_index = 0; |
---|
534 | printf OUT "static const char type_data[][256] = {\n"; |
---|
535 | for ($count = 0; $count <= $last; $count += 256) |
---|
536 | { |
---|
537 | $row[$count / 256] = &print_row ($count, 1, \&fetch_type); |
---|
538 | } |
---|
539 | printf OUT "\n};\n\n"; |
---|
540 | |
---|
541 | print OUT "static const short type_table[256] = {\n"; |
---|
542 | for ($count = 0; $count <= $last; $count += 256) |
---|
543 | { |
---|
544 | print OUT ",\n" if $count > 0; |
---|
545 | print OUT " ", $row[$count / 256]; |
---|
546 | $bytes_out += 2; |
---|
547 | } |
---|
548 | print OUT "\n};\n\n"; |
---|
549 | |
---|
550 | |
---|
551 | # |
---|
552 | # Now print attribute table. |
---|
553 | # |
---|
554 | |
---|
555 | $table_index = 0; |
---|
556 | printf OUT "static const unsigned short attr_data[][256] = {\n"; |
---|
557 | for ($count = 0; $count <= $last; $count += 256) |
---|
558 | { |
---|
559 | $row[$count / 256] = &print_row ($count, 2, \&fetch_attr); |
---|
560 | } |
---|
561 | printf OUT "\n};\n\n"; |
---|
562 | |
---|
563 | print OUT "static const short attr_table[256] = {\n"; |
---|
564 | for ($count = 0; $count <= $last; $count += 256) |
---|
565 | { |
---|
566 | print OUT ",\n" if $count > 0; |
---|
567 | print OUT " ", $row[$count / 256]; |
---|
568 | $bytes_out += 2; |
---|
569 | } |
---|
570 | print OUT "\n};\n\n"; |
---|
571 | |
---|
572 | # |
---|
573 | # print title case table |
---|
574 | # |
---|
575 | |
---|
576 | # FIXME: type. |
---|
577 | print OUT "static const unsigned short title_table[][3] = {\n"; |
---|
578 | my ($item); |
---|
579 | my ($first) = 1; |
---|
580 | foreach $item (sort keys %title_to_lower) |
---|
581 | { |
---|
582 | print OUT ",\n" |
---|
583 | unless $first; |
---|
584 | $first = 0; |
---|
585 | printf OUT " { 0x%04x, 0x%04x, 0x%04x }", $item, $title_to_upper{$item}, $title_to_lower{$item}; |
---|
586 | $bytes_out += 6; |
---|
587 | } |
---|
588 | print OUT "\n};\n\n"; |
---|
589 | |
---|
590 | # |
---|
591 | # And special case conversion table -- conversions that change length |
---|
592 | # |
---|
593 | &output_special_case_table (\*OUT); |
---|
594 | &output_casefold_table (\*OUT); |
---|
595 | |
---|
596 | print OUT "#endif /* CHARTABLES_H */\n"; |
---|
597 | |
---|
598 | close (OUT); |
---|
599 | |
---|
600 | printf STDERR "Generated %d bytes in tables\n", $bytes_out; |
---|
601 | } |
---|
602 | |
---|
603 | # A fetch function for the type table. |
---|
604 | sub fetch_type |
---|
605 | { |
---|
606 | my ($index) = @_; |
---|
607 | return $mappings{$type[$index]}; |
---|
608 | } |
---|
609 | |
---|
610 | # A fetch function for the attribute table. |
---|
611 | sub fetch_attr |
---|
612 | { |
---|
613 | my ($index) = @_; |
---|
614 | if (defined $value[$index]) |
---|
615 | { |
---|
616 | return sprintf ("0x%04x", $value[$index]); |
---|
617 | } |
---|
618 | else |
---|
619 | { |
---|
620 | return "0x0000"; |
---|
621 | } |
---|
622 | } |
---|
623 | |
---|
624 | sub print_row |
---|
625 | { |
---|
626 | my ($start, $typsize, $fetcher) = @_; |
---|
627 | |
---|
628 | my ($i); |
---|
629 | my (@values); |
---|
630 | my ($flag) = 1; |
---|
631 | my ($off); |
---|
632 | |
---|
633 | for ($off = 0; $off < 256; ++$off) |
---|
634 | { |
---|
635 | $values[$off] = $fetcher->($off + $start); |
---|
636 | if ($values[$off] ne $values[0]) |
---|
637 | { |
---|
638 | $flag = 0; |
---|
639 | } |
---|
640 | } |
---|
641 | if ($flag) |
---|
642 | { |
---|
643 | return $values[0] . " + G_UNICODE_MAX_TABLE_INDEX"; |
---|
644 | } |
---|
645 | |
---|
646 | printf OUT ",\n" if ($table_index != 0); |
---|
647 | printf OUT " { /* page %d, index %d */\n ", $start / 256, $table_index; |
---|
648 | my ($column) = 4; |
---|
649 | for ($i = $start; $i < $start + 256; ++$i) |
---|
650 | { |
---|
651 | print OUT ", " |
---|
652 | if $i > $start; |
---|
653 | my ($text) = $values[$i - $start]; |
---|
654 | if (length ($text) + $column + 2 > 78) |
---|
655 | { |
---|
656 | print OUT "\n "; |
---|
657 | $column = 4; |
---|
658 | } |
---|
659 | print OUT $text; |
---|
660 | $column += length ($text) + 2; |
---|
661 | } |
---|
662 | print OUT "\n }"; |
---|
663 | |
---|
664 | $bytes_out += 256 * $typsize; |
---|
665 | |
---|
666 | return sprintf "%d /* page %d */", $table_index++, $start / 256; |
---|
667 | } |
---|
668 | |
---|
669 | # Generate the character decomposition header. |
---|
670 | sub print_decomp |
---|
671 | { |
---|
672 | my ($last) = @_; |
---|
673 | my ($outfile) = "gunidecomp.h"; |
---|
674 | |
---|
675 | local ($bytes_out) = 0; |
---|
676 | |
---|
677 | print "Writing $outfile...\n"; |
---|
678 | |
---|
679 | open (OUT, "> $outfile") || exit 1; |
---|
680 | |
---|
681 | print OUT "/* This file is automatically generated. DO NOT EDIT! */\n\n"; |
---|
682 | print OUT "#ifndef DECOMP_H\n"; |
---|
683 | print OUT "#define DECOMP_H\n\n"; |
---|
684 | |
---|
685 | printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last; |
---|
686 | |
---|
687 | printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 1000\n\n"; |
---|
688 | |
---|
689 | my ($count, @row); |
---|
690 | $table_index = 0; |
---|
691 | printf OUT "static const unsigned char cclass_data[][256] = {\n"; |
---|
692 | for ($count = 0; $count <= $last; $count += 256) |
---|
693 | { |
---|
694 | $row[$count / 256] = &print_row ($count, 1, \&fetch_cclass); |
---|
695 | } |
---|
696 | printf OUT "\n};\n\n"; |
---|
697 | |
---|
698 | print OUT "static const short combining_class_table[256] = {\n"; |
---|
699 | for ($count = 0; $count <= $last; $count += 256) |
---|
700 | { |
---|
701 | print OUT ",\n" if $count > 0; |
---|
702 | print OUT " ", $row[$count / 256]; |
---|
703 | $bytes_out += 2; |
---|
704 | } |
---|
705 | print OUT "\n};\n\n"; |
---|
706 | |
---|
707 | print OUT "typedef struct\n{\n"; |
---|
708 | # FIXME: type. |
---|
709 | print OUT " unsigned short ch;\n"; |
---|
710 | print OUT " unsigned char canon_offset;\n"; |
---|
711 | print OUT " unsigned char compat_offset;\n"; |
---|
712 | print OUT " unsigned short expansion_offset;\n"; |
---|
713 | print OUT "} decomposition;\n\n"; |
---|
714 | |
---|
715 | print OUT "static const decomposition decomp_table[] =\n{\n"; |
---|
716 | my ($iter); |
---|
717 | my ($first) = 1; |
---|
718 | my ($decomp_string) = ""; |
---|
719 | my ($decomp_string_offset) = 0; |
---|
720 | for ($count = 0; $count <= $last; ++$count) |
---|
721 | { |
---|
722 | if (defined $decompositions[$count]) |
---|
723 | { |
---|
724 | print OUT ",\n" |
---|
725 | if ! $first; |
---|
726 | $first = 0; |
---|
727 | |
---|
728 | my $canon_decomp; |
---|
729 | my $compat_decomp; |
---|
730 | |
---|
731 | if (!$decompose_compat[$count]) { |
---|
732 | $canon_decomp = make_decomp ($count, 0); |
---|
733 | } |
---|
734 | $compat_decomp = make_decomp ($count, 1); |
---|
735 | |
---|
736 | if (defined $canon_decomp && $compat_decomp eq $canon_decomp) { |
---|
737 | undef $compat_decomp; |
---|
738 | } |
---|
739 | |
---|
740 | my $string = ""; |
---|
741 | my $canon_offset = 0xff; |
---|
742 | my $compat_offset = 0xff; |
---|
743 | |
---|
744 | if (defined $canon_decomp) { |
---|
745 | $canon_offset = 0; |
---|
746 | $string .= $canon_decomp; |
---|
747 | } |
---|
748 | if (defined $compat_decomp) { |
---|
749 | if (defined $canon_decomp) { |
---|
750 | $string .= "\\x00\\x00"; |
---|
751 | } |
---|
752 | $compat_offset = (length $string) / 4; |
---|
753 | $string .= $compat_decomp; |
---|
754 | } |
---|
755 | |
---|
756 | if (!defined($decomp_offsets{$string})) { |
---|
757 | $decomp_offsets{$string} = $decomp_string_offset; |
---|
758 | $decomp_string .= "\n \"".$string."\\0\\0\" /* offset ". |
---|
759 | $decomp_string_offset." */"; |
---|
760 | $decomp_string_offset += ((length $string) / 4) + 2; |
---|
761 | |
---|
762 | $bytes_out += (length $string) / 4 + 2; # "\x20" |
---|
763 | } |
---|
764 | |
---|
765 | printf OUT qq( { 0x%04x, %u, %u, %d }), |
---|
766 | $count, $canon_offset, $compat_offset, $decomp_offsets{$string}; |
---|
767 | $bytes_out += 6; |
---|
768 | |
---|
769 | } |
---|
770 | } |
---|
771 | print OUT "\n};\n\n"; |
---|
772 | |
---|
773 | printf OUT "static const guchar decomp_expansion_string[] = %s;\n\n", $decomp_string; |
---|
774 | |
---|
775 | print OUT "#endif /* DECOMP_H */\n"; |
---|
776 | |
---|
777 | printf STDERR "Generated %d bytes in decomp tables\n", $bytes_out; |
---|
778 | } |
---|
779 | |
---|
780 | sub print_line_break |
---|
781 | { |
---|
782 | my ($last) = @_; |
---|
783 | my ($outfile) = "gunibreak.h"; |
---|
784 | |
---|
785 | local ($bytes_out) = 0; |
---|
786 | |
---|
787 | print "Writing $outfile...\n"; |
---|
788 | |
---|
789 | open (OUT, "> $outfile"); |
---|
790 | |
---|
791 | print OUT "/* This file is automatically generated. DO NOT EDIT!\n"; |
---|
792 | print OUT " Instead, edit gen-unicode-tables.pl and re-run. */\n\n"; |
---|
793 | |
---|
794 | print OUT "#ifndef BREAKTABLES_H\n"; |
---|
795 | print OUT "#define BREAKTABLES_H\n\n"; |
---|
796 | |
---|
797 | print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n"; |
---|
798 | |
---|
799 | printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last; |
---|
800 | |
---|
801 | printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 1000\n\n"; |
---|
802 | |
---|
803 | $table_index = 0; |
---|
804 | printf OUT "static const char break_property_data[][256] = {\n"; |
---|
805 | for ($count = 0; $count <= $last; $count += 256) |
---|
806 | { |
---|
807 | $row[$count / 256] = &print_row ($count, 1, \&fetch_break_type); |
---|
808 | } |
---|
809 | printf OUT "\n};\n\n"; |
---|
810 | |
---|
811 | print OUT "static const short break_property_table[256] = {\n"; |
---|
812 | for ($count = 0; $count <= $last; $count += 256) |
---|
813 | { |
---|
814 | print OUT ",\n" if $count > 0; |
---|
815 | print OUT " ", $row[$count / 256]; |
---|
816 | $bytes_out += 2; |
---|
817 | } |
---|
818 | print OUT "\n};\n\n"; |
---|
819 | |
---|
820 | print OUT "#endif /* BREAKTABLES_H */\n"; |
---|
821 | |
---|
822 | close (OUT); |
---|
823 | |
---|
824 | printf STDERR "Generated %d bytes in break tables\n", $bytes_out; |
---|
825 | } |
---|
826 | |
---|
827 | |
---|
828 | # A fetch function for the break properties table. |
---|
829 | sub fetch_break_type |
---|
830 | { |
---|
831 | my ($index) = @_; |
---|
832 | return $break_mappings{$break_props[$index]}; |
---|
833 | } |
---|
834 | |
---|
835 | # Fetcher for combining class. |
---|
836 | sub fetch_cclass |
---|
837 | { |
---|
838 | my ($i) = @_; |
---|
839 | return $cclass[$i]; |
---|
840 | } |
---|
841 | |
---|
842 | # Expand a character decomposition recursively. |
---|
843 | sub expand_decomp |
---|
844 | { |
---|
845 | my ($code, $compat) = @_; |
---|
846 | |
---|
847 | my ($iter, $val); |
---|
848 | my (@result) = (); |
---|
849 | foreach $iter (split (' ', $decompositions[$code])) |
---|
850 | { |
---|
851 | $val = hex ($iter); |
---|
852 | if (defined $decompositions[$val] && |
---|
853 | ($compat || !$decompose_compat[$val])) |
---|
854 | { |
---|
855 | push (@result, &expand_decomp ($val, $compat)); |
---|
856 | } |
---|
857 | else |
---|
858 | { |
---|
859 | push (@result, $val); |
---|
860 | } |
---|
861 | } |
---|
862 | |
---|
863 | return @result; |
---|
864 | } |
---|
865 | |
---|
866 | sub make_decomp |
---|
867 | { |
---|
868 | my ($code, $compat) = @_; |
---|
869 | |
---|
870 | my $result = ""; |
---|
871 | foreach $iter (&expand_decomp ($code, $compat)) |
---|
872 | { |
---|
873 | $result .= sprintf "\\x%02x\\x%02x", $iter / 256, $iter & 0xff; |
---|
874 | } |
---|
875 | |
---|
876 | $result; |
---|
877 | } |
---|
878 | # Generate special case data string from two fields |
---|
879 | sub add_special_case |
---|
880 | { |
---|
881 | my ($code, $single, $field1, $field2) = @_; |
---|
882 | |
---|
883 | @values = (defined $single ? $single : (), |
---|
884 | (map { hex ($_) } split /\s+/, $field1), |
---|
885 | 0, |
---|
886 | (map { hex ($_) } split /\s+/, $field2)); |
---|
887 | $result = ""; |
---|
888 | |
---|
889 | |
---|
890 | for $value (@values) { |
---|
891 | $result .= sprintf ("\\x%02x\\x%02x", $value / 256, $value & 0xff); |
---|
892 | } |
---|
893 | |
---|
894 | $result .= "\\0"; |
---|
895 | |
---|
896 | if (2 * @values + 2 > $special_case_len) { |
---|
897 | $special_case_len = 2 * @values + 2; |
---|
898 | } |
---|
899 | |
---|
900 | push @special_cases, $result; |
---|
901 | |
---|
902 | # |
---|
903 | # We encode special cases in the surrogate pair space |
---|
904 | # |
---|
905 | $value[$code] = 0xD800 + scalar(@special_cases) - 1; |
---|
906 | } |
---|
907 | |
---|
908 | sub output_special_case_table |
---|
909 | { |
---|
910 | my $out = shift; |
---|
911 | |
---|
912 | print $out <<EOT; |
---|
913 | |
---|
914 | /* Table of special cases for case conversion; each record contains |
---|
915 | * First, the best single character mapping to lowercase if Lu, |
---|
916 | * and to uppercase if Ll, followed by the output mapping for the two cases |
---|
917 | * other than the case of the codepoint, in the order [Ll],[Lu],[Lt], |
---|
918 | * separated and terminated by a double NUL. |
---|
919 | */ |
---|
920 | static const guchar special_case_table[][$special_case_len] = { |
---|
921 | EOT |
---|
922 | |
---|
923 | for $case (@special_cases) { |
---|
924 | print $out qq( "$case",\n); |
---|
925 | } |
---|
926 | |
---|
927 | print $out <<EOT; |
---|
928 | }; |
---|
929 | |
---|
930 | EOT |
---|
931 | |
---|
932 | print STDERR "Generated ", ($special_case_len * scalar @special_cases), " bytes in special case table\n"; |
---|
933 | } |
---|
934 | |
---|
935 | sub enumerate_ordered |
---|
936 | { |
---|
937 | my ($array) = @_; |
---|
938 | |
---|
939 | my $n = 0; |
---|
940 | for my $code (sort { $a <=> $b } keys %$array) { |
---|
941 | if ($array->{$code} == 1) { |
---|
942 | delete $array->{$code}; |
---|
943 | next; |
---|
944 | } |
---|
945 | $array->{$code} = $n++; |
---|
946 | } |
---|
947 | |
---|
948 | return $n; |
---|
949 | } |
---|
950 | |
---|
951 | sub output_composition_table |
---|
952 | { |
---|
953 | print STDERR "Generating composition table\n"; |
---|
954 | |
---|
955 | local ($bytes_out) = 0; |
---|
956 | |
---|
957 | my %first; |
---|
958 | my %second; |
---|
959 | |
---|
960 | # First we need to go through and remove decompositions |
---|
961 | # starting with a non-starter, and single-character |
---|
962 | # decompositions. At the same time, record |
---|
963 | # the first and second character of each decomposition |
---|
964 | |
---|
965 | for $code (keys %compositions) { |
---|
966 | @values = map { hex ($_) } split /\s+/, $compositions{$code}; |
---|
967 | if ($cclass[$values[0]]) { |
---|
968 | delete $compositions{$code}; |
---|
969 | next; |
---|
970 | } |
---|
971 | if (@values == 1) { |
---|
972 | delete $compositions{$code}; |
---|
973 | next; |
---|
974 | } |
---|
975 | if (@values != 2) { |
---|
976 | die "$code has more than two elements in its decomposition!\n"; |
---|
977 | } |
---|
978 | |
---|
979 | if (exists $first{$values[0]}) { |
---|
980 | $first{$values[0]}++; |
---|
981 | } else { |
---|
982 | $first{$values[0]} = 1; |
---|
983 | } |
---|
984 | } |
---|
985 | |
---|
986 | # Assign integer indicices, removing singletons |
---|
987 | my $n_first = enumerate_ordered (\%first); |
---|
988 | |
---|
989 | # Now record the second character if each (non-singleton) decomposition |
---|
990 | for $code (keys %compositions) { |
---|
991 | @values = map { hex ($_) } split /\s+/, $compositions{$code}; |
---|
992 | |
---|
993 | if (exists $first{$values[0]}) { |
---|
994 | if (exists $second{$values[1]}) { |
---|
995 | $second{$values[1]}++; |
---|
996 | } else { |
---|
997 | $second{$values[1]} = 1; |
---|
998 | } |
---|
999 | } |
---|
1000 | } |
---|
1001 | |
---|
1002 | # Assign integer indices, removing duplicate |
---|
1003 | my $n_second = enumerate_ordered (\%second); |
---|
1004 | |
---|
1005 | # Build reverse table |
---|
1006 | |
---|
1007 | my @first_singletons; |
---|
1008 | my @second_singletons; |
---|
1009 | my %reverse; |
---|
1010 | for $code (keys %compositions) { |
---|
1011 | @values = map { hex ($_) } split /\s+/, $compositions{$code}; |
---|
1012 | |
---|
1013 | my $first = $first{$values[0]}; |
---|
1014 | my $second = $second{$values[1]}; |
---|
1015 | |
---|
1016 | if (defined $first && defined $second) { |
---|
1017 | $reverse{"$first|$second"} = $code; |
---|
1018 | } elsif (!defined $first) { |
---|
1019 | push @first_singletons, [ $values[0], $values[1], $code ]; |
---|
1020 | } else { |
---|
1021 | push @second_singletons, [ $values[1], $values[0], $code ]; |
---|
1022 | } |
---|
1023 | } |
---|
1024 | |
---|
1025 | @first_singletons = sort { $a->[0] <=> $b->[0] } @first_singletons; |
---|
1026 | @second_singletons = sort { $a->[0] <=> $b->[0] } @second_singletons; |
---|
1027 | |
---|
1028 | my %vals; |
---|
1029 | |
---|
1030 | open OUT, ">gunicomp.h" or die "Cannot open gunicomp.h: $!\n"; |
---|
1031 | |
---|
1032 | # Assign values in lookup table for all code points involved |
---|
1033 | |
---|
1034 | my $total = 1; |
---|
1035 | my $last = 0; |
---|
1036 | printf OUT "#define COMPOSE_FIRST_START %d\n", $total; |
---|
1037 | for $code (keys %first) { |
---|
1038 | $vals{$code} = $first{$code} + $total; |
---|
1039 | $last = $code if $code > $last; |
---|
1040 | } |
---|
1041 | $total += $n_first; |
---|
1042 | $i = 0; |
---|
1043 | printf OUT "#define COMPOSE_FIRST_SINGLE_START %d\n", $total; |
---|
1044 | for $record (@first_singletons) { |
---|
1045 | my $code = $record->[0]; |
---|
1046 | $vals{$code} = $i++ + $total; |
---|
1047 | $last = $code if $code > $last; |
---|
1048 | } |
---|
1049 | $total += @first_singletons; |
---|
1050 | printf OUT "#define COMPOSE_SECOND_START %d\n", $total; |
---|
1051 | for $code (keys %second) { |
---|
1052 | $vals{$code} = $second{$code} + $total; |
---|
1053 | $last = $code if $code > $last; |
---|
1054 | } |
---|
1055 | $total += $n_second; |
---|
1056 | $i = 0; |
---|
1057 | printf OUT "#define COMPOSE_SECOND_SINGLE_START %d\n\n", $total; |
---|
1058 | for $record (@second_singletons) { |
---|
1059 | my $code = $record->[0]; |
---|
1060 | $vals{$code} = $i++ + $total; |
---|
1061 | $last = $code if $code > $last; |
---|
1062 | } |
---|
1063 | |
---|
1064 | # Output lookup table |
---|
1065 | |
---|
1066 | my @row; |
---|
1067 | $table_index = 0; |
---|
1068 | printf OUT "static const gushort compose_data[][256] = {\n"; |
---|
1069 | for (my $count = 0; $count <= $last; $count += 256) |
---|
1070 | { |
---|
1071 | $row[$count / 256] = &print_row ($count, 2, sub { exists $vals{$_[0]} ? $vals{$_[0]} : 0; }); |
---|
1072 | } |
---|
1073 | printf OUT "\n};\n\n"; |
---|
1074 | |
---|
1075 | print OUT "static const short compose_table[256] = {\n"; |
---|
1076 | for (my $count = 0; $count <= $last; $count += 256) |
---|
1077 | { |
---|
1078 | print OUT ",\n" if $count > 0; |
---|
1079 | print OUT " ", $row[$count / 256]; |
---|
1080 | $bytes_out += 4; |
---|
1081 | } |
---|
1082 | print OUT "\n};\n\n"; |
---|
1083 | |
---|
1084 | # Output first singletons |
---|
1085 | |
---|
1086 | print OUT "static const gushort compose_first_single[][2] = {\n"; |
---|
1087 | $i = 0; |
---|
1088 | for $record (@first_singletons) { |
---|
1089 | print OUT ",\n" if $i++ > 0; |
---|
1090 | printf OUT " { %#06x, %#06x }", $record->[1], $record->[2]; |
---|
1091 | } |
---|
1092 | print OUT "\n};\n"; |
---|
1093 | |
---|
1094 | $bytes_out += @first_singletons * 4; |
---|
1095 | |
---|
1096 | # Output second singletons |
---|
1097 | |
---|
1098 | print OUT "static const gushort compose_second_single[][2] = {\n"; |
---|
1099 | $i = 0; |
---|
1100 | for $record (@second_singletons) { |
---|
1101 | print OUT ",\n" if $i++ > 0; |
---|
1102 | printf OUT " { %#06x, %#06x }", $record->[1], $record->[2]; |
---|
1103 | } |
---|
1104 | print OUT "\n};\n"; |
---|
1105 | |
---|
1106 | $bytes_out += @second_singletons * 4; |
---|
1107 | |
---|
1108 | # Output array of composition pairs |
---|
1109 | |
---|
1110 | print OUT <<EOT; |
---|
1111 | static const gushort compose_array[$n_first][$n_second] = { |
---|
1112 | EOT |
---|
1113 | |
---|
1114 | for (my $i = 0; $i < $n_first; $i++) { |
---|
1115 | print OUT ",\n" if $i; |
---|
1116 | print OUT " { "; |
---|
1117 | for (my $j = 0; $j < $n_second; $j++) { |
---|
1118 | print OUT ", " if $j; |
---|
1119 | if (exists $reverse{"$i|$j"}) { |
---|
1120 | printf OUT "%#06x", $reverse{"$i|$j"}; |
---|
1121 | } else { |
---|
1122 | print OUT " 0"; |
---|
1123 | } |
---|
1124 | } |
---|
1125 | print OUT " }"; |
---|
1126 | } |
---|
1127 | print OUT "\n"; |
---|
1128 | |
---|
1129 | print OUT <<EOT; |
---|
1130 | }; |
---|
1131 | EOT |
---|
1132 | |
---|
1133 | $bytes_out += $n_first * $n_second * 2; |
---|
1134 | |
---|
1135 | printf STDERR "Generated %d bytes in compose tables\n", $bytes_out; |
---|
1136 | } |
---|
1137 | |
---|
1138 | sub output_casefold_table |
---|
1139 | { |
---|
1140 | my $out = shift; |
---|
1141 | |
---|
1142 | print $out <<EOT; |
---|
1143 | |
---|
1144 | /* Table of casefolding cases that can't be derived by lowercasing |
---|
1145 | */ |
---|
1146 | static const struct { |
---|
1147 | guint16 ch; |
---|
1148 | gchar data[$casefoldlen]; |
---|
1149 | } casefold_table[] = { |
---|
1150 | EOT |
---|
1151 | |
---|
1152 | @casefold = sort { $a->[0] <=> $b->[0] } @casefold; |
---|
1153 | |
---|
1154 | for $case (@casefold) { |
---|
1155 | $code = $case->[0]; |
---|
1156 | $string = $case->[1]; |
---|
1157 | print $out sprintf(qq({ %#04x, "$string" },\n), $code); |
---|
1158 | |
---|
1159 | } |
---|
1160 | |
---|
1161 | print $out <<EOT; |
---|
1162 | }; |
---|
1163 | |
---|
1164 | EOT |
---|
1165 | |
---|
1166 | my $recordlen = (2+$casefoldlen+1) & ~1; |
---|
1167 | printf "Generated %d bytes for casefold table\n", $recordlen * @casefold; |
---|
1168 | } |
---|
1169 | |
---|
1170 | |
---|
1171 | |
---|