source: trunk/third/glib2/glib/gen-unicode-tables.pl @ 20721

Revision 20721, 33.3 KB checked in by ghudson, 20 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r20720, which included commits to RCS files with non-trunk default branches.
  • Property svn:executable set to *
Line 
1#! /usr/bin/perl -w
2
3#    Copyright (C) 1998, 1999 Tom Tromey
4#    Copyright (C) 2001 Red Hat Software
5
6#    This program is free software; you can redistribute it and/or modify
7#    it under the terms of the GNU General Public License as published by
8#    the Free Software Foundation; either version 2, or (at your option)
9#    any later version.
10
11#    This program is distributed in the hope that it will be useful,
12#    but WITHOUT ANY WARRANTY; without even the implied warranty of
13#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14#    GNU General Public License for more details.
15
16#    You should have received a copy of the GNU General Public License
17#    along with this program; if not, write to the Free Software
18#    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
19#    02111-1307, USA.
20
21# Contributer(s):
22#   Andrew Taylor <andrew.taylor@montage.ca>
23
24# gen-unicode-tables.pl - Generate tables for libunicode from Unicode data.
25# See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
26# I consider the output of this program to be unrestricted.  Use it as
27# you will.
28
29# FIXME:
30# * For decomp table it might make sense to use a shift count other
31#   than 8.  We could easily compute the perfect shift count.
32
33# we use some perl unicode features
34require 5.006;
35
36use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION);
37
38
39# Names of fields in Unicode data table.
40$CODE = 0;
41$NAME = 1;
42$CATEGORY = 2;
43$COMBINING_CLASSES = 3;
44$BIDI_CATEGORY = 4;
45$DECOMPOSITION = 5;
46$DECIMAL_VALUE = 6;
47$DIGIT_VALUE = 7;
48$NUMERIC_VALUE = 8;
49$MIRRORED = 9;
50$OLD_NAME = 10;
51$COMMENT = 11;
52$UPPER = 12;
53$LOWER = 13;
54$TITLE = 14;
55
56# Names of fields in the line break table
57$BREAK_CODE = 0;
58$BREAK_PROPERTY = 1;
59
60# Names of fields in the SpecialCasing table
61$CASE_CODE = 0;
62$CASE_LOWER = 1;
63$CASE_TITLE = 2;
64$CASE_UPPER = 3;
65$CASE_CONDITION = 4;
66
67# Names of fields in the CaseFolding table
68$FOLDING_CODE = 0;
69$FOLDING_STATUS = 1;
70$FOLDING_MAPPING = 2;
71
72# Map general category code onto symbolic name.
73%mappings =
74    (
75     # Normative.
76     'Lu' => "G_UNICODE_UPPERCASE_LETTER",
77     'Ll' => "G_UNICODE_LOWERCASE_LETTER",
78     'Lt' => "G_UNICODE_TITLECASE_LETTER",
79     'Mn' => "G_UNICODE_NON_SPACING_MARK",
80     'Mc' => "G_UNICODE_COMBINING_MARK",
81     'Me' => "G_UNICODE_ENCLOSING_MARK",
82     'Nd' => "G_UNICODE_DECIMAL_NUMBER",
83     'Nl' => "G_UNICODE_LETTER_NUMBER",
84     'No' => "G_UNICODE_OTHER_NUMBER",
85     'Zs' => "G_UNICODE_SPACE_SEPARATOR",
86     'Zl' => "G_UNICODE_LINE_SEPARATOR",
87     'Zp' => "G_UNICODE_PARAGRAPH_SEPARATOR",
88     'Cc' => "G_UNICODE_CONTROL",
89     'Cf' => "G_UNICODE_FORMAT",
90     'Cs' => "G_UNICODE_SURROGATE",
91     'Co' => "G_UNICODE_PRIVATE_USE",
92     'Cn' => "G_UNICODE_UNASSIGNED",
93
94     # Informative.
95     'Lm' => "G_UNICODE_MODIFIER_LETTER",
96     'Lo' => "G_UNICODE_OTHER_LETTER",
97     'Pc' => "G_UNICODE_CONNECT_PUNCTUATION",
98     'Pd' => "G_UNICODE_DASH_PUNCTUATION",
99     'Ps' => "G_UNICODE_OPEN_PUNCTUATION",
100     'Pe' => "G_UNICODE_CLOSE_PUNCTUATION",
101     'Pi' => "G_UNICODE_INITIAL_PUNCTUATION",
102     'Pf' => "G_UNICODE_FINAL_PUNCTUATION",
103     'Po' => "G_UNICODE_OTHER_PUNCTUATION",
104     'Sm' => "G_UNICODE_MATH_SYMBOL",
105     'Sc' => "G_UNICODE_CURRENCY_SYMBOL",
106     'Sk' => "G_UNICODE_MODIFIER_SYMBOL",
107     'So' => "G_UNICODE_OTHER_SYMBOL"
108     );
109
110%break_mappings =
111    (
112     'BK' => "G_UNICODE_BREAK_MANDATORY",
113     'CR' => "G_UNICODE_BREAK_CARRIAGE_RETURN",
114     'LF' => "G_UNICODE_BREAK_LINE_FEED",
115     'CM' => "G_UNICODE_BREAK_COMBINING_MARK",
116     'SG' => "G_UNICODE_BREAK_SURROGATE",
117     'ZW' => "G_UNICODE_BREAK_ZERO_WIDTH_SPACE",
118     'IN' => "G_UNICODE_BREAK_INSEPARABLE",
119     'GL' => "G_UNICODE_BREAK_NON_BREAKING_GLUE",
120     'CB' => "G_UNICODE_BREAK_CONTINGENT",
121     'SP' => "G_UNICODE_BREAK_SPACE",
122     'BA' => "G_UNICODE_BREAK_AFTER",
123     'BB' => "G_UNICODE_BREAK_BEFORE",
124     'B2' => "G_UNICODE_BREAK_BEFORE_AND_AFTER",
125     'HY' => "G_UNICODE_BREAK_HYPHEN",
126     'NS' => "G_UNICODE_BREAK_NON_STARTER",
127     'OP' => "G_UNICODE_BREAK_OPEN_PUNCTUATION",
128     'CL' => "G_UNICODE_BREAK_CLOSE_PUNCTUATION",
129     'QU' => "G_UNICODE_BREAK_QUOTATION",
130     'EX' => "G_UNICODE_BREAK_EXCLAMATION",
131     'ID' => "G_UNICODE_BREAK_IDEOGRAPHIC",
132     'NU' => "G_UNICODE_BREAK_NUMERIC",
133     'IS' => "G_UNICODE_BREAK_INFIX_SEPARATOR",
134     'SY' => "G_UNICODE_BREAK_SYMBOL",
135     'AL' => "G_UNICODE_BREAK_ALPHABETIC",
136     'PR' => "G_UNICODE_BREAK_PREFIX",
137     'PO' => "G_UNICODE_BREAK_POSTFIX",
138     'SA' => "G_UNICODE_BREAK_COMPLEX_CONTEXT",
139     'AI' => "G_UNICODE_BREAK_AMBIGUOUS",
140     'NL' => "G_UNICODE_BREAK_NEXT_LINE",
141     'WJ' => "G_UNICODE_BREAK_WORD_JOINER",
142     'XX' => "G_UNICODE_BREAK_UNKNOWN"
143     );
144
145# Title case mappings.
146%title_to_lower = ();
147%title_to_upper = ();
148
149# Maximum length of special-case strings
150
151my @special_cases;
152my @special_case_offsets;
153my $special_case_offset = 0;
154
155$do_decomp = 0;
156$do_props = 1;
157if (@ARGV && $ARGV[0] eq '-decomp')
158{
159    $do_decomp = 1;
160    $do_props = 0;
161    shift @ARGV;
162}
163elsif (@ARGV && $ARGV[0] eq '-both')
164{
165    $do_decomp = 1;
166    shift @ARGV;
167}
168
169if (@ARGV != 2) {
170    $0 =~ s@.*/@@;
171    die "\nUsage: $0 [-decomp | -both] UNICODE-VERSION DIRECTORY\n\n       DIRECTORY should contain the following Unicode data files:\n       UnicodeData.txt, LineBreak.txt, SpecialCasing.txt, CaseFolding.txt,\n       CompositionExclusions.txt, BidiMirroring.txt\n\n";
172}
173
174my ($unicodedatatxt, $linebreaktxt, $specialcasingtxt, $casefoldingtxt, $compositionexclusionstxt, $bidimirroringtxt);
175
176my $d = $ARGV[1];
177opendir (my $dir, $d) or die "Cannot open Unicode data dir $d: $!\n";
178for my $f (readdir ($dir))
179{
180    $unicodedatatxt = "$d/$f" if ($f =~ /UnicodeData.*\.txt/);
181    $linebreaktxt = "$d/$f" if ($f =~ /LineBreak.*\.txt/);
182    $specialcasingtxt = "$d/$f" if ($f =~ /SpecialCasing.*\.txt/);
183    $casefoldingtxt = "$d/$f" if ($f =~ /CaseFolding.*\.txt/);
184    $compositionexclusionstxt = "$d/$f" if ($f =~ /CompositionExclusions.*\.txt/);
185    $bidimirroringtxt = "$d/$f" if ($f =~ /BidiMirroring.*\.txt/);
186}
187
188defined $unicodedatatxt or die "Did not find UnicodeData file";
189defined $linebreaktxt or die "Did not find LineBreak file";
190defined $specialcasingtxt or die "Did not find SpecialCasing file";
191defined $casefoldingtxt or die "Did not find CaseFolding file";
192defined $compositionexclusionstxt or die "Did not find CompositionExclusions file";
193defined $bidimirroringtxt or die "Did not find BidiMirroring file";
194
195print "Creating decomp table\n" if ($do_decomp);
196print "Creating property table\n" if ($do_props);
197
198print "Composition exlusions from $compositionexclusionstxt\n";
199
200open (INPUT, "< $compositionexclusionstxt") || exit 1;
201
202while (<INPUT>) {
203
204    chop;
205
206    next if /^#/;
207    next if /^\s*$/;
208
209    s/\s*#.*//;
210
211    s/^\s*//;
212    s/\s*$//;
213
214    $composition_exclusions{hex($_)} = 1;
215}
216
217close INPUT;
218
219print "Unicode data from $unicodedatatxt\n";
220
221open (INPUT, "< $unicodedatatxt") || exit 1;
222
223# we save memory by skipping the huge empty area before U+E0000
224my $pages_before_e0000;
225
226$last_code = -1;
227while (<INPUT>)
228{
229    chop;
230    @fields = split (';', $_, 30);
231    if ($#fields != 14)
232    {
233        printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields);
234    }
235
236    $code = hex ($fields[$CODE]);
237
238    if ($code >= 0xE0000 and $last_code < 0xE0000)
239    {
240        $pages_before_e0000 = ($last_code >> 8) + 1;
241    }
242
243    if ($code > $last_code + 1)
244    {
245        # Found a gap.
246        if ($fields[$NAME] =~ /Last>/)
247        {
248            # Fill the gap with the last character read,
249            # since this was a range specified in the char database
250            @gfields = @fields;
251        }
252        else
253        {
254            # The gap represents undefined characters.  Only the type
255            # matters.
256            @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
257                        '', '', '', '');
258        }
259        for (++$last_code; $last_code < $code; ++$last_code)
260        {
261            $gfields{$CODE} = sprintf ("%04x", $last_code);
262            &process_one ($last_code, @gfields);
263        }
264    }
265    &process_one ($code, @fields);
266    $last_code = $code;
267}
268
269close INPUT;
270
271@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
272            '', '', '', '');
273for (++$last_code; $last_code <= 0x10FFFF; ++$last_code)
274{
275    $gfields{$CODE} = sprintf ("%04x", $last_code);
276    &process_one ($last_code, @gfields);
277}
278--$last_code;                   # Want last to be 0x10FFFF.
279
280print "Creating line break table\n";
281
282print "Line break data from $linebreaktxt\n";
283
284open (INPUT, "< $linebreaktxt") || exit 1;
285
286$last_code = -1;
287while (<INPUT>)
288{
289    my ($start_code, $end_code);
290   
291    chop;
292
293    next if /^#/;
294
295    s/\s*#.*//;
296   
297    @fields = split (';', $_, 30);
298    if ($#fields != 1)
299    {
300        printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields);
301        next;
302    }
303
304    if ($fields[$CODE] =~ /([A-F0-9]{4,6})\.\.([A-F0-9]{4,6})/)
305    {
306        $start_code = hex ($1);
307        $end_code = hex ($2);
308    } else {
309        $start_code = $end_code = hex ($fields[$CODE]);
310       
311    }
312
313    if ($start_code > $last_code + 1)
314    {
315        # The gap represents undefined characters. If assigned,
316        # they are AL, if not assigned, XX
317        for (++$last_code; $last_code < $start_code; ++$last_code)
318        {
319            if ($type[$last_code] eq 'Cn')
320            {
321                $break_props[$last_code] = 'XX';
322            }
323            else
324            {
325                $break_props[$last_code] = 'AL';
326            }
327        }
328    }
329
330    for ($last_code = $start_code; $last_code <= $end_code; $last_code++)
331    {
332        $break_props[$last_code] = $fields[$BREAK_PROPERTY];
333    }
334   
335    $last_code = $end_code;
336}
337
338close INPUT;
339
340for (++$last_code; $last_code <= 0x10FFFF; ++$last_code)
341{
342  if ($type[$last_code] eq 'Cn')
343    {
344      $break_props[$last_code] = 'XX';
345    }
346  else
347    {
348      $break_props[$last_code] = 'AL';
349    }
350}
351--$last_code;                   # Want last to be 0x10FFFF.
352
353print STDERR "Last code is not 0x10FFFF" if ($last_code != 0x10FFFF);
354
355print "Reading special-casing table for case conversion\n";
356
357open (INPUT, "< $specialcasingtxt") || exit 1;
358
359while (<INPUT>)
360{
361    my $code;
362   
363    chop;
364
365    next if /^#/;
366    next if /^\s*$/;
367
368    s/\s*#.*//;
369
370    @fields = split ('\s*;\s*', $_, 30);
371
372    $raw_code = $fields[$CASE_CODE];
373    $code = hex ($raw_code);
374
375    if ($#fields != 4 && $#fields != 5)
376    {
377        printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
378        next;
379    }
380
381    if (!defined $type[$code])
382    {
383        printf STDERR "Special case for code point: $code, which has no defined type\n";
384        next;
385    }
386
387    if (defined $fields[5]) {
388        # Ignore conditional special cases - we'll handle them in code
389        next;
390    }
391   
392    if ($type[$code] eq 'Lu')
393    {
394        (hex $fields[$CASE_UPPER] == $code) || die "$raw_code is Lu and UCD_Upper($raw_code) != $raw_code";
395
396        &add_special_case ($code, $value[$code], $fields[$CASE_LOWER], $fields[$CASE_TITLE]);
397       
398    } elsif ($type[$code] eq 'Lt')
399    {
400        (hex $fields[$CASE_TITLE] == $code) || die "$raw_code is Lt and UCD_Title($raw_code) != $raw_code";
401       
402        &add_special_case ($code, undef, $fields[$CASE_LOWER], $fields[$CASE_UPPER]);
403    } elsif ($type[$code] eq 'Ll')
404    {
405        (hex $fields[$CASE_LOWER] == $code) || die "$raw_code is Ll and UCD_Lower($raw_code) != $raw_code";
406       
407        &add_special_case ($code, $value[$code], $fields[$CASE_UPPER], $fields[$CASE_TITLE]);
408    } else {
409        printf STDERR "Special case for non-alphabetic code point: $raw_code\n";
410        next;
411    }
412}
413
414close INPUT;
415
416open (INPUT, "< $casefoldingtxt") || exit 1;
417
418my $casefoldlen = 0;
419my @casefold;
420 
421while (<INPUT>)
422{
423    my $code;
424   
425    chop;
426
427    next if /^#/;
428    next if /^\s*$/;
429
430    s/\s*#.*//;
431
432    @fields = split ('\s*;\s*', $_, 30);
433
434    $raw_code = $fields[$FOLDING_CODE];
435    $code = hex ($raw_code);
436
437    if ($#fields != 3)
438    {
439        printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
440        next;
441    }
442
443    # we don't use Simple or Turkic rules here
444    next if ($fields[$FOLDING_STATUS] =~ /^[ST]$/);
445
446    @values = map { hex ($_) } split /\s+/, $fields[$FOLDING_MAPPING];
447
448    # Check simple case
449
450    if (@values == 1 &&
451        !(defined $value[$code] && $value[$code] >= 0x1000000) &&
452        defined $type[$code]) {
453
454        my $lower;
455        if ($type[$code] eq 'Ll')
456        {
457            $lower = $code;
458        } elsif ($type[$code] eq 'Lt')
459        {
460            $lower = $title_to_lower{$code};
461        } elsif ($type[$code] eq 'Lu')
462        {
463            $lower = $value[$code];
464        } else {
465            $lower = $code;
466        }
467       
468        if ($lower == $values[0]) {
469            next;
470        }
471    }
472
473    my $string = pack ("U*", @values);
474
475    if (1 + &length_in_bytes ($string) > $casefoldlen) {
476        $casefoldlen = 1 + &length_in_bytes ($string);
477    }
478
479    push @casefold, [ $code, &escape ($string) ];
480}
481
482close INPUT;
483
484open (INPUT, "< $bidimirroringtxt") || exit 1;
485
486my @bidimirror;
487while (<INPUT>)
488{
489    chomp;
490
491    next if /^#/;
492    next if /^\s*$/;
493
494    s/\s*#.*//;
495
496    @fields = split ('\s*;\s*', $_, 30);
497
498    push @bidimirror, [hex ($fields[0]), hex ($fields[1])];
499}
500 
501if ($do_props) {
502    &print_tables ($last_code)
503}
504if ($do_decomp) {
505    &print_decomp ($last_code);
506    &output_composition_table;
507}
508
509&print_line_break ($last_code);
510
511exit 0;
512
513
514# perl "length" returns the length in characters
515sub length_in_bytes
516{
517    my ($string) = @_;
518
519    use bytes;
520    return length $string;
521}
522
523# Process a single character.
524sub process_one
525{
526    my ($code, @fields) = @_;
527
528    $type[$code] = $fields[$CATEGORY];
529    if ($type[$code] eq 'Nd')
530    {
531        $value[$code] = int ($fields[$DECIMAL_VALUE]);
532    }
533    elsif ($type[$code] eq 'Ll')
534    {
535        $value[$code] = hex ($fields[$UPPER]);
536    }
537    elsif ($type[$code] eq 'Lu')
538    {
539        $value[$code] = hex ($fields[$LOWER]);
540    }
541
542    if ($type[$code] eq 'Lt')
543    {
544        $title_to_lower{$code} = hex ($fields[$LOWER]);
545        $title_to_upper{$code} = hex ($fields[$UPPER]);
546    }
547
548    $cclass[$code] = $fields[$COMBINING_CLASSES];
549
550    # Handle decompositions.
551    if ($fields[$DECOMPOSITION] ne '')
552    {
553        if ($fields[$DECOMPOSITION] =~ s/\<.*\>\s*//) {
554           $decompose_compat[$code] = 1;
555        } else {
556           $decompose_compat[$code] = 0;
557
558           if (!exists $composition_exclusions{$code}) {
559               $compositions{$code} = $fields[$DECOMPOSITION];
560           }
561        }
562        $decompositions[$code] = $fields[$DECOMPOSITION];
563    }
564}
565
566sub print_tables
567{
568    my ($last) = @_;
569    my ($outfile) = "gunichartables.h";
570
571    local ($bytes_out) = 0;
572
573    print "Writing $outfile...\n";
574
575    open (OUT, "> $outfile");
576
577    print OUT "/* This file is automatically generated.  DO NOT EDIT!\n";
578    print OUT "   Instead, edit gen-unicode-tables.pl and re-run.  */\n\n";
579
580    print OUT "#ifndef CHARTABLES_H\n";
581    print OUT "#define CHARTABLES_H\n\n";
582
583    print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n";
584
585    printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
586
587    printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 10000\n\n";
588
589    my $last_part1 = ($pages_before_e0000 * 256) - 1;
590    printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1;
591    printf OUT "#define G_UNICODE_LAST_PAGE_PART1 %d\n\n", $pages_before_e0000 - 1;
592
593    $table_index = 0;
594    printf OUT "static const char type_data[][256] = {\n";
595    for ($count = 0; $count <= $last; $count += 256)
596    {
597        $row[$count / 256] = &print_row ($count, 1, \&fetch_type);
598    }
599    printf OUT "\n};\n\n";
600
601    printf OUT "/* U+0000 through U+%04X */\n", $last_part1;
602    print OUT "static const gint16 type_table_part1[$pages_before_e0000] = {\n";
603    for ($count = 0; $count <= $last_part1; $count += 256)
604    {
605        print OUT ",\n" if $count > 0;
606        print OUT "  ", $row[$count / 256];
607        $bytes_out += 2;
608    }
609    print OUT "\n};\n\n";
610
611    printf OUT "/* U+E0000 through U+%04X */\n", $last;
612    print OUT "static const gint16 type_table_part2[768] = {\n";
613    for ($count = 0xE0000; $count <= $last; $count += 256)
614    {
615        print OUT ",\n" if $count > 0xE0000;
616        print OUT "  ", $row[$count / 256];
617        $bytes_out += 2;
618    }
619    print OUT "\n};\n\n";
620
621
622    #
623    # Now print attribute table.
624    #
625
626    $table_index = 0;
627    printf OUT "static const gunichar attr_data[][256] = {\n";
628    for ($count = 0; $count <= $last; $count += 256)
629    {
630        $row[$count / 256] = &print_row ($count, 4, \&fetch_attr);
631    }
632    printf OUT "\n};\n\n";
633
634    printf OUT "/* U+0000 through U+%04X */\n", $last_part1;
635    print OUT "static const gint16 attr_table_part1[$pages_before_e0000] = {\n";
636    for ($count = 0; $count <= $last_part1; $count += 256)
637    {
638        print OUT ",\n" if $count > 0;
639        print OUT "  ", $row[$count / 256];
640        $bytes_out += 2;
641    }
642    print OUT "\n};\n\n";
643
644    printf OUT "/* U+E0000 through U+%04X */\n", $last;
645    print OUT "static const gint16 attr_table_part2[768] = {\n";
646    for ($count = 0xE0000; $count <= $last; $count += 256)
647    {
648        print OUT ",\n" if $count > 0xE0000;
649        print OUT "  ", $row[$count / 256];
650        $bytes_out += 2;
651    }
652    print OUT "\n};\n\n";
653
654    #
655    # print title case table
656    #
657
658    print OUT "static const gunichar title_table[][3] = {\n";
659    my ($item);
660    my ($first) = 1;
661    foreach $item (sort keys %title_to_lower)
662    {
663        print OUT ",\n"
664            unless $first;
665        $first = 0;
666        printf OUT "  { 0x%04x, 0x%04x, 0x%04x }", $item, $title_to_upper{$item}, $title_to_lower{$item};
667        $bytes_out += 12;
668    }
669    print OUT "\n};\n\n";
670
671    #
672    # And special case conversion table -- conversions that change length
673    #
674    &output_special_case_table (\*OUT);
675    &output_casefold_table (\*OUT);
676
677    print OUT "static const struct {\n";
678    print OUT "    gunichar ch;\n";
679    print OUT "    gunichar mirrored_ch;\n";
680    print OUT "} bidi_mirroring_table[] =\n";
681    print OUT "{\n";
682    $first = 1;
683    foreach $item (@bidimirror)
684    {
685        print OUT ",\n" unless $first;
686        $first = 0;
687        printf OUT "  { 0x%04x, 0x%04x }", $item->[0], $item->[1];
688        $bytes_out += 8;
689    }
690    print OUT "\n};\n\n";
691
692    print OUT "#endif /* CHARTABLES_H */\n";
693
694    close (OUT);
695
696    printf STDERR "Generated %d bytes in tables\n", $bytes_out;
697}
698
699# A fetch function for the type table.
700sub fetch_type
701{
702    my ($index) = @_;
703    return $mappings{$type[$index]};
704}
705
706# A fetch function for the attribute table.
707sub fetch_attr
708{
709    my ($index) = @_;
710    if (defined $value[$index])
711      {
712        return sprintf ("0x%04x", $value[$index]);
713      }
714    else
715      {
716        return "0x0000";
717      }
718}
719
720sub print_row
721{
722    my ($start, $typsize, $fetcher) = @_;
723
724    my ($i);
725    my (@values);
726    my ($flag) = 1;
727    my ($off);
728
729    for ($off = 0; $off < 256; ++$off)
730    {
731        $values[$off] = $fetcher->($off + $start);
732        if ($values[$off] ne $values[0])
733        {
734            $flag = 0;
735        }
736    }
737    if ($flag)
738    {
739        return $values[0] . " + G_UNICODE_MAX_TABLE_INDEX";
740    }
741
742    printf OUT ",\n" if ($table_index != 0);
743    printf OUT "  { /* page %d, index %d */\n    ", $start / 256, $table_index;
744    my ($column) = 4;
745    for ($i = $start; $i < $start + 256; ++$i)
746    {
747        print OUT ", "
748            if $i > $start;
749        my ($text) = $values[$i - $start];
750        if (length ($text) + $column + 2 > 78)
751        {
752            print OUT "\n    ";
753            $column = 4;
754        }
755        print OUT $text;
756        $column += length ($text) + 2;
757    }
758    print OUT "\n  }";
759
760    $bytes_out += 256 * $typsize;
761
762    return sprintf "%d /* page %d */", $table_index++, $start / 256;
763}
764
765sub escape
766{
767    my ($string) = @_;
768
769    my $escaped = unpack("H*", $string);
770    $escaped =~ s/(.{2})/\\x$1/g;
771
772    return $escaped;
773}
774
775# Returns the offset of $decomp in the offset string. Updates the
776# referenced variables as appropriate.
777sub handle_decomp ($$$$)
778{
779    my ($decomp, $decomp_offsets_ref, $decomp_string_ref, $decomp_string_offset_ref) = @_;
780    my $offset = "G_UNICODE_NOT_PRESENT_OFFSET";
781
782    if (defined $decomp)
783    {
784        if (defined $decomp_offsets_ref->{$decomp})
785        {
786            $offset = $decomp_offsets_ref->{$decomp};
787        }
788        else
789        {
790            $offset = ${$decomp_string_offset_ref};
791            $decomp_offsets_ref->{$decomp} = $offset;
792            ${$decomp_string_ref} .= "\n  \"" . &escape ($decomp) . "\\0\" /* offset ${$decomp_string_offset_ref} */";
793            ${$decomp_string_offset_ref} += &length_in_bytes ($decomp) + 1;
794        }
795    }
796
797    return $offset;
798}
799
800# Generate the character decomposition header.
801sub print_decomp
802{
803    my ($last) = @_;
804    my ($outfile) = "gunidecomp.h";
805
806    local ($bytes_out) = 0;
807
808    print "Writing $outfile...\n";
809
810    open (OUT, "> $outfile") || exit 1;
811
812    print OUT "/* This file is automatically generated.  DO NOT EDIT! */\n\n";
813    print OUT "#ifndef DECOMP_H\n";
814    print OUT "#define DECOMP_H\n\n";
815
816    printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
817
818    printf OUT "#define G_UNICODE_MAX_TABLE_INDEX (0x110000 / 256)\n\n";
819
820    my $last_part1 = ($pages_before_e0000 * 256) - 1;
821    printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1;
822    printf OUT "#define G_UNICODE_LAST_PAGE_PART1 %d\n\n", $pages_before_e0000 - 1;
823
824    $NOT_PRESENT_OFFSET = 65535;
825    print OUT "#define G_UNICODE_NOT_PRESENT_OFFSET $NOT_PRESENT_OFFSET\n\n";
826
827    my ($count, @row);
828    $table_index = 0;
829    printf OUT "static const guchar cclass_data[][256] = {\n";
830    for ($count = 0; $count <= $last; $count += 256)
831    {
832        $row[$count / 256] = &print_row ($count, 1, \&fetch_cclass);
833    }
834    printf OUT "\n};\n\n";
835
836    print OUT "static const gint16 combining_class_table_part1[$pages_before_e0000] = {\n";
837    for ($count = 0; $count <= $last_part1; $count += 256)
838    {
839        print OUT ",\n" if $count > 0;
840        print OUT "  ", $row[$count / 256];
841        $bytes_out += 2;
842    }
843    print OUT "\n};\n\n";
844
845    print OUT "static const gint16 combining_class_table_part2[768] = {\n";
846    for ($count = 0xE0000; $count <= $last; $count += 256)
847    {
848        print OUT ",\n" if $count > 0xE0000;
849        print OUT "  ", $row[$count / 256];
850        $bytes_out += 2;
851    }
852    print OUT "\n};\n\n";
853
854    print OUT "typedef struct\n{\n";
855    print OUT "  gunichar ch;\n";
856    print OUT "  guint16 canon_offset;\n";
857    print OUT "  guint16 compat_offset;\n";
858    print OUT "} decomposition;\n\n";
859
860    print OUT "static const decomposition decomp_table[] =\n{\n";
861    my ($iter);
862    my ($first) = 1;
863    my ($decomp_string) = "";
864    my ($decomp_string_offset) = 0;
865    for ($count = 0; $count <= $last; ++$count)
866    {
867        if (defined $decompositions[$count])
868        {
869            print OUT ",\n"
870                if ! $first;
871            $first = 0;
872
873            my $canon_decomp;
874            my $compat_decomp;
875
876            if (!$decompose_compat[$count]) {
877                $canon_decomp = make_decomp ($count, 0);
878            }
879            $compat_decomp = make_decomp ($count, 1);
880
881            if (defined $canon_decomp && $compat_decomp eq $canon_decomp) {
882                undef $compat_decomp;
883            }
884
885            my $canon_offset = handle_decomp ($canon_decomp, \%decomp_offsets, \$decomp_string, \$decomp_string_offset);
886            my $compat_offset = handle_decomp ($compat_decomp, \%decomp_offsets, \$decomp_string, \$decomp_string_offset);
887
888            die if $decomp_string_offset > $NOT_PRESENT_OFFSET;
889
890            printf OUT qq(  { 0x%04x, $canon_offset, $compat_offset }), $count;
891            $bytes_out += 8;
892        }
893    }
894    print OUT "\n};\n\n";
895    $bytes_out += $decomp_string_offset + 1;
896
897    printf OUT "static const gchar decomp_expansion_string[] = %s;\n\n", $decomp_string;
898
899    print OUT "#endif /* DECOMP_H */\n";
900
901    printf STDERR "Generated %d bytes in decomp tables\n", $bytes_out;
902}
903
904sub print_line_break
905{
906    my ($last) = @_;
907    my ($outfile) = "gunibreak.h";
908
909    local ($bytes_out) = 0;
910
911    print "Writing $outfile...\n";
912
913    open (OUT, "> $outfile");
914
915    print OUT "/* This file is automatically generated.  DO NOT EDIT!\n";
916    print OUT "   Instead, edit gen-unicode-tables.pl and re-run.  */\n\n";
917
918    print OUT "#ifndef BREAKTABLES_H\n";
919    print OUT "#define BREAKTABLES_H\n\n";
920
921    print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n";
922
923    printf OUT "#define G_UNICODE_LAST_CHAR 0x%04X\n\n", $last;
924
925    printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 10000\n\n";
926
927    my $last_part1 = ($pages_before_e0000 * 256) - 1;
928    printf OUT "/* the last code point that should be looked up in break_property_table_part1 */\n";
929    printf OUT "#define G_UNICODE_LAST_CHAR_PART1 0x%04X\n\n", $last_part1;
930
931    $table_index = 0;
932    printf OUT "static const gint8 break_property_data[][256] = {\n";
933    for ($count = 0; $count <= $last; $count += 256)
934    {
935        $row[$count / 256] = &print_row ($count, 1, \&fetch_break_type);
936    }
937    printf OUT "\n};\n\n";
938
939    printf OUT "/* U+0000 through U+%04X */\n", $last_part1;
940    print OUT "static const gint16 break_property_table_part1[$pages_before_e0000] = {\n";
941    for ($count = 0; $count <= $last_part1; $count += 256)
942    {
943        print OUT ",\n" if $count > 0;
944        print OUT "  ", $row[$count / 256];
945        $bytes_out += 2;
946    }
947    print OUT "\n};\n\n";
948
949    printf OUT "/* U+E0000 through U+%04X */\n", $last;
950    print OUT "static const gint16 break_property_table_part2[768] = {\n";
951    for ($count = 0xE0000; $count <= $last; $count += 256)
952    {
953        print OUT ",\n" if $count > 0xE0000;
954        print OUT "  ", $row[$count / 256];
955        $bytes_out += 2;
956    }
957    print OUT "\n};\n\n";
958
959
960    print OUT "#endif /* BREAKTABLES_H */\n";
961
962    close (OUT);
963
964    printf STDERR "Generated %d bytes in break tables\n", $bytes_out;
965}
966
967
968# A fetch function for the break properties table.
969sub fetch_break_type
970{
971    my ($index) = @_;
972    return $break_mappings{$break_props[$index]};
973}
974
975# Fetcher for combining class.
976sub fetch_cclass
977{
978    my ($i) = @_;
979    return $cclass[$i];
980}
981
982# Expand a character decomposition recursively.
983sub expand_decomp
984{
985    my ($code, $compat) = @_;
986
987    my ($iter, $val);
988    my (@result) = ();
989    foreach $iter (split (' ', $decompositions[$code]))
990    {
991        $val = hex ($iter);
992        if (defined $decompositions[$val] &&
993            ($compat || !$decompose_compat[$val]))
994        {
995            push (@result, &expand_decomp ($val, $compat));
996        }
997        else
998        {
999            push (@result, $val);
1000        }
1001    }
1002
1003    return @result;
1004}
1005
1006sub make_decomp
1007{
1008    my ($code, $compat) = @_;
1009
1010    my $result = "";
1011    foreach $iter (&expand_decomp ($code, $compat))
1012    {
1013        $result .= pack ("U", $iter);  # to utf-8
1014    }
1015
1016    $result;
1017}
1018# Generate special case data string from two fields
1019sub add_special_case
1020{
1021    my ($code, $single, $field1, $field2) = @_;
1022
1023    @values = (defined $single ? $single : (),
1024               (map { hex ($_) } split /\s+/, $field1),
1025               0,
1026               (map { hex ($_) } split /\s+/, $field2));
1027    $result = "";
1028
1029
1030    for $value (@values) {
1031        $result .= pack ("U", $value);  # to utf-8
1032    }
1033   
1034    push @special_case_offsets, $special_case_offset;
1035
1036    # We encode special cases up in the 0x1000000 space
1037    $value[$code] = 0x1000000 + $special_case_offset;
1038
1039    $special_case_offset += 1 + &length_in_bytes ($result);
1040
1041    push @special_cases, &escape ($result);
1042}
1043
1044sub output_special_case_table
1045{
1046    my $out = shift;
1047
1048    print $out <<EOT;
1049
1050/* Table of special cases for case conversion; each record contains
1051 * First, the best single character mapping to lowercase if Lu,
1052 * and to uppercase if Ll, followed by the output mapping for the two cases
1053 * other than the case of the codepoint, in the order [Ll],[Lu],[Lt],
1054 * encoded in UTF-8, separated and terminated by a null character.
1055 */
1056static const gchar special_case_table[] = {
1057EOT
1058
1059    my $i = 0;
1060    for $case (@special_cases) {
1061        print $out qq( "$case\\0" /* offset ${special_case_offsets[$i]} */\n);
1062        $i++;
1063    }
1064
1065    print $out <<EOT;
1066};
1067
1068EOT
1069
1070    print STDERR "Generated " . ($special_case_offset + 1) . " bytes in special case table\n";
1071}
1072
1073sub enumerate_ordered
1074{
1075    my ($array) = @_;
1076
1077    my $n = 0;
1078    for my $code (sort { $a <=> $b } keys %$array) {
1079        if ($array->{$code} == 1) {
1080            delete $array->{$code};
1081            next;
1082        }
1083        $array->{$code} = $n++;
1084    }
1085
1086    return $n;
1087}
1088
1089sub output_composition_table
1090{
1091    print STDERR "Generating composition table\n";
1092   
1093    local ($bytes_out) = 0;
1094
1095    my %first;
1096    my %second;
1097
1098    # First we need to go through and remove decompositions
1099    # starting with a non-starter, and single-character
1100    # decompositions. At the same time, record
1101    # the first and second character of each decomposition
1102   
1103    for $code (keys %compositions)
1104    {
1105        @values = map { hex ($_) } split /\s+/, $compositions{$code};
1106
1107        # non-starters
1108        if ($cclass[$values[0]]) {
1109            delete $compositions{$code};
1110            next;
1111        }
1112
1113        # single-character decompositions
1114        if (@values == 1) {
1115            delete $compositions{$code};
1116            next;
1117        }
1118
1119        if (@values != 2) {
1120            die "$code has more than two elements in its decomposition!\n";
1121        }
1122
1123        if (exists $first{$values[0]}) {
1124            $first{$values[0]}++;
1125        } else {
1126            $first{$values[0]} = 1;
1127        }
1128    }
1129
1130    # Assign integer indices, removing singletons
1131    my $n_first = enumerate_ordered (\%first);
1132
1133    # Now record the second character of each (non-singleton) decomposition
1134    for $code (keys %compositions) {
1135        @values = map { hex ($_) } split /\s+/, $compositions{$code};
1136
1137        if (exists $first{$values[0]}) {
1138            if (exists $second{$values[1]}) {
1139                $second{$values[1]}++;
1140            } else {
1141                $second{$values[1]} = 1;
1142            }
1143        }
1144    }
1145
1146    # Assign integer indices, removing duplicate
1147    my $n_second = enumerate_ordered (\%second);
1148
1149    # Build reverse table
1150
1151    my @first_singletons;
1152    my @second_singletons;
1153    my %reverse;
1154    for $code (keys %compositions) {
1155        @values = map { hex ($_) } split /\s+/, $compositions{$code};
1156
1157        my $first = $first{$values[0]};
1158        my $second = $second{$values[1]};
1159
1160        if (defined $first && defined $second) {
1161            $reverse{"$first|$second"} = $code;
1162        } elsif (!defined $first) {
1163            push @first_singletons, [ $values[0], $values[1], $code ];
1164        } else {
1165            push @second_singletons, [ $values[1], $values[0], $code ];
1166        }
1167    }
1168
1169    @first_singletons = sort { $a->[0] <=> $b->[0] } @first_singletons;
1170    @second_singletons = sort { $a->[0] <=> $b->[0] } @second_singletons;
1171
1172    my %vals;
1173   
1174    open OUT, ">gunicomp.h" or die "Cannot open gunicomp.h: $!\n";
1175   
1176    # Assign values in lookup table for all code points involved
1177   
1178    my $total = 1;
1179    my $last = 0;
1180    printf OUT "#define COMPOSE_FIRST_START %d\n", $total;
1181    for $code (keys %first) {
1182        $vals{$code} = $first{$code} + $total;
1183        $last = $code if $code > $last;
1184    }
1185    $total += $n_first;
1186    $i = 0;
1187    printf OUT "#define COMPOSE_FIRST_SINGLE_START %d\n", $total;
1188    for $record (@first_singletons) {
1189        my $code = $record->[0];
1190        $vals{$code} = $i++ + $total;
1191        $last = $code if $code > $last;
1192    }
1193    $total += @first_singletons;
1194    printf OUT "#define COMPOSE_SECOND_START %d\n", $total;
1195    for $code (keys %second) {
1196        $vals{$code} = $second{$code} + $total;
1197        $last = $code if $code > $last;
1198    }
1199    $total += $n_second;
1200    $i = 0;
1201    printf OUT "#define COMPOSE_SECOND_SINGLE_START %d\n\n", $total;
1202    for $record (@second_singletons) {
1203        my $code = $record->[0];
1204        $vals{$code} = $i++ + $total;
1205        $last = $code if $code > $last;
1206    }
1207
1208    printf OUT "#define COMPOSE_TABLE_LAST %d\n\n", $last / 256;
1209
1210    # Output lookup table
1211
1212    my @row;                                             
1213    $table_index = 0;
1214    printf OUT "static const guint16 compose_data[][256] = {\n";
1215    for (my $count = 0; $count <= $last; $count += 256)
1216    {
1217        $row[$count / 256] = &print_row ($count, 2, sub { exists $vals{$_[0]} ? $vals{$_[0]} : 0; });
1218    }
1219    printf OUT "\n};\n\n";
1220
1221    print OUT "static const gint16 compose_table[COMPOSE_TABLE_LAST + 1] = {\n";
1222    for (my $count = 0; $count <= $last; $count += 256)
1223    {
1224        print OUT ",\n" if $count > 0;
1225        print OUT "  ", $row[$count / 256];
1226        $bytes_out += 2;
1227    }
1228    print OUT "\n};\n\n";
1229
1230    # Output first singletons
1231
1232    print OUT "static const guint16 compose_first_single[][2] = {\n";
1233    $i = 0;                                 
1234    for $record (@first_singletons) {
1235        if ($record->[1] > 0xFFFF or $record->[2] > 0xFFFF) {
1236            die "time to switch compose_first_single to gunichar" ;
1237        }
1238        print OUT ",\n" if $i++ > 0;
1239        printf OUT " { %#06x, %#06x }", $record->[1], $record->[2];
1240    }
1241    print OUT "\n};\n";
1242                                     
1243    $bytes_out += @first_singletons * 4;
1244                 
1245    # Output second singletons
1246
1247    print OUT "static const guint16 compose_second_single[][2] = {\n";
1248    $i = 0;                                 
1249    for $record (@second_singletons) {
1250        if ($record->[1] > 0xFFFF or $record->[2] > 0xFFFF) {
1251            die "time to switch compose_second_single to gunichar";
1252        }
1253        print OUT ",\n" if $i++ > 0;
1254        printf OUT " { %#06x, %#06x }", $record->[1], $record->[2];
1255    }
1256    print OUT "\n};\n";
1257                                     
1258    $bytes_out += @second_singletons * 4;                                   
1259                 
1260    # Output array of composition pairs
1261
1262    print OUT <<EOT;
1263static const guint16 compose_array[$n_first][$n_second] = {
1264EOT
1265                       
1266    for (my $i = 0; $i < $n_first; $i++) {
1267        print OUT ",\n" if $i;
1268        print OUT " { ";
1269        for (my $j = 0; $j < $n_second; $j++) {
1270            print OUT ", " if $j;
1271            if (exists $reverse{"$i|$j"}) {
1272                if ($reverse{"$i|$j"} > 0xFFFF) {
1273                    die "time to switch compose_array to gunichar" ;
1274                }
1275                printf OUT "0x%04x", $reverse{"$i|$j"};
1276            } else {
1277                print OUT "     0";
1278            }
1279        }
1280        print OUT " }";
1281    }
1282    print OUT "\n";
1283
1284    print OUT <<EOT;
1285};
1286EOT
1287
1288    $bytes_out += $n_first * $n_second * 2;
1289   
1290    printf STDERR "Generated %d bytes in compose tables\n", $bytes_out;
1291}
1292
1293sub output_casefold_table
1294{
1295    my $out = shift;
1296
1297    print $out <<EOT;
1298
1299/* Table of casefolding cases that can't be derived by lowercasing
1300 */
1301static const struct {
1302  guint16 ch;
1303  gchar data[$casefoldlen];
1304} casefold_table[] = {
1305EOT
1306
1307   @casefold = sort { $a->[0] <=> $b->[0] } @casefold;
1308   
1309   for $case (@casefold)
1310   {
1311       $code = $case->[0];
1312       $string = $case->[1];
1313
1314       if ($code > 0xFFFF) {
1315           die "time to switch casefold_table to gunichar" ;
1316       }
1317
1318       print $out sprintf(qq(  { 0x%04x, "$string" },\n), $code);
1319   
1320   }
1321
1322    print $out <<EOT;
1323};
1324
1325EOT
1326
1327   my $recordlen = (2+$casefoldlen+1) & ~1;
1328   printf "Generated %d bytes for casefold table\n", $recordlen * @casefold;
1329}
1330
1331                             
1332
Note: See TracBrowser for help on using the repository browser.