source: trunk/third/glib2/glib/gen-unicode-tables.pl @ 18159

Revision 18159, 27.9 KB checked in by ghudson, 22 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r18158, which included commits to RCS files with non-trunk default branches.
  • Property svn:executable set to *
Line 
1#! /usr/bin/perl -w
2
3#    Copyright (C) 1998, 1999 Tom Tromey
4#    Copyright (C) 2001 Red Hat Software
5
6#    This program is free software; you can redistribute it and/or modify
7#    it under the terms of the GNU General Public License as published by
8#    the Free Software Foundation; either version 2, or (at your option)
9#    any later version.
10
11#    This program is distributed in the hope that it will be useful,
12#    but WITHOUT ANY WARRANTY; without even the implied warranty of
13#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14#    GNU General Public License for more details.
15
16#    You should have received a copy of the GNU General Public License
17#    along with this program; if not, write to the Free Software
18#    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
19#    02111-1307, USA.
20
21# Contributer(s):
22#   Andrew Taylor <andrew.taylor@montage.ca>
23
24# gen-unicode-tables.pl - Generate tables for libunicode from Unicode data.
25# See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
26# Usage: gen-unicode-tables.pl [-decomp | -both] UNICODE-VERSION UnicodeData.txt LineBreak.txt SpecialCasing.txt CaseFolding.txt
27# I consider the output of this program to be unrestricted.  Use it as
28# you will.
29
30# FIXME:
31# * For decomp table it might make sense to use a shift count other
32#   than 8.  We could easily compute the perfect shift count.
33
34use vars qw($CODE $NAME $CATEGORY $COMBINING_CLASSES $BIDI_CATEGORY $DECOMPOSITION $DECIMAL_VALUE $DIGIT_VALUE $NUMERIC_VALUE $MIRRORED $OLD_NAME $COMMENT $UPPER $LOWER $TITLE $BREAK_CODE $BREAK_CATEGORY $BREAK_NAME $CASE_CODE $CASE_LOWER $CASE_TITLE $CASE_UPPER $CASE_CONDITION);
35
36# Names of fields in Unicode data table.
37$CODE = 0;
38$NAME = 1;
39$CATEGORY = 2;
40$COMBINING_CLASSES = 3;
41$BIDI_CATEGORY = 4;
42$DECOMPOSITION = 5;
43$DECIMAL_VALUE = 6;
44$DIGIT_VALUE = 7;
45$NUMERIC_VALUE = 8;
46$MIRRORED = 9;
47$OLD_NAME = 10;
48$COMMENT = 11;
49$UPPER = 12;
50$LOWER = 13;
51$TITLE = 14;
52
53# Names of fields in the line break table
54$BREAK_CODE = 0;
55$BREAK_PROPERTY = 1;
56
57# Names of fields in the SpecialCasing table
58$CASE_CODE = 0;
59$CASE_LOWER = 1;
60$CASE_TITLE = 2;
61$CASE_UPPER = 3;
62$CASE_CONDITION = 4;
63
64# Names of fields in the CaseFolding table
65$FOLDING_CODE = 0;
66$FOLDING_STATUS = 1;
67$FOLDING_MAPPING = 2;
68
69# Map general category code onto symbolic name.
70%mappings =
71    (
72     # Normative.
73     'Lu' => "G_UNICODE_UPPERCASE_LETTER",
74     'Ll' => "G_UNICODE_LOWERCASE_LETTER",
75     'Lt' => "G_UNICODE_TITLECASE_LETTER",
76     'Mn' => "G_UNICODE_NON_SPACING_MARK",
77     'Mc' => "G_UNICODE_COMBINING_MARK",
78     'Me' => "G_UNICODE_ENCLOSING_MARK",
79     'Nd' => "G_UNICODE_DECIMAL_NUMBER",
80     'Nl' => "G_UNICODE_LETTER_NUMBER",
81     'No' => "G_UNICODE_OTHER_NUMBER",
82     'Zs' => "G_UNICODE_SPACE_SEPARATOR",
83     'Zl' => "G_UNICODE_LINE_SEPARATOR",
84     'Zp' => "G_UNICODE_PARAGRAPH_SEPARATOR",
85     'Cc' => "G_UNICODE_CONTROL",
86     'Cf' => "G_UNICODE_FORMAT",
87     'Cs' => "G_UNICODE_SURROGATE",
88     'Co' => "G_UNICODE_PRIVATE_USE",
89     'Cn' => "G_UNICODE_UNASSIGNED",
90
91     # Informative.
92     'Lm' => "G_UNICODE_MODIFIER_LETTER",
93     'Lo' => "G_UNICODE_OTHER_LETTER",
94     'Pc' => "G_UNICODE_CONNECT_PUNCTUATION",
95     'Pd' => "G_UNICODE_DASH_PUNCTUATION",
96     'Ps' => "G_UNICODE_OPEN_PUNCTUATION",
97     'Pe' => "G_UNICODE_CLOSE_PUNCTUATION",
98     'Pi' => "G_UNICODE_INITIAL_PUNCTUATION",
99     'Pf' => "G_UNICODE_FINAL_PUNCTUATION",
100     'Po' => "G_UNICODE_OTHER_PUNCTUATION",
101     'Sm' => "G_UNICODE_MATH_SYMBOL",
102     'Sc' => "G_UNICODE_CURRENCY_SYMBOL",
103     'Sk' => "G_UNICODE_MODIFIER_SYMBOL",
104     'So' => "G_UNICODE_OTHER_SYMBOL"
105     );
106
107%break_mappings =
108    (
109     'BK' => "G_UNICODE_BREAK_MANDATORY",
110     'CR' => "G_UNICODE_BREAK_CARRIAGE_RETURN",
111     'LF' => "G_UNICODE_BREAK_LINE_FEED",
112     'CM' => "G_UNICODE_BREAK_COMBINING_MARK",
113     'SG' => "G_UNICODE_BREAK_SURROGATE",
114     'ZW' => "G_UNICODE_BREAK_ZERO_WIDTH_SPACE",
115     'IN' => "G_UNICODE_BREAK_INSEPARABLE",
116     'GL' => "G_UNICODE_BREAK_NON_BREAKING_GLUE",
117     'CB' => "G_UNICODE_BREAK_CONTINGENT",
118     'SP' => "G_UNICODE_BREAK_SPACE",
119     'BA' => "G_UNICODE_BREAK_AFTER",
120     'BB' => "G_UNICODE_BREAK_BEFORE",
121     'B2' => "G_UNICODE_BREAK_BEFORE_AND_AFTER",
122     'HY' => "G_UNICODE_BREAK_HYPHEN",
123     'NS' => "G_UNICODE_BREAK_NON_STARTER",
124     'OP' => "G_UNICODE_BREAK_OPEN_PUNCTUATION",
125     'CL' => "G_UNICODE_BREAK_CLOSE_PUNCTUATION",
126     'QU' => "G_UNICODE_BREAK_QUOTATION",
127     'EX' => "G_UNICODE_BREAK_EXCLAMATION",
128     'ID' => "G_UNICODE_BREAK_IDEOGRAPHIC",
129     'NU' => "G_UNICODE_BREAK_NUMERIC",
130     'IS' => "G_UNICODE_BREAK_INFIX_SEPARATOR",
131     'SY' => "G_UNICODE_BREAK_SYMBOL",
132     'AL' => "G_UNICODE_BREAK_ALPHABETIC",
133     'PR' => "G_UNICODE_BREAK_PREFIX",
134     'PO' => "G_UNICODE_BREAK_POSTFIX",
135     'SA' => "G_UNICODE_BREAK_COMPLEX_CONTEXT",
136     'AI' => "G_UNICODE_BREAK_AMBIGUOUS",
137     'XX' => "G_UNICODE_BREAK_UNKNOWN"
138     );
139
140# Title case mappings.
141%title_to_lower = ();
142%title_to_upper = ();
143
144# Maximum length of special-case strings
145
146my $special_case_len = 0;
147my @special_cases;
148
149$do_decomp = 0;
150$do_props = 1;
151if (@ARGV && $ARGV[0] eq '-decomp')
152{
153    $do_decomp = 1;
154    $do_props = 0;
155    shift @ARGV;
156}
157elsif (@ARGV && $ARGV[0] eq '-both')
158{
159    $do_decomp = 1;
160    shift @ARGV;
161}
162
163if (@ARGV != 6) {
164    $0 =~ s@.*/@@;
165    die "Usage: $0 [-decomp | -both] UNICODE-VERSION UnicodeData.txt LineBreak.txt SpecialCasing.txt CaseFolding.txt CompositionExclusions.txt\n";
166}
167 
168print "Creating decomp table\n" if ($do_decomp);
169print "Creating property table\n" if ($do_props);
170
171print "Composition exlusions from $ARGV[5]\n";
172
173open (INPUT, "< $ARGV[5]") || exit 1;
174
175while (<INPUT>) {
176
177    chop;
178
179    next if /^#/;
180    next if /^\s*$/;
181
182    s/\s*#.*//;
183
184    s/^\s*//;
185    s/\s*$//;
186
187    $composition_exclusions{hex($_)} = 1;
188}
189
190close INPUT;
191
192print "Unicode data from $ARGV[1]\n";
193
194open (INPUT, "< $ARGV[1]") || exit 1;
195
196$last_code = -1;
197while (<INPUT>)
198{
199    chop;
200    @fields = split (';', $_, 30);
201    if ($#fields != 14)
202    {
203        printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields);
204    }
205
206    $code = hex ($fields[$CODE]);
207
208    last if ($code > 0xFFFF); # ignore characters out of the basic plane
209
210    if ($code > $last_code + 1)
211    {
212        # Found a gap.
213        if ($fields[$NAME] =~ /Last>/)
214        {
215            # Fill the gap with the last character read,
216            # since this was a range specified in the char database
217            @gfields = @fields;
218        }
219        else
220        {
221            # The gap represents undefined characters.  Only the type
222            # matters.
223            @gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
224                        '', '', '', '');
225        }
226        for (++$last_code; $last_code < $code; ++$last_code)
227        {
228            $gfields{$CODE} = sprintf ("%04x", $last_code);
229            &process_one ($last_code, @gfields);
230        }
231    }
232    &process_one ($code, @fields);
233    $last_code = $code;
234}
235
236close INPUT;
237
238@gfields = ('', '', 'Cn', '0', '', '', '', '', '', '', '',
239            '', '', '', '');
240for (++$last_code; $last_code < 0x10000; ++$last_code)
241{
242    $gfields{$CODE} = sprintf ("%04x", $last_code);
243    &process_one ($last_code, @gfields);
244}
245--$last_code;                   # Want last to be 0xFFFF.
246
247print "Creating line break table\n";
248
249print "Line break data from $ARGV[2]\n";
250
251open (INPUT, "< $ARGV[2]") || exit 1;
252
253$last_code = -1;
254while (<INPUT>)
255{
256    my ($start_code, $end_code);
257   
258    chop;
259
260    next if /^#/;
261
262    s/\s*#.*//;
263   
264    @fields = split (';', $_, 30);
265    if ($#fields != 1)
266    {
267        printf STDERR ("Entry for $fields[$CODE] has wrong number of fields (%d)\n", $#fields);
268        next;
269    }
270
271    if ($fields[$CODE] =~ /([A-F0-9]{4})..([A-F0-9]{4})/)
272    {
273        $start_code = hex ($1);
274        $end_code = hex ($2);
275    } else {
276        $start_code = $end_code = hex ($fields[$CODE]);
277       
278    }
279
280    last if ($start_code > 0xFFFF); # FIXME ignore characters out of the basic plane
281
282    if ($start_code > $last_code + 1)
283    {
284        # The gap represents undefined characters. If assigned,
285        # they are AL, if not assigned, XX
286        for (++$last_code; $last_code < $start_code; ++$last_code)
287        {
288            if ($type[$last_code] eq 'Cn')
289            {
290                $break_props[$last_code] = 'XX';
291            }
292            else
293            {
294                $break_props[$last_code] = 'AL';
295            }
296        }
297    }
298
299    for ($last_code = $start_code; $last_code <= $end_code; $last_code++)
300    {
301        $break_props[$last_code] = $fields[$BREAK_PROPERTY];
302    }
303   
304    $last_code = $end_code;
305}
306
307close INPUT;
308
309for (++$last_code; $last_code < 0x10000; ++$last_code)
310{
311  if ($type[$last_code] eq 'Cn')
312    {
313      $break_props[$last_code] = 'XX';
314    }
315  else
316    {
317      $break_props[$last_code] = 'AL';
318    }
319}
320--$last_code;                   # Want last to be 0xFFFF.
321
322print STDERR "Last code is not 0xFFFF" if ($last_code != 0xFFFF);
323
324print "Reading special-casing table for case conversion\n";
325
326open (INPUT, "< $ARGV[3]") || exit 1;
327
328while (<INPUT>)
329{
330    my $code;
331   
332    chop;
333
334    next if /^#/;
335    next if /^\s*$/;
336
337    s/\s*#.*//;
338
339    @fields = split ('\s*;\s*', $_, 30);
340
341    $raw_code = $fields[$CASE_CODE];
342    $code = hex ($raw_code);
343
344    if ($#fields != 4 && $#fields != 5)
345    {
346        printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
347        next;
348    }
349
350    if (!defined $type[$code])
351    {
352        printf STDERR "Special case for code point: $code, which has no defined type\n";
353        next;
354    }
355
356    if (defined $fields[5]) {
357        # Ignore conditional special cases - we'll handle them in code
358        next;
359    }
360   
361    if ($type[$code] eq 'Lu')
362    {
363        (hex $fields[$CASE_UPPER] == $code) || die "$raw_code is Lu and UCD_Upper($raw_code) != $raw_code";
364
365        &add_special_case ($code, $value[$code],$fields[$CASE_LOWER], $fields[$CASE_TITLE]);
366       
367    } elsif ($type[$code] eq 'Lt')
368    {
369        (hex $fields[$CASE_TITLE] == $code) || die "$raw_code is Lt and UCD_Title($raw_code) != $raw_code";
370       
371        &add_special_case ($code, undef,$fields[$CASE_LOWER], $fields[$CASE_UPPER]);
372    } elsif ($type[$code] eq 'Ll')
373    {
374        (hex $fields[$CASE_LOWER] == $code) || die "$raw_code is Ll and UCD_Lower($raw_code) != $raw_code";
375       
376        &add_special_case ($code, $value[$code],$fields[$CASE_UPPER], $fields[$CASE_TITLE]);
377    } else {
378        printf STDERR "Special case for non-alphabetic code point: $raw_code\n";
379        next;
380    }
381}
382
383close INPUT;
384
385open (INPUT, "< $ARGV[4]") || exit 1;
386
387my $casefoldlen = 0;
388my @casefold;
389 
390while (<INPUT>)
391{
392    my $code;
393   
394    chop;
395
396    next if /^#/;
397    next if /^\s*$/;
398
399    s/\s*#.*//;
400
401    @fields = split ('\s*;\s*', $_, 30);
402
403    $raw_code = $fields[$FOLDING_CODE];
404    $code = hex ($raw_code);
405
406    next if $code > 0xffff;     # FIXME!
407   
408    if ($#fields != 3)
409    {
410        printf STDERR ("Entry for $raw_code has wrong number of fields (%d)\n", $#fields);
411        next;
412    }
413
414    next if ($fields[$FOLDING_STATUS] eq 'S');
415
416    @values = map { hex ($_) } split /\s+/, $fields[$FOLDING_MAPPING];
417
418    # Check simple case
419
420    if (@values == 1 &&
421        !(defined $value[$code] && $value[$code] >= 0xd800 && $value[$code] < 0xdc00) &&
422        defined $type[$code]) {
423
424        my $lower;
425        if ($type[$code] eq 'Ll')
426        {
427            $lower = $code;
428        } elsif ($type[$code] eq 'Lt')
429        {
430            $lower = $title_to_lower{$code};
431        } elsif ($type[$code] eq 'Lu')
432        {
433            $lower = $value[$code];
434        } else {
435            $lower = $code;
436        }
437       
438        if ($lower == $values[0]) {
439            next;
440        }
441    }
442
443    my $string = pack ("U*", @values);
444    $string =~ s/([\x80-\xff])/sprintf "\\x%02x",ord($1)/eg;
445   
446    if (1 + length $string > $casefoldlen) {
447        $casefoldlen = 1 + length $string;
448    }
449
450    push @casefold, [ $code, $string ];
451}
452
453close INPUT;
454
455if ($do_props) {
456    &print_tables ($last_code)
457}
458if ($do_decomp) {
459    &print_decomp ($last_code);
460    &output_composition_table;
461}
462
463&print_line_break ($last_code);
464
465exit 0;
466
467# Process a single character.
468sub process_one
469{
470    my ($code, @fields) = @_;
471
472    $type[$code] = $fields[$CATEGORY];
473    if ($type[$code] eq 'Nd')
474    {
475        $value[$code] = int ($fields[$DECIMAL_VALUE]);
476    }
477    elsif ($type[$code] eq 'Ll')
478    {
479        $value[$code] = hex ($fields[$UPPER]);
480    }
481    elsif ($type[$code] eq 'Lu')
482    {
483        $value[$code] = hex ($fields[$LOWER]);
484    }
485
486    if ($type[$code] eq 'Lt')
487    {
488        $title_to_lower{$code} = hex ($fields[$LOWER]);
489        $title_to_upper{$code} = hex ($fields[$UPPER]);
490    }
491
492    $cclass[$code] = $fields[$COMBINING_CLASSES];
493
494    # Handle decompositions.
495    if ($fields[$DECOMPOSITION] ne '')
496    {
497        if ($fields[$DECOMPOSITION] =~ s/\<.*\>\s*//) {
498           $decompose_compat[$code] = 1;
499        } else {
500           $decompose_compat[$code] = 0;
501
502           if (!exists $composition_exclusions{$code}) {
503               $compositions{$code} = $fields[$DECOMPOSITION];
504           }
505        }
506        $decompositions[$code] = $fields[$DECOMPOSITION];
507    }
508}
509
510sub print_tables
511{
512    my ($last) = @_;
513    my ($outfile) = "gunichartables.h";
514
515    local ($bytes_out) = 0;
516
517    print "Writing $outfile...\n";
518
519    open (OUT, "> $outfile");
520
521    print OUT "/* This file is automatically generated.  DO NOT EDIT!\n";
522    print OUT "   Instead, edit gen-unicode-tables.pl and re-run.  */\n\n";
523
524    print OUT "#ifndef CHARTABLES_H\n";
525    print OUT "#define CHARTABLES_H\n\n";
526
527    print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n";
528
529    printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
530
531    printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 1000\n\n";
532
533    $table_index = 0;
534    printf OUT "static const char type_data[][256] = {\n";
535    for ($count = 0; $count <= $last; $count += 256)
536    {
537        $row[$count / 256] = &print_row ($count, 1, \&fetch_type);
538    }
539    printf OUT "\n};\n\n";
540
541    print OUT "static const short type_table[256] = {\n";
542    for ($count = 0; $count <= $last; $count += 256)
543    {
544        print OUT ",\n" if $count > 0;
545        print OUT "  ", $row[$count / 256];
546        $bytes_out += 2;
547    }
548    print OUT "\n};\n\n";
549
550
551    #
552    # Now print attribute table.
553    #
554
555    $table_index = 0;
556    printf OUT "static const unsigned short attr_data[][256] = {\n";
557    for ($count = 0; $count <= $last; $count += 256)
558    {
559        $row[$count / 256] = &print_row ($count, 2, \&fetch_attr);
560    }
561    printf OUT "\n};\n\n";
562
563    print OUT "static const short attr_table[256] = {\n";
564    for ($count = 0; $count <= $last; $count += 256)
565    {
566        print OUT ",\n" if $count > 0;
567        print OUT "  ", $row[$count / 256];
568        $bytes_out += 2;
569    }
570    print OUT "\n};\n\n";
571
572    #
573    # print title case table
574    #
575
576    # FIXME: type.
577    print OUT "static const unsigned short title_table[][3] = {\n";
578    my ($item);
579    my ($first) = 1;
580    foreach $item (sort keys %title_to_lower)
581    {
582        print OUT ",\n"
583            unless $first;
584        $first = 0;
585        printf OUT "  { 0x%04x, 0x%04x, 0x%04x }", $item, $title_to_upper{$item}, $title_to_lower{$item};
586        $bytes_out += 6;
587    }
588    print OUT "\n};\n\n";
589
590    #
591    # And special case conversion table -- conversions that change length
592    #
593    &output_special_case_table (\*OUT);
594    &output_casefold_table (\*OUT);
595
596    print OUT "#endif /* CHARTABLES_H */\n";
597
598    close (OUT);
599
600    printf STDERR "Generated %d bytes in tables\n", $bytes_out;
601}
602
603# A fetch function for the type table.
604sub fetch_type
605{
606    my ($index) = @_;
607    return $mappings{$type[$index]};
608}
609
610# A fetch function for the attribute table.
611sub fetch_attr
612{
613    my ($index) = @_;
614    if (defined $value[$index])
615      {
616        return sprintf ("0x%04x", $value[$index]);
617      }
618    else
619      {
620        return "0x0000";
621      }
622}
623
624sub print_row
625{
626    my ($start, $typsize, $fetcher) = @_;
627
628    my ($i);
629    my (@values);
630    my ($flag) = 1;
631    my ($off);
632
633    for ($off = 0; $off < 256; ++$off)
634    {
635        $values[$off] = $fetcher->($off + $start);
636        if ($values[$off] ne $values[0])
637        {
638            $flag = 0;
639        }
640    }
641    if ($flag)
642    {
643        return $values[0] . " + G_UNICODE_MAX_TABLE_INDEX";
644    }
645
646    printf OUT ",\n" if ($table_index != 0);
647    printf OUT "  { /* page %d, index %d */\n    ", $start / 256, $table_index;
648    my ($column) = 4;
649    for ($i = $start; $i < $start + 256; ++$i)
650    {
651        print OUT ", "
652            if $i > $start;
653        my ($text) = $values[$i - $start];
654        if (length ($text) + $column + 2 > 78)
655        {
656            print OUT "\n    ";
657            $column = 4;
658        }
659        print OUT $text;
660        $column += length ($text) + 2;
661    }
662    print OUT "\n  }";
663
664    $bytes_out += 256 * $typsize;
665
666    return sprintf "%d /* page %d */", $table_index++, $start / 256;
667}
668
669# Generate the character decomposition header.
670sub print_decomp
671{
672    my ($last) = @_;
673    my ($outfile) = "gunidecomp.h";
674
675    local ($bytes_out) = 0;
676
677    print "Writing $outfile...\n";
678
679    open (OUT, "> $outfile") || exit 1;
680
681    print OUT "/* This file is automatically generated.  DO NOT EDIT! */\n\n";
682    print OUT "#ifndef DECOMP_H\n";
683    print OUT "#define DECOMP_H\n\n";
684
685    printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
686
687    printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 1000\n\n";
688
689    my ($count, @row);
690    $table_index = 0;
691    printf OUT "static const unsigned char cclass_data[][256] = {\n";
692    for ($count = 0; $count <= $last; $count += 256)
693    {
694        $row[$count / 256] = &print_row ($count, 1, \&fetch_cclass);
695    }
696    printf OUT "\n};\n\n";
697
698    print OUT "static const short combining_class_table[256] = {\n";
699    for ($count = 0; $count <= $last; $count += 256)
700    {
701        print OUT ",\n" if $count > 0;
702        print OUT "  ", $row[$count / 256];
703        $bytes_out += 2;
704    }
705    print OUT "\n};\n\n";
706
707    print OUT "typedef struct\n{\n";
708    # FIXME: type.
709    print OUT "  unsigned short ch;\n";
710    print OUT "  unsigned char canon_offset;\n";
711    print OUT "  unsigned char compat_offset;\n";
712    print OUT "  unsigned short expansion_offset;\n";
713    print OUT "} decomposition;\n\n";
714
715    print OUT "static const decomposition decomp_table[] =\n{\n";
716    my ($iter);
717    my ($first) = 1;
718    my ($decomp_string) = "";
719    my ($decomp_string_offset) = 0;
720    for ($count = 0; $count <= $last; ++$count)
721    {
722        if (defined $decompositions[$count])
723        {
724            print OUT ",\n"
725                if ! $first;
726            $first = 0;
727
728            my $canon_decomp;
729            my $compat_decomp;
730
731            if (!$decompose_compat[$count]) {
732                $canon_decomp = make_decomp ($count, 0);
733            }
734            $compat_decomp = make_decomp ($count, 1);
735
736            if (defined $canon_decomp && $compat_decomp eq $canon_decomp) {
737                undef $compat_decomp;
738            }
739
740            my $string = "";
741            my $canon_offset = 0xff;
742            my $compat_offset = 0xff;
743           
744            if (defined $canon_decomp) {
745                $canon_offset = 0;
746                $string .= $canon_decomp;
747            }
748            if (defined $compat_decomp) {
749                if (defined $canon_decomp) {
750                    $string .= "\\x00\\x00";
751                }
752                $compat_offset = (length $string) / 4;
753                $string .= $compat_decomp;
754            }
755
756            if (!defined($decomp_offsets{$string})) {
757                $decomp_offsets{$string} = $decomp_string_offset;
758                $decomp_string .= "\n  \"".$string."\\0\\0\" /* offset ".
759                    $decomp_string_offset." */";
760                $decomp_string_offset += ((length $string) / 4) + 2;
761           
762                $bytes_out += (length $string) / 4 + 2; # "\x20"
763            }
764           
765            printf OUT qq(  { 0x%04x, %u, %u, %d }),
766                $count, $canon_offset, $compat_offset, $decomp_offsets{$string};
767            $bytes_out += 6;
768
769        }
770    }
771    print OUT "\n};\n\n";
772
773    printf OUT "static const guchar decomp_expansion_string[] = %s;\n\n", $decomp_string;
774
775    print OUT "#endif /* DECOMP_H */\n";
776
777    printf STDERR "Generated %d bytes in decomp tables\n", $bytes_out;
778}
779
780sub print_line_break
781{
782    my ($last) = @_;
783    my ($outfile) = "gunibreak.h";
784
785    local ($bytes_out) = 0;
786
787    print "Writing $outfile...\n";
788
789    open (OUT, "> $outfile");
790
791    print OUT "/* This file is automatically generated.  DO NOT EDIT!\n";
792    print OUT "   Instead, edit gen-unicode-tables.pl and re-run.  */\n\n";
793
794    print OUT "#ifndef BREAKTABLES_H\n";
795    print OUT "#define BREAKTABLES_H\n\n";
796
797    print OUT "#define G_UNICODE_DATA_VERSION \"$ARGV[0]\"\n\n";
798
799    printf OUT "#define G_UNICODE_LAST_CHAR 0x%04x\n\n", $last;
800
801    printf OUT "#define G_UNICODE_MAX_TABLE_INDEX 1000\n\n";
802
803    $table_index = 0;
804    printf OUT "static const char break_property_data[][256] = {\n";
805    for ($count = 0; $count <= $last; $count += 256)
806    {
807        $row[$count / 256] = &print_row ($count, 1, \&fetch_break_type);
808    }
809    printf OUT "\n};\n\n";
810
811    print OUT "static const short break_property_table[256] = {\n";
812    for ($count = 0; $count <= $last; $count += 256)
813    {
814        print OUT ",\n" if $count > 0;
815        print OUT "  ", $row[$count / 256];
816        $bytes_out += 2;
817    }
818    print OUT "\n};\n\n";
819
820    print OUT "#endif /* BREAKTABLES_H */\n";
821
822    close (OUT);
823
824    printf STDERR "Generated %d bytes in break tables\n", $bytes_out;
825}
826
827
828# A fetch function for the break properties table.
829sub fetch_break_type
830{
831    my ($index) = @_;
832    return $break_mappings{$break_props[$index]};
833}
834
835# Fetcher for combining class.
836sub fetch_cclass
837{
838    my ($i) = @_;
839    return $cclass[$i];
840}
841
842# Expand a character decomposition recursively.
843sub expand_decomp
844{
845    my ($code, $compat) = @_;
846
847    my ($iter, $val);
848    my (@result) = ();
849    foreach $iter (split (' ', $decompositions[$code]))
850    {
851        $val = hex ($iter);
852        if (defined $decompositions[$val] &&
853            ($compat || !$decompose_compat[$val]))
854        {
855            push (@result, &expand_decomp ($val, $compat));
856        }
857        else
858        {
859            push (@result, $val);
860        }
861    }
862
863    return @result;
864}
865
866sub make_decomp
867{
868    my ($code, $compat) = @_;
869
870    my $result = "";
871    foreach $iter (&expand_decomp ($code, $compat))
872    {
873        $result .= sprintf "\\x%02x\\x%02x", $iter / 256, $iter & 0xff;
874    }
875
876    $result;
877}
878# Generate special case data string from two fields
879sub add_special_case
880{
881    my ($code, $single, $field1, $field2) = @_;
882
883    @values = (defined $single ? $single : (),
884               (map { hex ($_) } split /\s+/, $field1),
885               0,
886               (map { hex ($_) } split /\s+/, $field2));
887    $result = "";
888
889
890    for $value (@values) {
891        $result .= sprintf ("\\x%02x\\x%02x", $value / 256, $value & 0xff);
892    }
893
894    $result .= "\\0";
895   
896    if (2 * @values + 2 > $special_case_len) {
897        $special_case_len = 2 * @values + 2;
898    }
899
900    push @special_cases, $result;
901
902    #
903    # We encode special cases in the surrogate pair space
904    #
905    $value[$code] = 0xD800 + scalar(@special_cases) - 1;
906}
907
908sub output_special_case_table
909{
910    my $out = shift;
911
912    print $out <<EOT;
913
914/* Table of special cases for case conversion; each record contains
915 * First, the best single character mapping to lowercase if Lu,
916 * and to uppercase if Ll, followed by the output mapping for the two cases
917 * other than the case of the codepoint, in the order [Ll],[Lu],[Lt],
918 * separated and terminated by a double NUL.
919 */
920static const guchar special_case_table[][$special_case_len] = {
921EOT
922
923    for $case (@special_cases) {
924        print $out qq( "$case",\n);
925    }
926
927    print $out <<EOT;
928};
929
930EOT
931
932    print STDERR "Generated ", ($special_case_len * scalar @special_cases), " bytes in special case table\n";
933}
934
935sub enumerate_ordered
936{
937    my ($array) = @_;
938
939    my $n = 0;
940    for my $code (sort { $a <=> $b } keys %$array) {
941        if ($array->{$code} == 1) {
942            delete $array->{$code};
943            next;
944        }
945        $array->{$code} = $n++;
946    }
947
948    return $n;
949}
950
951sub output_composition_table
952{
953    print STDERR "Generating composition table\n";
954   
955    local ($bytes_out) = 0;
956
957    my %first;
958    my %second;
959
960    # First we need to go through and remove decompositions
961    # starting with a non-starter, and single-character
962    # decompositions. At the same time, record
963    # the first and second character of each decomposition
964   
965    for $code (keys %compositions) {
966        @values = map { hex ($_) } split /\s+/, $compositions{$code};
967        if ($cclass[$values[0]]) {
968            delete $compositions{$code};
969            next;
970        }
971        if (@values == 1) {
972            delete $compositions{$code};
973            next;
974        }
975        if (@values != 2) {
976            die "$code has more than two elements in its decomposition!\n";
977        }
978
979        if (exists $first{$values[0]}) {
980            $first{$values[0]}++;
981        } else {
982            $first{$values[0]} = 1;
983        }
984    }
985
986    # Assign integer indicices, removing singletons
987    my $n_first = enumerate_ordered (\%first);
988
989    # Now record the second character if each (non-singleton) decomposition
990    for $code (keys %compositions) {
991        @values = map { hex ($_) } split /\s+/, $compositions{$code};
992
993        if (exists $first{$values[0]}) {
994            if (exists $second{$values[1]}) {
995                $second{$values[1]}++;
996            } else {
997                $second{$values[1]} = 1;
998            }
999        }
1000    }
1001
1002    # Assign integer indices, removing duplicate
1003    my $n_second = enumerate_ordered (\%second);
1004
1005    # Build reverse table
1006
1007    my @first_singletons;
1008    my @second_singletons;
1009    my %reverse;
1010    for $code (keys %compositions) {
1011        @values = map { hex ($_) } split /\s+/, $compositions{$code};
1012
1013        my $first = $first{$values[0]};
1014        my $second = $second{$values[1]};
1015
1016        if (defined $first && defined $second) {
1017            $reverse{"$first|$second"} = $code;
1018        } elsif (!defined $first) {
1019            push @first_singletons, [ $values[0], $values[1], $code ];
1020        } else {
1021            push @second_singletons, [ $values[1], $values[0], $code ];
1022        }
1023    }
1024
1025    @first_singletons = sort { $a->[0] <=> $b->[0] } @first_singletons;
1026    @second_singletons = sort { $a->[0] <=> $b->[0] } @second_singletons;
1027
1028    my %vals;
1029   
1030    open OUT, ">gunicomp.h" or die "Cannot open gunicomp.h: $!\n";
1031   
1032    # Assign values in lookup table for all code points involved
1033   
1034    my $total = 1;
1035    my $last = 0;
1036    printf OUT "#define COMPOSE_FIRST_START %d\n", $total;
1037    for $code (keys %first) {
1038        $vals{$code} = $first{$code} + $total;
1039        $last = $code if $code > $last;
1040    }
1041    $total += $n_first;
1042    $i = 0;
1043    printf OUT "#define COMPOSE_FIRST_SINGLE_START %d\n", $total;
1044    for $record (@first_singletons) {
1045        my $code = $record->[0];
1046        $vals{$code} = $i++ + $total;
1047        $last = $code if $code > $last;
1048    }
1049    $total += @first_singletons;
1050    printf OUT "#define COMPOSE_SECOND_START %d\n", $total;
1051    for $code (keys %second) {
1052        $vals{$code} = $second{$code} + $total;
1053        $last = $code if $code > $last;
1054    }
1055    $total += $n_second;
1056    $i = 0;
1057    printf OUT "#define COMPOSE_SECOND_SINGLE_START %d\n\n", $total;
1058    for $record (@second_singletons) {
1059        my $code = $record->[0];
1060        $vals{$code} = $i++ + $total;
1061        $last = $code if $code > $last;
1062    }
1063
1064    # Output lookup table
1065
1066    my @row;                                             
1067    $table_index = 0;
1068    printf OUT "static const gushort compose_data[][256] = {\n";
1069    for (my $count = 0; $count <= $last; $count += 256)
1070    {
1071        $row[$count / 256] = &print_row ($count, 2, sub { exists $vals{$_[0]} ? $vals{$_[0]} : 0; });
1072    }
1073    printf OUT "\n};\n\n";
1074
1075    print OUT "static const short compose_table[256] = {\n";
1076    for (my $count = 0; $count <= $last; $count += 256)
1077    {
1078        print OUT ",\n" if $count > 0;
1079        print OUT "  ", $row[$count / 256];
1080        $bytes_out += 4;
1081    }
1082    print OUT "\n};\n\n";
1083
1084    # Output first singletons
1085
1086    print OUT "static const gushort compose_first_single[][2] = {\n";
1087    $i = 0;                                 
1088    for $record (@first_singletons) {
1089        print OUT ",\n" if $i++ > 0;
1090        printf OUT " { %#06x, %#06x }", $record->[1], $record->[2];
1091    }
1092    print OUT "\n};\n";
1093                                     
1094    $bytes_out += @first_singletons * 4;                                     
1095                 
1096    # Output second singletons
1097
1098    print OUT "static const gushort compose_second_single[][2] = {\n";
1099    $i = 0;                                 
1100    for $record (@second_singletons) {
1101        print OUT ",\n" if $i++ > 0;
1102        printf OUT " { %#06x, %#06x }", $record->[1], $record->[2];
1103    }
1104    print OUT "\n};\n";
1105                                     
1106    $bytes_out += @second_singletons * 4;                                   
1107                 
1108    # Output array of composition pairs
1109
1110    print OUT <<EOT;
1111static const gushort compose_array[$n_first][$n_second] = {
1112EOT
1113                       
1114    for (my $i = 0; $i < $n_first; $i++) {
1115        print OUT ",\n" if $i;
1116        print OUT " { ";
1117        for (my $j = 0; $j < $n_second; $j++) {
1118            print OUT ", " if $j;
1119            if (exists $reverse{"$i|$j"}) {
1120                printf OUT "%#06x", $reverse{"$i|$j"};
1121            } else {
1122                print OUT "     0";
1123            }
1124        }
1125        print OUT " }";
1126    }
1127    print OUT "\n";
1128
1129    print OUT <<EOT;
1130};
1131EOT
1132
1133    $bytes_out += $n_first * $n_second * 2;
1134   
1135    printf STDERR "Generated %d bytes in compose tables\n", $bytes_out;
1136}
1137
1138sub output_casefold_table
1139{
1140    my $out = shift;
1141
1142    print $out <<EOT;
1143
1144/* Table of casefolding cases that can't be derived by lowercasing
1145 */
1146static const struct {
1147  guint16 ch;
1148  gchar data[$casefoldlen];
1149} casefold_table[] = {
1150EOT
1151
1152   @casefold = sort { $a->[0] <=> $b->[0] } @casefold;
1153   
1154   for $case (@casefold) {
1155       $code = $case->[0];
1156       $string = $case->[1];
1157       print $out sprintf(qq({ %#04x, "$string" },\n), $code);
1158   
1159   }
1160
1161    print $out <<EOT;
1162};
1163
1164EOT
1165
1166   my $recordlen = (2+$casefoldlen+1) & ~1;
1167   printf "Generated %d bytes for casefold table\n", $recordlen * @casefold;
1168}
1169
1170                             
1171
Note: See TracBrowser for help on using the repository browser.