source: trunk/third/gettext/lib/gen-lbrkprop.c @ 16931

Revision 16931, 29.1 KB checked in by ghudson, 23 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r16930, which included commits to RCS files with non-trunk default branches.
Line 
1/* Generate a Unicode conforming Line Break Properties tables from a
2   UnicodeData file.
3   Written by Bruno Haible <haible@clisp.cons.org>, 2000-2001.
4
5This program is free software; you can redistribute it and/or modify
6it under the terms of the GNU General Public License as published by
7the Free Software Foundation; either version 2, or (at your option)
8any later version.
9
10This program is distributed in the hope that it will be useful,
11but WITHOUT ANY WARRANTY; without even the implied warranty of
12MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License
16along with this program; if not, write to the Free Software
17Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
18
19/* Usage example:
20     $ gen-lbrkprop /usr/local/share/Unidata/UnicodeData.txt \
21                    /usr/local/share/Unidata/PropList.txt \
22                    /usr/local/share/Unidata/EastAsianWidth.txt \
23                    3.0
24 */
25
26#include <stdio.h>
27#include <stdlib.h>
28#include <stdbool.h>
29#include <stdint.h>
30#include <string.h>
31#include <time.h>
32
33/* This structure represents one line in the UnicodeData.txt file.  */
34struct unicode_attribute
35{
36  const char *name;           /* Character name */
37  const char *category;       /* General category */
38  const char *combining;      /* Canonical combining classes */
39  const char *bidi;           /* Bidirectional category */
40  const char *decomposition;  /* Character decomposition mapping */
41  const char *decdigit;       /* Decimal digit value */
42  const char *digit;          /* Digit value */
43  const char *numeric;        /* Numeric value */
44  int mirrored;               /* mirrored */
45  const char *oldname;        /* Old Unicode 1.0 name */
46  const char *comment;        /* Comment */
47  unsigned int upper;         /* Uppercase mapping */
48  unsigned int lower;         /* Lowercase mapping */
49  unsigned int title;         /* Titlecase mapping */
50};
51
52/* Missing fields are represented with "" for strings, and NONE for
53   characters.  */
54#define NONE (~(unsigned int)0)
55
56/* The entire contents of the UnicodeData.txt file.  */
57struct unicode_attribute unicode_attributes [0x10000];
58
59/* Stores in unicode_attributes[i] the values from the given fields.  */
60static void
61fill_attribute (unsigned int i,
62                const char *field1, const char *field2,
63                const char *field3, const char *field4,
64                const char *field5, const char *field6,
65                const char *field7, const char *field8,
66                const char *field9, const char *field10,
67                const char *field11, const char *field12,
68                const char *field13, const char *field14)
69{
70  struct unicode_attribute * uni;
71
72  if (i >= 0x10000)
73    {
74      fprintf (stderr, "index too large\n");
75      exit (1);
76    }
77  uni = &unicode_attributes[i];
78  /* Copy the strings.  */
79  uni->name          = strdup (field1);
80  uni->category      = (field2[0] == '\0' ? "" : strdup (field2));
81  uni->combining     = (field3[0] == '\0' ? "" : strdup (field3));
82  uni->bidi          = (field4[0] == '\0' ? "" : strdup (field4));
83  uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5));
84  uni->decdigit      = (field6[0] == '\0' ? "" : strdup (field6));
85  uni->digit         = (field7[0] == '\0' ? "" : strdup (field7));
86  uni->numeric       = (field8[0] == '\0' ? "" : strdup (field8));
87  uni->mirrored      = (field9[0] == 'Y');
88  uni->oldname       = (field10[0] == '\0' ? "" : strdup (field10));
89  uni->comment       = (field11[0] == '\0' ? "" : strdup (field11));
90  uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16));
91  uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16));
92  uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16));
93}
94
95/* Maximum length of a field in the UnicodeData.txt file.  */
96#define FIELDLEN 120
97
98/* Reads the next field from STREAM.  The buffer BUFFER has size FIELDLEN.
99   Reads up to (but excluding) DELIM.
100   Returns 1 when a field was successfully read, otherwise 0.  */
101static int
102getfield (FILE *stream, char *buffer, int delim)
103{
104  int count = 0;
105  int c;
106
107  for (; (c = getc (stream)), (c != EOF && c != delim); )
108    {
109      /* The original unicode.org UnicodeData.txt file happens to have
110         CR/LF line terminators.  Silently convert to LF.  */
111      if (c == '\r')
112        continue;
113
114      /* Put c into the buffer.  */
115      if (++count >= FIELDLEN - 1)
116        {
117          fprintf (stderr, "field too long\n");
118          exit (1);
119        }
120      *buffer++ = c;
121    }
122
123  if (c == EOF)
124    return 0;
125
126  *buffer = '\0';
127  return 1;
128}
129
130/* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt
131   file.  */
132static void
133fill_attributes (const char *unicodedata_filename)
134{
135  unsigned int i, j;
136  FILE *stream;
137  char field0[FIELDLEN];
138  char field1[FIELDLEN];
139  char field2[FIELDLEN];
140  char field3[FIELDLEN];
141  char field4[FIELDLEN];
142  char field5[FIELDLEN];
143  char field6[FIELDLEN];
144  char field7[FIELDLEN];
145  char field8[FIELDLEN];
146  char field9[FIELDLEN];
147  char field10[FIELDLEN];
148  char field11[FIELDLEN];
149  char field12[FIELDLEN];
150  char field13[FIELDLEN];
151  char field14[FIELDLEN];
152  int lineno = 0;
153
154  for (i = 0; i < 0x10000; i++)
155    unicode_attributes[i].name = NULL;
156
157  stream = fopen (unicodedata_filename, "r");
158  if (stream == NULL)
159    {
160      fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename);
161      exit (1);
162    }
163
164  for (;;)
165    {
166      int n;
167
168      lineno++;
169      n = getfield (stream, field0, ';');
170      n += getfield (stream, field1, ';');
171      n += getfield (stream, field2, ';');
172      n += getfield (stream, field3, ';');
173      n += getfield (stream, field4, ';');
174      n += getfield (stream, field5, ';');
175      n += getfield (stream, field6, ';');
176      n += getfield (stream, field7, ';');
177      n += getfield (stream, field8, ';');
178      n += getfield (stream, field9, ';');
179      n += getfield (stream, field10, ';');
180      n += getfield (stream, field11, ';');
181      n += getfield (stream, field12, ';');
182      n += getfield (stream, field13, ';');
183      n += getfield (stream, field14, '\n');
184      if (n == 0)
185        break;
186      if (n != 15)
187        {
188          fprintf (stderr, "short line in'%s':%d\n",
189                   unicodedata_filename, lineno);
190          exit (1);
191        }
192      i = strtoul (field0, NULL, 16);
193      if (field1[0] == '<'
194          && strlen (field1) >= 9
195          && !strcmp (field1 + strlen(field1) - 8, ", First>"))
196        {
197          /* Deal with a range. */
198          lineno++;
199          n = getfield (stream, field0, ';');
200          n += getfield (stream, field1, ';');
201          n += getfield (stream, field2, ';');
202          n += getfield (stream, field3, ';');
203          n += getfield (stream, field4, ';');
204          n += getfield (stream, field5, ';');
205          n += getfield (stream, field6, ';');
206          n += getfield (stream, field7, ';');
207          n += getfield (stream, field8, ';');
208          n += getfield (stream, field9, ';');
209          n += getfield (stream, field10, ';');
210          n += getfield (stream, field11, ';');
211          n += getfield (stream, field12, ';');
212          n += getfield (stream, field13, ';');
213          n += getfield (stream, field14, '\n');
214          if (n != 15)
215            {
216              fprintf (stderr, "missing end range in '%s':%d\n",
217                       unicodedata_filename, lineno);
218              exit (1);
219            }
220          if (!(field1[0] == '<'
221                && strlen (field1) >= 8
222                && !strcmp (field1 + strlen (field1) - 7, ", Last>")))
223            {
224              fprintf (stderr, "missing end range in '%s':%d\n",
225                       unicodedata_filename, lineno);
226              exit (1);
227            }
228          field1[strlen (field1) - 7] = '\0';
229          j = strtoul (field0, NULL, 16);
230          for (; i <= j; i++)
231            fill_attribute (i, field1+1, field2, field3, field4, field5,
232                               field6, field7, field8, field9, field10,
233                               field11, field12, field13, field14);
234        }
235      else
236        {
237          /* Single character line */
238          fill_attribute (i, field1, field2, field3, field4, field5,
239                             field6, field7, field8, field9, field10,
240                             field11, field12, field13, field14);
241        }
242    }
243  if (ferror (stream) || fclose (stream))
244    {
245      fprintf (stderr, "error reading from '%s'\n", unicodedata_filename);
246      exit (1);
247    }
248}
249
250/* The combining property from the PropList.txt file.  */
251char unicode_combining[0x10000];
252
253/* Stores in unicode_combining[] the Combining property from the
254   PropList.txt file.  */
255static void
256fill_combining (const char *proplist_filename)
257{
258  unsigned int i;
259  FILE *stream;
260  char buf[100+1];
261
262  for (i = 0; i < 0x10000; i++)
263    unicode_combining[i] = 0;
264
265  stream = fopen (proplist_filename, "r");
266  if (stream == NULL)
267    {
268      fprintf (stderr, "error during fopen of '%s'\n", proplist_filename);
269      exit (1);
270    }
271
272  /* Search for the "Property dump for: 0x20000004 (Combining)" line.  */
273  do
274    {
275      if (fscanf (stream, "%100[^\n]\n", buf) < 1)
276        {
277          fprintf (stderr, "no combining property found in '%s'\n",
278                   proplist_filename);
279          exit (1);
280        }
281    }
282  while (strstr (buf, "(Combining)") == NULL);
283
284  for (;;)
285    {
286      unsigned int i1, i2;
287
288      if (fscanf (stream, "%100[^\n]\n", buf) < 1)
289        {
290          fprintf (stderr, "premature end of combining property in '%s'\n",
291                   proplist_filename);
292          exit (1);
293        }
294      if (buf[0] == '*')
295        break;
296      if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.')
297        {
298          if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2)
299            {
300              fprintf (stderr, "parse error in combining property in '%s'\n",
301                       proplist_filename);
302              exit (1);
303            }
304        }
305      else if (strlen (buf) >= 4)
306        {
307          if (sscanf (buf, "%4X", &i1) < 1)
308            {
309              fprintf (stderr, "parse error in combining property in '%s'\n",
310                       proplist_filename);
311              exit (1);
312            }
313          i2 = i1;
314        }
315      else
316        {
317          fprintf (stderr, "parse error in combining property in '%s'\n",
318                   proplist_filename);
319          exit (1);
320        }
321      for (i = i1; i <= i2; i++)
322        unicode_combining[i] = 1;
323    }
324  if (ferror (stream) || fclose (stream))
325    {
326      fprintf (stderr, "error reading from '%s'\n", proplist_filename);
327      exit (1);
328    }
329}
330
331/* The width property from the EastAsianWidth.txt file.
332   Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na".  */
333const char * unicode_width[0x10000];
334
335/* Stores in unicode_width[] the width property from the PropList.txt
336   file.  */
337static void
338fill_width (const char *width_filename)
339{
340  unsigned int i, j;
341  FILE *stream;
342  char field0[FIELDLEN];
343  char field1[FIELDLEN];
344  char field2[FIELDLEN];
345  int lineno = 0;
346
347  for (i = 0; i < 0x10000; i++)
348    unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL);
349
350  stream = fopen (width_filename, "r");
351  if (stream == NULL)
352    {
353      fprintf (stderr, "error during fopen of '%s'\n", width_filename);
354      exit (1);
355    }
356
357  for (;;)
358    {
359      int n;
360      int c;
361
362      lineno++;
363      c = getc (stream);
364      if (c == EOF)
365        break;
366      if (c == '#')
367        {
368          do c = getc (stream); while (c != EOF && c != '\n');
369          continue;
370        }
371      ungetc (c, stream);
372      n = getfield (stream, field0, ';');
373      n += getfield (stream, field1, ';');
374      n += getfield (stream, field2, '\n');
375      if (n == 0)
376        break;
377      if (n != 3)
378        {
379          fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno);
380          exit (1);
381        }
382      i = strtoul (field0, NULL, 16);
383      if (field2[0] == '<'
384          && strlen (field2) >= 9
385          && !strcmp (field2 + strlen(field2) - 8, ", First>"))
386        {
387          /* Deal with a range. */
388          lineno++;
389          n = getfield (stream, field0, ';');
390          n += getfield (stream, field1, ';');
391          n += getfield (stream, field2, '\n');
392          if (n != 3)
393            {
394              fprintf (stderr, "missing end range in '%s':%d\n",
395                       width_filename, lineno);
396              exit (1);
397            }
398          if (!(field2[0] == '<'
399                && strlen (field2) >= 8
400                && !strcmp (field2 + strlen (field2) - 7, ", Last>")))
401            {
402              fprintf (stderr, "missing end range in '%s':%d\n",
403                       width_filename, lineno);
404              exit (1);
405            }
406          field2[strlen (field2) - 7] = '\0';
407          j = strtoul (field0, NULL, 16);
408          for (; i <= j; i++)
409            unicode_width[i] = strdup (field1);
410        }
411      else
412        {
413          /* Single character line */
414          unicode_width[i] = strdup (field1);
415        }
416    }
417  if (ferror (stream) || fclose (stream))
418    {
419      fprintf (stderr, "error reading from '%s'\n", width_filename);
420      exit (1);
421    }
422}
423
424/* Line breaking classification.  */
425
426enum
427{
428  /* Values >= 20 are resolved at run time. */
429  LBP_BK =  0, /* mandatory break */
430/*LBP_CR,         carriage return - not used here because it's a DOSism */
431/*LBP_LF,         line feed - not used here because it's a DOSism */
432  LBP_CM = 20, /* attached characters and combining marks */
433/*LBP_SG,         surrogates - not used here because they are not characters */
434  LBP_ZW =  1, /* zero width space */
435  LBP_IN =  2, /* inseparable */
436  LBP_GL =  3, /* non-breaking (glue) */
437  LBP_CB = 22, /* contingent break opportunity */
438  LBP_SP = 21, /* space */
439  LBP_BA =  4, /* break opportunity after */
440  LBP_BB =  5, /* break opportunity before */
441  LBP_B2 =  6, /* break opportunity before and after */
442  LBP_HY =  7, /* hyphen */
443  LBP_NS =  8, /* non starter */
444  LBP_OP =  9, /* opening punctuation */
445  LBP_CL = 10, /* closing punctuation */
446  LBP_QU = 11, /* ambiguous quotation */
447  LBP_EX = 12, /* exclamation/interrogation */
448  LBP_ID = 13, /* ideographic */
449  LBP_NU = 14, /* numeric */
450  LBP_IS = 15, /* infix separator (numeric) */
451  LBP_SY = 16, /* symbols allowing breaks */
452  LBP_AL = 17, /* ordinary alphabetic and symbol characters */
453  LBP_PR = 18, /* prefix (numeric) */
454  LBP_PO = 19, /* postfix (numeric) */
455  LBP_SA = 23, /* complex context (South East Asian) */
456  LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */
457  LBP_XX = 25  /* unknown */
458};
459
460/* Returns the line breaking classification for ch, as a bit mask.  */
461static int
462get_lbp (unsigned int ch)
463{
464  int attr = 0;
465
466  if (unicode_attributes[ch].name != NULL)
467    {
468      /* mandatory break */
469      if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */
470          || ch == 0x000C /* form feed */
471          || ch == 0x2028 /* LINE SEPARATOR */
472          || ch == 0x2029 /* PARAGRAPH SEPARATOR */)
473        attr |= 1 << LBP_BK;
474
475      /* zero width space */
476      if (ch == 0x200B /* ZERO WIDTH SPACE */)
477        attr |= 1 << LBP_ZW;
478
479      /* inseparable */
480      if (ch == 0x2024 /* ONE DOT LEADER */
481          || ch == 0x2025 /* TWO DOT LEADER */
482          || ch == 0x2026 /* HORIZONTAL ELLIPSIS */)
483        attr |= 1 << LBP_IN;
484
485      /* non-breaking (glue) */
486      if (ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */
487          || ch == 0x00A0 /* NO-BREAK SPACE */
488          || ch == 0x202F /* NARROW NO-BREAK SPACE */
489          || ch == 0x2007 /* FIGURE SPACE */
490          || ch == 0x2011 /* NON-BREAKING HYPHEN */
491          || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */)
492        attr |= 1 << LBP_GL;
493
494      /* contingent break opportunity */
495      if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */)
496        attr |= 1 << LBP_CB;
497
498      /* space */
499      if (ch == 0x0020 /* SPACE */)
500        attr |= 1 << LBP_SP;
501
502      /* break opportunity after */
503      if (ch == 0x2000 /* EN QUAD */
504          || ch == 0x2001 /* EM QUAD */
505          || ch == 0x2002 /* EN SPACE */
506          || ch == 0x2003 /* EM SPACE */
507          || ch == 0x2004 /* THREE-PER-EM SPACE */
508          || ch == 0x2005 /* FOUR-PER-EM SPACE */
509          || ch == 0x2006 /* SIX-PER-EM SPACE */
510          || ch == 0x2008 /* PUNCTUATION SPACE */
511          || ch == 0x2009 /* THIN SPACE */
512          || ch == 0x200A /* HAIR SPACE */
513          || ch == 0x0009 /* tab */
514          || ch == 0x2010 /* HYPHEN */
515          || ch == 0x058A /* ARMENIAN HYPHEN */
516          || ch == 0x00AD /* SOFT HYPHEN */
517          || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */
518          || ch == 0x1361 /* ETHIOPIC WORDSPACE */
519          || ch == 0x1680 /* OGHAM SPACE MARK */
520          || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */
521          || ch == 0x2027 /* HYPHENATION POINT */
522          || ch == 0x007C /* VERTICAL LINE */)
523        attr |= 1 << LBP_BA;
524
525      /* break opportunity before */
526      if (ch == 0x00B4 /* ACUTE ACCENT */
527          || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */
528          || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */
529          || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */)
530        attr |= 1 << LBP_BB;
531
532      /* break opportunity before and after */
533      if (ch == 0x2014 /* EM DASH */)
534        attr |= 1 << LBP_B2;
535
536      /* hyphen */
537      if (ch == 0x002D /* HYPHEN-MINUS */)
538        attr |= 1 << LBP_HY;
539
540      /* exclamation/interrogation */
541      if (ch == 0x0021 /* EXCLAMATION MARK */
542          || ch == 0x003F /* QUESTION MARK */
543          || ch == 0xFE56 /* SMALL QUESTION MARK */
544          || ch == 0xFE57 /* SMALL EXCLAMATION MARK */
545          || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */
546          || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */)
547        attr |= 1 << LBP_EX;
548
549      /* opening punctuation */
550      if (unicode_attributes[ch].category[0] == 'P'
551          && unicode_attributes[ch].category[1] == 's')
552        attr |= 1 << LBP_OP;
553
554      /* closing punctuation */
555      if (ch == 0x3001 /* IDEOGRAPHIC COMMA */
556          || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */
557          || ch == 0xFF0C /* FULLWIDTH COMMA */
558          || ch == 0xFF0E /* FULLWIDTH FULL STOP */
559          || ch == 0xFE50 /* SMALL COMMA */
560          || ch == 0xFE52 /* SMALL FULL STOP */
561          || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */
562          || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */
563          || (unicode_attributes[ch].category[0] == 'P'
564              && unicode_attributes[ch].category[1] == 'e'))
565        attr |= 1 << LBP_CL;
566
567      /* ambiguous quotation */
568      if (ch == 0x0022 /* QUOTATION MARK */
569          || ch == 0x0027 /* APOSTROPHE */
570          || (unicode_attributes[ch].category[0] == 'P'
571              && (unicode_attributes[ch].category[1] == 'f'
572                  || unicode_attributes[ch].category[1] == 'i')))
573        attr |= 1 << LBP_QU;
574
575      /* attached characters and combining marks */
576      if ((unicode_attributes[ch].category[0] == 'M'
577           && (unicode_attributes[ch].category[1] == 'n'
578               || unicode_attributes[ch].category[1] == 'c'
579               || unicode_attributes[ch].category[1] == 'e'))
580          || (ch >= 0x1160 && ch <= 0x11F9)
581          || (unicode_attributes[ch].category[0] == 'C'
582              && (unicode_attributes[ch].category[1] == 'c'
583                  || unicode_attributes[ch].category[1] == 'f')))
584        if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL))))
585          attr |= 1 << LBP_CM;
586
587      /* non starter */
588      if (ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */
589          || ch == 0x0E5B /* THAI CHARACTER KHOMUT */
590          || ch == 0x17D4 /* KHMER SIGN KHAN */
591          || ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */
592          || ch == 0x17D7 /* KHMER SIGN LEK TOO */
593          || ch == 0x17D8 /* KHMER SIGN BEYYAL */
594          || ch == 0x17D9 /* KHMER SIGN PHNAEK MUAN */
595          || ch == 0x17DA /* KHMER SIGN KOOMUUT */
596          || ch == 0x203C /* DOUBLE EXCLAMATION MARK */
597          || ch == 0x2044 /* FRACTION SLASH */
598          || ch == 0x301C /* WAVE DASH */
599          || ch == 0x30FB /* KATAKANA MIDDLE DOT */
600          || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */
601          || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */
602          || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */
603          || ch == 0x309D /* HIRAGANA ITERATION MARK */
604          || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */
605          || ch == 0x30FD /* KATAKANA ITERATION MARK */
606          || ch == 0xFE54 /* SMALL SEMICOLON */
607          || ch == 0xFE55 /* SMALL COLON */
608          || ch == 0xFF1A /* FULLWIDTH COLON */
609          || ch == 0xFF1B /* FULLWIDTH SEMICOLON */
610          || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */
611          || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */
612          || (unicode_attributes[ch].category[0] == 'L'
613              && unicode_attributes[ch].category[1] == 'm'
614              && (unicode_width[ch][0] == 'W'
615                  || unicode_width[ch][0] == 'H'))
616          || (unicode_attributes[ch].category[0] == 'S'
617              && unicode_attributes[ch].category[1] == 'k'
618              && unicode_width[ch][0] == 'W')
619          || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL
620          || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL)
621        attr |= 1 << LBP_NS;
622
623      /* numeric */
624      if (unicode_attributes[ch].category[0] == 'N'
625          && unicode_attributes[ch].category[1] == 'd'
626          && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL)
627        attr |= 1 << LBP_NU;
628
629      /* infix separator (numeric) */
630      if (ch == 0x002C /* COMMA */
631          || ch == 0x002E /* FULL STOP */
632          || ch == 0x003A /* COLON */
633          || ch == 0x003B /* SEMICOLON */
634          || ch == 0x0589 /* ARMENIAN FULL STOP */)
635        attr |= 1 << LBP_IS;
636
637      /* symbols allowing breaks */
638      if (ch == 0x002F /* SOLIDUS */)
639        attr |= 1 << LBP_SY;
640
641      /* postfix (numeric) */
642      if (ch == 0x0025 /* PERCENT SIGN */
643          || ch == 0x00A2 /* CENT SIGN */
644          || ch == 0x00B0 /* DEGREE SIGN */
645          || ch == 0x2030 /* PER MILLE SIGN */
646          || ch == 0x2031 /* PER TEN THOUSAND SIGN */
647          || ch == 0x2032 /* PRIME */
648          || ch == 0x2033 /* DOUBLE PRIME */
649          || ch == 0x2034 /* TRIPLE PRIME */
650          || ch == 0x2035 /* REVERSED PRIME */
651          || ch == 0x20A7 /* PESETA SIGN */
652          || ch == 0x2103 /* DEGREE CELSIUS */
653          || ch == 0x2109 /* DEGREE FAHRENHEIT */
654          || ch == 0x2126 /* OHM SIGN */
655          || ch == 0xFE6A /* SMALL PERCENT SIGN */
656          || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */
657          || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */)
658        attr |= 1 << LBP_PO;
659
660      /* prefix (numeric) */
661      if (ch == 0x002B /* PLUS SIGN */
662          || ch == 0x005C /* REVERSE SOLIDUS */
663          || ch == 0x00B1 /* PLUS-MINUS SIGN */
664          || ch == 0x2212 /* MINUS SIGN */
665          || ch == 0x2116 /* NUMERO SIGN */
666          || ch == 0x2213 /* MINUS-OR-PLUS SIGN */
667          || (unicode_attributes[ch].category[0] == 'S'
668              && unicode_attributes[ch].category[1] == 'c'))
669        if (!(attr & (1 << LBP_PO)))
670          attr |= 1 << LBP_PR;
671
672      /* complex context (South East Asian) */
673      if ((ch >= 0x0E00 && ch <= 0x0EFF)
674          || (ch >= 0x1000 && ch <= 0x109F)
675          || (ch >= 0x1780 && ch <= 0x17FF))
676        if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_NU) | (1 << LBP_BA) | (1 << LBP_PR))))
677          attr |= 1 << LBP_SA;
678
679      /* ideographic */
680      if ((ch >= 0x4E00 && ch <= 0x9FAF) /* CJK Ideograph */
681          || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */
682          || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK COMPATIBILITY IDEOGRAPH */
683          || ch == 0x3000 /* IDEOGRAPHIC SPACE */
684          || (ch >= 0xAC00 && ch <= 0xD7AF) /* HANGUL SYLLABLE */
685          || (ch >= 0x3130 && ch <= 0x318F) /* HANGUL LETTER */
686          || (ch >= 0x1100 && ch <= 0x115F) /* HANGUL CHOSEONG */
687          || (ch >= 0xA000 && ch <= 0xA48C) /* YI SYLLABLE */
688          || (ch >= 0xA490 && ch <= 0xACFF) /* YI RADICAL */
689          || (ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */
690          || ch == 0xFE62 /* SMALL PLUS SIGN */
691          || ch == 0xFE63 /* SMALL HYPHEN-MINUS */
692          || ch == 0xFE64 /* SMALL LESS-THAN SIGN */
693          || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */
694          || ch == 0xFE66 /* SMALL EQUALS SIGN */
695          || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */
696          || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL
697          || (ch >= 0x3000 && ch <= 0x33FF
698              && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL)))))
699        {
700          /* ambiguous (ideograph) ? */
701          if (unicode_width[ch] != NULL
702              && unicode_width[ch][0] == 'A')
703            attr |= 1 << LBP_AI;
704          else
705            attr |= 1 << LBP_ID;
706        }
707
708      /* ordinary alphabetic and symbol characters */
709      if ((unicode_attributes[ch].category[0] == 'L'
710           && (unicode_attributes[ch].category[1] == 'u'
711               || unicode_attributes[ch].category[1] == 'l'
712               || unicode_attributes[ch].category[1] == 't'
713               || unicode_attributes[ch].category[1] == 'm'
714               || unicode_attributes[ch].category[1] == 'o'))
715          || (unicode_attributes[ch].category[0] == 'S'
716              && (unicode_attributes[ch].category[1] == 'm'
717                  || unicode_attributes[ch].category[1] == 'c'
718                  || unicode_attributes[ch].category[1] == 'k'
719                  || unicode_attributes[ch].category[1] == 'o')))
720        if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_ID) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SA) | (1 << LBP_CB))))
721          {
722            /* ambiguous (alphabetic) ? */
723            if (unicode_width[ch] != NULL
724                && unicode_width[ch][0] == 'A')
725              attr |= 1 << LBP_AI;
726            else
727              attr |= 1 << LBP_AL;
728          }
729    }
730
731  if (attr == 0)
732    /* unknown */
733    attr |= 1 << LBP_XX;
734
735  return attr;
736}
737
738/* Output the line breaking properties in a human readable format.  */
739static void
740debug_output_lbp (FILE *stream)
741{
742  unsigned int i;
743
744  for (i = 0; i < 0x10000; i++)
745    {
746      int attr = get_lbp (i);
747      if (attr != 1 << LBP_XX)
748        {
749          fprintf (stream, "0x%04X", i);
750#define PRINT_BIT(attr,bit) \
751  if (attr & (1 << bit)) fprintf (stream, " " ## #bit);
752          PRINT_BIT(attr,LBP_BK);
753          PRINT_BIT(attr,LBP_CM);
754          PRINT_BIT(attr,LBP_ZW);
755          PRINT_BIT(attr,LBP_IN);
756          PRINT_BIT(attr,LBP_GL);
757          PRINT_BIT(attr,LBP_CB);
758          PRINT_BIT(attr,LBP_SP);
759          PRINT_BIT(attr,LBP_BA);
760          PRINT_BIT(attr,LBP_BB);
761          PRINT_BIT(attr,LBP_B2);
762          PRINT_BIT(attr,LBP_HY);
763          PRINT_BIT(attr,LBP_NS);
764          PRINT_BIT(attr,LBP_OP);
765          PRINT_BIT(attr,LBP_CL);
766          PRINT_BIT(attr,LBP_QU);
767          PRINT_BIT(attr,LBP_EX);
768          PRINT_BIT(attr,LBP_ID);
769          PRINT_BIT(attr,LBP_NU);
770          PRINT_BIT(attr,LBP_IS);
771          PRINT_BIT(attr,LBP_SY);
772          PRINT_BIT(attr,LBP_AL);
773          PRINT_BIT(attr,LBP_PR);
774          PRINT_BIT(attr,LBP_PO);
775          PRINT_BIT(attr,LBP_SA);
776          PRINT_BIT(attr,LBP_XX);
777          PRINT_BIT(attr,LBP_AI);
778#undef PRINT_BIT
779          fprintf (stream, "\n");
780        }
781    }
782}
783
784/* Construction of sparse 3-level tables.  */
785#define TABLE lbp_table
786#define ELEMENT unsigned char
787#define DEFAULT LBP_XX
788#define xmalloc malloc
789#define xrealloc realloc
790#include "3level.h"
791
792static void
793output_lbp (FILE *stream)
794{
795  unsigned int i;
796  struct lbp_table t;
797  unsigned int level1_offset, level2_offset, level3_offset;
798
799  t.p = 7;
800  t.q = 9;
801  lbp_table_init (&t);
802
803  for (i = 0; i < 0x10000; i++)
804    {
805      int attr = get_lbp (i);
806
807      /* Now attr should contain exactly one bit.  */
808      if (attr == 0 || ((attr & (attr - 1)) != 0))
809        abort ();
810
811      if (attr != 1 << LBP_XX)
812        {
813          unsigned int log2_attr;
814          for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++);
815
816          lbp_table_add (&t, i, log2_attr);
817        }
818    }
819
820  lbp_table_finalize (&t);
821
822  level1_offset =
823    5 * sizeof (uint32_t);
824  level2_offset =
825    5 * sizeof (uint32_t)
826    + t.level1_size * sizeof (uint32_t);
827  level3_offset =
828    5 * sizeof (uint32_t)
829    + t.level1_size * sizeof (uint32_t)
830    + (t.level2_size << t.q) * sizeof (uint32_t);
831
832  for (i = 0; i < 5; i++)
833    fprintf (stream, "#define lbrkprop_header_%d %d\n", i,
834             ((uint32_t *) t.result)[i]);
835  fprintf (stream, "static const\n");
836  fprintf (stream, "struct\n");
837  fprintf (stream, "  {\n");
838  fprintf (stream, "    int level1[%d];\n", t.level1_size);
839  fprintf (stream, "    int level2[%d << %d];\n", t.level2_size, t.q);
840  fprintf (stream, "    unsigned char level3[%d << %d];\n", t.level3_size, t.p);
841  fprintf (stream, "  }\n");
842  fprintf (stream, "lbrkprop =\n");
843  fprintf (stream, "{\n");
844  fprintf (stream, "  { ");
845  for (i = 0; i < t.level1_size; i++)
846    fprintf (stream, "%d%s ",
847             (((uint32_t *) (t.result + level1_offset))[i] - level2_offset) / sizeof (uint32_t),
848             (i+1 < t.level1_size ? "," : ""));
849  fprintf (stream, "},\n");
850  fprintf (stream, "  {");
851  if (t.level2_size << t.q > 8)
852    fprintf (stream, "\n   ");
853  for (i = 0; i < t.level2_size << t.q; i++)
854    {
855      if (i > 0 && (i % 8) == 0)
856        fprintf (stream, "\n   ");
857      fprintf (stream, " %5d%s",
858               (((uint32_t *) (t.result + level2_offset))[i] - level3_offset) / sizeof (uint8_t),
859               (i+1 < t.level2_size << t.q ? "," : ""));
860    }
861  if (t.level2_size << t.q > 8)
862    fprintf (stream, "\n ");
863  fprintf (stream, " },\n");
864  fprintf (stream, "  {");
865  if (t.level3_size << t.p > 8)
866    fprintf (stream, "\n   ");
867  for (i = 0; i < t.level3_size << t.p; i++)
868    {
869      unsigned char value = ((unsigned char *) (t.result + level3_offset))[i];
870      const char *value_string;
871      switch (value)
872        {
873#define CASE(x) case x: value_string = #x; break;
874          CASE(LBP_BK);
875          CASE(LBP_CM);
876          CASE(LBP_ZW);
877          CASE(LBP_IN);
878          CASE(LBP_GL);
879          CASE(LBP_CB);
880          CASE(LBP_SP);
881          CASE(LBP_BA);
882          CASE(LBP_BB);
883          CASE(LBP_B2);
884          CASE(LBP_HY);
885          CASE(LBP_NS);
886          CASE(LBP_OP);
887          CASE(LBP_CL);
888          CASE(LBP_QU);
889          CASE(LBP_EX);
890          CASE(LBP_ID);
891          CASE(LBP_NU);
892          CASE(LBP_IS);
893          CASE(LBP_SY);
894          CASE(LBP_AL);
895          CASE(LBP_PR);
896          CASE(LBP_PO);
897          CASE(LBP_SA);
898          CASE(LBP_XX);
899          CASE(LBP_AI);
900#undef CASE
901          default:
902            abort ();
903        }
904      if (i > 0 && (i % 8) == 0)
905        fprintf (stream, "\n   ");
906      fprintf (stream, " %s%s", value_string,
907               (i+1 < t.level3_size << t.p ? "," : ""));
908    }
909  if (t.level3_size << t.p > 8)
910    fprintf (stream, "\n ");
911  fprintf (stream, " }\n");
912  fprintf (stream, "};\n");
913}
914
915static void
916debug_output_tables (const char *filename)
917{
918  FILE *stream;
919
920  stream = fopen (filename, "w");
921  if (stream == NULL)
922    {
923      fprintf (stderr, "cannot open '%s' for writing\n", filename);
924      exit (1);
925    }
926
927  debug_output_lbp (stream);
928
929  if (ferror (stream) || fclose (stream))
930    {
931      fprintf (stderr, "error writing to '%s'\n", filename);
932      exit (1);
933    }
934}
935
936static void
937output_tables (const char *filename, const char *version)
938{
939  FILE *stream;
940
941  stream = fopen (filename, "w");
942  if (stream == NULL)
943    {
944      fprintf (stderr, "cannot open '%s' for writing\n", filename);
945      exit (1);
946    }
947
948  fprintf (stream, "/* Line breaking properties of Unicode characters.  */\n");
949  fprintf (stream, "/* Generated automatically by gen-lbrkprop for Unicode %s.  */\n",
950           version);
951  fprintf (stream, "\n");
952
953  output_lbp (stream);
954
955  if (ferror (stream) || fclose (stream))
956    {
957      fprintf (stderr, "error writing to '%s'\n", filename);
958      exit (1);
959    }
960}
961
962int
963main (int argc, char * argv[])
964{
965  if (argc != 5)
966    {
967      fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt EastAsianWidth.txt version\n",
968               argv[0]);
969      exit (1);
970    }
971
972  fill_attributes (argv[1]);
973  fill_combining (argv[2]);
974  fill_width (argv[3]);
975
976  debug_output_tables ("lbrkprop.txt");
977
978  output_tables ("lbrkprop.h", argv[4]);
979
980  return 0;
981}
Note: See TracBrowser for help on using the repository browser.