1 | /* Generate a Unicode conforming Line Break Properties tables from a |
---|
2 | UnicodeData file. |
---|
3 | Written by Bruno Haible <haible@clisp.cons.org>, 2000-2001. |
---|
4 | |
---|
5 | This program is free software; you can redistribute it and/or modify |
---|
6 | it under the terms of the GNU General Public License as published by |
---|
7 | the Free Software Foundation; either version 2, or (at your option) |
---|
8 | any later version. |
---|
9 | |
---|
10 | This program is distributed in the hope that it will be useful, |
---|
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
13 | GNU General Public License for more details. |
---|
14 | |
---|
15 | You should have received a copy of the GNU General Public License |
---|
16 | along with this program; if not, write to the Free Software |
---|
17 | Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ |
---|
18 | |
---|
19 | /* Usage example: |
---|
20 | $ gen-lbrkprop /usr/local/share/Unidata/UnicodeData.txt \ |
---|
21 | /usr/local/share/Unidata/PropList.txt \ |
---|
22 | /usr/local/share/Unidata/EastAsianWidth.txt \ |
---|
23 | 3.0 |
---|
24 | */ |
---|
25 | |
---|
26 | #include <stdio.h> |
---|
27 | #include <stdlib.h> |
---|
28 | #include <stdbool.h> |
---|
29 | #include <stdint.h> |
---|
30 | #include <string.h> |
---|
31 | #include <time.h> |
---|
32 | |
---|
33 | /* This structure represents one line in the UnicodeData.txt file. */ |
---|
34 | struct unicode_attribute |
---|
35 | { |
---|
36 | const char *name; /* Character name */ |
---|
37 | const char *category; /* General category */ |
---|
38 | const char *combining; /* Canonical combining classes */ |
---|
39 | const char *bidi; /* Bidirectional category */ |
---|
40 | const char *decomposition; /* Character decomposition mapping */ |
---|
41 | const char *decdigit; /* Decimal digit value */ |
---|
42 | const char *digit; /* Digit value */ |
---|
43 | const char *numeric; /* Numeric value */ |
---|
44 | int mirrored; /* mirrored */ |
---|
45 | const char *oldname; /* Old Unicode 1.0 name */ |
---|
46 | const char *comment; /* Comment */ |
---|
47 | unsigned int upper; /* Uppercase mapping */ |
---|
48 | unsigned int lower; /* Lowercase mapping */ |
---|
49 | unsigned int title; /* Titlecase mapping */ |
---|
50 | }; |
---|
51 | |
---|
52 | /* Missing fields are represented with "" for strings, and NONE for |
---|
53 | characters. */ |
---|
54 | #define NONE (~(unsigned int)0) |
---|
55 | |
---|
56 | /* The entire contents of the UnicodeData.txt file. */ |
---|
57 | struct unicode_attribute unicode_attributes [0x10000]; |
---|
58 | |
---|
59 | /* Stores in unicode_attributes[i] the values from the given fields. */ |
---|
60 | static void |
---|
61 | fill_attribute (unsigned int i, |
---|
62 | const char *field1, const char *field2, |
---|
63 | const char *field3, const char *field4, |
---|
64 | const char *field5, const char *field6, |
---|
65 | const char *field7, const char *field8, |
---|
66 | const char *field9, const char *field10, |
---|
67 | const char *field11, const char *field12, |
---|
68 | const char *field13, const char *field14) |
---|
69 | { |
---|
70 | struct unicode_attribute * uni; |
---|
71 | |
---|
72 | if (i >= 0x10000) |
---|
73 | { |
---|
74 | fprintf (stderr, "index too large\n"); |
---|
75 | exit (1); |
---|
76 | } |
---|
77 | uni = &unicode_attributes[i]; |
---|
78 | /* Copy the strings. */ |
---|
79 | uni->name = strdup (field1); |
---|
80 | uni->category = (field2[0] == '\0' ? "" : strdup (field2)); |
---|
81 | uni->combining = (field3[0] == '\0' ? "" : strdup (field3)); |
---|
82 | uni->bidi = (field4[0] == '\0' ? "" : strdup (field4)); |
---|
83 | uni->decomposition = (field5[0] == '\0' ? "" : strdup (field5)); |
---|
84 | uni->decdigit = (field6[0] == '\0' ? "" : strdup (field6)); |
---|
85 | uni->digit = (field7[0] == '\0' ? "" : strdup (field7)); |
---|
86 | uni->numeric = (field8[0] == '\0' ? "" : strdup (field8)); |
---|
87 | uni->mirrored = (field9[0] == 'Y'); |
---|
88 | uni->oldname = (field10[0] == '\0' ? "" : strdup (field10)); |
---|
89 | uni->comment = (field11[0] == '\0' ? "" : strdup (field11)); |
---|
90 | uni->upper = (field12[0] =='\0' ? NONE : strtoul (field12, NULL, 16)); |
---|
91 | uni->lower = (field13[0] =='\0' ? NONE : strtoul (field13, NULL, 16)); |
---|
92 | uni->title = (field14[0] =='\0' ? NONE : strtoul (field14, NULL, 16)); |
---|
93 | } |
---|
94 | |
---|
95 | /* Maximum length of a field in the UnicodeData.txt file. */ |
---|
96 | #define FIELDLEN 120 |
---|
97 | |
---|
98 | /* Reads the next field from STREAM. The buffer BUFFER has size FIELDLEN. |
---|
99 | Reads up to (but excluding) DELIM. |
---|
100 | Returns 1 when a field was successfully read, otherwise 0. */ |
---|
101 | static int |
---|
102 | getfield (FILE *stream, char *buffer, int delim) |
---|
103 | { |
---|
104 | int count = 0; |
---|
105 | int c; |
---|
106 | |
---|
107 | for (; (c = getc (stream)), (c != EOF && c != delim); ) |
---|
108 | { |
---|
109 | /* The original unicode.org UnicodeData.txt file happens to have |
---|
110 | CR/LF line terminators. Silently convert to LF. */ |
---|
111 | if (c == '\r') |
---|
112 | continue; |
---|
113 | |
---|
114 | /* Put c into the buffer. */ |
---|
115 | if (++count >= FIELDLEN - 1) |
---|
116 | { |
---|
117 | fprintf (stderr, "field too long\n"); |
---|
118 | exit (1); |
---|
119 | } |
---|
120 | *buffer++ = c; |
---|
121 | } |
---|
122 | |
---|
123 | if (c == EOF) |
---|
124 | return 0; |
---|
125 | |
---|
126 | *buffer = '\0'; |
---|
127 | return 1; |
---|
128 | } |
---|
129 | |
---|
130 | /* Stores in unicode_attributes[] the entire contents of the UnicodeData.txt |
---|
131 | file. */ |
---|
132 | static void |
---|
133 | fill_attributes (const char *unicodedata_filename) |
---|
134 | { |
---|
135 | unsigned int i, j; |
---|
136 | FILE *stream; |
---|
137 | char field0[FIELDLEN]; |
---|
138 | char field1[FIELDLEN]; |
---|
139 | char field2[FIELDLEN]; |
---|
140 | char field3[FIELDLEN]; |
---|
141 | char field4[FIELDLEN]; |
---|
142 | char field5[FIELDLEN]; |
---|
143 | char field6[FIELDLEN]; |
---|
144 | char field7[FIELDLEN]; |
---|
145 | char field8[FIELDLEN]; |
---|
146 | char field9[FIELDLEN]; |
---|
147 | char field10[FIELDLEN]; |
---|
148 | char field11[FIELDLEN]; |
---|
149 | char field12[FIELDLEN]; |
---|
150 | char field13[FIELDLEN]; |
---|
151 | char field14[FIELDLEN]; |
---|
152 | int lineno = 0; |
---|
153 | |
---|
154 | for (i = 0; i < 0x10000; i++) |
---|
155 | unicode_attributes[i].name = NULL; |
---|
156 | |
---|
157 | stream = fopen (unicodedata_filename, "r"); |
---|
158 | if (stream == NULL) |
---|
159 | { |
---|
160 | fprintf (stderr, "error during fopen of '%s'\n", unicodedata_filename); |
---|
161 | exit (1); |
---|
162 | } |
---|
163 | |
---|
164 | for (;;) |
---|
165 | { |
---|
166 | int n; |
---|
167 | |
---|
168 | lineno++; |
---|
169 | n = getfield (stream, field0, ';'); |
---|
170 | n += getfield (stream, field1, ';'); |
---|
171 | n += getfield (stream, field2, ';'); |
---|
172 | n += getfield (stream, field3, ';'); |
---|
173 | n += getfield (stream, field4, ';'); |
---|
174 | n += getfield (stream, field5, ';'); |
---|
175 | n += getfield (stream, field6, ';'); |
---|
176 | n += getfield (stream, field7, ';'); |
---|
177 | n += getfield (stream, field8, ';'); |
---|
178 | n += getfield (stream, field9, ';'); |
---|
179 | n += getfield (stream, field10, ';'); |
---|
180 | n += getfield (stream, field11, ';'); |
---|
181 | n += getfield (stream, field12, ';'); |
---|
182 | n += getfield (stream, field13, ';'); |
---|
183 | n += getfield (stream, field14, '\n'); |
---|
184 | if (n == 0) |
---|
185 | break; |
---|
186 | if (n != 15) |
---|
187 | { |
---|
188 | fprintf (stderr, "short line in'%s':%d\n", |
---|
189 | unicodedata_filename, lineno); |
---|
190 | exit (1); |
---|
191 | } |
---|
192 | i = strtoul (field0, NULL, 16); |
---|
193 | if (field1[0] == '<' |
---|
194 | && strlen (field1) >= 9 |
---|
195 | && !strcmp (field1 + strlen(field1) - 8, ", First>")) |
---|
196 | { |
---|
197 | /* Deal with a range. */ |
---|
198 | lineno++; |
---|
199 | n = getfield (stream, field0, ';'); |
---|
200 | n += getfield (stream, field1, ';'); |
---|
201 | n += getfield (stream, field2, ';'); |
---|
202 | n += getfield (stream, field3, ';'); |
---|
203 | n += getfield (stream, field4, ';'); |
---|
204 | n += getfield (stream, field5, ';'); |
---|
205 | n += getfield (stream, field6, ';'); |
---|
206 | n += getfield (stream, field7, ';'); |
---|
207 | n += getfield (stream, field8, ';'); |
---|
208 | n += getfield (stream, field9, ';'); |
---|
209 | n += getfield (stream, field10, ';'); |
---|
210 | n += getfield (stream, field11, ';'); |
---|
211 | n += getfield (stream, field12, ';'); |
---|
212 | n += getfield (stream, field13, ';'); |
---|
213 | n += getfield (stream, field14, '\n'); |
---|
214 | if (n != 15) |
---|
215 | { |
---|
216 | fprintf (stderr, "missing end range in '%s':%d\n", |
---|
217 | unicodedata_filename, lineno); |
---|
218 | exit (1); |
---|
219 | } |
---|
220 | if (!(field1[0] == '<' |
---|
221 | && strlen (field1) >= 8 |
---|
222 | && !strcmp (field1 + strlen (field1) - 7, ", Last>"))) |
---|
223 | { |
---|
224 | fprintf (stderr, "missing end range in '%s':%d\n", |
---|
225 | unicodedata_filename, lineno); |
---|
226 | exit (1); |
---|
227 | } |
---|
228 | field1[strlen (field1) - 7] = '\0'; |
---|
229 | j = strtoul (field0, NULL, 16); |
---|
230 | for (; i <= j; i++) |
---|
231 | fill_attribute (i, field1+1, field2, field3, field4, field5, |
---|
232 | field6, field7, field8, field9, field10, |
---|
233 | field11, field12, field13, field14); |
---|
234 | } |
---|
235 | else |
---|
236 | { |
---|
237 | /* Single character line */ |
---|
238 | fill_attribute (i, field1, field2, field3, field4, field5, |
---|
239 | field6, field7, field8, field9, field10, |
---|
240 | field11, field12, field13, field14); |
---|
241 | } |
---|
242 | } |
---|
243 | if (ferror (stream) || fclose (stream)) |
---|
244 | { |
---|
245 | fprintf (stderr, "error reading from '%s'\n", unicodedata_filename); |
---|
246 | exit (1); |
---|
247 | } |
---|
248 | } |
---|
249 | |
---|
250 | /* The combining property from the PropList.txt file. */ |
---|
251 | char unicode_combining[0x10000]; |
---|
252 | |
---|
253 | /* Stores in unicode_combining[] the Combining property from the |
---|
254 | PropList.txt file. */ |
---|
255 | static void |
---|
256 | fill_combining (const char *proplist_filename) |
---|
257 | { |
---|
258 | unsigned int i; |
---|
259 | FILE *stream; |
---|
260 | char buf[100+1]; |
---|
261 | |
---|
262 | for (i = 0; i < 0x10000; i++) |
---|
263 | unicode_combining[i] = 0; |
---|
264 | |
---|
265 | stream = fopen (proplist_filename, "r"); |
---|
266 | if (stream == NULL) |
---|
267 | { |
---|
268 | fprintf (stderr, "error during fopen of '%s'\n", proplist_filename); |
---|
269 | exit (1); |
---|
270 | } |
---|
271 | |
---|
272 | /* Search for the "Property dump for: 0x20000004 (Combining)" line. */ |
---|
273 | do |
---|
274 | { |
---|
275 | if (fscanf (stream, "%100[^\n]\n", buf) < 1) |
---|
276 | { |
---|
277 | fprintf (stderr, "no combining property found in '%s'\n", |
---|
278 | proplist_filename); |
---|
279 | exit (1); |
---|
280 | } |
---|
281 | } |
---|
282 | while (strstr (buf, "(Combining)") == NULL); |
---|
283 | |
---|
284 | for (;;) |
---|
285 | { |
---|
286 | unsigned int i1, i2; |
---|
287 | |
---|
288 | if (fscanf (stream, "%100[^\n]\n", buf) < 1) |
---|
289 | { |
---|
290 | fprintf (stderr, "premature end of combining property in '%s'\n", |
---|
291 | proplist_filename); |
---|
292 | exit (1); |
---|
293 | } |
---|
294 | if (buf[0] == '*') |
---|
295 | break; |
---|
296 | if (strlen (buf) >= 10 && buf[4] == '.' && buf[5] == '.') |
---|
297 | { |
---|
298 | if (sscanf (buf, "%4X..%4X", &i1, &i2) < 2) |
---|
299 | { |
---|
300 | fprintf (stderr, "parse error in combining property in '%s'\n", |
---|
301 | proplist_filename); |
---|
302 | exit (1); |
---|
303 | } |
---|
304 | } |
---|
305 | else if (strlen (buf) >= 4) |
---|
306 | { |
---|
307 | if (sscanf (buf, "%4X", &i1) < 1) |
---|
308 | { |
---|
309 | fprintf (stderr, "parse error in combining property in '%s'\n", |
---|
310 | proplist_filename); |
---|
311 | exit (1); |
---|
312 | } |
---|
313 | i2 = i1; |
---|
314 | } |
---|
315 | else |
---|
316 | { |
---|
317 | fprintf (stderr, "parse error in combining property in '%s'\n", |
---|
318 | proplist_filename); |
---|
319 | exit (1); |
---|
320 | } |
---|
321 | for (i = i1; i <= i2; i++) |
---|
322 | unicode_combining[i] = 1; |
---|
323 | } |
---|
324 | if (ferror (stream) || fclose (stream)) |
---|
325 | { |
---|
326 | fprintf (stderr, "error reading from '%s'\n", proplist_filename); |
---|
327 | exit (1); |
---|
328 | } |
---|
329 | } |
---|
330 | |
---|
331 | /* The width property from the EastAsianWidth.txt file. |
---|
332 | Each is NULL (unassigned) or "N", "A", "H", "W", "F", "Na". */ |
---|
333 | const char * unicode_width[0x10000]; |
---|
334 | |
---|
335 | /* Stores in unicode_width[] the width property from the PropList.txt |
---|
336 | file. */ |
---|
337 | static void |
---|
338 | fill_width (const char *width_filename) |
---|
339 | { |
---|
340 | unsigned int i, j; |
---|
341 | FILE *stream; |
---|
342 | char field0[FIELDLEN]; |
---|
343 | char field1[FIELDLEN]; |
---|
344 | char field2[FIELDLEN]; |
---|
345 | int lineno = 0; |
---|
346 | |
---|
347 | for (i = 0; i < 0x10000; i++) |
---|
348 | unicode_width[i] = (unicode_attributes[i].name != NULL ? "N" : NULL); |
---|
349 | |
---|
350 | stream = fopen (width_filename, "r"); |
---|
351 | if (stream == NULL) |
---|
352 | { |
---|
353 | fprintf (stderr, "error during fopen of '%s'\n", width_filename); |
---|
354 | exit (1); |
---|
355 | } |
---|
356 | |
---|
357 | for (;;) |
---|
358 | { |
---|
359 | int n; |
---|
360 | int c; |
---|
361 | |
---|
362 | lineno++; |
---|
363 | c = getc (stream); |
---|
364 | if (c == EOF) |
---|
365 | break; |
---|
366 | if (c == '#') |
---|
367 | { |
---|
368 | do c = getc (stream); while (c != EOF && c != '\n'); |
---|
369 | continue; |
---|
370 | } |
---|
371 | ungetc (c, stream); |
---|
372 | n = getfield (stream, field0, ';'); |
---|
373 | n += getfield (stream, field1, ';'); |
---|
374 | n += getfield (stream, field2, '\n'); |
---|
375 | if (n == 0) |
---|
376 | break; |
---|
377 | if (n != 3) |
---|
378 | { |
---|
379 | fprintf (stderr, "short line in '%s':%d\n", width_filename, lineno); |
---|
380 | exit (1); |
---|
381 | } |
---|
382 | i = strtoul (field0, NULL, 16); |
---|
383 | if (field2[0] == '<' |
---|
384 | && strlen (field2) >= 9 |
---|
385 | && !strcmp (field2 + strlen(field2) - 8, ", First>")) |
---|
386 | { |
---|
387 | /* Deal with a range. */ |
---|
388 | lineno++; |
---|
389 | n = getfield (stream, field0, ';'); |
---|
390 | n += getfield (stream, field1, ';'); |
---|
391 | n += getfield (stream, field2, '\n'); |
---|
392 | if (n != 3) |
---|
393 | { |
---|
394 | fprintf (stderr, "missing end range in '%s':%d\n", |
---|
395 | width_filename, lineno); |
---|
396 | exit (1); |
---|
397 | } |
---|
398 | if (!(field2[0] == '<' |
---|
399 | && strlen (field2) >= 8 |
---|
400 | && !strcmp (field2 + strlen (field2) - 7, ", Last>"))) |
---|
401 | { |
---|
402 | fprintf (stderr, "missing end range in '%s':%d\n", |
---|
403 | width_filename, lineno); |
---|
404 | exit (1); |
---|
405 | } |
---|
406 | field2[strlen (field2) - 7] = '\0'; |
---|
407 | j = strtoul (field0, NULL, 16); |
---|
408 | for (; i <= j; i++) |
---|
409 | unicode_width[i] = strdup (field1); |
---|
410 | } |
---|
411 | else |
---|
412 | { |
---|
413 | /* Single character line */ |
---|
414 | unicode_width[i] = strdup (field1); |
---|
415 | } |
---|
416 | } |
---|
417 | if (ferror (stream) || fclose (stream)) |
---|
418 | { |
---|
419 | fprintf (stderr, "error reading from '%s'\n", width_filename); |
---|
420 | exit (1); |
---|
421 | } |
---|
422 | } |
---|
423 | |
---|
424 | /* Line breaking classification. */ |
---|
425 | |
---|
426 | enum |
---|
427 | { |
---|
428 | /* Values >= 20 are resolved at run time. */ |
---|
429 | LBP_BK = 0, /* mandatory break */ |
---|
430 | /*LBP_CR, carriage return - not used here because it's a DOSism */ |
---|
431 | /*LBP_LF, line feed - not used here because it's a DOSism */ |
---|
432 | LBP_CM = 20, /* attached characters and combining marks */ |
---|
433 | /*LBP_SG, surrogates - not used here because they are not characters */ |
---|
434 | LBP_ZW = 1, /* zero width space */ |
---|
435 | LBP_IN = 2, /* inseparable */ |
---|
436 | LBP_GL = 3, /* non-breaking (glue) */ |
---|
437 | LBP_CB = 22, /* contingent break opportunity */ |
---|
438 | LBP_SP = 21, /* space */ |
---|
439 | LBP_BA = 4, /* break opportunity after */ |
---|
440 | LBP_BB = 5, /* break opportunity before */ |
---|
441 | LBP_B2 = 6, /* break opportunity before and after */ |
---|
442 | LBP_HY = 7, /* hyphen */ |
---|
443 | LBP_NS = 8, /* non starter */ |
---|
444 | LBP_OP = 9, /* opening punctuation */ |
---|
445 | LBP_CL = 10, /* closing punctuation */ |
---|
446 | LBP_QU = 11, /* ambiguous quotation */ |
---|
447 | LBP_EX = 12, /* exclamation/interrogation */ |
---|
448 | LBP_ID = 13, /* ideographic */ |
---|
449 | LBP_NU = 14, /* numeric */ |
---|
450 | LBP_IS = 15, /* infix separator (numeric) */ |
---|
451 | LBP_SY = 16, /* symbols allowing breaks */ |
---|
452 | LBP_AL = 17, /* ordinary alphabetic and symbol characters */ |
---|
453 | LBP_PR = 18, /* prefix (numeric) */ |
---|
454 | LBP_PO = 19, /* postfix (numeric) */ |
---|
455 | LBP_SA = 23, /* complex context (South East Asian) */ |
---|
456 | LBP_AI = 24, /* ambiguous (alphabetic or ideograph) */ |
---|
457 | LBP_XX = 25 /* unknown */ |
---|
458 | }; |
---|
459 | |
---|
460 | /* Returns the line breaking classification for ch, as a bit mask. */ |
---|
461 | static int |
---|
462 | get_lbp (unsigned int ch) |
---|
463 | { |
---|
464 | int attr = 0; |
---|
465 | |
---|
466 | if (unicode_attributes[ch].name != NULL) |
---|
467 | { |
---|
468 | /* mandatory break */ |
---|
469 | if (ch == 0x000A || ch == 0x000D || ch == 0x0085 /* newline */ |
---|
470 | || ch == 0x000C /* form feed */ |
---|
471 | || ch == 0x2028 /* LINE SEPARATOR */ |
---|
472 | || ch == 0x2029 /* PARAGRAPH SEPARATOR */) |
---|
473 | attr |= 1 << LBP_BK; |
---|
474 | |
---|
475 | /* zero width space */ |
---|
476 | if (ch == 0x200B /* ZERO WIDTH SPACE */) |
---|
477 | attr |= 1 << LBP_ZW; |
---|
478 | |
---|
479 | /* inseparable */ |
---|
480 | if (ch == 0x2024 /* ONE DOT LEADER */ |
---|
481 | || ch == 0x2025 /* TWO DOT LEADER */ |
---|
482 | || ch == 0x2026 /* HORIZONTAL ELLIPSIS */) |
---|
483 | attr |= 1 << LBP_IN; |
---|
484 | |
---|
485 | /* non-breaking (glue) */ |
---|
486 | if (ch == 0xFEFF /* ZERO WIDTH NO-BREAK SPACE */ |
---|
487 | || ch == 0x00A0 /* NO-BREAK SPACE */ |
---|
488 | || ch == 0x202F /* NARROW NO-BREAK SPACE */ |
---|
489 | || ch == 0x2007 /* FIGURE SPACE */ |
---|
490 | || ch == 0x2011 /* NON-BREAKING HYPHEN */ |
---|
491 | || ch == 0x0F0C /* TIBETAN MARK DELIMITER TSHEG BSTAR */) |
---|
492 | attr |= 1 << LBP_GL; |
---|
493 | |
---|
494 | /* contingent break opportunity */ |
---|
495 | if (ch == 0xFFFC /* OBJECT REPLACEMENT CHARACTER */) |
---|
496 | attr |= 1 << LBP_CB; |
---|
497 | |
---|
498 | /* space */ |
---|
499 | if (ch == 0x0020 /* SPACE */) |
---|
500 | attr |= 1 << LBP_SP; |
---|
501 | |
---|
502 | /* break opportunity after */ |
---|
503 | if (ch == 0x2000 /* EN QUAD */ |
---|
504 | || ch == 0x2001 /* EM QUAD */ |
---|
505 | || ch == 0x2002 /* EN SPACE */ |
---|
506 | || ch == 0x2003 /* EM SPACE */ |
---|
507 | || ch == 0x2004 /* THREE-PER-EM SPACE */ |
---|
508 | || ch == 0x2005 /* FOUR-PER-EM SPACE */ |
---|
509 | || ch == 0x2006 /* SIX-PER-EM SPACE */ |
---|
510 | || ch == 0x2008 /* PUNCTUATION SPACE */ |
---|
511 | || ch == 0x2009 /* THIN SPACE */ |
---|
512 | || ch == 0x200A /* HAIR SPACE */ |
---|
513 | || ch == 0x0009 /* tab */ |
---|
514 | || ch == 0x2010 /* HYPHEN */ |
---|
515 | || ch == 0x058A /* ARMENIAN HYPHEN */ |
---|
516 | || ch == 0x00AD /* SOFT HYPHEN */ |
---|
517 | || ch == 0x0F0B /* TIBETAN MARK INTERSYLLABIC TSHEG */ |
---|
518 | || ch == 0x1361 /* ETHIOPIC WORDSPACE */ |
---|
519 | || ch == 0x1680 /* OGHAM SPACE MARK */ |
---|
520 | || ch == 0x17D5 /* KHMER SIGN BARIYOOSAN */ |
---|
521 | || ch == 0x2027 /* HYPHENATION POINT */ |
---|
522 | || ch == 0x007C /* VERTICAL LINE */) |
---|
523 | attr |= 1 << LBP_BA; |
---|
524 | |
---|
525 | /* break opportunity before */ |
---|
526 | if (ch == 0x00B4 /* ACUTE ACCENT */ |
---|
527 | || ch == 0x02C8 /* MODIFIER LETTER VERTICAL LINE */ |
---|
528 | || ch == 0x02CC /* MODIFIER LETTER LOW VERTICAL LINE */ |
---|
529 | || ch == 0x1806 /* MONGOLIAN TODO SOFT HYPHEN */) |
---|
530 | attr |= 1 << LBP_BB; |
---|
531 | |
---|
532 | /* break opportunity before and after */ |
---|
533 | if (ch == 0x2014 /* EM DASH */) |
---|
534 | attr |= 1 << LBP_B2; |
---|
535 | |
---|
536 | /* hyphen */ |
---|
537 | if (ch == 0x002D /* HYPHEN-MINUS */) |
---|
538 | attr |= 1 << LBP_HY; |
---|
539 | |
---|
540 | /* exclamation/interrogation */ |
---|
541 | if (ch == 0x0021 /* EXCLAMATION MARK */ |
---|
542 | || ch == 0x003F /* QUESTION MARK */ |
---|
543 | || ch == 0xFE56 /* SMALL QUESTION MARK */ |
---|
544 | || ch == 0xFE57 /* SMALL EXCLAMATION MARK */ |
---|
545 | || ch == 0xFF01 /* FULLWIDTH EXCLAMATION MARK */ |
---|
546 | || ch == 0xFF1F /* FULLWIDTH QUESTION MARK */) |
---|
547 | attr |= 1 << LBP_EX; |
---|
548 | |
---|
549 | /* opening punctuation */ |
---|
550 | if (unicode_attributes[ch].category[0] == 'P' |
---|
551 | && unicode_attributes[ch].category[1] == 's') |
---|
552 | attr |= 1 << LBP_OP; |
---|
553 | |
---|
554 | /* closing punctuation */ |
---|
555 | if (ch == 0x3001 /* IDEOGRAPHIC COMMA */ |
---|
556 | || ch == 0x3002 /* IDEOGRAPHIC FULL STOP */ |
---|
557 | || ch == 0xFF0C /* FULLWIDTH COMMA */ |
---|
558 | || ch == 0xFF0E /* FULLWIDTH FULL STOP */ |
---|
559 | || ch == 0xFE50 /* SMALL COMMA */ |
---|
560 | || ch == 0xFE52 /* SMALL FULL STOP */ |
---|
561 | || ch == 0xFF61 /* HALFWIDTH IDEOGRAPHIC FULL STOP */ |
---|
562 | || ch == 0xFF64 /* HALFWIDTH IDEOGRAPHIC COMMA */ |
---|
563 | || (unicode_attributes[ch].category[0] == 'P' |
---|
564 | && unicode_attributes[ch].category[1] == 'e')) |
---|
565 | attr |= 1 << LBP_CL; |
---|
566 | |
---|
567 | /* ambiguous quotation */ |
---|
568 | if (ch == 0x0022 /* QUOTATION MARK */ |
---|
569 | || ch == 0x0027 /* APOSTROPHE */ |
---|
570 | || (unicode_attributes[ch].category[0] == 'P' |
---|
571 | && (unicode_attributes[ch].category[1] == 'f' |
---|
572 | || unicode_attributes[ch].category[1] == 'i'))) |
---|
573 | attr |= 1 << LBP_QU; |
---|
574 | |
---|
575 | /* attached characters and combining marks */ |
---|
576 | if ((unicode_attributes[ch].category[0] == 'M' |
---|
577 | && (unicode_attributes[ch].category[1] == 'n' |
---|
578 | || unicode_attributes[ch].category[1] == 'c' |
---|
579 | || unicode_attributes[ch].category[1] == 'e')) |
---|
580 | || (ch >= 0x1160 && ch <= 0x11F9) |
---|
581 | || (unicode_attributes[ch].category[0] == 'C' |
---|
582 | && (unicode_attributes[ch].category[1] == 'c' |
---|
583 | || unicode_attributes[ch].category[1] == 'f'))) |
---|
584 | if (!(attr & ((1 << LBP_BK) | (1 << LBP_BA) | (1 << LBP_GL)))) |
---|
585 | attr |= 1 << LBP_CM; |
---|
586 | |
---|
587 | /* non starter */ |
---|
588 | if (ch == 0x0E5A /* THAI CHARACTER ANGKHANKHU */ |
---|
589 | || ch == 0x0E5B /* THAI CHARACTER KHOMUT */ |
---|
590 | || ch == 0x17D4 /* KHMER SIGN KHAN */ |
---|
591 | || ch == 0x17D6 /* KHMER SIGN CAMNUC PII KUUH */ |
---|
592 | || ch == 0x17D7 /* KHMER SIGN LEK TOO */ |
---|
593 | || ch == 0x17D8 /* KHMER SIGN BEYYAL */ |
---|
594 | || ch == 0x17D9 /* KHMER SIGN PHNAEK MUAN */ |
---|
595 | || ch == 0x17DA /* KHMER SIGN KOOMUUT */ |
---|
596 | || ch == 0x203C /* DOUBLE EXCLAMATION MARK */ |
---|
597 | || ch == 0x2044 /* FRACTION SLASH */ |
---|
598 | || ch == 0x301C /* WAVE DASH */ |
---|
599 | || ch == 0x30FB /* KATAKANA MIDDLE DOT */ |
---|
600 | || ch == 0x3005 /* IDEOGRAPHIC ITERATION MARK */ |
---|
601 | || ch == 0x309B /* KATAKANA-HIRAGANA VOICED SOUND MARK */ |
---|
602 | || ch == 0x309C /* KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK */ |
---|
603 | || ch == 0x309D /* HIRAGANA ITERATION MARK */ |
---|
604 | || ch == 0x309E /* HIRAGANA VOICED ITERATION MARK */ |
---|
605 | || ch == 0x30FD /* KATAKANA ITERATION MARK */ |
---|
606 | || ch == 0xFE54 /* SMALL SEMICOLON */ |
---|
607 | || ch == 0xFE55 /* SMALL COLON */ |
---|
608 | || ch == 0xFF1A /* FULLWIDTH COLON */ |
---|
609 | || ch == 0xFF1B /* FULLWIDTH SEMICOLON */ |
---|
610 | || ch == 0xFF65 /* HALFWIDTH KATAKANA MIDDLE DOT */ |
---|
611 | || ch == 0xFF70 /* HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK */ |
---|
612 | || (unicode_attributes[ch].category[0] == 'L' |
---|
613 | && unicode_attributes[ch].category[1] == 'm' |
---|
614 | && (unicode_width[ch][0] == 'W' |
---|
615 | || unicode_width[ch][0] == 'H')) |
---|
616 | || (unicode_attributes[ch].category[0] == 'S' |
---|
617 | && unicode_attributes[ch].category[1] == 'k' |
---|
618 | && unicode_width[ch][0] == 'W') |
---|
619 | || strstr (unicode_attributes[ch].name, "HIRAGANA LETTER SMALL ") != NULL |
---|
620 | || strstr (unicode_attributes[ch].name, "KATAKANA LETTER SMALL ") != NULL) |
---|
621 | attr |= 1 << LBP_NS; |
---|
622 | |
---|
623 | /* numeric */ |
---|
624 | if (unicode_attributes[ch].category[0] == 'N' |
---|
625 | && unicode_attributes[ch].category[1] == 'd' |
---|
626 | && strstr (unicode_attributes[ch].name, "FULLWIDTH") == NULL) |
---|
627 | attr |= 1 << LBP_NU; |
---|
628 | |
---|
629 | /* infix separator (numeric) */ |
---|
630 | if (ch == 0x002C /* COMMA */ |
---|
631 | || ch == 0x002E /* FULL STOP */ |
---|
632 | || ch == 0x003A /* COLON */ |
---|
633 | || ch == 0x003B /* SEMICOLON */ |
---|
634 | || ch == 0x0589 /* ARMENIAN FULL STOP */) |
---|
635 | attr |= 1 << LBP_IS; |
---|
636 | |
---|
637 | /* symbols allowing breaks */ |
---|
638 | if (ch == 0x002F /* SOLIDUS */) |
---|
639 | attr |= 1 << LBP_SY; |
---|
640 | |
---|
641 | /* postfix (numeric) */ |
---|
642 | if (ch == 0x0025 /* PERCENT SIGN */ |
---|
643 | || ch == 0x00A2 /* CENT SIGN */ |
---|
644 | || ch == 0x00B0 /* DEGREE SIGN */ |
---|
645 | || ch == 0x2030 /* PER MILLE SIGN */ |
---|
646 | || ch == 0x2031 /* PER TEN THOUSAND SIGN */ |
---|
647 | || ch == 0x2032 /* PRIME */ |
---|
648 | || ch == 0x2033 /* DOUBLE PRIME */ |
---|
649 | || ch == 0x2034 /* TRIPLE PRIME */ |
---|
650 | || ch == 0x2035 /* REVERSED PRIME */ |
---|
651 | || ch == 0x20A7 /* PESETA SIGN */ |
---|
652 | || ch == 0x2103 /* DEGREE CELSIUS */ |
---|
653 | || ch == 0x2109 /* DEGREE FAHRENHEIT */ |
---|
654 | || ch == 0x2126 /* OHM SIGN */ |
---|
655 | || ch == 0xFE6A /* SMALL PERCENT SIGN */ |
---|
656 | || ch == 0xFF05 /* FULLWIDTH PERCENT SIGN */ |
---|
657 | || ch == 0xFFE0 /* FULLWIDTH DIGIT ZERO */) |
---|
658 | attr |= 1 << LBP_PO; |
---|
659 | |
---|
660 | /* prefix (numeric) */ |
---|
661 | if (ch == 0x002B /* PLUS SIGN */ |
---|
662 | || ch == 0x005C /* REVERSE SOLIDUS */ |
---|
663 | || ch == 0x00B1 /* PLUS-MINUS SIGN */ |
---|
664 | || ch == 0x2212 /* MINUS SIGN */ |
---|
665 | || ch == 0x2116 /* NUMERO SIGN */ |
---|
666 | || ch == 0x2213 /* MINUS-OR-PLUS SIGN */ |
---|
667 | || (unicode_attributes[ch].category[0] == 'S' |
---|
668 | && unicode_attributes[ch].category[1] == 'c')) |
---|
669 | if (!(attr & (1 << LBP_PO))) |
---|
670 | attr |= 1 << LBP_PR; |
---|
671 | |
---|
672 | /* complex context (South East Asian) */ |
---|
673 | if ((ch >= 0x0E00 && ch <= 0x0EFF) |
---|
674 | || (ch >= 0x1000 && ch <= 0x109F) |
---|
675 | || (ch >= 0x1780 && ch <= 0x17FF)) |
---|
676 | if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_NU) | (1 << LBP_BA) | (1 << LBP_PR)))) |
---|
677 | attr |= 1 << LBP_SA; |
---|
678 | |
---|
679 | /* ideographic */ |
---|
680 | if ((ch >= 0x4E00 && ch <= 0x9FAF) /* CJK Ideograph */ |
---|
681 | || (ch >= 0x3400 && ch <= 0x4DBF) /* CJK Ideograph Extension A */ |
---|
682 | || (ch >= 0xF900 && ch <= 0xFAFF) /* CJK COMPATIBILITY IDEOGRAPH */ |
---|
683 | || ch == 0x3000 /* IDEOGRAPHIC SPACE */ |
---|
684 | || (ch >= 0xAC00 && ch <= 0xD7AF) /* HANGUL SYLLABLE */ |
---|
685 | || (ch >= 0x3130 && ch <= 0x318F) /* HANGUL LETTER */ |
---|
686 | || (ch >= 0x1100 && ch <= 0x115F) /* HANGUL CHOSEONG */ |
---|
687 | || (ch >= 0xA000 && ch <= 0xA48C) /* YI SYLLABLE */ |
---|
688 | || (ch >= 0xA490 && ch <= 0xACFF) /* YI RADICAL */ |
---|
689 | || (ch >= 0x2E80 && ch <= 0x2FFF) /* CJK RADICAL, KANGXI RADICAL, IDEOGRAPHIC DESCRIPTION */ |
---|
690 | || ch == 0xFE62 /* SMALL PLUS SIGN */ |
---|
691 | || ch == 0xFE63 /* SMALL HYPHEN-MINUS */ |
---|
692 | || ch == 0xFE64 /* SMALL LESS-THAN SIGN */ |
---|
693 | || ch == 0xFE65 /* SMALL GREATER-THAN SIGN */ |
---|
694 | || ch == 0xFE66 /* SMALL EQUALS SIGN */ |
---|
695 | || (ch >= 0xFF10 && ch <= 0xFF19) /* FULLWIDTH DIGIT */ |
---|
696 | || strstr (unicode_attributes[ch].name, "FULLWIDTH LATIN ") != NULL |
---|
697 | || (ch >= 0x3000 && ch <= 0x33FF |
---|
698 | && !(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_OP) | (1 << LBP_CL))))) |
---|
699 | { |
---|
700 | /* ambiguous (ideograph) ? */ |
---|
701 | if (unicode_width[ch] != NULL |
---|
702 | && unicode_width[ch][0] == 'A') |
---|
703 | attr |= 1 << LBP_AI; |
---|
704 | else |
---|
705 | attr |= 1 << LBP_ID; |
---|
706 | } |
---|
707 | |
---|
708 | /* ordinary alphabetic and symbol characters */ |
---|
709 | if ((unicode_attributes[ch].category[0] == 'L' |
---|
710 | && (unicode_attributes[ch].category[1] == 'u' |
---|
711 | || unicode_attributes[ch].category[1] == 'l' |
---|
712 | || unicode_attributes[ch].category[1] == 't' |
---|
713 | || unicode_attributes[ch].category[1] == 'm' |
---|
714 | || unicode_attributes[ch].category[1] == 'o')) |
---|
715 | || (unicode_attributes[ch].category[0] == 'S' |
---|
716 | && (unicode_attributes[ch].category[1] == 'm' |
---|
717 | || unicode_attributes[ch].category[1] == 'c' |
---|
718 | || unicode_attributes[ch].category[1] == 'k' |
---|
719 | || unicode_attributes[ch].category[1] == 'o'))) |
---|
720 | if (!(attr & ((1 << LBP_CM) | (1 << LBP_NS) | (1 << LBP_ID) | (1 << LBP_BA) | (1 << LBP_BB) | (1 << LBP_PO) | (1 << LBP_PR) | (1 << LBP_SA) | (1 << LBP_CB)))) |
---|
721 | { |
---|
722 | /* ambiguous (alphabetic) ? */ |
---|
723 | if (unicode_width[ch] != NULL |
---|
724 | && unicode_width[ch][0] == 'A') |
---|
725 | attr |= 1 << LBP_AI; |
---|
726 | else |
---|
727 | attr |= 1 << LBP_AL; |
---|
728 | } |
---|
729 | } |
---|
730 | |
---|
731 | if (attr == 0) |
---|
732 | /* unknown */ |
---|
733 | attr |= 1 << LBP_XX; |
---|
734 | |
---|
735 | return attr; |
---|
736 | } |
---|
737 | |
---|
738 | /* Output the line breaking properties in a human readable format. */ |
---|
739 | static void |
---|
740 | debug_output_lbp (FILE *stream) |
---|
741 | { |
---|
742 | unsigned int i; |
---|
743 | |
---|
744 | for (i = 0; i < 0x10000; i++) |
---|
745 | { |
---|
746 | int attr = get_lbp (i); |
---|
747 | if (attr != 1 << LBP_XX) |
---|
748 | { |
---|
749 | fprintf (stream, "0x%04X", i); |
---|
750 | #define PRINT_BIT(attr,bit) \ |
---|
751 | if (attr & (1 << bit)) fprintf (stream, " " ## #bit); |
---|
752 | PRINT_BIT(attr,LBP_BK); |
---|
753 | PRINT_BIT(attr,LBP_CM); |
---|
754 | PRINT_BIT(attr,LBP_ZW); |
---|
755 | PRINT_BIT(attr,LBP_IN); |
---|
756 | PRINT_BIT(attr,LBP_GL); |
---|
757 | PRINT_BIT(attr,LBP_CB); |
---|
758 | PRINT_BIT(attr,LBP_SP); |
---|
759 | PRINT_BIT(attr,LBP_BA); |
---|
760 | PRINT_BIT(attr,LBP_BB); |
---|
761 | PRINT_BIT(attr,LBP_B2); |
---|
762 | PRINT_BIT(attr,LBP_HY); |
---|
763 | PRINT_BIT(attr,LBP_NS); |
---|
764 | PRINT_BIT(attr,LBP_OP); |
---|
765 | PRINT_BIT(attr,LBP_CL); |
---|
766 | PRINT_BIT(attr,LBP_QU); |
---|
767 | PRINT_BIT(attr,LBP_EX); |
---|
768 | PRINT_BIT(attr,LBP_ID); |
---|
769 | PRINT_BIT(attr,LBP_NU); |
---|
770 | PRINT_BIT(attr,LBP_IS); |
---|
771 | PRINT_BIT(attr,LBP_SY); |
---|
772 | PRINT_BIT(attr,LBP_AL); |
---|
773 | PRINT_BIT(attr,LBP_PR); |
---|
774 | PRINT_BIT(attr,LBP_PO); |
---|
775 | PRINT_BIT(attr,LBP_SA); |
---|
776 | PRINT_BIT(attr,LBP_XX); |
---|
777 | PRINT_BIT(attr,LBP_AI); |
---|
778 | #undef PRINT_BIT |
---|
779 | fprintf (stream, "\n"); |
---|
780 | } |
---|
781 | } |
---|
782 | } |
---|
783 | |
---|
784 | /* Construction of sparse 3-level tables. */ |
---|
785 | #define TABLE lbp_table |
---|
786 | #define ELEMENT unsigned char |
---|
787 | #define DEFAULT LBP_XX |
---|
788 | #define xmalloc malloc |
---|
789 | #define xrealloc realloc |
---|
790 | #include "3level.h" |
---|
791 | |
---|
792 | static void |
---|
793 | output_lbp (FILE *stream) |
---|
794 | { |
---|
795 | unsigned int i; |
---|
796 | struct lbp_table t; |
---|
797 | unsigned int level1_offset, level2_offset, level3_offset; |
---|
798 | |
---|
799 | t.p = 7; |
---|
800 | t.q = 9; |
---|
801 | lbp_table_init (&t); |
---|
802 | |
---|
803 | for (i = 0; i < 0x10000; i++) |
---|
804 | { |
---|
805 | int attr = get_lbp (i); |
---|
806 | |
---|
807 | /* Now attr should contain exactly one bit. */ |
---|
808 | if (attr == 0 || ((attr & (attr - 1)) != 0)) |
---|
809 | abort (); |
---|
810 | |
---|
811 | if (attr != 1 << LBP_XX) |
---|
812 | { |
---|
813 | unsigned int log2_attr; |
---|
814 | for (log2_attr = 0; attr > 1; attr >>= 1, log2_attr++); |
---|
815 | |
---|
816 | lbp_table_add (&t, i, log2_attr); |
---|
817 | } |
---|
818 | } |
---|
819 | |
---|
820 | lbp_table_finalize (&t); |
---|
821 | |
---|
822 | level1_offset = |
---|
823 | 5 * sizeof (uint32_t); |
---|
824 | level2_offset = |
---|
825 | 5 * sizeof (uint32_t) |
---|
826 | + t.level1_size * sizeof (uint32_t); |
---|
827 | level3_offset = |
---|
828 | 5 * sizeof (uint32_t) |
---|
829 | + t.level1_size * sizeof (uint32_t) |
---|
830 | + (t.level2_size << t.q) * sizeof (uint32_t); |
---|
831 | |
---|
832 | for (i = 0; i < 5; i++) |
---|
833 | fprintf (stream, "#define lbrkprop_header_%d %d\n", i, |
---|
834 | ((uint32_t *) t.result)[i]); |
---|
835 | fprintf (stream, "static const\n"); |
---|
836 | fprintf (stream, "struct\n"); |
---|
837 | fprintf (stream, " {\n"); |
---|
838 | fprintf (stream, " int level1[%d];\n", t.level1_size); |
---|
839 | fprintf (stream, " int level2[%d << %d];\n", t.level2_size, t.q); |
---|
840 | fprintf (stream, " unsigned char level3[%d << %d];\n", t.level3_size, t.p); |
---|
841 | fprintf (stream, " }\n"); |
---|
842 | fprintf (stream, "lbrkprop =\n"); |
---|
843 | fprintf (stream, "{\n"); |
---|
844 | fprintf (stream, " { "); |
---|
845 | for (i = 0; i < t.level1_size; i++) |
---|
846 | fprintf (stream, "%d%s ", |
---|
847 | (((uint32_t *) (t.result + level1_offset))[i] - level2_offset) / sizeof (uint32_t), |
---|
848 | (i+1 < t.level1_size ? "," : "")); |
---|
849 | fprintf (stream, "},\n"); |
---|
850 | fprintf (stream, " {"); |
---|
851 | if (t.level2_size << t.q > 8) |
---|
852 | fprintf (stream, "\n "); |
---|
853 | for (i = 0; i < t.level2_size << t.q; i++) |
---|
854 | { |
---|
855 | if (i > 0 && (i % 8) == 0) |
---|
856 | fprintf (stream, "\n "); |
---|
857 | fprintf (stream, " %5d%s", |
---|
858 | (((uint32_t *) (t.result + level2_offset))[i] - level3_offset) / sizeof (uint8_t), |
---|
859 | (i+1 < t.level2_size << t.q ? "," : "")); |
---|
860 | } |
---|
861 | if (t.level2_size << t.q > 8) |
---|
862 | fprintf (stream, "\n "); |
---|
863 | fprintf (stream, " },\n"); |
---|
864 | fprintf (stream, " {"); |
---|
865 | if (t.level3_size << t.p > 8) |
---|
866 | fprintf (stream, "\n "); |
---|
867 | for (i = 0; i < t.level3_size << t.p; i++) |
---|
868 | { |
---|
869 | unsigned char value = ((unsigned char *) (t.result + level3_offset))[i]; |
---|
870 | const char *value_string; |
---|
871 | switch (value) |
---|
872 | { |
---|
873 | #define CASE(x) case x: value_string = #x; break; |
---|
874 | CASE(LBP_BK); |
---|
875 | CASE(LBP_CM); |
---|
876 | CASE(LBP_ZW); |
---|
877 | CASE(LBP_IN); |
---|
878 | CASE(LBP_GL); |
---|
879 | CASE(LBP_CB); |
---|
880 | CASE(LBP_SP); |
---|
881 | CASE(LBP_BA); |
---|
882 | CASE(LBP_BB); |
---|
883 | CASE(LBP_B2); |
---|
884 | CASE(LBP_HY); |
---|
885 | CASE(LBP_NS); |
---|
886 | CASE(LBP_OP); |
---|
887 | CASE(LBP_CL); |
---|
888 | CASE(LBP_QU); |
---|
889 | CASE(LBP_EX); |
---|
890 | CASE(LBP_ID); |
---|
891 | CASE(LBP_NU); |
---|
892 | CASE(LBP_IS); |
---|
893 | CASE(LBP_SY); |
---|
894 | CASE(LBP_AL); |
---|
895 | CASE(LBP_PR); |
---|
896 | CASE(LBP_PO); |
---|
897 | CASE(LBP_SA); |
---|
898 | CASE(LBP_XX); |
---|
899 | CASE(LBP_AI); |
---|
900 | #undef CASE |
---|
901 | default: |
---|
902 | abort (); |
---|
903 | } |
---|
904 | if (i > 0 && (i % 8) == 0) |
---|
905 | fprintf (stream, "\n "); |
---|
906 | fprintf (stream, " %s%s", value_string, |
---|
907 | (i+1 < t.level3_size << t.p ? "," : "")); |
---|
908 | } |
---|
909 | if (t.level3_size << t.p > 8) |
---|
910 | fprintf (stream, "\n "); |
---|
911 | fprintf (stream, " }\n"); |
---|
912 | fprintf (stream, "};\n"); |
---|
913 | } |
---|
914 | |
---|
915 | static void |
---|
916 | debug_output_tables (const char *filename) |
---|
917 | { |
---|
918 | FILE *stream; |
---|
919 | |
---|
920 | stream = fopen (filename, "w"); |
---|
921 | if (stream == NULL) |
---|
922 | { |
---|
923 | fprintf (stderr, "cannot open '%s' for writing\n", filename); |
---|
924 | exit (1); |
---|
925 | } |
---|
926 | |
---|
927 | debug_output_lbp (stream); |
---|
928 | |
---|
929 | if (ferror (stream) || fclose (stream)) |
---|
930 | { |
---|
931 | fprintf (stderr, "error writing to '%s'\n", filename); |
---|
932 | exit (1); |
---|
933 | } |
---|
934 | } |
---|
935 | |
---|
936 | static void |
---|
937 | output_tables (const char *filename, const char *version) |
---|
938 | { |
---|
939 | FILE *stream; |
---|
940 | |
---|
941 | stream = fopen (filename, "w"); |
---|
942 | if (stream == NULL) |
---|
943 | { |
---|
944 | fprintf (stderr, "cannot open '%s' for writing\n", filename); |
---|
945 | exit (1); |
---|
946 | } |
---|
947 | |
---|
948 | fprintf (stream, "/* Line breaking properties of Unicode characters. */\n"); |
---|
949 | fprintf (stream, "/* Generated automatically by gen-lbrkprop for Unicode %s. */\n", |
---|
950 | version); |
---|
951 | fprintf (stream, "\n"); |
---|
952 | |
---|
953 | output_lbp (stream); |
---|
954 | |
---|
955 | if (ferror (stream) || fclose (stream)) |
---|
956 | { |
---|
957 | fprintf (stderr, "error writing to '%s'\n", filename); |
---|
958 | exit (1); |
---|
959 | } |
---|
960 | } |
---|
961 | |
---|
962 | int |
---|
963 | main (int argc, char * argv[]) |
---|
964 | { |
---|
965 | if (argc != 5) |
---|
966 | { |
---|
967 | fprintf (stderr, "Usage: %s UnicodeData.txt PropList.txt EastAsianWidth.txt version\n", |
---|
968 | argv[0]); |
---|
969 | exit (1); |
---|
970 | } |
---|
971 | |
---|
972 | fill_attributes (argv[1]); |
---|
973 | fill_combining (argv[2]); |
---|
974 | fill_width (argv[3]); |
---|
975 | |
---|
976 | debug_output_tables ("lbrkprop.txt"); |
---|
977 | |
---|
978 | output_tables ("lbrkprop.h", argv[4]); |
---|
979 | |
---|
980 | return 0; |
---|
981 | } |
---|