1 | /* decomp.c - Character decomposition. |
---|
2 | * |
---|
3 | * Copyright (C) 1999, 2000 Tom Tromey |
---|
4 | * Copyright 2000 Red Hat, Inc. |
---|
5 | * |
---|
6 | * The Gnome Library is free software; you can redistribute it and/or |
---|
7 | * modify it under the terms of the GNU Lesser General Public License as |
---|
8 | * published by the Free Software Foundation; either version 2 of the |
---|
9 | * License, or (at your option) any later version. |
---|
10 | * |
---|
11 | * The Gnome Library is distributed in the hope that it will be useful, |
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
---|
14 | * Lesser General Public License for more details. |
---|
15 | * |
---|
16 | * You should have received a copy of the GNU Lesser General Public |
---|
17 | * License along with the Gnome Library; see the file COPYING.LIB. If not, |
---|
18 | * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
---|
19 | * Boston, MA 02111-1307, USA. |
---|
20 | */ |
---|
21 | |
---|
22 | #include "config.h" |
---|
23 | |
---|
24 | #include <stdlib.h> |
---|
25 | |
---|
26 | #include "glib.h" |
---|
27 | #include "gunidecomp.h" |
---|
28 | #include "gunicomp.h" |
---|
29 | #include "gunicodeprivate.h" |
---|
30 | |
---|
31 | |
---|
32 | #define CC_PART1(Page, Char) \ |
---|
33 | ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
---|
34 | ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
---|
35 | : (cclass_data[combining_class_table_part1[Page]][Char])) |
---|
36 | |
---|
37 | #define CC_PART2(Page, Char) \ |
---|
38 | ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
---|
39 | ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
---|
40 | : (cclass_data[combining_class_table_part2[Page]][Char])) |
---|
41 | |
---|
42 | #define COMBINING_CLASS(Char) \ |
---|
43 | (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ |
---|
44 | ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \ |
---|
45 | : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ |
---|
46 | ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ |
---|
47 | : 0)) |
---|
48 | |
---|
49 | gint |
---|
50 | _g_unichar_combining_class (gunichar uc) |
---|
51 | { |
---|
52 | return COMBINING_CLASS (uc); |
---|
53 | } |
---|
54 | |
---|
55 | /* constants for hangul syllable [de]composition */ |
---|
56 | #define SBase 0xAC00 |
---|
57 | #define LBase 0x1100 |
---|
58 | #define VBase 0x1161 |
---|
59 | #define TBase 0x11A7 |
---|
60 | #define LCount 19 |
---|
61 | #define VCount 21 |
---|
62 | #define TCount 28 |
---|
63 | #define NCount (VCount * TCount) |
---|
64 | #define SCount (LCount * NCount) |
---|
65 | |
---|
66 | /** |
---|
67 | * g_unicode_canonical_ordering: |
---|
68 | * @string: a UCS-4 encoded string. |
---|
69 | * @len: the maximum length of @string to use. |
---|
70 | * |
---|
71 | * Computes the canonical ordering of a string in-place. |
---|
72 | * This rearranges decomposed characters in the string |
---|
73 | * according to their combining classes. See the Unicode |
---|
74 | * manual for more information. |
---|
75 | **/ |
---|
76 | void |
---|
77 | g_unicode_canonical_ordering (gunichar *string, |
---|
78 | gsize len) |
---|
79 | { |
---|
80 | gsize i; |
---|
81 | int swap = 1; |
---|
82 | |
---|
83 | while (swap) |
---|
84 | { |
---|
85 | int last; |
---|
86 | swap = 0; |
---|
87 | last = COMBINING_CLASS (string[0]); |
---|
88 | for (i = 0; i < len - 1; ++i) |
---|
89 | { |
---|
90 | int next = COMBINING_CLASS (string[i + 1]); |
---|
91 | if (next != 0 && last > next) |
---|
92 | { |
---|
93 | gsize j; |
---|
94 | /* Percolate item leftward through string. */ |
---|
95 | for (j = i + 1; j > 0; --j) |
---|
96 | { |
---|
97 | gunichar t; |
---|
98 | if (COMBINING_CLASS (string[j - 1]) <= next) |
---|
99 | break; |
---|
100 | t = string[j]; |
---|
101 | string[j] = string[j - 1]; |
---|
102 | string[j - 1] = t; |
---|
103 | swap = 1; |
---|
104 | } |
---|
105 | /* We're re-entering the loop looking at the old |
---|
106 | character again. */ |
---|
107 | next = last; |
---|
108 | } |
---|
109 | last = next; |
---|
110 | } |
---|
111 | } |
---|
112 | } |
---|
113 | |
---|
114 | /* http://www.unicode.org/unicode/reports/tr15/#Hangul |
---|
115 | * r should be null or have sufficient space. Calling with r == NULL will |
---|
116 | * only calculate the result_len; however, a buffer with space for three |
---|
117 | * characters will always be big enough. */ |
---|
118 | static void |
---|
119 | decompose_hangul (gunichar s, |
---|
120 | gunichar *r, |
---|
121 | gsize *result_len) |
---|
122 | { |
---|
123 | gint SIndex = s - SBase; |
---|
124 | |
---|
125 | /* not a hangul syllable */ |
---|
126 | if (SIndex < 0 || SIndex >= SCount) |
---|
127 | { |
---|
128 | if (r) |
---|
129 | r[0] = s; |
---|
130 | *result_len = 1; |
---|
131 | } |
---|
132 | else |
---|
133 | { |
---|
134 | gunichar L = LBase + SIndex / NCount; |
---|
135 | gunichar V = VBase + (SIndex % NCount) / TCount; |
---|
136 | gunichar T = TBase + SIndex % TCount; |
---|
137 | |
---|
138 | if (r) |
---|
139 | { |
---|
140 | r[0] = L; |
---|
141 | r[1] = V; |
---|
142 | } |
---|
143 | |
---|
144 | if (T != TBase) |
---|
145 | { |
---|
146 | if (r) |
---|
147 | r[2] = T; |
---|
148 | *result_len = 3; |
---|
149 | } |
---|
150 | else |
---|
151 | *result_len = 2; |
---|
152 | } |
---|
153 | } |
---|
154 | |
---|
155 | /* returns a pointer to a null-terminated UTF-8 string */ |
---|
156 | static const gchar * |
---|
157 | find_decomposition (gunichar ch, |
---|
158 | gboolean compat) |
---|
159 | { |
---|
160 | int start = 0; |
---|
161 | int end = G_N_ELEMENTS (decomp_table); |
---|
162 | |
---|
163 | if (ch >= decomp_table[start].ch && |
---|
164 | ch <= decomp_table[end - 1].ch) |
---|
165 | { |
---|
166 | while (TRUE) |
---|
167 | { |
---|
168 | int half = (start + end) / 2; |
---|
169 | if (ch == decomp_table[half].ch) |
---|
170 | { |
---|
171 | int offset; |
---|
172 | |
---|
173 | if (compat) |
---|
174 | { |
---|
175 | offset = decomp_table[half].compat_offset; |
---|
176 | if (offset == G_UNICODE_NOT_PRESENT_OFFSET) |
---|
177 | offset = decomp_table[half].canon_offset; |
---|
178 | } |
---|
179 | else |
---|
180 | { |
---|
181 | offset = decomp_table[half].canon_offset; |
---|
182 | if (offset == G_UNICODE_NOT_PRESENT_OFFSET) |
---|
183 | return NULL; |
---|
184 | } |
---|
185 | |
---|
186 | return &(decomp_expansion_string[offset]); |
---|
187 | } |
---|
188 | else if (half == start) |
---|
189 | break; |
---|
190 | else if (ch > decomp_table[half].ch) |
---|
191 | start = half; |
---|
192 | else |
---|
193 | end = half; |
---|
194 | } |
---|
195 | } |
---|
196 | |
---|
197 | return NULL; |
---|
198 | } |
---|
199 | |
---|
200 | /** |
---|
201 | * g_unicode_canonical_decomposition: |
---|
202 | * @ch: a Unicode character. |
---|
203 | * @result_len: location to store the length of the return value. |
---|
204 | * |
---|
205 | * Computes the canonical decomposition of a Unicode character. |
---|
206 | * |
---|
207 | * Return value: a newly allocated string of Unicode characters. |
---|
208 | * @result_len is set to the resulting length of the string. |
---|
209 | **/ |
---|
210 | gunichar * |
---|
211 | g_unicode_canonical_decomposition (gunichar ch, |
---|
212 | gsize *result_len) |
---|
213 | { |
---|
214 | const gchar *decomp; |
---|
215 | const gchar *p; |
---|
216 | gunichar *r; |
---|
217 | |
---|
218 | /* Hangul syllable */ |
---|
219 | if (ch >= 0xac00 && ch <= 0xd7af) |
---|
220 | { |
---|
221 | decompose_hangul (ch, NULL, result_len); |
---|
222 | r = g_malloc (*result_len * sizeof (gunichar)); |
---|
223 | decompose_hangul (ch, r, result_len); |
---|
224 | } |
---|
225 | else if ((decomp = find_decomposition (ch, FALSE)) != NULL) |
---|
226 | { |
---|
227 | /* Found it. */ |
---|
228 | int i; |
---|
229 | |
---|
230 | *result_len = g_utf8_strlen (decomp, -1); |
---|
231 | r = g_malloc (*result_len * sizeof (gunichar)); |
---|
232 | |
---|
233 | for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++) |
---|
234 | r[i] = g_utf8_get_char (p); |
---|
235 | } |
---|
236 | else |
---|
237 | { |
---|
238 | /* Not in our table. */ |
---|
239 | r = g_malloc (sizeof (gunichar)); |
---|
240 | *r = ch; |
---|
241 | *result_len = 1; |
---|
242 | } |
---|
243 | |
---|
244 | /* Supposedly following the Unicode 2.1.9 table means that the |
---|
245 | decompositions come out in canonical order. I haven't tested |
---|
246 | this, but we rely on it here. */ |
---|
247 | return r; |
---|
248 | } |
---|
249 | |
---|
250 | /* L,V => LV and LV,T => LVT */ |
---|
251 | static gboolean |
---|
252 | combine_hangul (gunichar a, |
---|
253 | gunichar b, |
---|
254 | gunichar *result) |
---|
255 | { |
---|
256 | gint LIndex = a - LBase; |
---|
257 | gint SIndex = a - SBase; |
---|
258 | |
---|
259 | gint VIndex = b - VBase; |
---|
260 | gint TIndex = b - TBase; |
---|
261 | |
---|
262 | if (0 <= LIndex && LIndex < LCount |
---|
263 | && 0 <= VIndex && VIndex < VCount) |
---|
264 | { |
---|
265 | *result = SBase + (LIndex * VCount + VIndex) * TCount; |
---|
266 | return TRUE; |
---|
267 | } |
---|
268 | else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0 |
---|
269 | && 0 <= TIndex && TIndex <= TCount) |
---|
270 | { |
---|
271 | *result = a + TIndex; |
---|
272 | return TRUE; |
---|
273 | } |
---|
274 | |
---|
275 | return FALSE; |
---|
276 | } |
---|
277 | |
---|
278 | #define CI(Page, Char) \ |
---|
279 | ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
---|
280 | ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
---|
281 | : (compose_data[compose_table[Page]][Char])) |
---|
282 | |
---|
283 | #define COMPOSE_INDEX(Char) \ |
---|
284 | (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff)) |
---|
285 | |
---|
286 | static gboolean |
---|
287 | combine (gunichar a, |
---|
288 | gunichar b, |
---|
289 | gunichar *result) |
---|
290 | { |
---|
291 | gushort index_a, index_b; |
---|
292 | |
---|
293 | if (combine_hangul (a, b, result)) |
---|
294 | return TRUE; |
---|
295 | |
---|
296 | index_a = COMPOSE_INDEX(a); |
---|
297 | |
---|
298 | if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START) |
---|
299 | { |
---|
300 | if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0]) |
---|
301 | { |
---|
302 | *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1]; |
---|
303 | return TRUE; |
---|
304 | } |
---|
305 | else |
---|
306 | return FALSE; |
---|
307 | } |
---|
308 | |
---|
309 | index_b = COMPOSE_INDEX(b); |
---|
310 | |
---|
311 | if (index_b >= COMPOSE_SECOND_SINGLE_START) |
---|
312 | { |
---|
313 | if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0]) |
---|
314 | { |
---|
315 | *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1]; |
---|
316 | return TRUE; |
---|
317 | } |
---|
318 | else |
---|
319 | return FALSE; |
---|
320 | } |
---|
321 | |
---|
322 | if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START && |
---|
323 | index_b >= COMPOSE_SECOND_START && index_b < COMPOSE_SECOND_SINGLE_START) |
---|
324 | { |
---|
325 | gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START]; |
---|
326 | |
---|
327 | if (res) |
---|
328 | { |
---|
329 | *result = res; |
---|
330 | return TRUE; |
---|
331 | } |
---|
332 | } |
---|
333 | |
---|
334 | return FALSE; |
---|
335 | } |
---|
336 | |
---|
337 | gunichar * |
---|
338 | _g_utf8_normalize_wc (const gchar *str, |
---|
339 | gssize max_len, |
---|
340 | GNormalizeMode mode) |
---|
341 | { |
---|
342 | gsize n_wc; |
---|
343 | gunichar *wc_buffer; |
---|
344 | const char *p; |
---|
345 | gsize last_start; |
---|
346 | gboolean do_compat = (mode == G_NORMALIZE_NFKC || |
---|
347 | mode == G_NORMALIZE_NFKD); |
---|
348 | gboolean do_compose = (mode == G_NORMALIZE_NFC || |
---|
349 | mode == G_NORMALIZE_NFKC); |
---|
350 | |
---|
351 | n_wc = 0; |
---|
352 | p = str; |
---|
353 | while ((max_len < 0 || p < str + max_len) && *p) |
---|
354 | { |
---|
355 | const gchar *decomp; |
---|
356 | gunichar wc = g_utf8_get_char (p); |
---|
357 | |
---|
358 | if (wc >= 0xac00 && wc <= 0xd7af) |
---|
359 | { |
---|
360 | gsize result_len; |
---|
361 | decompose_hangul (wc, NULL, &result_len); |
---|
362 | n_wc += result_len; |
---|
363 | } |
---|
364 | else |
---|
365 | { |
---|
366 | decomp = find_decomposition (wc, do_compat); |
---|
367 | |
---|
368 | if (decomp) |
---|
369 | n_wc += g_utf8_strlen (decomp, -1); |
---|
370 | else |
---|
371 | n_wc++; |
---|
372 | } |
---|
373 | |
---|
374 | p = g_utf8_next_char (p); |
---|
375 | } |
---|
376 | |
---|
377 | wc_buffer = g_new (gunichar, n_wc + 1); |
---|
378 | |
---|
379 | last_start = 0; |
---|
380 | n_wc = 0; |
---|
381 | p = str; |
---|
382 | while ((max_len < 0 || p < str + max_len) && *p) |
---|
383 | { |
---|
384 | gunichar wc = g_utf8_get_char (p); |
---|
385 | const gchar *decomp; |
---|
386 | int cc; |
---|
387 | gsize old_n_wc = n_wc; |
---|
388 | |
---|
389 | if (wc >= 0xac00 && wc <= 0xd7af) |
---|
390 | { |
---|
391 | gsize result_len; |
---|
392 | decompose_hangul (wc, wc_buffer + n_wc, &result_len); |
---|
393 | n_wc += result_len; |
---|
394 | } |
---|
395 | else |
---|
396 | { |
---|
397 | decomp = find_decomposition (wc, do_compat); |
---|
398 | |
---|
399 | if (decomp) |
---|
400 | { |
---|
401 | const char *pd; |
---|
402 | for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd)) |
---|
403 | wc_buffer[n_wc++] = g_utf8_get_char (pd); |
---|
404 | } |
---|
405 | else |
---|
406 | wc_buffer[n_wc++] = wc; |
---|
407 | } |
---|
408 | |
---|
409 | if (n_wc > 0) |
---|
410 | { |
---|
411 | cc = COMBINING_CLASS (wc_buffer[old_n_wc]); |
---|
412 | |
---|
413 | if (cc == 0) |
---|
414 | { |
---|
415 | g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start); |
---|
416 | last_start = old_n_wc; |
---|
417 | } |
---|
418 | } |
---|
419 | |
---|
420 | p = g_utf8_next_char (p); |
---|
421 | } |
---|
422 | |
---|
423 | if (n_wc > 0) |
---|
424 | { |
---|
425 | g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start); |
---|
426 | last_start = n_wc; |
---|
427 | } |
---|
428 | |
---|
429 | wc_buffer[n_wc] = 0; |
---|
430 | |
---|
431 | /* All decomposed and reordered */ |
---|
432 | |
---|
433 | if (do_compose && n_wc > 0) |
---|
434 | { |
---|
435 | gsize i, j; |
---|
436 | int last_cc = 0; |
---|
437 | last_start = 0; |
---|
438 | |
---|
439 | for (i = 0; i < n_wc; i++) |
---|
440 | { |
---|
441 | int cc = COMBINING_CLASS (wc_buffer[i]); |
---|
442 | |
---|
443 | if (i > 0 && |
---|
444 | (last_cc == 0 || last_cc != cc) && |
---|
445 | combine (wc_buffer[last_start], wc_buffer[i], |
---|
446 | &wc_buffer[last_start])) |
---|
447 | { |
---|
448 | for (j = i + 1; j < n_wc; j++) |
---|
449 | wc_buffer[j-1] = wc_buffer[j]; |
---|
450 | n_wc--; |
---|
451 | i--; |
---|
452 | |
---|
453 | if (i == last_start) |
---|
454 | last_cc = 0; |
---|
455 | else |
---|
456 | last_cc = COMBINING_CLASS (wc_buffer[i-1]); |
---|
457 | |
---|
458 | continue; |
---|
459 | } |
---|
460 | |
---|
461 | if (cc == 0) |
---|
462 | last_start = i; |
---|
463 | |
---|
464 | last_cc = cc; |
---|
465 | } |
---|
466 | } |
---|
467 | |
---|
468 | wc_buffer[n_wc] = 0; |
---|
469 | |
---|
470 | return wc_buffer; |
---|
471 | } |
---|
472 | |
---|
473 | /** |
---|
474 | * g_utf8_normalize: |
---|
475 | * @str: a UTF-8 encoded string. |
---|
476 | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
---|
477 | * @mode: the type of normalization to perform. |
---|
478 | * |
---|
479 | * Converts a string into canonical form, standardizing |
---|
480 | * such issues as whether a character with an accent |
---|
481 | * is represented as a base character and combining |
---|
482 | * accent or as a single precomposed character. You |
---|
483 | * should generally call g_utf8_normalize() before |
---|
484 | * comparing two Unicode strings. |
---|
485 | * |
---|
486 | * The normalization mode %G_NORMALIZE_DEFAULT only |
---|
487 | * standardizes differences that do not affect the |
---|
488 | * text content, such as the above-mentioned accent |
---|
489 | * representation. %G_NORMALIZE_ALL also standardizes |
---|
490 | * the "compatibility" characters in Unicode, such |
---|
491 | * as SUPERSCRIPT THREE to the standard forms |
---|
492 | * (in this case DIGIT THREE). Formatting information |
---|
493 | * may be lost but for most text operations such |
---|
494 | * characters should be considered the same. |
---|
495 | * For example, g_utf8_collate() normalizes |
---|
496 | * with %G_NORMALIZE_ALL as its first step. |
---|
497 | * |
---|
498 | * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE |
---|
499 | * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL, |
---|
500 | * but returned a result with composed forms rather |
---|
501 | * than a maximally decomposed form. This is often |
---|
502 | * useful if you intend to convert the string to |
---|
503 | * a legacy encoding or pass it to a system with |
---|
504 | * less capable Unicode handling. |
---|
505 | * |
---|
506 | * Return value: a newly allocated string, that is the |
---|
507 | * normalized form of @str. |
---|
508 | **/ |
---|
509 | gchar * |
---|
510 | g_utf8_normalize (const gchar *str, |
---|
511 | gssize len, |
---|
512 | GNormalizeMode mode) |
---|
513 | { |
---|
514 | gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode); |
---|
515 | gchar *result; |
---|
516 | |
---|
517 | result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL); |
---|
518 | g_free (result_wc); |
---|
519 | |
---|
520 | return result; |
---|
521 | } |
---|