1 | /* decomp.c - Character decomposition. |
---|
2 | * |
---|
3 | * Copyright (C) 1999, 2000 Tom Tromey |
---|
4 | * Copyright 2000 Red Hat, Inc. |
---|
5 | * |
---|
6 | * The Gnome Library is free software; you can redistribute it and/or |
---|
7 | * modify it under the terms of the GNU Lesser General Public License as |
---|
8 | * published by the Free Software Foundation; either version 2 of the |
---|
9 | * License, or (at your option) any later version. |
---|
10 | * |
---|
11 | * The Gnome Library is distributed in the hope that it will be useful, |
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
---|
14 | * Lesser General Public License for more details. |
---|
15 | * |
---|
16 | * You should have received a copy of the GNU Lesser General Public |
---|
17 | * License along with the Gnome Library; see the file COPYING.LIB. If not, |
---|
18 | * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
---|
19 | * Boston, MA 02111-1307, USA. |
---|
20 | */ |
---|
21 | |
---|
22 | #include "config.h" |
---|
23 | |
---|
24 | #include <stdlib.h> |
---|
25 | |
---|
26 | #include "glib.h" |
---|
27 | #include "gunidecomp.h" |
---|
28 | #include "gunicomp.h" |
---|
29 | |
---|
30 | |
---|
31 | #define CC(Page, Char) \ |
---|
32 | ((combining_class_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
---|
33 | ? (combining_class_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
---|
34 | : (cclass_data[combining_class_table[Page]][Char])) |
---|
35 | |
---|
36 | #define COMBINING_CLASS(Char) \ |
---|
37 | (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff)) |
---|
38 | |
---|
39 | /** |
---|
40 | * g_unicode_canonical_ordering: |
---|
41 | * @string: a UCS-4 encoded string. |
---|
42 | * @len: the maximum length of @string to use. |
---|
43 | * |
---|
44 | * Computes the canonical ordering of a string in-place. |
---|
45 | * This rearranges decomposed characters in the string |
---|
46 | * according to their combining classes. See the Unicode |
---|
47 | * manual for more information. |
---|
48 | **/ |
---|
49 | void |
---|
50 | g_unicode_canonical_ordering (gunichar *string, |
---|
51 | gsize len) |
---|
52 | { |
---|
53 | gsize i; |
---|
54 | int swap = 1; |
---|
55 | |
---|
56 | while (swap) |
---|
57 | { |
---|
58 | int last; |
---|
59 | swap = 0; |
---|
60 | last = COMBINING_CLASS (string[0]); |
---|
61 | for (i = 0; i < len - 1; ++i) |
---|
62 | { |
---|
63 | int next = COMBINING_CLASS (string[i + 1]); |
---|
64 | if (next != 0 && last > next) |
---|
65 | { |
---|
66 | gsize j; |
---|
67 | /* Percolate item leftward through string. */ |
---|
68 | for (j = i; j > 0; --j) |
---|
69 | { |
---|
70 | gunichar t; |
---|
71 | if (COMBINING_CLASS (string[j]) <= next) |
---|
72 | break; |
---|
73 | t = string[j + 1]; |
---|
74 | string[j + 1] = string[j]; |
---|
75 | string[j] = t; |
---|
76 | swap = 1; |
---|
77 | } |
---|
78 | /* We're re-entering the loop looking at the old |
---|
79 | character again. */ |
---|
80 | next = last; |
---|
81 | } |
---|
82 | last = next; |
---|
83 | } |
---|
84 | } |
---|
85 | } |
---|
86 | |
---|
87 | static const guchar * |
---|
88 | find_decomposition (gunichar ch, |
---|
89 | gboolean compat) |
---|
90 | { |
---|
91 | int start = 0; |
---|
92 | int end = G_N_ELEMENTS (decomp_table); |
---|
93 | |
---|
94 | if (ch >= decomp_table[start].ch && |
---|
95 | ch <= decomp_table[end - 1].ch) |
---|
96 | { |
---|
97 | while (TRUE) |
---|
98 | { |
---|
99 | int half = (start + end) / 2; |
---|
100 | if (ch == decomp_table[half].ch) |
---|
101 | { |
---|
102 | int offset; |
---|
103 | |
---|
104 | if (compat) |
---|
105 | { |
---|
106 | offset = decomp_table[half].compat_offset; |
---|
107 | if (offset == 0xff) |
---|
108 | offset = decomp_table[half].canon_offset; |
---|
109 | } |
---|
110 | else |
---|
111 | { |
---|
112 | offset = decomp_table[half].canon_offset; |
---|
113 | if (offset == 0xff) |
---|
114 | return NULL; |
---|
115 | } |
---|
116 | |
---|
117 | return &(decomp_expansion_string[decomp_table[half].expansion_offset + offset]); |
---|
118 | } |
---|
119 | else if (half == start) |
---|
120 | break; |
---|
121 | else if (ch > decomp_table[half].ch) |
---|
122 | start = half; |
---|
123 | else |
---|
124 | end = half; |
---|
125 | } |
---|
126 | } |
---|
127 | |
---|
128 | return NULL; |
---|
129 | } |
---|
130 | |
---|
131 | /** |
---|
132 | * g_unicode_canonical_decomposition: |
---|
133 | * @ch: a Unicode character. |
---|
134 | * @result_len: location to store the length of the return value. |
---|
135 | * |
---|
136 | * Computes the canonical decomposition of a Unicode character. |
---|
137 | * |
---|
138 | * Return value: a newly allocated string of Unicode characters. |
---|
139 | * @result_len is set to the resulting length of the string. |
---|
140 | **/ |
---|
141 | gunichar * |
---|
142 | g_unicode_canonical_decomposition (gunichar ch, |
---|
143 | gsize *result_len) |
---|
144 | { |
---|
145 | const guchar *decomp = find_decomposition (ch, FALSE); |
---|
146 | gunichar *r; |
---|
147 | |
---|
148 | if (decomp) |
---|
149 | { |
---|
150 | /* Found it. */ |
---|
151 | int i, len; |
---|
152 | /* We store as a double-nul terminated string. */ |
---|
153 | for (len = 0; (decomp[len] || decomp[len + 1]); |
---|
154 | len += 2) |
---|
155 | ; |
---|
156 | |
---|
157 | /* We've counted twice as many bytes as there are |
---|
158 | characters. */ |
---|
159 | *result_len = len / 2; |
---|
160 | r = malloc (len / 2 * sizeof (gunichar)); |
---|
161 | |
---|
162 | for (i = 0; i < len; i += 2) |
---|
163 | { |
---|
164 | r[i / 2] = (decomp[i] << 8 | decomp[i + 1]); |
---|
165 | } |
---|
166 | } |
---|
167 | else |
---|
168 | { |
---|
169 | /* Not in our table. */ |
---|
170 | r = malloc (sizeof (gunichar)); |
---|
171 | *r = ch; |
---|
172 | *result_len = 1; |
---|
173 | } |
---|
174 | |
---|
175 | /* Supposedly following the Unicode 2.1.9 table means that the |
---|
176 | decompositions come out in canonical order. I haven't tested |
---|
177 | this, but we rely on it here. */ |
---|
178 | return r; |
---|
179 | } |
---|
180 | |
---|
181 | #define CI(Page, Char) \ |
---|
182 | ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
---|
183 | ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
---|
184 | : (compose_data[compose_table[Page]][Char])) |
---|
185 | |
---|
186 | #define COMPOSE_INDEX(Char) \ |
---|
187 | (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CI((Char) >> 8, (Char) & 0xff)) |
---|
188 | |
---|
189 | gboolean |
---|
190 | combine (gunichar a, |
---|
191 | gunichar b, |
---|
192 | gunichar *result) |
---|
193 | { |
---|
194 | gushort index_a, index_b; |
---|
195 | |
---|
196 | index_a = COMPOSE_INDEX(a); |
---|
197 | if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START) |
---|
198 | { |
---|
199 | if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0]) |
---|
200 | { |
---|
201 | *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1]; |
---|
202 | return TRUE; |
---|
203 | } |
---|
204 | else |
---|
205 | return FALSE; |
---|
206 | } |
---|
207 | |
---|
208 | index_b = COMPOSE_INDEX(b); |
---|
209 | if (index_b >= COMPOSE_SECOND_SINGLE_START) |
---|
210 | { |
---|
211 | if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0]) |
---|
212 | { |
---|
213 | *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1]; |
---|
214 | return TRUE; |
---|
215 | } |
---|
216 | else |
---|
217 | return FALSE; |
---|
218 | } |
---|
219 | |
---|
220 | if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START && |
---|
221 | index_b >= COMPOSE_SECOND_START && index_a < COMPOSE_SECOND_SINGLE_START) |
---|
222 | { |
---|
223 | gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START]; |
---|
224 | |
---|
225 | if (res) |
---|
226 | { |
---|
227 | *result = res; |
---|
228 | return TRUE; |
---|
229 | } |
---|
230 | } |
---|
231 | |
---|
232 | return FALSE; |
---|
233 | } |
---|
234 | |
---|
235 | gunichar * |
---|
236 | _g_utf8_normalize_wc (const gchar *str, |
---|
237 | gssize max_len, |
---|
238 | GNormalizeMode mode) |
---|
239 | { |
---|
240 | gsize n_wc; |
---|
241 | gunichar *wc_buffer; |
---|
242 | const char *p; |
---|
243 | gsize last_start; |
---|
244 | gboolean do_compat = (mode == G_NORMALIZE_NFKC || |
---|
245 | mode == G_NORMALIZE_NFKD); |
---|
246 | gboolean do_compose = (mode == G_NORMALIZE_NFC || |
---|
247 | mode == G_NORMALIZE_NFKC); |
---|
248 | |
---|
249 | n_wc = 0; |
---|
250 | p = str; |
---|
251 | while ((max_len < 0 || p < str + max_len) && *p) |
---|
252 | { |
---|
253 | gunichar wc = g_utf8_get_char (p); |
---|
254 | |
---|
255 | const guchar *decomp = find_decomposition (wc, do_compat); |
---|
256 | |
---|
257 | if (decomp) |
---|
258 | { |
---|
259 | int len; |
---|
260 | /* We store as a double-nul terminated string. */ |
---|
261 | for (len = 0; (decomp[len] || decomp[len + 1]); |
---|
262 | len += 2) |
---|
263 | ; |
---|
264 | n_wc += len / 2; |
---|
265 | } |
---|
266 | else |
---|
267 | n_wc++; |
---|
268 | |
---|
269 | p = g_utf8_next_char (p); |
---|
270 | } |
---|
271 | |
---|
272 | wc_buffer = g_new (gunichar, n_wc + 1); |
---|
273 | |
---|
274 | last_start = 0; |
---|
275 | n_wc = 0; |
---|
276 | p = str; |
---|
277 | while ((max_len < 0 || p < str + max_len) && *p) |
---|
278 | { |
---|
279 | gunichar wc = g_utf8_get_char (p); |
---|
280 | const guchar *decomp; |
---|
281 | int cc; |
---|
282 | gsize old_n_wc = n_wc; |
---|
283 | |
---|
284 | decomp = find_decomposition (wc, do_compat); |
---|
285 | |
---|
286 | if (decomp) |
---|
287 | { |
---|
288 | int len; |
---|
289 | /* We store as a double-nul terminated string. */ |
---|
290 | for (len = 0; (decomp[len] || decomp[len + 1]); |
---|
291 | len += 2) |
---|
292 | wc_buffer[n_wc++] = (decomp[len] << 8 | decomp[len + 1]); |
---|
293 | } |
---|
294 | else |
---|
295 | wc_buffer[n_wc++] = wc; |
---|
296 | |
---|
297 | if (n_wc > 0) |
---|
298 | { |
---|
299 | cc = COMBINING_CLASS (wc_buffer[old_n_wc]); |
---|
300 | |
---|
301 | if (cc == 0) |
---|
302 | { |
---|
303 | g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start); |
---|
304 | last_start = old_n_wc; |
---|
305 | } |
---|
306 | } |
---|
307 | |
---|
308 | p = g_utf8_next_char (p); |
---|
309 | } |
---|
310 | |
---|
311 | if (n_wc > 0) |
---|
312 | { |
---|
313 | g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start); |
---|
314 | last_start = n_wc; |
---|
315 | } |
---|
316 | |
---|
317 | wc_buffer[n_wc] = 0; |
---|
318 | |
---|
319 | /* All decomposed and reordered */ |
---|
320 | |
---|
321 | |
---|
322 | if (do_compose && n_wc > 0) |
---|
323 | { |
---|
324 | gsize i, j; |
---|
325 | int last_cc = 0; |
---|
326 | last_start = 0; |
---|
327 | |
---|
328 | for (i = 0; i < n_wc; i++) |
---|
329 | { |
---|
330 | int cc = COMBINING_CLASS (wc_buffer[i]); |
---|
331 | |
---|
332 | if (i > 0 && |
---|
333 | (last_cc == 0 || last_cc != cc) && |
---|
334 | combine (wc_buffer[last_start], wc_buffer[i], |
---|
335 | &wc_buffer[last_start])) |
---|
336 | { |
---|
337 | for (j = i + 1; j < n_wc; j++) |
---|
338 | wc_buffer[j-1] = wc_buffer[j]; |
---|
339 | n_wc--; |
---|
340 | i--; |
---|
341 | |
---|
342 | if (i == last_start) |
---|
343 | last_cc = 0; |
---|
344 | else |
---|
345 | last_cc = COMBINING_CLASS (wc_buffer[i-1]); |
---|
346 | |
---|
347 | continue; |
---|
348 | } |
---|
349 | |
---|
350 | if (cc == 0) |
---|
351 | last_start = i; |
---|
352 | |
---|
353 | last_cc = cc; |
---|
354 | } |
---|
355 | } |
---|
356 | |
---|
357 | wc_buffer[n_wc] = 0; |
---|
358 | |
---|
359 | return wc_buffer; |
---|
360 | } |
---|
361 | |
---|
362 | /** |
---|
363 | * g_utf8_normalize: |
---|
364 | * @str: a UTF-8 encoded string. |
---|
365 | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
---|
366 | * @mode: the type of normalization to perform. |
---|
367 | * |
---|
368 | * Converts a string into canonical form, standardizing |
---|
369 | * such issues as whether a character with an accent |
---|
370 | * is represented as a base character and combining |
---|
371 | * accent or as a single precomposed character. You |
---|
372 | * should generally call g_utf8_normalize() before |
---|
373 | * comparing two Unicode strings. |
---|
374 | * |
---|
375 | * The normalization mode %G_NORMALIZE_DEFAULT only |
---|
376 | * standardizes differences that do not affect the |
---|
377 | * text content, such as the above-mentioned accent |
---|
378 | * representation. %G_NORMALIZE_ALL also standardizes |
---|
379 | * the "compatibility" characters in Unicode, such |
---|
380 | * as SUPERSCRIPT THREE to the standard forms |
---|
381 | * (in this case DIGIT THREE). Formatting information |
---|
382 | * may be lost but for most text operations such |
---|
383 | * characters should be considered the same. |
---|
384 | * For example, g_utf8_collate() normalizes |
---|
385 | * with %G_NORMALIZE_ALL as its first step. |
---|
386 | * |
---|
387 | * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE |
---|
388 | * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL, |
---|
389 | * but returned a result with composed forms rather |
---|
390 | * than a maximally decomposed form. This is often |
---|
391 | * useful if you intend to convert the string to |
---|
392 | * a legacy encoding or pass it to a system with |
---|
393 | * less capable Unicode handling. |
---|
394 | * |
---|
395 | * Return value: a newly allocated string, that is the |
---|
396 | * normalized form of @str. |
---|
397 | **/ |
---|
398 | gchar * |
---|
399 | g_utf8_normalize (const gchar *str, |
---|
400 | gssize len, |
---|
401 | GNormalizeMode mode) |
---|
402 | { |
---|
403 | gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode); |
---|
404 | gchar *result; |
---|
405 | |
---|
406 | result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL); |
---|
407 | g_free (result_wc); |
---|
408 | |
---|
409 | return result; |
---|
410 | } |
---|