source: trunk/third/glib2/glib/gunidecomp.c @ 20721

Revision 20721, 12.7 KB checked in by ghudson, 20 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r20720, which included commits to RCS files with non-trunk default branches.
Line 
1/* decomp.c - Character decomposition.
2 *
3 *  Copyright (C) 1999, 2000 Tom Tromey
4 *  Copyright 2000 Red Hat, Inc.
5 *
6 * The Gnome Library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
10 *
11 * The Gnome Library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with the Gnome Library; see the file COPYING.LIB.  If not,
18 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 *   Boston, MA 02111-1307, USA.
20 */
21
22#include "config.h"
23
24#include <stdlib.h>
25
26#include "glib.h"
27#include "gunidecomp.h"
28#include "gunicomp.h"
29#include "gunicodeprivate.h"
30
31
32#define CC_PART1(Page, Char) \
33  ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
34   ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
35   : (cclass_data[combining_class_table_part1[Page]][Char]))
36
37#define CC_PART2(Page, Char) \
38  ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
39   ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
40   : (cclass_data[combining_class_table_part2[Page]][Char]))
41
42#define COMBINING_CLASS(Char) \
43  (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
44   ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
45   : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
46      ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
47      : 0))
48
49gint
50_g_unichar_combining_class (gunichar uc)
51{
52  return COMBINING_CLASS (uc);
53}
54
55/* constants for hangul syllable [de]composition */
56#define SBase 0xAC00
57#define LBase 0x1100
58#define VBase 0x1161
59#define TBase 0x11A7
60#define LCount 19
61#define VCount 21
62#define TCount 28
63#define NCount (VCount * TCount)
64#define SCount (LCount * NCount)
65
66/**
67 * g_unicode_canonical_ordering:
68 * @string: a UCS-4 encoded string.
69 * @len: the maximum length of @string to use.
70 *
71 * Computes the canonical ordering of a string in-place. 
72 * This rearranges decomposed characters in the string
73 * according to their combining classes.  See the Unicode
74 * manual for more information.
75 **/
76void
77g_unicode_canonical_ordering (gunichar *string,
78                              gsize     len)
79{
80  gsize i;
81  int swap = 1;
82
83  while (swap)
84    {
85      int last;
86      swap = 0;
87      last = COMBINING_CLASS (string[0]);
88      for (i = 0; i < len - 1; ++i)
89        {
90          int next = COMBINING_CLASS (string[i + 1]);
91          if (next != 0 && last > next)
92            {
93              gsize j;
94              /* Percolate item leftward through string.  */
95              for (j = i + 1; j > 0; --j)
96                {
97                  gunichar t;
98                  if (COMBINING_CLASS (string[j - 1]) <= next)
99                    break;
100                  t = string[j];
101                  string[j] = string[j - 1];
102                  string[j - 1] = t;
103                  swap = 1;
104                }
105              /* We're re-entering the loop looking at the old
106                 character again.  */
107              next = last;
108            }
109          last = next;
110        }
111    }
112}
113
114/* http://www.unicode.org/unicode/reports/tr15/#Hangul
115 * r should be null or have sufficient space. Calling with r == NULL will
116 * only calculate the result_len; however, a buffer with space for three
117 * characters will always be big enough. */
118static void
119decompose_hangul (gunichar s,
120                  gunichar *r,
121                  gsize *result_len)
122{
123  gint SIndex = s - SBase;
124
125  /* not a hangul syllable */
126  if (SIndex < 0 || SIndex >= SCount)
127    {
128      if (r)
129        r[0] = s;
130      *result_len = 1;
131    }
132  else
133    {
134      gunichar L = LBase + SIndex / NCount;
135      gunichar V = VBase + (SIndex % NCount) / TCount;
136      gunichar T = TBase + SIndex % TCount;
137
138      if (r)
139        {
140          r[0] = L;
141          r[1] = V;
142        }
143
144      if (T != TBase)
145        {
146          if (r)
147            r[2] = T;
148          *result_len = 3;
149        }
150      else
151        *result_len = 2;
152    }
153}
154
155/* returns a pointer to a null-terminated UTF-8 string */
156static const gchar *
157find_decomposition (gunichar ch,
158                    gboolean compat)
159{
160  int start = 0;
161  int end = G_N_ELEMENTS (decomp_table);
162 
163  if (ch >= decomp_table[start].ch &&
164      ch <= decomp_table[end - 1].ch)
165    {
166      while (TRUE)
167        {
168          int half = (start + end) / 2;
169          if (ch == decomp_table[half].ch)
170            {
171              int offset;
172
173              if (compat)
174                {
175                  offset = decomp_table[half].compat_offset;
176                  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
177                    offset = decomp_table[half].canon_offset;
178                }
179              else
180                {
181                  offset = decomp_table[half].canon_offset;
182                  if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
183                    return NULL;
184                }
185             
186              return &(decomp_expansion_string[offset]);
187            }
188          else if (half == start)
189            break;
190          else if (ch > decomp_table[half].ch)
191            start = half;
192          else
193            end = half;
194        }
195    }
196
197  return NULL;
198}
199
200/**
201 * g_unicode_canonical_decomposition:
202 * @ch: a Unicode character.
203 * @result_len: location to store the length of the return value.
204 *
205 * Computes the canonical decomposition of a Unicode character. 
206 *
207 * Return value: a newly allocated string of Unicode characters.
208 *   @result_len is set to the resulting length of the string.
209 **/
210gunichar *
211g_unicode_canonical_decomposition (gunichar ch,
212                                   gsize   *result_len)
213{
214  const gchar *decomp;
215  const gchar *p;
216  gunichar *r;
217
218  /* Hangul syllable */
219  if (ch >= 0xac00 && ch <= 0xd7af)
220    {
221      decompose_hangul (ch, NULL, result_len);
222      r = g_malloc (*result_len * sizeof (gunichar));
223      decompose_hangul (ch, r, result_len);
224    }
225  else if ((decomp = find_decomposition (ch, FALSE)) != NULL)
226    {
227      /* Found it.  */
228      int i;
229     
230      *result_len = g_utf8_strlen (decomp, -1);
231      r = g_malloc (*result_len * sizeof (gunichar));
232     
233      for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++)
234        r[i] = g_utf8_get_char (p);
235    }
236  else
237    {
238      /* Not in our table.  */
239      r = g_malloc (sizeof (gunichar));
240      *r = ch;
241      *result_len = 1;
242    }
243
244  /* Supposedly following the Unicode 2.1.9 table means that the
245     decompositions come out in canonical order.  I haven't tested
246     this, but we rely on it here.  */
247  return r;
248}
249
250/* L,V => LV and LV,T => LVT  */
251static gboolean
252combine_hangul (gunichar a,
253                gunichar b,
254                gunichar *result)
255{
256  gint LIndex = a - LBase;
257  gint SIndex = a - SBase;
258
259  gint VIndex = b - VBase;
260  gint TIndex = b - TBase;
261
262  if (0 <= LIndex && LIndex < LCount
263      && 0 <= VIndex && VIndex < VCount)
264    {
265      *result = SBase + (LIndex * VCount + VIndex) * TCount;
266      return TRUE;
267    }
268  else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
269           && 0 <= TIndex && TIndex <= TCount)
270    {
271      *result = a + TIndex;
272      return TRUE;
273    }
274
275  return FALSE;
276}
277
278#define CI(Page, Char) \
279  ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
280   ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
281   : (compose_data[compose_table[Page]][Char]))
282
283#define COMPOSE_INDEX(Char) \
284     (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
285
286static gboolean
287combine (gunichar  a,
288         gunichar  b,
289         gunichar *result)
290{
291  gushort index_a, index_b;
292
293  if (combine_hangul (a, b, result))
294    return TRUE;
295
296  index_a = COMPOSE_INDEX(a);
297
298  if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
299    {
300      if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
301        {
302          *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
303          return TRUE;
304        }
305      else
306        return FALSE;
307    }
308 
309  index_b = COMPOSE_INDEX(b);
310
311  if (index_b >= COMPOSE_SECOND_SINGLE_START)
312    {
313      if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
314        {
315          *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
316          return TRUE;
317        }
318      else
319        return FALSE;
320    }
321
322  if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START &&
323      index_b >= COMPOSE_SECOND_START && index_b < COMPOSE_SECOND_SINGLE_START)
324    {
325      gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START];
326
327      if (res)
328        {
329          *result = res;
330          return TRUE;
331        }
332    }
333
334  return FALSE;
335}
336
337gunichar *
338_g_utf8_normalize_wc (const gchar    *str,
339                      gssize          max_len,
340                      GNormalizeMode  mode)
341{
342  gsize n_wc;
343  gunichar *wc_buffer;
344  const char *p;
345  gsize last_start;
346  gboolean do_compat = (mode == G_NORMALIZE_NFKC ||
347                        mode == G_NORMALIZE_NFKD);
348  gboolean do_compose = (mode == G_NORMALIZE_NFC ||
349                         mode == G_NORMALIZE_NFKC);
350
351  n_wc = 0;
352  p = str;
353  while ((max_len < 0 || p < str + max_len) && *p)
354    {
355      const gchar *decomp;
356      gunichar wc = g_utf8_get_char (p);
357
358      if (wc >= 0xac00 && wc <= 0xd7af)
359        {
360          gsize result_len;
361          decompose_hangul (wc, NULL, &result_len);
362          n_wc += result_len;
363        }
364      else
365        {
366          decomp = find_decomposition (wc, do_compat);
367
368          if (decomp)
369            n_wc += g_utf8_strlen (decomp, -1);
370          else
371            n_wc++;
372        }
373
374      p = g_utf8_next_char (p);
375    }
376
377  wc_buffer = g_new (gunichar, n_wc + 1);
378
379  last_start = 0;
380  n_wc = 0;
381  p = str;
382  while ((max_len < 0 || p < str + max_len) && *p)
383    {
384      gunichar wc = g_utf8_get_char (p);
385      const gchar *decomp;
386      int cc;
387      gsize old_n_wc = n_wc;
388         
389      if (wc >= 0xac00 && wc <= 0xd7af)
390        {
391          gsize result_len;
392          decompose_hangul (wc, wc_buffer + n_wc, &result_len);
393          n_wc += result_len;
394        }
395      else
396        {
397          decomp = find_decomposition (wc, do_compat);
398         
399          if (decomp)
400            {
401              const char *pd;
402              for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
403                wc_buffer[n_wc++] = g_utf8_get_char (pd);
404            }
405          else
406            wc_buffer[n_wc++] = wc;
407        }
408
409      if (n_wc > 0)
410        {
411          cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
412
413          if (cc == 0)
414            {
415              g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
416              last_start = old_n_wc;
417            }
418        }
419     
420      p = g_utf8_next_char (p);
421    }
422
423  if (n_wc > 0)
424    {
425      g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
426      last_start = n_wc;
427    }
428         
429  wc_buffer[n_wc] = 0;
430
431  /* All decomposed and reordered */
432
433  if (do_compose && n_wc > 0)
434    {
435      gsize i, j;
436      int last_cc = 0;
437      last_start = 0;
438     
439      for (i = 0; i < n_wc; i++)
440        {
441          int cc = COMBINING_CLASS (wc_buffer[i]);
442
443          if (i > 0 &&
444              (last_cc == 0 || last_cc != cc) &&
445              combine (wc_buffer[last_start], wc_buffer[i],
446                       &wc_buffer[last_start]))
447            {
448              for (j = i + 1; j < n_wc; j++)
449                wc_buffer[j-1] = wc_buffer[j];
450              n_wc--;
451              i--;
452             
453              if (i == last_start)
454                last_cc = 0;
455              else
456                last_cc = COMBINING_CLASS (wc_buffer[i-1]);
457             
458              continue;
459            }
460
461          if (cc == 0)
462            last_start = i;
463
464          last_cc = cc;
465        }
466    }
467
468  wc_buffer[n_wc] = 0;
469
470  return wc_buffer;
471}
472
473/**
474 * g_utf8_normalize:
475 * @str: a UTF-8 encoded string.
476 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
477 * @mode: the type of normalization to perform.
478 *
479 * Converts a string into canonical form, standardizing
480 * such issues as whether a character with an accent
481 * is represented as a base character and combining
482 * accent or as a single precomposed character. You
483 * should generally call g_utf8_normalize() before
484 * comparing two Unicode strings.
485 *
486 * The normalization mode %G_NORMALIZE_DEFAULT only
487 * standardizes differences that do not affect the
488 * text content, such as the above-mentioned accent
489 * representation. %G_NORMALIZE_ALL also standardizes
490 * the "compatibility" characters in Unicode, such
491 * as SUPERSCRIPT THREE to the standard forms
492 * (in this case DIGIT THREE). Formatting information
493 * may be lost but for most text operations such
494 * characters should be considered the same.
495 * For example, g_utf8_collate() normalizes
496 * with %G_NORMALIZE_ALL as its first step.
497 *
498 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
499 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
500 * but returned a result with composed forms rather
501 * than a maximally decomposed form. This is often
502 * useful if you intend to convert the string to
503 * a legacy encoding or pass it to a system with
504 * less capable Unicode handling.
505 *
506 * Return value: a newly allocated string, that is the
507 *   normalized form of @str.
508 **/
509gchar *
510g_utf8_normalize (const gchar    *str,
511                  gssize          len,
512                  GNormalizeMode  mode)
513{
514  gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
515  gchar *result;
516
517  result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
518  g_free (result_wc);
519
520  return result;
521}
Note: See TracBrowser for help on using the repository browser.