source: trunk/third/glib2/glib/gunidecomp.c @ 18159

Revision 18159, 9.9 KB checked in by ghudson, 22 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r18158, which included commits to RCS files with non-trunk default branches.
Line 
1/* decomp.c - Character decomposition.
2 *
3 *  Copyright (C) 1999, 2000 Tom Tromey
4 *  Copyright 2000 Red Hat, Inc.
5 *
6 * The Gnome Library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
10 *
11 * The Gnome Library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with the Gnome Library; see the file COPYING.LIB.  If not,
18 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 *   Boston, MA 02111-1307, USA.
20 */
21
22#include "config.h"
23
24#include <stdlib.h>
25
26#include "glib.h"
27#include "gunidecomp.h"
28#include "gunicomp.h"
29
30
31#define CC(Page, Char) \
32  ((combining_class_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
33   ? (combining_class_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
34   : (cclass_data[combining_class_table[Page]][Char]))
35
36#define COMBINING_CLASS(Char) \
37     (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CC((Char) >> 8, (Char) & 0xff))
38
39/**
40 * g_unicode_canonical_ordering:
41 * @string: a UCS-4 encoded string.
42 * @len: the maximum length of @string to use.
43 *
44 * Computes the canonical ordering of a string in-place. 
45 * This rearranges decomposed characters in the string
46 * according to their combining classes.  See the Unicode
47 * manual for more information.
48 **/
49void
50g_unicode_canonical_ordering (gunichar *string,
51                              gsize     len)
52{
53  gsize i;
54  int swap = 1;
55
56  while (swap)
57    {
58      int last;
59      swap = 0;
60      last = COMBINING_CLASS (string[0]);
61      for (i = 0; i < len - 1; ++i)
62        {
63          int next = COMBINING_CLASS (string[i + 1]);
64          if (next != 0 && last > next)
65            {
66              gsize j;
67              /* Percolate item leftward through string.  */
68              for (j = i; j > 0; --j)
69                {
70                  gunichar t;
71                  if (COMBINING_CLASS (string[j]) <= next)
72                    break;
73                  t = string[j + 1];
74                  string[j + 1] = string[j];
75                  string[j] = t;
76                  swap = 1;
77                }
78              /* We're re-entering the loop looking at the old
79                 character again.  */
80              next = last;
81            }
82          last = next;
83        }
84    }
85}
86
87static const guchar *
88find_decomposition (gunichar ch,
89                    gboolean compat)
90{
91  int start = 0;
92  int end = G_N_ELEMENTS (decomp_table);
93 
94  if (ch >= decomp_table[start].ch &&
95      ch <= decomp_table[end - 1].ch)
96    {
97      while (TRUE)
98        {
99          int half = (start + end) / 2;
100          if (ch == decomp_table[half].ch)
101            {
102              int offset;
103
104              if (compat)
105                {
106                  offset = decomp_table[half].compat_offset;
107                  if (offset == 0xff)
108                    offset = decomp_table[half].canon_offset;
109                }
110              else
111                {
112                  offset = decomp_table[half].canon_offset;
113                  if (offset == 0xff)
114                    return NULL;
115                }
116             
117              return &(decomp_expansion_string[decomp_table[half].expansion_offset + offset]);
118            }
119          else if (half == start)
120            break;
121          else if (ch > decomp_table[half].ch)
122            start = half;
123          else
124            end = half;
125        }
126    }
127
128  return NULL;
129}
130
131/**
132 * g_unicode_canonical_decomposition:
133 * @ch: a Unicode character.
134 * @result_len: location to store the length of the return value.
135 *
136 * Computes the canonical decomposition of a Unicode character. 
137 *
138 * Return value: a newly allocated string of Unicode characters.
139 *   @result_len is set to the resulting length of the string.
140 **/
141gunichar *
142g_unicode_canonical_decomposition (gunichar ch,
143                                   gsize   *result_len)
144{
145  const guchar *decomp = find_decomposition (ch, FALSE);
146  gunichar *r;
147
148  if (decomp)
149    {
150      /* Found it.  */
151      int i, len;
152      /* We store as a double-nul terminated string.  */
153      for (len = 0; (decomp[len] || decomp[len + 1]);
154           len += 2)
155        ;
156     
157      /* We've counted twice as many bytes as there are
158         characters.  */
159      *result_len = len / 2;
160      r = malloc (len / 2 * sizeof (gunichar));
161     
162      for (i = 0; i < len; i += 2)
163        {
164          r[i / 2] = (decomp[i] << 8 | decomp[i + 1]);
165        }
166    }
167  else
168    {
169      /* Not in our table.  */
170      r = malloc (sizeof (gunichar));
171      *r = ch;
172      *result_len = 1;
173    }
174
175  /* Supposedly following the Unicode 2.1.9 table means that the
176     decompositions come out in canonical order.  I haven't tested
177     this, but we rely on it here.  */
178  return r;
179}
180
181#define CI(Page, Char) \
182  ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
183   ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
184   : (compose_data[compose_table[Page]][Char]))
185
186#define COMPOSE_INDEX(Char) \
187     (((Char) > (G_UNICODE_LAST_CHAR)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
188
189gboolean
190combine (gunichar  a,
191         gunichar  b,
192         gunichar *result)
193{
194  gushort index_a, index_b;
195
196  index_a = COMPOSE_INDEX(a);
197  if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
198    {
199      if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
200        {
201          *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
202          return TRUE;
203        }
204      else
205        return FALSE;
206    }
207 
208  index_b = COMPOSE_INDEX(b);
209  if (index_b >= COMPOSE_SECOND_SINGLE_START)
210    {
211      if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
212        {
213          *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
214          return TRUE;
215        }
216      else
217        return FALSE;
218    }
219
220  if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START &&
221      index_b >= COMPOSE_SECOND_START && index_a < COMPOSE_SECOND_SINGLE_START)
222    {
223      gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START];
224
225      if (res)
226        {
227          *result = res;
228          return TRUE;
229        }
230    }
231
232  return FALSE;
233}
234
235gunichar *
236_g_utf8_normalize_wc (const gchar    *str,
237                      gssize          max_len,
238                      GNormalizeMode  mode)
239{
240  gsize n_wc;
241  gunichar *wc_buffer;
242  const char *p;
243  gsize last_start;
244  gboolean do_compat = (mode == G_NORMALIZE_NFKC ||
245                        mode == G_NORMALIZE_NFKD);
246  gboolean do_compose = (mode == G_NORMALIZE_NFC ||
247                         mode == G_NORMALIZE_NFKC);
248
249  n_wc = 0;
250  p = str;
251  while ((max_len < 0 || p < str + max_len) && *p)
252    {
253      gunichar wc = g_utf8_get_char (p);
254
255      const guchar *decomp = find_decomposition (wc, do_compat);
256
257      if (decomp)
258        {
259          int len;
260          /* We store as a double-nul terminated string.  */
261          for (len = 0; (decomp[len] || decomp[len + 1]);
262               len += 2)
263            ;
264          n_wc += len / 2;
265        }
266      else
267        n_wc++;
268
269      p = g_utf8_next_char (p);
270    }
271
272  wc_buffer = g_new (gunichar, n_wc + 1);
273
274  last_start = 0;
275  n_wc = 0;
276  p = str;
277  while ((max_len < 0 || p < str + max_len) && *p)
278    {
279      gunichar wc = g_utf8_get_char (p);
280      const guchar *decomp;
281      int cc;
282      gsize old_n_wc = n_wc;
283         
284      decomp = find_decomposition (wc, do_compat);
285         
286      if (decomp)
287        {
288          int len;
289          /* We store as a double-nul terminated string.  */
290          for (len = 0; (decomp[len] || decomp[len + 1]);
291               len += 2)
292            wc_buffer[n_wc++] = (decomp[len] << 8 | decomp[len + 1]);
293        }
294      else
295        wc_buffer[n_wc++] = wc;
296
297      if (n_wc > 0)
298        {
299          cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
300
301          if (cc == 0)
302            {
303              g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
304              last_start = old_n_wc;
305            }
306        }
307     
308      p = g_utf8_next_char (p);
309    }
310
311  if (n_wc > 0)
312    {
313      g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
314      last_start = n_wc;
315    }
316         
317  wc_buffer[n_wc] = 0;
318
319  /* All decomposed and reordered */
320
321
322  if (do_compose && n_wc > 0)
323    {
324      gsize i, j;
325      int last_cc = 0;
326      last_start = 0;
327     
328      for (i = 0; i < n_wc; i++)
329        {
330          int cc = COMBINING_CLASS (wc_buffer[i]);
331
332          if (i > 0 &&
333              (last_cc == 0 || last_cc != cc) &&
334              combine (wc_buffer[last_start], wc_buffer[i],
335                       &wc_buffer[last_start]))
336            {
337              for (j = i + 1; j < n_wc; j++)
338                wc_buffer[j-1] = wc_buffer[j];
339              n_wc--;
340              i--;
341             
342              if (i == last_start)
343                last_cc = 0;
344              else
345                last_cc = COMBINING_CLASS (wc_buffer[i-1]);
346             
347              continue;
348            }
349
350          if (cc == 0)
351            last_start = i;
352
353          last_cc = cc;
354        }
355    }
356
357  wc_buffer[n_wc] = 0;
358
359  return wc_buffer;
360}
361
362/**
363 * g_utf8_normalize:
364 * @str: a UTF-8 encoded string.
365 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
366 * @mode: the type of normalization to perform.
367 *
368 * Converts a string into canonical form, standardizing
369 * such issues as whether a character with an accent
370 * is represented as a base character and combining
371 * accent or as a single precomposed character. You
372 * should generally call g_utf8_normalize() before
373 * comparing two Unicode strings.
374 *
375 * The normalization mode %G_NORMALIZE_DEFAULT only
376 * standardizes differences that do not affect the
377 * text content, such as the above-mentioned accent
378 * representation. %G_NORMALIZE_ALL also standardizes
379 * the "compatibility" characters in Unicode, such
380 * as SUPERSCRIPT THREE to the standard forms
381 * (in this case DIGIT THREE). Formatting information
382 * may be lost but for most text operations such
383 * characters should be considered the same.
384 * For example, g_utf8_collate() normalizes
385 * with %G_NORMALIZE_ALL as its first step.
386 *
387 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
388 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
389 * but returned a result with composed forms rather
390 * than a maximally decomposed form. This is often
391 * useful if you intend to convert the string to
392 * a legacy encoding or pass it to a system with
393 * less capable Unicode handling.
394 *
395 * Return value: a newly allocated string, that is the
396 *   normalized form of @str.
397 **/
398gchar *
399g_utf8_normalize (const gchar    *str,
400                  gssize          len,
401                  GNormalizeMode  mode)
402{
403  gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
404  gchar *result;
405 
406  result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
407  g_free (result_wc);
408
409  return result;
410}
Note: See TracBrowser for help on using the repository browser.