source: trunk/third/glib2/glib/guniprop.c @ 18159

Revision 18159, 21.8 KB checked in by ghudson, 22 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r18158, which included commits to RCS files with non-trunk default branches.
Line 
1/* guniprop.c - Unicode character properties.
2 *
3 * Copyright (C) 1999 Tom Tromey
4 * Copyright (C) 2000 Red Hat, Inc.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
20 */
21
22#include "config.h"
23
24#include <stddef.h>
25#include <string.h>
26#include <locale.h>
27
28#include "glib.h"
29#include "gunichartables.h"
30
31
32#define ATTTABLE(Page, Char) \
33  ((attr_table[Page] == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[attr_table[Page]][Char]))
34
35/* We cheat a bit and cast type values to (char *).  We detect these
36   using the &0xff trick.  */
37#define TTYPE(Page, Char) \
38  ((type_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
39   ? (type_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
40   : (type_data[type_table[Page]][Char]))
41
42
43#define TYPE(Char) (((Char) > (G_UNICODE_LAST_CHAR)) ? G_UNICODE_UNASSIGNED : TTYPE ((Char) >> 8, (Char) & 0xff))
44
45#define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER       \
46                       || (Type) == G_UNICODE_LETTER_NUMBER     \
47                       || (Type) == G_UNICODE_OTHER_NUMBER)
48
49#define ISALPHA(Type) ((Type) == G_UNICODE_LOWERCASE_LETTER     \
50                       || (Type) == G_UNICODE_UPPERCASE_LETTER  \
51                       || (Type) == G_UNICODE_TITLECASE_LETTER  \
52                       || (Type) == G_UNICODE_MODIFIER_LETTER   \
53                       || (Type) == G_UNICODE_OTHER_LETTER)
54
55#define ISMARK(Type) ((Type) == G_UNICODE_NON_SPACING_MARK ||   \
56                      (Type) == G_UNICODE_COMBINING_MARK ||     \
57                      (Type) == G_UNICODE_ENCLOSING_MARK)
58                     
59
60/**
61 * g_unichar_isalnum:
62 * @c: a Unicode character
63 *
64 * Determines whether a character is alphanumeric.
65 * Given some UTF-8 text, obtain a character value
66 * with g_utf8_get_char().
67 *
68 * Return value: %TRUE if @c is an alphanumeric character
69 **/
70gboolean
71g_unichar_isalnum (gunichar c)
72{
73  int t = TYPE (c);
74  return ISDIGIT (t) || ISALPHA (t);
75}
76
77/**
78 * g_unichar_isalpha:
79 * @c: a Unicode character
80 *
81 * Determines whether a character is alphabetic (i.e. a letter).
82 * Given some UTF-8 text, obtain a character value with
83 * g_utf8_get_char().
84 *
85 * Return value: %TRUE if @c is an alphabetic character
86 **/
87gboolean
88g_unichar_isalpha (gunichar c)
89{
90  int t = TYPE (c);
91  return ISALPHA (t);
92}
93
94
95/**
96 * g_unichar_iscntrl:
97 * @c: a Unicode character
98 *
99 * Determines whether a character is a control character.
100 * Given some UTF-8 text, obtain a character value with
101 * g_utf8_get_char().
102 *
103 * Return value: %TRUE if @c is a control character
104 **/
105gboolean
106g_unichar_iscntrl (gunichar c)
107{
108  return TYPE (c) == G_UNICODE_CONTROL;
109}
110
111/**
112 * g_unichar_isdigit:
113 * @c: a Unicode character
114 *
115 * Determines whether a character is numeric (i.e. a digit).  This
116 * covers ASCII 0-9 and also digits in other languages/scripts.  Given
117 * some UTF-8 text, obtain a character value with g_utf8_get_char().
118 *
119 * Return value: %TRUE if @c is a digit
120 **/
121gboolean
122g_unichar_isdigit (gunichar c)
123{
124  return TYPE (c) == G_UNICODE_DECIMAL_NUMBER;
125}
126
127
128/**
129 * g_unichar_isgraph:
130 * @c: a Unicode character
131 *
132 * Determines whether a character is printable and not a space
133 * (returns %FALSE for control characters, format characters, and
134 * spaces). g_unichar_isprint() is similar, but returns %TRUE for
135 * spaces. Given some UTF-8 text, obtain a character value with
136 * g_utf8_get_char().
137 *
138 * Return value: %TRUE if @c is printable unless it's a space
139 **/
140gboolean
141g_unichar_isgraph (gunichar c)
142{
143  int t = TYPE (c);
144  return (t != G_UNICODE_CONTROL
145          && t != G_UNICODE_FORMAT
146          && t != G_UNICODE_UNASSIGNED
147          && t != G_UNICODE_PRIVATE_USE
148          && t != G_UNICODE_SURROGATE
149          && t != G_UNICODE_SPACE_SEPARATOR);
150}
151
152/**
153 * g_unichar_islower:
154 * @c: a Unicode character
155 *
156 * Determines whether a character is a lowercase letter.
157 * Given some UTF-8 text, obtain a character value with
158 * g_utf8_get_char().
159 *
160 * Return value: %TRUE if @c is a lowercase letter
161 **/
162gboolean
163g_unichar_islower (gunichar c)
164{
165  return TYPE (c) == G_UNICODE_LOWERCASE_LETTER;
166}
167
168
169/**
170 * g_unichar_isprint:
171 * @c: a Unicode character
172 *
173 * Determines whether a character is printable.
174 * Unlike g_unichar_isgraph(), returns %TRUE for spaces.
175 * Given some UTF-8 text, obtain a character value with
176 * g_utf8_get_char().
177 *
178 * Return value: %TRUE if @c is printable
179 **/
180gboolean
181g_unichar_isprint (gunichar c)
182{
183  int t = TYPE (c);
184  return (t != G_UNICODE_CONTROL
185          && t != G_UNICODE_FORMAT
186          && t != G_UNICODE_UNASSIGNED
187          && t != G_UNICODE_PRIVATE_USE
188          && t != G_UNICODE_SURROGATE);
189}
190
191/**
192 * g_unichar_ispunct:
193 * @c: a Unicode character
194 *
195 * Determines whether a character is punctuation or a symbol.
196 * Given some UTF-8 text, obtain a character value with
197 * g_utf8_get_char().
198 *
199 * Return value: %TRUE if @c is a punctuation or symbol character
200 **/
201gboolean
202g_unichar_ispunct (gunichar c)
203{
204  int t = TYPE (c);
205  return (t == G_UNICODE_CONNECT_PUNCTUATION || t == G_UNICODE_DASH_PUNCTUATION
206          || t == G_UNICODE_CLOSE_PUNCTUATION || t == G_UNICODE_FINAL_PUNCTUATION
207          || t == G_UNICODE_INITIAL_PUNCTUATION || t == G_UNICODE_OTHER_PUNCTUATION
208          || t == G_UNICODE_OPEN_PUNCTUATION || t == G_UNICODE_CURRENCY_SYMBOL
209          || t == G_UNICODE_MODIFIER_SYMBOL || t == G_UNICODE_MATH_SYMBOL
210          || t == G_UNICODE_OTHER_SYMBOL);
211}
212
213/**
214 * g_unichar_isspace:
215 * @c: a Unicode character
216 *
217 * Determines whether a character is a space, tab, or line separator
218 * (newline, carriage return, etc.).  Given some UTF-8 text, obtain a
219 * character value with g_utf8_get_char().
220 *
221 * (Note: don't use this to do word breaking; you have to use
222 * Pango or equivalent to get word breaking right, the algorithm
223 * is fairly complex.)
224 * 
225 * Return value: %TRUE if @c is a punctuation character
226 **/
227gboolean
228g_unichar_isspace (gunichar c)
229{
230  switch (c)
231    {
232      /* special-case these since Unicode thinks they are not spaces */
233    case '\t':
234    case '\n':
235    case '\r':
236    case '\f':
237      return TRUE;
238      break;
239     
240    default:
241      {
242        int t = TYPE (c);
243        return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR
244                || t == G_UNICODE_PARAGRAPH_SEPARATOR);
245      }
246      break;
247    }
248}
249
250/**
251 * g_unichar_isupper:
252 * @c: a Unicode character
253 *
254 * Determines if a character is uppercase.
255 *
256 * Return value: %TRUE if @c is an uppercase character
257 **/
258gboolean
259g_unichar_isupper (gunichar c)
260{
261  return TYPE (c) == G_UNICODE_UPPERCASE_LETTER;
262}
263
264/**
265 * g_unichar_istitle:
266 * @c: a Unicode character
267 *
268 * Determines if a character is titlecase. Some characters in
269 * Unicode which are composites, such as the DZ digraph
270 * have three case variants instead of just two. The titlecase
271 * form is used at the beginning of a word where only the
272 * first letter is capitalized. The titlecase form of the DZ
273 * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z.
274 *
275 * Return value: %TRUE if the character is titlecase
276 **/
277gboolean
278g_unichar_istitle (gunichar c)
279{
280  unsigned int i;
281  for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
282    if (title_table[i][0] == c)
283      return 1;
284  return 0;
285}
286
287/**
288 * g_unichar_isxdigit:
289 * @c: a Unicode character.
290 *
291 * Determines if a character is a hexidecimal digit.
292 *
293 * Return value: %TRUE if the character is a hexadecimal digit
294 **/
295gboolean
296g_unichar_isxdigit (gunichar c)
297{
298  int t = TYPE (c);
299  return ((c >= 'a' && c <= 'f')
300          || (c >= 'A' && c <= 'F')
301          || ISDIGIT (t));
302}
303
304/**
305 * g_unichar_isdefined:
306 * @c: a Unicode character
307 *
308 * Determines if a given character is assigned in the Unicode
309 * standard.
310 *
311 * Return value: %TRUE if the character has an assigned value
312 **/
313gboolean
314g_unichar_isdefined (gunichar c)
315{
316  int t = TYPE (c);
317  return t != G_UNICODE_UNASSIGNED;
318}
319
320/**
321 * g_unichar_iswide:
322 * @c: a Unicode character
323 *
324 * Determines if a character is typically rendered in a double-width
325 * cell.
326 *
327 * Return value: %TRUE if the character is wide
328 **/
329/* This function stolen from Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>.  */
330gboolean
331g_unichar_iswide (gunichar c)
332{
333  if (c < 0x1100)
334    return 0;
335
336  return ((c >= 0x1100 && c <= 0x115f)     /* Hangul Jamo */
337          || (c >= 0x2e80 && c <= 0xa4cf && (c & ~0x0011) != 0x300a &&
338              c != 0x303f)                 /* CJK ... Yi */
339          || (c >= 0xac00 && c <= 0xd7a3)  /* Hangul Syllables */
340          || (c >= 0xf900 && c <= 0xfaff)  /* CJK Compatibility Ideographs */
341          || (c >= 0xfe30 && c <= 0xfe6f)  /* CJK Compatibility Forms */
342          || (c >= 0xff00 && c <= 0xff5f)  /* Fullwidth Forms */
343          || (c >= 0xffe0 && c <= 0xffe6));
344}
345
346/**
347 * g_unichar_toupper:
348 * @c: a Unicode character
349 *
350 * Converts a character to uppercase.
351 *
352 * Return value: the result of converting @c to uppercase.
353 *               If @c is not an lowercase or titlecase character,
354 *               or has no upper case equivalent @c is returned unchanged.
355 **/
356gunichar
357g_unichar_toupper (gunichar c)
358{
359  int t = TYPE (c);
360  if (t == G_UNICODE_LOWERCASE_LETTER)
361    {
362      gunichar val = ATTTABLE (c >> 8, c & 0xff);
363      if (val >= 0xd800 && val < 0xdc00)
364        {
365          const guchar *p = special_case_table[val - 0xd800];
366          return p[0] * 256 + p[1];
367        }
368      else
369        return val ? val : c;
370    }
371  else if (t == G_UNICODE_TITLECASE_LETTER)
372    {
373      unsigned int i;
374      for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
375        {
376          if (title_table[i][0] == c)
377            return title_table[i][1];
378        }
379    }
380  return c;
381}
382
383/**
384 * g_unichar_tolower:
385 * @c: a Unicode character.
386 *
387 * Converts a character to lower case.
388 *
389 * Return value: the result of converting @c to lower case.
390 *               If @c is not an upperlower or titlecase character,
391 *               or has no lowercase equivalent @c is returned unchanged.
392 **/
393gunichar
394g_unichar_tolower (gunichar c)
395{
396  int t = TYPE (c);
397  if (t == G_UNICODE_UPPERCASE_LETTER)
398    {
399      gunichar val = ATTTABLE (c >> 8, c & 0xff);
400      if (val >= 0xd800 && val < 0xdc00)
401        {
402          const guchar *p = special_case_table[val - 0xd800];
403          return p[0] * 256 + p[1];
404        }
405      else
406        return val ? val : c;
407    }
408  else if (t == G_UNICODE_TITLECASE_LETTER)
409    {
410      unsigned int i;
411      for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
412        {
413          if (title_table[i][0] == c)
414            return title_table[i][2];
415        }
416    }
417  return c;
418}
419
420/**
421 * g_unichar_totitle:
422 * @c: a Unicode character
423 *
424 * Converts a character to the titlecase.
425 *
426 * Return value: the result of converting @c to titlecase.
427 *               If @c is not an uppercase or lowercase character,
428 *               @c is returned unchanged.
429 **/
430gunichar
431g_unichar_totitle (gunichar c)
432{
433  unsigned int i;
434  for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
435    {
436      if (title_table[i][0] == c || title_table[i][1] == c
437          || title_table[i][2] == c)
438        return title_table[i][0];
439    }
440  return (TYPE (c) == G_UNICODE_LOWERCASE_LETTER
441          ? ATTTABLE (c >> 8, c & 0xff)
442          : c);
443}
444
445/**
446 * g_unichar_digit_value:
447 * @c: a Unicode character
448 *
449 * Determines the numeric value of a character as a decimal
450 * digit.
451 *
452 * Return value: If @c is a decimal digit (according to
453 * g_unichar_isdigit()), its numeric value. Otherwise, -1.
454 **/
455int
456g_unichar_digit_value (gunichar c)
457{
458  if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
459    return ATTTABLE (c >> 8, c & 0xff);
460  return -1;
461}
462
463/**
464 * g_unichar_xdigit_value:
465 * @c: a Unicode character
466 *
467 * Determines the numeric value of a character as a hexidecimal
468 * digit.
469 *
470 * Return value: If @c is a hex digit (according to
471 * g_unichar_isxdigit()), its numeric value. Otherwise, -1.
472 **/
473int
474g_unichar_xdigit_value (gunichar c)
475{
476  if (c >= 'A' && c <= 'F')
477    return c - 'A' + 10;
478  if (c >= 'a' && c <= 'f')
479    return c - 'a' + 10;
480  if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
481    return ATTTABLE (c >> 8, c & 0xff);
482  return -1;
483}
484
485/**
486 * g_unichar_type:
487 * @c: a Unicode character
488 *
489 * Classifies a Unicode character by type.
490 *
491 * Return value: the type of the character.
492 **/
493GUnicodeType
494g_unichar_type (gunichar c)
495{
496  return TYPE (c);
497}
498
499/*
500 * Case mapping functions
501 */
502
503typedef enum {
504  LOCALE_NORMAL,
505  LOCALE_TURKIC,
506  LOCALE_LITHUANIAN
507} LocaleType;
508
509static LocaleType
510get_locale_type (void)
511{
512  const char *locale = setlocale (LC_CTYPE, NULL);
513
514  switch (locale[0])
515    {
516   case 'a':
517      if (locale[1] == 'z')
518        return LOCALE_TURKIC;
519      break;
520    case 'l':
521      if (locale[1] == 't')
522        return LOCALE_LITHUANIAN;
523      break;
524    case 't':
525      if (locale[1] == 'r')
526        return LOCALE_TURKIC;
527      break;
528    }
529
530  return LOCALE_NORMAL;
531}
532
533static int
534output_marks (const char **p_inout,
535              char        *out_buffer,
536              int          len,
537              gboolean     remove_dot)
538{
539  const char *p = *p_inout;
540 
541  while (*p)
542    {
543      gunichar c = g_utf8_get_char (p);
544      int t = TYPE(c);
545     
546      if (ISMARK(t))
547        {
548          if (!remove_dot || c != 0x307 /* COMBINING DOT ABOVE */)
549            len += g_unichar_to_utf8 (c, out_buffer ? out_buffer + len : NULL);
550          p = g_utf8_next_char (p);
551        }
552      else
553        break;
554    }
555
556  *p_inout = p;
557  return len;
558}
559
560static gsize
561output_special_case (gchar *out_buffer,
562                     gsize  len,
563                     int    index,
564                     int    type,
565                     int    which)
566{
567  const guchar *p = special_case_table[index];
568
569  if (type != G_UNICODE_TITLECASE_LETTER)
570    p += 2; /* +2 to skip over "best single match" */
571
572  if (which == 1)
573    {
574      while (p[0] && p[1])
575        p += 2;
576      p += 2;
577    }
578
579  while (TRUE)
580    {
581      gunichar ch = p[0] * 256 + p[1];
582      if (!ch)
583        break;
584
585      len += g_unichar_to_utf8 (ch, out_buffer ? out_buffer + len : NULL);
586      p += 2;
587    }
588
589  return len;
590}
591
592static gsize
593real_toupper (const gchar *str,
594              gssize       max_len,
595              gchar       *out_buffer,
596              LocaleType   locale_type)
597{
598  const gchar *p = str;
599  const char *last = NULL;
600  gsize len = 0;
601  gboolean last_was_i = FALSE;
602
603  while ((max_len < 0 || p < str + max_len) && *p)
604    {
605      gunichar c = g_utf8_get_char (p);
606      int t = TYPE (c);
607      gunichar val;
608
609      last = p;
610      p = g_utf8_next_char (p);
611
612      if (locale_type == LOCALE_LITHUANIAN)
613        {
614          if (c == 'i')
615            last_was_i = TRUE;
616          else
617            {
618              if (last_was_i)
619                {
620                  /* Nasty, need to remove any dot above. Though
621                   * I think only E WITH DOT ABOVE occurs in practice
622                   * which could simplify this considerably.
623                   */
624                  gsize decomp_len, i;
625                  gunichar *decomp;
626
627                  decomp = g_unicode_canonical_decomposition (c, &decomp_len);
628                  for (i=0; i < decomp_len; i++)
629                    {
630                      if (decomp[i] != 0x307 /* COMBINING DOT ABOVE */)
631                        len += g_unichar_to_utf8 (g_unichar_toupper (decomp[i]), out_buffer ? out_buffer + len : NULL);
632                    }
633                  g_free (decomp);
634                 
635                  len = output_marks (&p, out_buffer, len, TRUE);
636
637                  continue;
638                }
639
640              if (!ISMARK(t))
641                last_was_i = FALSE;
642            }
643        }
644     
645      if (locale_type == LOCALE_TURKIC && c == 'i')
646        {
647          /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */
648          len += g_unichar_to_utf8 (0x130, out_buffer ? out_buffer + len : NULL);
649        }
650      else if (c == 0x0345)     /* COMBINING GREEK YPOGEGRAMMENI */
651        {
652          /* Nasty, need to move it after other combining marks .. this would go away if
653           * we normalized first.
654           */
655          len = output_marks (&p, out_buffer, len, FALSE);
656
657          /* And output as GREEK CAPITAL LETTER IOTA */
658          len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL);         
659        }
660      else if (t == G_UNICODE_LOWERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
661        {
662          val = ATTTABLE (c >> 8, c & 0xff);
663
664          if (val >= 0xd800 && val < 0xdc00)
665            {
666              len += output_special_case (out_buffer, len, val - 0xd800, t,
667                                          t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1);
668            }
669          else
670            {
671              if (t == G_UNICODE_TITLECASE_LETTER)
672                {
673                  unsigned int i;
674                  for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
675                    {
676                      if (title_table[i][0] == c)
677                        val = title_table[i][1];
678                    }
679                }
680
681              len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
682            }
683        }
684      else
685        {
686          gsize char_len = g_utf8_skip[*(guchar *)last];
687
688          if (out_buffer)
689            memcpy (out_buffer + len, last, char_len);
690
691          len += char_len;
692        }
693
694    }
695
696  return len;
697}
698
699/**
700 * g_utf8_strup:
701 * @str: a UTF-8 encoded string
702 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
703 *
704 * Converts all Unicode characters in the string that have a case
705 * to uppercase. The exact manner that this is done depends
706 * on the current locale, and may result in the number of
707 * characters in the string increasing. (For instance, the
708 * German ess-zet will be changed to SS.)
709 *
710 * Return value: a newly allocated string, with all characters
711 *    converted to uppercase. 
712 **/
713gchar *
714g_utf8_strup (const gchar *str,
715              gssize       len)
716{
717  gsize result_len;
718  LocaleType locale_type;
719  gchar *result;
720
721  g_return_val_if_fail (str != NULL, NULL);
722
723  locale_type = get_locale_type ();
724 
725  /*
726   * We use a two pass approach to keep memory management simple
727   */
728  result_len = real_toupper (str, len, NULL, locale_type);
729  result = g_malloc (result_len + 1);
730  real_toupper (str, len, result, locale_type);
731  result[result_len] = '\0';
732
733  return result;
734}
735
736static gsize
737real_tolower (const gchar *str,
738              gssize       max_len,
739              gchar       *out_buffer,
740              LocaleType   locale_type)
741{
742  const gchar *p = str;
743  const char *last = NULL;
744  gsize len = 0;
745
746  while ((max_len < 0 || p < str + max_len) && *p)
747    {
748      gunichar c = g_utf8_get_char (p);
749      int t = TYPE (c);
750      gunichar val;
751
752      last = p;
753      p = g_utf8_next_char (p);
754
755      if (locale_type == LOCALE_TURKIC && c == 'I')
756        {
757          /* I => LATIN SMALL LETTER DOTLESS I */
758          len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL);
759        }
760      else if (c == 0x03A3)     /* GREEK CAPITAL LETTER SIGMA */
761        {
762          if ((max_len < 0 || p < str + max_len) && *p)
763            {
764              gunichar next_c = g_utf8_get_char (p);
765              int next_type = TYPE(next_c);
766
767              /* SIGMA mapps differently depending on whether it is
768               * final or not. The following simplified test would
769               * fail in the case of combining marks following the
770               * sigma, but I don't think that occurs in real text.
771               * The test here matches that in ICU.
772               */
773              if (ISALPHA(next_type)) /* Lu,Ll,Lt,Lm,Lo */
774                val = 0x3c3;    /* GREEK SMALL SIGMA */
775              else
776                val = 0x3c2;    /* GREEK SMALL FINAL SIGMA */
777            }
778          else
779            val = 0x3c2;        /* GREEK SMALL FINAL SIGMA */
780
781          len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
782        }
783      else if (t == G_UNICODE_UPPERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
784        {
785          val = ATTTABLE (c >> 8, c & 0xff);
786
787          if (val >= 0xd800 && val < 0xdc00)
788            {
789              len += output_special_case (out_buffer, len, val - 0xd800, t, 0);
790            }
791          else
792            {
793              if (t == G_UNICODE_TITLECASE_LETTER)
794                {
795                  unsigned int i;
796                  for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
797                    {
798                      if (title_table[i][0] == c)
799                        val = title_table[i][2];
800                    }
801                }
802
803              len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
804            }
805        }
806      else
807        {
808          gsize char_len = g_utf8_skip[*(guchar *)last];
809
810          if (out_buffer)
811            memcpy (out_buffer + len, last, char_len);
812
813          len += char_len;
814        }
815
816    }
817
818  return len;
819}
820
821/**
822 * g_utf8_strdown:
823 * @str: a UTF-8 encoded string
824 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
825 *
826 * Converts all Unicode characters in the string that have a case
827 * to lowercase. The exact manner that this is done depends
828 * on the current locale, and may result in the number of
829 * characters in the string changing.
830 *
831 * Return value: a newly allocated string, with all characters
832 *    converted to lowercase. 
833 **/
834gchar *
835g_utf8_strdown (const gchar *str,
836                gssize       len)
837{
838  gsize result_len;
839  LocaleType locale_type;
840  gchar *result;
841
842  g_return_val_if_fail (str != NULL, NULL);
843
844  locale_type = get_locale_type ();
845 
846  /*
847   * We use a two pass approach to keep memory management simple
848   */
849  result_len = real_tolower (str, len, NULL, locale_type);
850  result = g_malloc (result_len + 1);
851  real_tolower (str, len, result, locale_type);
852  result[result_len] = '\0';
853
854  return result;
855}
856
857/**
858 * g_utf8_casefold:
859 * @str: a UTF-8 encoded string
860 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
861 *
862 * Converts a string into a form that is independent of case. The
863 * result will not correspond to any particular case, but can be
864 * compared for equality or ordered with the results of calling
865 * g_utf8_casefold() on other strings.
866 *
867 * Note that calling g_utf8_casefold() followed by g_utf8_collate() is
868 * only an approximation to the correct linguistic case insensitive
869 * ordering, though it is a fairly good one. Getting this exactly
870 * right would require a more sophisticated collation function that
871 * takes case sensitivity into account. GLib does not currently
872 * provide such a function.
873 *
874 * Return value: a newly allocated string, that is a
875 *   case independent form of @str.
876 **/
877gchar *
878g_utf8_casefold (const gchar *str,
879                 gssize       len)
880{
881  GString *result = g_string_new (NULL);
882  const char *p;
883
884  p = str;
885  while ((len < 0 || p < str + len) && *p)
886    {
887      gunichar ch = g_utf8_get_char (p);
888
889      int start = 0;
890      int end = G_N_ELEMENTS (casefold_table);
891
892      if (ch >= casefold_table[start].ch &&
893          ch <= casefold_table[end - 1].ch)
894        {
895          while (TRUE)
896            {
897              int half = (start + end) / 2;
898              if (ch == casefold_table[half].ch)
899                {
900                  g_string_append (result, casefold_table[half].data);
901                  goto next;
902                }
903              else if (half == start)
904                break;
905              else if (ch > casefold_table[half].ch)
906                start = half;
907              else
908                end = half;
909            }
910        }
911
912      g_string_append_unichar (result, g_unichar_tolower (ch));
913     
914    next:
915      p = g_utf8_next_char (p);
916    }
917
918  return g_string_free (result, FALSE);
919}
Note: See TracBrowser for help on using the repository browser.