source: trunk/third/glib2/glib/guniprop.c @ 20721

Revision 20721, 26.2 KB checked in by ghudson, 20 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r20720, which included commits to RCS files with non-trunk default branches.
Line 
1/* guniprop.c - Unicode character properties.
2 *
3 * Copyright (C) 1999 Tom Tromey
4 * Copyright (C) 2000 Red Hat, Inc.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
20 */
21
22#include "config.h"
23
24#include <stddef.h>
25#include <string.h>
26#include <locale.h>
27
28#include "glib.h"
29#include "gunichartables.h"
30#include "gunicodeprivate.h"
31
32#define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \
33                          ? attr_table_part1[Page] \
34                          : attr_table_part2[(Page) - 0xe00])
35
36#define ATTTABLE(Page, Char) \
37  ((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char]))
38
39#define TTYPE_PART1(Page, Char) \
40  ((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
41   ? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
42   : (type_data[type_table_part1[Page]][Char]))
43
44#define TTYPE_PART2(Page, Char) \
45  ((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
46   ? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
47   : (type_data[type_table_part2[Page]][Char]))
48
49#define TYPE(Char) \
50  (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
51   ? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \
52   : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
53      ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
54      : G_UNICODE_UNASSIGNED))
55
56
57#define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER       \
58                       || (Type) == G_UNICODE_LETTER_NUMBER     \
59                       || (Type) == G_UNICODE_OTHER_NUMBER)
60
61#define ISALPHA(Type) ((Type) == G_UNICODE_LOWERCASE_LETTER     \
62                       || (Type) == G_UNICODE_UPPERCASE_LETTER  \
63                       || (Type) == G_UNICODE_TITLECASE_LETTER  \
64                       || (Type) == G_UNICODE_MODIFIER_LETTER   \
65                       || (Type) == G_UNICODE_OTHER_LETTER)
66
67#define ISMARK(Type) ((Type) == G_UNICODE_NON_SPACING_MARK ||   \
68                      (Type) == G_UNICODE_COMBINING_MARK ||     \
69                      (Type) == G_UNICODE_ENCLOSING_MARK)
70                     
71
72/**
73 * g_unichar_isalnum:
74 * @c: a Unicode character
75 *
76 * Determines whether a character is alphanumeric.
77 * Given some UTF-8 text, obtain a character value
78 * with g_utf8_get_char().
79 *
80 * Return value: %TRUE if @c is an alphanumeric character
81 **/
82gboolean
83g_unichar_isalnum (gunichar c)
84{
85  int t = TYPE (c);
86  return ISDIGIT (t) || ISALPHA (t);
87}
88
89/**
90 * g_unichar_isalpha:
91 * @c: a Unicode character
92 *
93 * Determines whether a character is alphabetic (i.e. a letter).
94 * Given some UTF-8 text, obtain a character value with
95 * g_utf8_get_char().
96 *
97 * Return value: %TRUE if @c is an alphabetic character
98 **/
99gboolean
100g_unichar_isalpha (gunichar c)
101{
102  int t = TYPE (c);
103  return ISALPHA (t);
104}
105
106
107/**
108 * g_unichar_iscntrl:
109 * @c: a Unicode character
110 *
111 * Determines whether a character is a control character.
112 * Given some UTF-8 text, obtain a character value with
113 * g_utf8_get_char().
114 *
115 * Return value: %TRUE if @c is a control character
116 **/
117gboolean
118g_unichar_iscntrl (gunichar c)
119{
120  return TYPE (c) == G_UNICODE_CONTROL;
121}
122
123/**
124 * g_unichar_isdigit:
125 * @c: a Unicode character
126 *
127 * Determines whether a character is numeric (i.e. a digit).  This
128 * covers ASCII 0-9 and also digits in other languages/scripts.  Given
129 * some UTF-8 text, obtain a character value with g_utf8_get_char().
130 *
131 * Return value: %TRUE if @c is a digit
132 **/
133gboolean
134g_unichar_isdigit (gunichar c)
135{
136  return TYPE (c) == G_UNICODE_DECIMAL_NUMBER;
137}
138
139
140/**
141 * g_unichar_isgraph:
142 * @c: a Unicode character
143 *
144 * Determines whether a character is printable and not a space
145 * (returns %FALSE for control characters, format characters, and
146 * spaces). g_unichar_isprint() is similar, but returns %TRUE for
147 * spaces. Given some UTF-8 text, obtain a character value with
148 * g_utf8_get_char().
149 *
150 * Return value: %TRUE if @c is printable unless it's a space
151 **/
152gboolean
153g_unichar_isgraph (gunichar c)
154{
155  int t = TYPE (c);
156  return (t != G_UNICODE_CONTROL
157          && t != G_UNICODE_FORMAT
158          && t != G_UNICODE_UNASSIGNED
159          && t != G_UNICODE_PRIVATE_USE
160          && t != G_UNICODE_SURROGATE
161          && t != G_UNICODE_SPACE_SEPARATOR);
162}
163
164/**
165 * g_unichar_islower:
166 * @c: a Unicode character
167 *
168 * Determines whether a character is a lowercase letter.
169 * Given some UTF-8 text, obtain a character value with
170 * g_utf8_get_char().
171 *
172 * Return value: %TRUE if @c is a lowercase letter
173 **/
174gboolean
175g_unichar_islower (gunichar c)
176{
177  return TYPE (c) == G_UNICODE_LOWERCASE_LETTER;
178}
179
180
181/**
182 * g_unichar_isprint:
183 * @c: a Unicode character
184 *
185 * Determines whether a character is printable.
186 * Unlike g_unichar_isgraph(), returns %TRUE for spaces.
187 * Given some UTF-8 text, obtain a character value with
188 * g_utf8_get_char().
189 *
190 * Return value: %TRUE if @c is printable
191 **/
192gboolean
193g_unichar_isprint (gunichar c)
194{
195  int t = TYPE (c);
196  return (t != G_UNICODE_CONTROL
197          && t != G_UNICODE_FORMAT
198          && t != G_UNICODE_UNASSIGNED
199          && t != G_UNICODE_PRIVATE_USE
200          && t != G_UNICODE_SURROGATE);
201}
202
203/**
204 * g_unichar_ispunct:
205 * @c: a Unicode character
206 *
207 * Determines whether a character is punctuation or a symbol.
208 * Given some UTF-8 text, obtain a character value with
209 * g_utf8_get_char().
210 *
211 * Return value: %TRUE if @c is a punctuation or symbol character
212 **/
213gboolean
214g_unichar_ispunct (gunichar c)
215{
216  int t = TYPE (c);
217  return (t == G_UNICODE_CONNECT_PUNCTUATION || t == G_UNICODE_DASH_PUNCTUATION
218          || t == G_UNICODE_CLOSE_PUNCTUATION || t == G_UNICODE_FINAL_PUNCTUATION
219          || t == G_UNICODE_INITIAL_PUNCTUATION || t == G_UNICODE_OTHER_PUNCTUATION
220          || t == G_UNICODE_OPEN_PUNCTUATION || t == G_UNICODE_CURRENCY_SYMBOL
221          || t == G_UNICODE_MODIFIER_SYMBOL || t == G_UNICODE_MATH_SYMBOL
222          || t == G_UNICODE_OTHER_SYMBOL);
223}
224
225/**
226 * g_unichar_isspace:
227 * @c: a Unicode character
228 *
229 * Determines whether a character is a space, tab, or line separator
230 * (newline, carriage return, etc.).  Given some UTF-8 text, obtain a
231 * character value with g_utf8_get_char().
232 *
233 * (Note: don't use this to do word breaking; you have to use
234 * Pango or equivalent to get word breaking right, the algorithm
235 * is fairly complex.)
236 * 
237 * Return value: %TRUE if @c is a punctuation character
238 **/
239gboolean
240g_unichar_isspace (gunichar c)
241{
242  switch (c)
243    {
244      /* special-case these since Unicode thinks they are not spaces */
245    case '\t':
246    case '\n':
247    case '\r':
248    case '\f':
249      return TRUE;
250      break;
251     
252    default:
253      {
254        int t = TYPE (c);
255        return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR
256                || t == G_UNICODE_PARAGRAPH_SEPARATOR);
257      }
258      break;
259    }
260}
261
262/**
263 * g_unichar_isupper:
264 * @c: a Unicode character
265 *
266 * Determines if a character is uppercase.
267 *
268 * Return value: %TRUE if @c is an uppercase character
269 **/
270gboolean
271g_unichar_isupper (gunichar c)
272{
273  return TYPE (c) == G_UNICODE_UPPERCASE_LETTER;
274}
275
276/**
277 * g_unichar_istitle:
278 * @c: a Unicode character
279 *
280 * Determines if a character is titlecase. Some characters in
281 * Unicode which are composites, such as the DZ digraph
282 * have three case variants instead of just two. The titlecase
283 * form is used at the beginning of a word where only the
284 * first letter is capitalized. The titlecase form of the DZ
285 * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z.
286 *
287 * Return value: %TRUE if the character is titlecase
288 **/
289gboolean
290g_unichar_istitle (gunichar c)
291{
292  unsigned int i;
293  for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
294    if (title_table[i][0] == c)
295      return 1;
296  return 0;
297}
298
299/**
300 * g_unichar_isxdigit:
301 * @c: a Unicode character.
302 *
303 * Determines if a character is a hexidecimal digit.
304 *
305 * Return value: %TRUE if the character is a hexadecimal digit
306 **/
307gboolean
308g_unichar_isxdigit (gunichar c)
309{
310  int t = TYPE (c);
311  return ((c >= 'a' && c <= 'f')
312          || (c >= 'A' && c <= 'F')
313          || ISDIGIT (t));
314}
315
316/**
317 * g_unichar_isdefined:
318 * @c: a Unicode character
319 *
320 * Determines if a given character is assigned in the Unicode
321 * standard.
322 *
323 * Return value: %TRUE if the character has an assigned value
324 **/
325gboolean
326g_unichar_isdefined (gunichar c)
327{
328  int t = TYPE (c);
329  return t != G_UNICODE_UNASSIGNED;
330}
331
332/**
333 * g_unichar_iswide:
334 * @c: a Unicode character
335 *
336 * Determines if a character is typically rendered in a double-width
337 * cell.
338 *
339 * Return value: %TRUE if the character is wide
340 **/
341/* This function stolen from Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>.  */
342gboolean
343g_unichar_iswide (gunichar c)
344{
345  if (c < 0x1100)
346    return FALSE;
347
348  return (c <= 0x115f  /* Hangul Jamo init. consonants */
349          || c == 0x2329 || c == 0x232a     /* angle brackets */
350          || (c >= 0x2e80 && c <= 0xa4cf && (c < 0x302a || c > 0x302f)
351              && c != 0x303f && c != 0x3099 && c!= 0x309a) /* CJK ... Yi */
352          || (c >= 0xac00 && c <= 0xd7a3)   /* Hangul Syllables */
353          || (c >= 0xf900 && c <= 0xfaff)   /* CJK Compatibility Ideographs */
354          || (c >= 0xfe30 && c <= 0xfe6f)   /* CJK Compatibility Forms */
355          || (c >= 0xff00 && c <= 0xff60)   /* Fullwidth Forms */
356          || (c >= 0xffe0 && c <= 0xffe6)   /* Fullwidth Forms */
357          || (c >= 0x20000 && c <= 0x2fffd) /* CJK extra stuff */
358          || (c >= 0x30000 && c <= 0x3fffd));
359}
360
361/**
362 * g_unichar_toupper:
363 * @c: a Unicode character
364 *
365 * Converts a character to uppercase.
366 *
367 * Return value: the result of converting @c to uppercase.
368 *               If @c is not an lowercase or titlecase character,
369 *               or has no upper case equivalent @c is returned unchanged.
370 **/
371gunichar
372g_unichar_toupper (gunichar c)
373{
374  int t = TYPE (c);
375  if (t == G_UNICODE_LOWERCASE_LETTER)
376    {
377      gunichar val = ATTTABLE (c >> 8, c & 0xff);
378      if (val >= 0x1000000)
379        {
380          const gchar *p = special_case_table + val - 0x1000000;
381          return g_utf8_get_char (p);
382        }
383      else
384        return val ? val : c;
385    }
386  else if (t == G_UNICODE_TITLECASE_LETTER)
387    {
388      unsigned int i;
389      for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
390        {
391          if (title_table[i][0] == c)
392            return title_table[i][1];
393        }
394    }
395  return c;
396}
397
398/**
399 * g_unichar_tolower:
400 * @c: a Unicode character.
401 *
402 * Converts a character to lower case.
403 *
404 * Return value: the result of converting @c to lower case.
405 *               If @c is not an upperlower or titlecase character,
406 *               or has no lowercase equivalent @c is returned unchanged.
407 **/
408gunichar
409g_unichar_tolower (gunichar c)
410{
411  int t = TYPE (c);
412  if (t == G_UNICODE_UPPERCASE_LETTER)
413    {
414      gunichar val = ATTTABLE (c >> 8, c & 0xff);
415      if (val >= 0x1000000)
416        {
417          const gchar *p = special_case_table + val - 0x1000000;
418          return g_utf8_get_char (p);
419        }
420      else
421        return val ? val : c;
422    }
423  else if (t == G_UNICODE_TITLECASE_LETTER)
424    {
425      unsigned int i;
426      for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
427        {
428          if (title_table[i][0] == c)
429            return title_table[i][2];
430        }
431    }
432  return c;
433}
434
435/**
436 * g_unichar_totitle:
437 * @c: a Unicode character
438 *
439 * Converts a character to the titlecase.
440 *
441 * Return value: the result of converting @c to titlecase.
442 *               If @c is not an uppercase or lowercase character,
443 *               @c is returned unchanged.
444 **/
445gunichar
446g_unichar_totitle (gunichar c)
447{
448  unsigned int i;
449  for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
450    {
451      if (title_table[i][0] == c || title_table[i][1] == c
452          || title_table[i][2] == c)
453        return title_table[i][0];
454    }
455  return (TYPE (c) == G_UNICODE_LOWERCASE_LETTER
456          ? ATTTABLE (c >> 8, c & 0xff)
457          : c);
458}
459
460/**
461 * g_unichar_digit_value:
462 * @c: a Unicode character
463 *
464 * Determines the numeric value of a character as a decimal
465 * digit.
466 *
467 * Return value: If @c is a decimal digit (according to
468 * g_unichar_isdigit()), its numeric value. Otherwise, -1.
469 **/
470int
471g_unichar_digit_value (gunichar c)
472{
473  if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
474    return ATTTABLE (c >> 8, c & 0xff);
475  return -1;
476}
477
478/**
479 * g_unichar_xdigit_value:
480 * @c: a Unicode character
481 *
482 * Determines the numeric value of a character as a hexidecimal
483 * digit.
484 *
485 * Return value: If @c is a hex digit (according to
486 * g_unichar_isxdigit()), its numeric value. Otherwise, -1.
487 **/
488int
489g_unichar_xdigit_value (gunichar c)
490{
491  if (c >= 'A' && c <= 'F')
492    return c - 'A' + 10;
493  if (c >= 'a' && c <= 'f')
494    return c - 'a' + 10;
495  if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER)
496    return ATTTABLE (c >> 8, c & 0xff);
497  return -1;
498}
499
500/**
501 * g_unichar_type:
502 * @c: a Unicode character
503 *
504 * Classifies a Unicode character by type.
505 *
506 * Return value: the type of the character.
507 **/
508GUnicodeType
509g_unichar_type (gunichar c)
510{
511  return TYPE (c);
512}
513
514/*
515 * Case mapping functions
516 */
517
518typedef enum {
519  LOCALE_NORMAL,
520  LOCALE_TURKIC,
521  LOCALE_LITHUANIAN
522} LocaleType;
523
524static LocaleType
525get_locale_type (void)
526{
527  const char *locale = setlocale (LC_CTYPE, NULL);
528
529  switch (locale[0])
530    {
531   case 'a':
532      if (locale[1] == 'z')
533        return LOCALE_TURKIC;
534      break;
535    case 'l':
536      if (locale[1] == 't')
537        return LOCALE_LITHUANIAN;
538      break;
539    case 't':
540      if (locale[1] == 'r')
541        return LOCALE_TURKIC;
542      break;
543    }
544
545  return LOCALE_NORMAL;
546}
547
548static gint
549output_marks (const char **p_inout,
550              char        *out_buffer,
551              gboolean     remove_dot)
552{
553  const char *p = *p_inout;
554  gint len = 0;
555 
556  while (*p)
557    {
558      gunichar c = g_utf8_get_char (p);
559      int t = TYPE(c);
560     
561      if (ISMARK(t))
562        {
563          if (!remove_dot || c != 0x307 /* COMBINING DOT ABOVE */)
564            len += g_unichar_to_utf8 (c, out_buffer ? out_buffer + len : NULL);
565          p = g_utf8_next_char (p);
566        }
567      else
568        break;
569    }
570
571  *p_inout = p;
572  return len;
573}
574
575static gint
576output_special_case (gchar *out_buffer,
577                     int    offset,
578                     int    type,
579                     int    which)
580{
581  const gchar *p = special_case_table + offset;
582  gint len;
583
584  if (type != G_UNICODE_TITLECASE_LETTER)
585    p = g_utf8_next_char (p);
586
587  if (which == 1)
588    p += strlen (p) + 1;
589
590  len = strlen (p);
591  if (out_buffer)
592    memcpy (out_buffer, p, len);
593
594  return len;
595}
596
597static gsize
598real_toupper (const gchar *str,
599              gssize       max_len,
600              gchar       *out_buffer,
601              LocaleType   locale_type)
602{
603  const gchar *p = str;
604  const char *last = NULL;
605  gsize len = 0;
606  gboolean last_was_i = FALSE;
607
608  while ((max_len < 0 || p < str + max_len) && *p)
609    {
610      gunichar c = g_utf8_get_char (p);
611      int t = TYPE (c);
612      gunichar val;
613
614      last = p;
615      p = g_utf8_next_char (p);
616
617      if (locale_type == LOCALE_LITHUANIAN)
618        {
619          if (c == 'i')
620            last_was_i = TRUE;
621          else
622            {
623              if (last_was_i)
624                {
625                  /* Nasty, need to remove any dot above. Though
626                   * I think only E WITH DOT ABOVE occurs in practice
627                   * which could simplify this considerably.
628                   */
629                  gsize decomp_len, i;
630                  gunichar *decomp;
631
632                  decomp = g_unicode_canonical_decomposition (c, &decomp_len);
633                  for (i=0; i < decomp_len; i++)
634                    {
635                      if (decomp[i] != 0x307 /* COMBINING DOT ABOVE */)
636                        len += g_unichar_to_utf8 (g_unichar_toupper (decomp[i]), out_buffer ? out_buffer + len : NULL);
637                    }
638                  g_free (decomp);
639                 
640                  len += output_marks (&p, out_buffer ? out_buffer + len : NULL, TRUE);
641
642                  continue;
643                }
644
645              if (!ISMARK(t))
646                last_was_i = FALSE;
647            }
648        }
649     
650      if (locale_type == LOCALE_TURKIC && c == 'i')
651        {
652          /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */
653          len += g_unichar_to_utf8 (0x130, out_buffer ? out_buffer + len : NULL);
654        }
655      else if (c == 0x0345)     /* COMBINING GREEK YPOGEGRAMMENI */
656        {
657          /* Nasty, need to move it after other combining marks .. this would go away if
658           * we normalized first.
659           */
660          len += output_marks (&p, out_buffer ? out_buffer + len : NULL, FALSE);
661
662          /* And output as GREEK CAPITAL LETTER IOTA */
663          len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL);         
664        }
665      else if (t == G_UNICODE_LOWERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
666        {
667          val = ATTTABLE (c >> 8, c & 0xff);
668
669          if (val >= 0x1000000)
670            {
671              len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t,
672                                          t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1);
673            }
674          else
675            {
676              if (t == G_UNICODE_TITLECASE_LETTER)
677                {
678                  unsigned int i;
679                  for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
680                    {
681                      if (title_table[i][0] == c)
682                        val = title_table[i][1];
683                    }
684                }
685
686              len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
687            }
688        }
689      else
690        {
691          gsize char_len = g_utf8_skip[*(guchar *)last];
692
693          if (out_buffer)
694            memcpy (out_buffer + len, last, char_len);
695
696          len += char_len;
697        }
698
699    }
700
701  return len;
702}
703
704/**
705 * g_utf8_strup:
706 * @str: a UTF-8 encoded string
707 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
708 *
709 * Converts all Unicode characters in the string that have a case
710 * to uppercase. The exact manner that this is done depends
711 * on the current locale, and may result in the number of
712 * characters in the string increasing. (For instance, the
713 * German ess-zet will be changed to SS.)
714 *
715 * Return value: a newly allocated string, with all characters
716 *    converted to uppercase. 
717 **/
718gchar *
719g_utf8_strup (const gchar *str,
720              gssize       len)
721{
722  gsize result_len;
723  LocaleType locale_type;
724  gchar *result;
725
726  g_return_val_if_fail (str != NULL, NULL);
727
728  locale_type = get_locale_type ();
729 
730  /*
731   * We use a two pass approach to keep memory management simple
732   */
733  result_len = real_toupper (str, len, NULL, locale_type);
734  result = g_malloc (result_len + 1);
735  real_toupper (str, len, result, locale_type);
736  result[result_len] = '\0';
737
738  return result;
739}
740
741/* traverses the string checking for characters with combining class == 230
742 * until a base character is found */
743static gboolean
744has_more_above (const gchar *str)
745{
746  const gchar *p = str;
747  gint combining_class;
748
749  while (*p)
750    {
751      combining_class = _g_unichar_combining_class (g_utf8_get_char (p));
752      if (combining_class == 230)
753        return TRUE;
754      else if (combining_class == 0)
755        break;
756
757      p = g_utf8_next_char (p);
758    }
759
760  return FALSE;
761}
762
763static gsize
764real_tolower (const gchar *str,
765              gssize       max_len,
766              gchar       *out_buffer,
767              LocaleType   locale_type)
768{
769  const gchar *p = str;
770  const char *last = NULL;
771  gsize len = 0;
772
773  while ((max_len < 0 || p < str + max_len) && *p)
774    {
775      gunichar c = g_utf8_get_char (p);
776      int t = TYPE (c);
777      gunichar val;
778
779      last = p;
780      p = g_utf8_next_char (p);
781
782      if (locale_type == LOCALE_TURKIC && c == 'I')
783        {
784          if (g_utf8_get_char (p) == 0x0307)
785            {
786              /* I + COMBINING DOT ABOVE => i (U+0069) */
787              len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL);
788              p = g_utf8_next_char (p);
789            }
790          else
791            {
792              /* I => LATIN SMALL LETTER DOTLESS I */
793              len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL);
794            }
795        }
796      /* Introduce an explicit dot above when lowercasing capital I's and J's
797       * whenever there are more accents above. [SpecialCasing.txt] */
798      else if (locale_type == LOCALE_LITHUANIAN &&
799               (c == 0x00cc || c == 0x00cd || c == 0x0128))
800        {
801          len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL);
802          len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL);
803
804          switch (c)
805            {
806            case 0x00cc:
807              len += g_unichar_to_utf8 (0x0300, out_buffer ? out_buffer + len : NULL);
808              break;
809            case 0x00cd:
810              len += g_unichar_to_utf8 (0x0301, out_buffer ? out_buffer + len : NULL);
811              break;
812            case 0x0128:
813              len += g_unichar_to_utf8 (0x0303, out_buffer ? out_buffer + len : NULL);
814              break;
815            }
816        }
817      else if (locale_type == LOCALE_LITHUANIAN &&
818               (c == 'I' || c == 'J' || c == 0x012e) &&
819               has_more_above (p))
820        {
821          len += g_unichar_to_utf8 (g_unichar_tolower (c), out_buffer ? out_buffer + len : NULL);
822          len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL);
823        }
824      else if (c == 0x03A3)     /* GREEK CAPITAL LETTER SIGMA */
825        {
826          if ((max_len < 0 || p < str + max_len) && *p)
827            {
828              gunichar next_c = g_utf8_get_char (p);
829              int next_type = TYPE(next_c);
830
831              /* SIGMA mapps differently depending on whether it is
832               * final or not. The following simplified test would
833               * fail in the case of combining marks following the
834               * sigma, but I don't think that occurs in real text.
835               * The test here matches that in ICU.
836               */
837              if (ISALPHA(next_type)) /* Lu,Ll,Lt,Lm,Lo */
838                val = 0x3c3;    /* GREEK SMALL SIGMA */
839              else
840                val = 0x3c2;    /* GREEK SMALL FINAL SIGMA */
841            }
842          else
843            val = 0x3c2;        /* GREEK SMALL FINAL SIGMA */
844
845          len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
846        }
847      else if (t == G_UNICODE_UPPERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER)
848        {
849          val = ATTTABLE (c >> 8, c & 0xff);
850
851          if (val >= 0x1000000)
852            {
853              len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t, 0);
854            }
855          else
856            {
857              if (t == G_UNICODE_TITLECASE_LETTER)
858                {
859                  unsigned int i;
860                  for (i = 0; i < G_N_ELEMENTS (title_table); ++i)
861                    {
862                      if (title_table[i][0] == c)
863                        val = title_table[i][2];
864                    }
865                }
866
867              len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL);
868            }
869        }
870      else
871        {
872          gsize char_len = g_utf8_skip[*(guchar *)last];
873
874          if (out_buffer)
875            memcpy (out_buffer + len, last, char_len);
876
877          len += char_len;
878        }
879
880    }
881
882  return len;
883}
884
885/**
886 * g_utf8_strdown:
887 * @str: a UTF-8 encoded string
888 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
889 *
890 * Converts all Unicode characters in the string that have a case
891 * to lowercase. The exact manner that this is done depends
892 * on the current locale, and may result in the number of
893 * characters in the string changing.
894 *
895 * Return value: a newly allocated string, with all characters
896 *    converted to lowercase. 
897 **/
898gchar *
899g_utf8_strdown (const gchar *str,
900                gssize       len)
901{
902  gsize result_len;
903  LocaleType locale_type;
904  gchar *result;
905
906  g_return_val_if_fail (str != NULL, NULL);
907
908  locale_type = get_locale_type ();
909 
910  /*
911   * We use a two pass approach to keep memory management simple
912   */
913  result_len = real_tolower (str, len, NULL, locale_type);
914  result = g_malloc (result_len + 1);
915  real_tolower (str, len, result, locale_type);
916  result[result_len] = '\0';
917
918  return result;
919}
920
921/**
922 * g_utf8_casefold:
923 * @str: a UTF-8 encoded string
924 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
925 *
926 * Converts a string into a form that is independent of case. The
927 * result will not correspond to any particular case, but can be
928 * compared for equality or ordered with the results of calling
929 * g_utf8_casefold() on other strings.
930 *
931 * Note that calling g_utf8_casefold() followed by g_utf8_collate() is
932 * only an approximation to the correct linguistic case insensitive
933 * ordering, though it is a fairly good one. Getting this exactly
934 * right would require a more sophisticated collation function that
935 * takes case sensitivity into account. GLib does not currently
936 * provide such a function.
937 *
938 * Return value: a newly allocated string, that is a
939 *   case independent form of @str.
940 **/
941gchar *
942g_utf8_casefold (const gchar *str,
943                 gssize       len)
944{
945  GString *result;
946  const char *p;
947
948  g_return_val_if_fail (str != NULL, NULL);
949
950  result = g_string_new (NULL);
951  p = str;
952  while ((len < 0 || p < str + len) && *p)
953    {
954      gunichar ch = g_utf8_get_char (p);
955
956      int start = 0;
957      int end = G_N_ELEMENTS (casefold_table);
958
959      if (ch >= casefold_table[start].ch &&
960          ch <= casefold_table[end - 1].ch)
961        {
962          while (TRUE)
963            {
964              int half = (start + end) / 2;
965              if (ch == casefold_table[half].ch)
966                {
967                  g_string_append (result, casefold_table[half].data);
968                  goto next;
969                }
970              else if (half == start)
971                break;
972              else if (ch > casefold_table[half].ch)
973                start = half;
974              else
975                end = half;
976            }
977        }
978
979      g_string_append_unichar (result, g_unichar_tolower (ch));
980     
981    next:
982      p = g_utf8_next_char (p);
983    }
984
985  return g_string_free (result, FALSE);
986}
987
988/**
989 * g_unichar_get_mirror_char:
990 * @ch: a unicode character
991 * @mirrored_ch: location to store the mirrored character
992 *
993 * In Unicode, some characters are <firstterm>mirrored</firstterm>. This
994 * means that their images are mirrored horizontally in text that is laid
995 * out from right to left. For instance, "(" would become its mirror image,
996 * ")", in right-to-left text.
997 *
998 * If @ch has the Unicode mirrored property and there is another unicode
999 * character that typically has a glyph that is the mirror image of @ch's
1000 * glyph, puts that character in the address pointed to by @mirrored_ch.
1001 *
1002 * Return value: %TRUE if @ch has a mirrored character and @mirrored_ch is
1003 * filled in, %FALSE otherwise
1004 *
1005 * Since: 2.4
1006 **/
1007/* This code is adapted from FriBidi (http://fribidi.sourceforge.net/).
1008 * FriBidi is: Copyright (C) 1999,2000 Dov Grobgeld, and
1009 *             Copyright (C) 2001,2002 Behdad Esfahbod.
1010 */
1011gboolean
1012g_unichar_get_mirror_char (gunichar ch,
1013                           gunichar *mirrored_ch)
1014{
1015  gint pos, step, size;
1016  gboolean found;
1017
1018  size = G_N_ELEMENTS (bidi_mirroring_table);
1019  pos = step = (size / 2) + 1;
1020
1021  while (step > 1)
1022    {
1023      gunichar cmp_ch = bidi_mirroring_table[pos].ch;
1024      step = (step + 1) / 2;
1025
1026      if (cmp_ch < ch)
1027        {
1028          pos += step;
1029          if (pos > size - 1)
1030            pos = size - 1;
1031        }
1032      else if (cmp_ch > ch)
1033        {
1034          pos -= step;
1035          if (pos < 0)
1036            pos = 0;
1037        }
1038      else
1039        break;
1040    }
1041  found = bidi_mirroring_table[pos].ch == ch;
1042  if (mirrored_ch)
1043    *mirrored_ch = found ? bidi_mirroring_table[pos].mirrored_ch : ch;
1044
1045  return found;
1046
1047}
Note: See TracBrowser for help on using the repository browser.