source: trunk/third/glib2/glib/gutf8.c @ 18159

Revision 18159, 38.9 KB checked in by ghudson, 22 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r18158, which included commits to RCS files with non-trunk default branches.
Line 
1/* gutf8.c - Operations on UTF-8 strings.
2 *
3 * Copyright (C) 1999 Tom Tromey
4 * Copyright (C) 2000 Red Hat, Inc.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
20 */
21
22#include <config.h>
23
24#include <stdlib.h>
25#ifdef HAVE_CODESET
26#include <langinfo.h>
27#endif
28#include <string.h>
29
30#include "glib.h"
31
32#ifdef G_PLATFORM_WIN32
33#include <stdio.h>
34#define STRICT
35#include <windows.h>
36#undef STRICT
37#endif
38
39#include "libcharset/libcharset.h"
40
41#include "glibintl.h"
42
43#define UTF8_COMPUTE(Char, Mask, Len)                                         \
44  if (Char < 128)                                                             \
45    {                                                                         \
46      Len = 1;                                                                \
47      Mask = 0x7f;                                                            \
48    }                                                                         \
49  else if ((Char & 0xe0) == 0xc0)                                             \
50    {                                                                         \
51      Len = 2;                                                                \
52      Mask = 0x1f;                                                            \
53    }                                                                         \
54  else if ((Char & 0xf0) == 0xe0)                                             \
55    {                                                                         \
56      Len = 3;                                                                \
57      Mask = 0x0f;                                                            \
58    }                                                                         \
59  else if ((Char & 0xf8) == 0xf0)                                             \
60    {                                                                         \
61      Len = 4;                                                                \
62      Mask = 0x07;                                                            \
63    }                                                                         \
64  else if ((Char & 0xfc) == 0xf8)                                             \
65    {                                                                         \
66      Len = 5;                                                                \
67      Mask = 0x03;                                                            \
68    }                                                                         \
69  else if ((Char & 0xfe) == 0xfc)                                             \
70    {                                                                         \
71      Len = 6;                                                                \
72      Mask = 0x01;                                                            \
73    }                                                                         \
74  else                                                                        \
75    Len = -1;
76
77#define UTF8_LENGTH(Char)              \
78  ((Char) < 0x80 ? 1 :                 \
79   ((Char) < 0x800 ? 2 :               \
80    ((Char) < 0x10000 ? 3 :            \
81     ((Char) < 0x200000 ? 4 :          \
82      ((Char) < 0x4000000 ? 5 : 6)))))
83   
84
85#define UTF8_GET(Result, Chars, Count, Mask, Len)                             \
86  (Result) = (Chars)[0] & (Mask);                                             \
87  for ((Count) = 1; (Count) < (Len); ++(Count))                               \
88    {                                                                         \
89      if (((Chars)[(Count)] & 0xc0) != 0x80)                                  \
90        {                                                                     \
91          (Result) = -1;                                                      \
92          break;                                                              \
93        }                                                                     \
94      (Result) <<= 6;                                                         \
95      (Result) |= ((Chars)[(Count)] & 0x3f);                                  \
96    }
97
98#define UNICODE_VALID(Char)                   \
99    ((Char) < 0x110000 &&                     \
100     ((Char) < 0xD800 || (Char) >= 0xE000) && \
101     (Char) != 0xFFFE && (Char) != 0xFFFF)
102   
103     
104static const gchar utf8_skip_data[256] = {
105  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
106  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
107  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
108  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
109  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
110  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
111  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
112  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
113};
114
115const gchar * const g_utf8_skip = utf8_skip_data;
116
117/**
118 * g_utf8_find_prev_char:
119 * @str: pointer to the beginning of a UTF-8 encoded string
120 * @p: pointer to some position within @str
121 *
122 * Given a position @p with a UTF-8 encoded string @str, find the start
123 * of the previous UTF-8 character starting before @p. Returns %NULL if no
124 * UTF-8 characters are present in @p before @str.
125 *
126 * @p does not have to be at the beginning of a UTF-8 character. No check
127 * is made to see if the character found is actually valid other than
128 * it starts with an appropriate byte.
129 *
130 * Return value: a pointer to the found character or %NULL.
131 **/
132gchar *
133g_utf8_find_prev_char (const char *str,
134                       const char *p)
135{
136  for (--p; p >= str; --p)
137    {
138      if ((*p & 0xc0) != 0x80)
139        return (gchar *)p;
140    }
141  return NULL;
142}
143
144/**
145 * g_utf8_find_next_char:
146 * @p: a pointer to a position within a UTF-8 encoded string
147 * @end: a pointer to the end of the string, or %NULL to indicate
148 *        that the string is nul-terminated, in which case
149 *        the returned value will be
150 *
151 * Finds the start of the next UTF-8 character in the string after @p.
152 *
153 * @p does not have to be at the beginning of a UTF-8 character. No check
154 * is made to see if the character found is actually valid other than
155 * it starts with an appropriate byte.
156 *
157 * Return value: a pointer to the found character or %NULL
158 **/
159gchar *
160g_utf8_find_next_char (const gchar *p,
161                       const gchar *end)
162{
163  if (*p)
164    {
165      if (end)
166        for (++p; p < end && (*p & 0xc0) == 0x80; ++p)
167          ;
168      else
169        for (++p; (*p & 0xc0) == 0x80; ++p)
170          ;
171    }
172  return (p == end) ? NULL : (gchar *)p;
173}
174
175/**
176 * g_utf8_prev_char:
177 * @p: a pointer to a position within a UTF-8 encoded string
178 *
179 * Finds the previous UTF-8 character in the string before @p.
180 *
181 * @p does not have to be at the beginning of a UTF-8 character. No check
182 * is made to see if the character found is actually valid other than
183 * it starts with an appropriate byte. If @p might be the first
184 * character of the string, you must use g_utf8_find_prev_char() instead.
185 *
186 * Return value: a pointer to the found character.
187 **/
188gchar *
189g_utf8_prev_char (const gchar *p)
190{
191  while (TRUE)
192    {
193      p--;
194      if ((*p & 0xc0) != 0x80)
195        return (gchar *)p;
196    }
197}
198
199/**
200 * g_utf8_strlen:
201 * @p: pointer to the start of a UTF-8 encoded string.
202 * @max: the maximum number of bytes to examine. If @max
203 *       is less than 0, then the string is assumed to be
204 *       nul-terminated.
205 *
206 * Returns the length of the string in characters.
207 *
208 * Return value: the length of the string in characters
209 **/
210glong
211g_utf8_strlen (const gchar *p,
212               gssize       max)
213{
214  glong len = 0;
215  const gchar *start = p;
216
217  if (max < 0)
218    {
219      while (*p)
220        {
221          p = g_utf8_next_char (p);
222          ++len;
223        }
224    }
225  else
226    {
227      if (max == 0 || !*p)
228        return 0;
229     
230      p = g_utf8_next_char (p);         
231
232      while (p - start < max && *p)
233        {
234          ++len;
235          p = g_utf8_next_char (p);         
236        }
237
238      /* only do the last len increment if we got a complete
239       * char (don't count partial chars)
240       */
241      if (p - start == max)
242        ++len;
243    }
244
245  return len;
246}
247
248/**
249 * g_utf8_get_char:
250 * @p: a pointer to Unicode character encoded as UTF-8
251 *
252 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
253 * If @p does not point to a valid UTF-8 encoded character, results are
254 * undefined. If you are not sure that the bytes are complete
255 * valid Unicode characters, you should use g_utf8_get_char_validated()
256 * instead.
257 *
258 * Return value: the resulting character
259 **/
260gunichar
261g_utf8_get_char (const gchar *p)
262{
263  int i, mask = 0, len;
264  gunichar result;
265  unsigned char c = (unsigned char) *p;
266
267  UTF8_COMPUTE (c, mask, len);
268  if (len == -1)
269    return (gunichar)-1;
270  UTF8_GET (result, p, i, mask, len);
271
272  return result;
273}
274
275/**
276 * g_utf8_offset_to_pointer:
277 * @str: a UTF-8 encoded string
278 * @offset: a character offset within @str
279 *
280 * Converts from an integer character offset to a pointer to a position
281 * within the string.
282 *
283 * Return value: the resulting pointer
284 **/
285gchar *
286g_utf8_offset_to_pointer  (const gchar *str,
287                           glong        offset)   
288{
289  const gchar *s = str;
290  while (offset--)
291    s = g_utf8_next_char (s);
292 
293  return (gchar *)s;
294}
295
296/**
297 * g_utf8_pointer_to_offset:
298 * @str: a UTF-8 encoded string
299 * @pos: a pointer to a position within @str
300 *
301 * Converts from a pointer to position within a string to a integer
302 * character offset.
303 *
304 * Return value: the resulting character offset
305 **/
306glong   
307g_utf8_pointer_to_offset (const gchar *str,
308                          const gchar *pos)
309{
310  const gchar *s = str;
311  glong offset = 0;   
312 
313  while (s < pos)
314    {
315      s = g_utf8_next_char (s);
316      offset++;
317    }
318
319  return offset;
320}
321
322
323/**
324 * g_utf8_strncpy:
325 * @dest: buffer to fill with characters from @src
326 * @src: UTF-8 encoded string
327 * @n: character count
328 *
329 * Like the standard C <function>strncpy()</function> function, but
330 * copies a given number of characters instead of a given number of
331 * bytes. The @src string must be valid UTF-8 encoded text.
332 * (Use g_utf8_validate() on all text before trying to use UTF-8
333 * utility functions with it.)
334 *
335 * Return value: @dest
336 **/
337gchar *
338g_utf8_strncpy (gchar       *dest,
339                const gchar *src,
340                gsize        n)
341{
342  const gchar *s = src;
343  while (n && *s)
344    {
345      s = g_utf8_next_char(s);
346      n--;
347    }
348  strncpy(dest, src, s - src);
349  dest[s - src] = 0;
350  return dest;
351}
352
353G_LOCK_DEFINE_STATIC (aliases);
354
355static GHashTable *
356get_alias_hash (void)
357{
358  static GHashTable *alias_hash = NULL;
359  const char *aliases;
360
361  G_LOCK (aliases);
362
363  if (!alias_hash)
364    {
365      alias_hash = g_hash_table_new (g_str_hash, g_str_equal);
366     
367      aliases = _g_locale_get_charset_aliases ();
368      while (*aliases != '\0')
369        {
370          const char *canonical;
371          const char *alias;
372          const char **alias_array;
373          int count = 0;
374         
375          alias = aliases;
376          aliases += strlen (aliases) + 1;
377          canonical = aliases;
378          aliases += strlen (aliases) + 1;
379         
380          alias_array = g_hash_table_lookup (alias_hash, canonical);
381          if (alias_array)
382            {
383              while (alias_array[count])
384                count++;
385            }
386         
387          alias_array = g_renew (const char *, alias_array, count + 2);
388          alias_array[count] = alias;
389          alias_array[count + 1] = NULL;
390         
391          g_hash_table_insert (alias_hash, (char *)canonical, alias_array);
392        }
393    }
394
395  G_UNLOCK (aliases);
396
397  return alias_hash;
398}
399
400/* As an abuse of the alias table, the following routines gets
401 * the charsets that are aliases for the canonical name.
402 */
403const char **
404_g_charset_get_aliases (const char *canonical_name)
405{
406  GHashTable *alias_hash = get_alias_hash ();
407
408  return g_hash_table_lookup (alias_hash, canonical_name);
409}
410
411static gboolean
412g_utf8_get_charset_internal (const char  *raw_data,
413                             const char **a)
414{
415  const char *charset = getenv("CHARSET");
416
417  if (charset && *charset)
418    {
419      *a = charset;
420
421      if (charset && strstr (charset, "UTF-8"))
422        return TRUE;
423      else
424        return FALSE;
425    }
426
427  /* The libcharset code tries to be thread-safe without
428   * a lock, but has a memory leak and a missing memory
429   * barrier, so we lock for it
430   */
431  G_LOCK (aliases);
432  charset = _g_locale_charset_unalias (raw_data);
433  G_UNLOCK (aliases);
434 
435  if (charset && *charset)
436    {
437      *a = charset;
438     
439      if (charset && strstr (charset, "UTF-8"))
440        return TRUE;
441      else
442        return FALSE;
443    }
444
445  /* Assume this for compatibility at present.  */
446  *a = "US-ASCII";
447 
448  return FALSE;
449}
450
451typedef struct _GCharsetCache GCharsetCache;
452
453struct _GCharsetCache {
454  gboolean is_utf8;
455  gchar *raw;
456  gchar *charset;
457};
458
459static void
460charset_cache_free (gpointer data)
461{
462  GCharsetCache *cache = data;
463  g_free (cache->raw);
464  g_free (cache->charset);
465  g_free (cache);
466}
467
468/**
469 * g_get_charset:
470 * @charset: return location for character set name
471 *
472 * Obtains the character set for the current locale; you might use
473 * this character set as an argument to g_convert(), to convert from
474 * the current locale's encoding to some other encoding. (Frequently
475 * g_locale_to_utf8() and g_locale_from_utf8() are nice shortcuts,
476 * though.)
477 *
478 * The return value is %TRUE if the locale's encoding is UTF-8, in that
479 * case you can perhaps avoid calling g_convert().
480 *
481 * The string returned in @charset is not allocated, and should not be
482 * freed.
483 *
484 * Return value: %TRUE if the returned charset is UTF-8
485 **/
486gboolean
487g_get_charset (G_CONST_RETURN char **charset)
488{
489  static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT;
490  GCharsetCache *cache = g_static_private_get (&cache_private);
491  const gchar *raw;
492
493  if (!cache)
494    {
495      cache = g_new0 (GCharsetCache, 1);
496      g_static_private_set (&cache_private, cache, charset_cache_free);
497    }
498
499  raw = _g_locale_charset_raw ();
500 
501  if (!(cache->raw && strcmp (cache->raw, raw) == 0))
502    {
503      const gchar *new_charset;
504           
505      g_free (cache->raw);
506      g_free (cache->charset);
507      cache->raw = g_strdup (raw);
508      cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset);
509      cache->charset = g_strdup (new_charset);
510    }
511
512  if (charset)
513    *charset = cache->charset;
514 
515  return cache->is_utf8;
516}
517
518/* unicode_strchr */
519
520/**
521 * g_unichar_to_utf8:
522 * @c: a ISO10646 character code
523 * @outbuf: output buffer, must have at least 6 bytes of space.
524 *       If %NULL, the length will be computed and returned
525 *       and nothing will be written to @outbuf.
526 *
527 * Converts a single character to UTF-8.
528 *
529 * Return value: number of bytes written
530 **/
531int
532g_unichar_to_utf8 (gunichar c,
533                   gchar   *outbuf)
534{
535  guint len = 0;   
536  int first;
537  int i;
538
539  if (c < 0x80)
540    {
541      first = 0;
542      len = 1;
543    }
544  else if (c < 0x800)
545    {
546      first = 0xc0;
547      len = 2;
548    }
549  else if (c < 0x10000)
550    {
551      first = 0xe0;
552      len = 3;
553    }
554   else if (c < 0x200000)
555    {
556      first = 0xf0;
557      len = 4;
558    }
559  else if (c < 0x4000000)
560    {
561      first = 0xf8;
562      len = 5;
563    }
564  else
565    {
566      first = 0xfc;
567      len = 6;
568    }
569
570  if (outbuf)
571    {
572      for (i = len - 1; i > 0; --i)
573        {
574          outbuf[i] = (c & 0x3f) | 0x80;
575          c >>= 6;
576        }
577      outbuf[0] = c | first;
578    }
579
580  return len;
581}
582
583/**
584 * g_utf8_strchr:
585 * @p: a nul-terminated UTF-8 encoded string
586 * @len: the maximum length of @p
587 * @c: a ISO10646 character
588 *
589 * Finds the leftmost occurrence of the given ISO10646 character
590 * in a UTF-8 encoded string, while limiting the search to @len bytes.
591 * If @len is -1, allow unbounded search.
592 *
593 * Return value: %NULL if the string does not contain the character,
594 *   otherwise, a pointer to the start of the leftmost occurrence of
595 *   the character in the string.
596 **/
597gchar *
598g_utf8_strchr (const char *p,
599               gssize      len,
600               gunichar    c)
601{
602  gchar ch[10];
603
604  gint charlen = g_unichar_to_utf8 (c, ch);
605  ch[charlen] = '\0';
606 
607  return g_strstr_len (p, len, ch);
608}
609
610
611/**
612 * g_utf8_strrchr:
613 * @p: a nul-terminated UTF-8 encoded string
614 * @len: the maximum length of @p
615 * @c: a ISO10646 character
616 *
617 * Find the rightmost occurrence of the given ISO10646 character
618 * in a UTF-8 encoded string, while limiting the search to @len bytes.
619 * If @len is -1, allow unbounded search.
620 *
621 * Return value: %NULL if the string does not contain the character,
622 *   otherwise, a pointer to the start of the rightmost occurrence of the
623 *   character in the string.
624 **/
625gchar *
626g_utf8_strrchr (const char *p,
627                gssize      len,
628                gunichar    c)
629{
630  gchar ch[10];
631
632  gint charlen = g_unichar_to_utf8 (c, ch);
633  ch[charlen] = '\0';
634 
635  return g_strrstr_len (p, len, ch);
636}
637
638
639/* Like g_utf8_get_char, but take a maximum length
640 * and return (gunichar)-2 on incomplete trailing character
641 */
642static inline gunichar
643g_utf8_get_char_extended (const  gchar *p,
644                          gssize max_len) 
645{
646  guint i, len;
647  gunichar wc = (guchar) *p;
648
649  if (wc < 0x80)
650    {
651      return wc;
652    }
653  else if (wc < 0xc0)
654    {
655      return (gunichar)-1;
656    }
657  else if (wc < 0xe0)
658    {
659      len = 2;
660      wc &= 0x1f;
661    }
662  else if (wc < 0xf0)
663    {
664      len = 3;
665      wc &= 0x0f;
666    }
667  else if (wc < 0xf8)
668    {
669      len = 4;
670      wc &= 0x07;
671    }
672  else if (wc < 0xfc)
673    {
674      len = 5;
675      wc &= 0x03;
676    }
677  else if (wc < 0xfe)
678    {
679      len = 6;
680      wc &= 0x01;
681    }
682  else
683    {
684      return (gunichar)-1;
685    }
686 
687  if (max_len >= 0 && len > max_len)
688    {
689      for (i = 1; i < max_len; i++)
690        {
691          if ((((guchar *)p)[i] & 0xc0) != 0x80)
692            return (gunichar)-1;
693        }
694      return (gunichar)-2;
695    }
696
697  for (i = 1; i < len; ++i)
698    {
699      gunichar ch = ((guchar *)p)[i];
700     
701      if ((ch & 0xc0) != 0x80)
702        {
703          if (ch)
704            return (gunichar)-1;
705          else
706            return (gunichar)-2;
707        }
708
709      wc <<= 6;
710      wc |= (ch & 0x3f);
711    }
712
713  if (UTF8_LENGTH(wc) != len)
714    return (gunichar)-1;
715 
716  return wc;
717}
718
719/**
720 * g_utf8_get_char_validated:
721 * @p: a pointer to Unicode character encoded as UTF-8
722 * @max_len: the maximum number of bytes to read, or -1, for no maximum.
723 *
724 * Convert a sequence of bytes encoded as UTF-8 to a Unicode character.
725 * This function checks for incomplete characters, for invalid characters
726 * such as characters that are out of the range of Unicode, and for
727 * overlong encodings of valid characters.
728 *
729 * Return value: the resulting character. If @p points to a partial
730 *    sequence at the end of a string that could begin a valid character,
731 *    returns (gunichar)-2; otherwise, if @p does not point to a valid
732 *    UTF-8 encoded Unicode character, returns (gunichar)-1.
733 **/
734gunichar
735g_utf8_get_char_validated (const  gchar *p,
736                           gssize max_len)
737{
738  gunichar result = g_utf8_get_char_extended (p, max_len);
739
740  if (result & 0x80000000)
741    return result;
742  else if (!UNICODE_VALID (result))
743    return (gunichar)-1;
744  else
745    return result;
746}
747
748/**
749 * g_utf8_to_ucs4_fast:
750 * @str: a UTF-8 encoded string
751 * @len: the maximum length of @str to use. If @len < 0, then
752 *       the string is nul-terminated.
753 * @items_written: location to store the number of characters in the
754 *                 result, or %NULL.
755 *
756 * Convert a string from UTF-8 to a 32-bit fixed width
757 * representation as UCS-4, assuming valid UTF-8 input.
758 * This function is roughly twice as fast as g_utf8_to_ucs4()
759 * but does no error checking on the input.
760 *
761 * Return value: a pointer to a newly allocated UCS-4 string.
762 *               This value must be freed with g_free().
763 **/
764gunichar *
765g_utf8_to_ucs4_fast (const gchar *str,
766                     glong        len,             
767                     glong       *items_written)   
768{
769  gint j, charlen;
770  gunichar *result;
771  gint n_chars, i;
772  const gchar *p;
773
774  g_return_val_if_fail (str != NULL, NULL);
775
776  p = str;
777  n_chars = 0;
778  if (len < 0)
779    {
780      while (*p)
781        {
782          p = g_utf8_next_char (p);
783          ++n_chars;
784        }
785    }
786  else
787    {
788      while (p < str + len && *p)
789        {
790          p = g_utf8_next_char (p);
791          ++n_chars;
792        }
793    }
794 
795  result = g_new (gunichar, n_chars + 1);
796 
797  p = str;
798  for (i=0; i < n_chars; i++)
799    {
800      gunichar wc = ((unsigned char *)p)[0];
801
802      if (wc < 0x80)
803        {
804          result[i] = wc;
805          p++;
806        }
807      else
808        {
809          if (wc < 0xe0)
810            {
811              charlen = 2;
812              wc &= 0x1f;
813            }
814          else if (wc < 0xf0)
815            {
816              charlen = 3;
817              wc &= 0x0f;
818            }
819          else if (wc < 0xf8)
820            {
821              charlen = 4;
822              wc &= 0x07;
823            }
824          else if (wc < 0xfc)
825            {
826              charlen = 5;
827              wc &= 0x03;
828            }
829          else
830            {
831              charlen = 6;
832              wc &= 0x01;
833            }
834
835          for (j = 1; j < charlen; j++)
836            {
837              wc <<= 6;
838              wc |= ((unsigned char *)p)[j] & 0x3f;
839            }
840
841          result[i] = wc;
842          p += charlen;
843        }
844    }
845  result[i] = 0;
846
847  if (items_written)
848    *items_written = i;
849
850  return result;
851}
852
853/**
854 * g_utf8_to_ucs4:
855 * @str: a UTF-8 encoded string
856 * @len: the maximum length of @str to use. If @len < 0, then
857 *       the string is nul-terminated.
858 * @items_read: location to store number of bytes read, or %NULL.
859 *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
860 *              returned in case @str contains a trailing partial
861 *              character. If an error occurs then the index of the
862 *              invalid input is stored here.
863 * @items_written: location to store number of characters written or %NULL.
864 *                 The value here stored does not include the trailing 0
865 *                 character.
866 * @error: location to store the error occuring, or %NULL to ignore
867 *         errors. Any of the errors in #GConvertError other than
868 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
869 *
870 * Convert a string from UTF-8 to a 32-bit fixed width
871 * representation as UCS-4. A trailing 0 will be added to the
872 * string after the converted text.
873 *
874 * Return value: a pointer to a newly allocated UCS-4 string.
875 *               This value must be freed with g_free(). If an
876 *               error occurs, %NULL will be returned and
877 *               @error set.
878 **/
879gunichar *
880g_utf8_to_ucs4 (const gchar *str,
881                glong        len,             
882                glong       *items_read,     
883                glong       *items_written,   
884                GError     **error)
885{
886  gunichar *result = NULL;
887  gint n_chars, i;
888  const gchar *in;
889 
890  in = str;
891  n_chars = 0;
892  while ((len < 0 || str + len - in > 0) && *in)
893    {
894      gunichar wc = g_utf8_get_char_extended (in, str + len - in);
895      if (wc & 0x80000000)
896        {
897          if (wc == (gunichar)-2)
898            {
899              if (items_read)
900                break;
901              else
902                g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
903                             _("Partial character sequence at end of input"));
904            }
905          else
906            g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
907                         _("Invalid byte sequence in conversion input"));
908
909          goto err_out;
910        }
911
912      n_chars++;
913
914      in = g_utf8_next_char (in);
915    }
916
917  result = g_new (gunichar, n_chars + 1);
918 
919  in = str;
920  for (i=0; i < n_chars; i++)
921    {
922      result[i] = g_utf8_get_char (in);
923      in = g_utf8_next_char (in);
924    }
925  result[i] = 0;
926
927  if (items_written)
928    *items_written = n_chars;
929
930 err_out:
931  if (items_read)
932    *items_read = in - str;
933
934  return result;
935}
936
937/**
938 * g_ucs4_to_utf8:
939 * @str: a UCS-4 encoded string
940 * @len: the maximum length of @str to use. If @len < 0, then
941 *       the string is terminated with a 0 character.
942 * @items_read: location to store number of characters read read, or %NULL.
943 * @items_written: location to store number of bytes written or %NULL.
944 *                 The value here stored does not include the trailing 0
945 *                 byte.
946 * @error: location to store the error occuring, or %NULL to ignore
947 *         errors. Any of the errors in #GConvertError other than
948 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
949 *
950 * Convert a string from a 32-bit fixed width representation as UCS-4.
951 * to UTF-8. The result will be terminated with a 0 byte.
952 *
953 * Return value: a pointer to a newly allocated UTF-8 string.
954 *               This value must be freed with g_free(). If an
955 *               error occurs, %NULL will be returned and
956 *               @error set.
957 **/
958gchar *
959g_ucs4_to_utf8 (const gunichar *str,
960                glong           len,             
961                glong          *items_read,       
962                glong          *items_written,   
963                GError        **error)
964{
965  gint result_length;
966  gchar *result = NULL;
967  gchar *p;
968  gint i;
969
970  result_length = 0;
971  for (i = 0; len < 0 || i < len ; i++)
972    {
973      if (!str[i])
974        break;
975
976      if (str[i] >= 0x80000000)
977        {
978          if (items_read)
979            *items_read = i;
980         
981          g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
982                       _("Character out of range for UTF-8"));
983          goto err_out;
984        }
985     
986      result_length += UTF8_LENGTH (str[i]);
987    }
988
989  result = g_malloc (result_length + 1);
990  p = result;
991
992  i = 0;
993  while (p < result + result_length)
994    p += g_unichar_to_utf8 (str[i++], p);
995 
996  *p = '\0';
997
998  if (items_written)
999    *items_written = p - result;
1000
1001 err_out:
1002  if (items_read)
1003    *items_read = i;
1004
1005  return result;
1006}
1007
1008#define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)
1009
1010/**
1011 * g_utf16_to_utf8:
1012 * @str: a UTF-16 encoded string
1013 * @len: the maximum length of @str to use. If @len < 0, then
1014 *       the string is terminated with a 0 character.
1015 * @items_read: location to store number of words read, or %NULL.
1016 *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1017 *              returned in case @str contains a trailing partial
1018 *              character. If an error occurs then the index of the
1019 *              invalid input is stored here.
1020 * @items_written: location to store number of bytes written, or %NULL.
1021 *                 The value stored here does not include the trailing
1022 *                 0 byte.
1023 * @error: location to store the error occuring, or %NULL to ignore
1024 *         errors. Any of the errors in #GConvertError other than
1025 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1026 *
1027 * Convert a string from UTF-16 to UTF-8. The result will be
1028 * terminated with a 0 byte.
1029 *
1030 * Return value: a pointer to a newly allocated UTF-8 string.
1031 *               This value must be freed with g_free(). If an
1032 *               error occurs, %NULL will be returned and
1033 *               @error set.
1034 **/
1035gchar *
1036g_utf16_to_utf8 (const gunichar2  *str,
1037                 glong             len,             
1038                 glong            *items_read,       
1039                 glong            *items_written,   
1040                 GError          **error)
1041{
1042  /* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ
1043   * are marked.
1044   */
1045  const gunichar2 *in;
1046  gchar *out;
1047  gchar *result = NULL;
1048  gint n_bytes;
1049  gunichar high_surrogate;
1050
1051  g_return_val_if_fail (str != 0, NULL);
1052
1053  n_bytes = 0;
1054  in = str;
1055  high_surrogate = 0;
1056  while ((len < 0 || in - str < len) && *in)
1057    {
1058      gunichar2 c = *in;
1059      gunichar wc;
1060
1061      if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1062        {
1063          if (high_surrogate)
1064            {
1065              wc = SURROGATE_VALUE (high_surrogate, c);
1066              high_surrogate = 0;
1067            }
1068          else
1069            {
1070              g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1071                           _("Invalid sequence in conversion input"));
1072              goto err_out;
1073            }
1074        }
1075      else
1076        {
1077          if (high_surrogate)
1078            {
1079              g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1080                           _("Invalid sequence in conversion input"));
1081              goto err_out;
1082            }
1083
1084          if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1085            {
1086              high_surrogate = c;
1087              goto next1;
1088            }
1089          else
1090            wc = c;
1091        }
1092
1093      /********** DIFFERENT for UTF8/UCS4 **********/
1094      n_bytes += UTF8_LENGTH (wc);
1095
1096    next1:
1097      in++;
1098    }
1099
1100  if (high_surrogate && !items_read)
1101    {
1102      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1103                   _("Partial character sequence at end of input"));
1104      goto err_out;
1105    }
1106 
1107  /* At this point, everything is valid, and we just need to convert
1108   */
1109  /********** DIFFERENT for UTF8/UCS4 **********/
1110  result = g_malloc (n_bytes + 1);
1111 
1112  high_surrogate = 0;
1113  out = result;
1114  in = str;
1115  while (out < result + n_bytes)
1116    {
1117      gunichar2 c = *in;
1118      gunichar wc;
1119
1120      if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1121        {
1122          wc = SURROGATE_VALUE (high_surrogate, c);
1123          high_surrogate = 0;
1124        }
1125      else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1126        {
1127          high_surrogate = c;
1128          goto next2;
1129        }
1130      else
1131        wc = c;
1132
1133      /********** DIFFERENT for UTF8/UCS4 **********/
1134      out += g_unichar_to_utf8 (wc, out);
1135
1136    next2:
1137      in++;
1138    }
1139 
1140  /********** DIFFERENT for UTF8/UCS4 **********/
1141  *out = '\0';
1142
1143  if (items_written)
1144    /********** DIFFERENT for UTF8/UCS4 **********/
1145    *items_written = out - result;
1146
1147 err_out:
1148  if (items_read)
1149    *items_read = in - str;
1150
1151  return result;
1152}
1153
1154/**
1155 * g_utf16_to_ucs4:
1156 * @str: a UTF-16 encoded string
1157 * @len: the maximum length of @str to use. If @len < 0, then
1158 *       the string is terminated with a 0 character.
1159 * @items_read: location to store number of words read, or %NULL.
1160 *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1161 *              returned in case @str contains a trailing partial
1162 *              character. If an error occurs then the index of the
1163 *              invalid input is stored here.
1164 * @items_written: location to store number of characters written, or %NULL.
1165 *                 The value stored here does not include the trailing
1166 *                 0 character.
1167 * @error: location to store the error occuring, or %NULL to ignore
1168 *         errors. Any of the errors in #GConvertError other than
1169 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1170 *
1171 * Convert a string from UTF-16 to UCS-4. The result will be
1172 * terminated with a 0 character.
1173 *
1174 * Return value: a pointer to a newly allocated UCS-4 string.
1175 *               This value must be freed with g_free(). If an
1176 *               error occurs, %NULL will be returned and
1177 *               @error set.
1178 **/
1179gunichar *
1180g_utf16_to_ucs4 (const gunichar2  *str,
1181                 glong             len,             
1182                 glong            *items_read,       
1183                 glong            *items_written,   
1184                 GError          **error)
1185{
1186  const gunichar2 *in;
1187  gchar *out;
1188  gchar *result = NULL;
1189  gint n_bytes;
1190  gunichar high_surrogate;
1191
1192  g_return_val_if_fail (str != 0, NULL);
1193
1194  n_bytes = 0;
1195  in = str;
1196  high_surrogate = 0;
1197  while ((len < 0 || in - str < len) && *in)
1198    {
1199      gunichar2 c = *in;
1200      gunichar wc;
1201
1202      if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1203        {
1204          if (high_surrogate)
1205            {
1206              wc = SURROGATE_VALUE (high_surrogate, c);
1207              high_surrogate = 0;
1208            }
1209          else
1210            {
1211              g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1212                           _("Invalid sequence in conversion input"));
1213              goto err_out;
1214            }
1215        }
1216      else
1217        {
1218          if (high_surrogate)
1219            {
1220              g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1221                           _("Invalid sequence in conversion input"));
1222              goto err_out;
1223            }
1224
1225          if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1226            {
1227              high_surrogate = c;
1228              goto next1;
1229            }
1230          else
1231            wc = c;
1232        }
1233
1234      /********** DIFFERENT for UTF8/UCS4 **********/
1235      n_bytes += sizeof (gunichar);
1236
1237    next1:
1238      in++;
1239    }
1240
1241  if (high_surrogate && !items_read)
1242    {
1243      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1244                   _("Partial character sequence at end of input"));
1245      goto err_out;
1246    }
1247 
1248  /* At this point, everything is valid, and we just need to convert
1249   */
1250  /********** DIFFERENT for UTF8/UCS4 **********/
1251  result = g_malloc (n_bytes + 4);
1252 
1253  high_surrogate = 0;
1254  out = result;
1255  in = str;
1256  while (out < result + n_bytes)
1257    {
1258      gunichar2 c = *in;
1259      gunichar wc;
1260
1261      if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1262        {
1263          wc = SURROGATE_VALUE (high_surrogate, c);
1264          high_surrogate = 0;
1265        }
1266      else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1267        {
1268          high_surrogate = c;
1269          goto next2;
1270        }
1271      else
1272        wc = c;
1273
1274      /********** DIFFERENT for UTF8/UCS4 **********/
1275      *(gunichar *)out = wc;
1276      out += sizeof (gunichar);
1277
1278    next2:
1279      in++;
1280    }
1281
1282  /********** DIFFERENT for UTF8/UCS4 **********/
1283  *(gunichar *)out = 0;
1284
1285  if (items_written)
1286    /********** DIFFERENT for UTF8/UCS4 **********/
1287    *items_written = (out - result) / sizeof (gunichar);
1288
1289 err_out:
1290  if (items_read)
1291    *items_read = in - str;
1292
1293  return (gunichar *)result;
1294}
1295
1296/**
1297 * g_utf8_to_utf16:
1298 * @str: a UTF-8 encoded string
1299 * @len: the maximum length of @str to use. If @len < 0, then
1300 *       the string is nul-terminated.
1301 * @items_read: location to store number of bytes read, or %NULL.
1302 *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1303 *              returned in case @str contains a trailing partial
1304 *              character. If an error occurs then the index of the
1305 *              invalid input is stored here.
1306 * @items_written: location to store number of words written, or %NULL.
1307 *                 The value stored here does not include the trailing
1308 *                 0 word.
1309 * @error: location to store the error occuring, or %NULL to ignore
1310 *         errors. Any of the errors in #GConvertError other than
1311 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1312 *
1313 * Convert a string from UTF-8 to UTF-16. A 0 word will be
1314 * added to the result after the converted text.
1315 *
1316 * Return value: a pointer to a newly allocated UTF-16 string.
1317 *               This value must be freed with g_free(). If an
1318 *               error occurs, %NULL will be returned and
1319 *               @error set.
1320 **/
1321gunichar2 *
1322g_utf8_to_utf16 (const gchar *str,
1323                 glong        len,             
1324                 glong       *items_read,       
1325                 glong       *items_written,   
1326                 GError     **error)
1327{
1328  gunichar2 *result = NULL;
1329  gint n16;
1330  const gchar *in;
1331  gint i;
1332
1333  g_return_val_if_fail (str != NULL, NULL);
1334
1335  in = str;
1336  n16 = 0;
1337  while ((len < 0 || str + len - in > 0) && *in)
1338    {
1339      gunichar wc = g_utf8_get_char_extended (in, str + len - in);
1340      if (wc & 0x80000000)
1341        {
1342          if (wc == (gunichar)-2)
1343            {
1344              if (items_read)
1345                break;
1346              else
1347                g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1348                             _("Partial character sequence at end of input"));
1349            }
1350          else
1351            g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1352                         _("Invalid byte sequence in conversion input"));
1353
1354          goto err_out;
1355        }
1356
1357      if (wc < 0xd800)
1358        n16 += 1;
1359      else if (wc < 0xe000)
1360        {
1361          g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1362                       _("Invalid sequence in conversion input"));
1363
1364          goto err_out;
1365        }
1366      else if (wc < 0x10000)
1367        n16 += 1;
1368      else if (wc < 0x110000)
1369        n16 += 2;
1370      else
1371        {
1372          g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1373                       _("Character out of range for UTF-16"));
1374
1375          goto err_out;
1376        }
1377     
1378      in = g_utf8_next_char (in);
1379    }
1380
1381  result = g_new (gunichar2, n16 + 1);
1382 
1383  in = str;
1384  for (i = 0; i < n16;)
1385    {
1386      gunichar wc = g_utf8_get_char (in);
1387
1388      if (wc < 0x10000)
1389        {
1390          result[i++] = wc;
1391        }
1392      else
1393        {
1394          result[i++] = (wc - 0x10000) / 0x400 + 0xd800;
1395          result[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
1396        }
1397     
1398      in = g_utf8_next_char (in);
1399    }
1400
1401  result[i] = 0;
1402
1403  if (items_written)
1404    *items_written = n16;
1405
1406 err_out:
1407  if (items_read)
1408    *items_read = in - str;
1409 
1410  return result;
1411}
1412
1413/**
1414 * g_ucs4_to_utf16:
1415 * @str: a UCS-4 encoded string
1416 * @len: the maximum length of @str to use. If @len < 0, then
1417 *       the string is terminated with a 0 character.
1418 * @items_read: location to store number of bytes read, or %NULL.
1419 *              If an error occurs then the index of the invalid input
1420 *              is stored here.
1421 * @items_written: location to store number of words written, or %NULL.
1422 *                 The value stored here does not include the trailing
1423 *                 0 word.
1424 * @error: location to store the error occuring, or %NULL to ignore
1425 *         errors. Any of the errors in #GConvertError other than
1426 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1427 *
1428 * Convert a string from UCS-4 to UTF-16. A 0 word will be
1429 * added to the result after the converted text.
1430 *
1431 * Return value: a pointer to a newly allocated UTF-16 string.
1432 *               This value must be freed with g_free(). If an
1433 *               error occurs, %NULL will be returned and
1434 *               @error set.
1435 **/
1436gunichar2 *
1437g_ucs4_to_utf16 (const gunichar  *str,
1438                 glong            len,             
1439                 glong           *items_read,       
1440                 glong           *items_written,   
1441                 GError         **error)
1442{
1443  gunichar2 *result = NULL;
1444  gint n16;
1445  gint i, j;
1446
1447  n16 = 0;
1448  i = 0;
1449  while ((len < 0 || i < len) && str[i])
1450    {
1451      gunichar wc = str[i];
1452
1453      if (wc < 0xd800)
1454        n16 += 1;
1455      else if (wc < 0xe000)
1456        {
1457          g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1458                       _("Invalid sequence in conversion input"));
1459
1460          goto err_out;
1461        }
1462      else if (wc < 0x10000)
1463        n16 += 1;
1464      else if (wc < 0x110000)
1465        n16 += 2;
1466      else
1467        {
1468          g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1469                       _("Character out of range for UTF-16"));
1470
1471          goto err_out;
1472        }
1473
1474      i++;
1475    }
1476 
1477  result = g_new (gunichar2, n16 + 1);
1478 
1479  for (i = 0, j = 0; j < n16; i++)
1480    {
1481      gunichar wc = str[i];
1482
1483      if (wc < 0x10000)
1484        {
1485          result[j++] = wc;
1486        }
1487      else
1488        {
1489          result[j++] = (wc - 0x10000) / 0x400 + 0xd800;
1490          result[j++] = (wc - 0x10000) % 0x400 + 0xdc00;
1491        }
1492    }
1493  result[j] = 0;
1494
1495  if (items_written)
1496    *items_written = n16;
1497 
1498 err_out:
1499  if (items_read)
1500    *items_read = i;
1501 
1502  return result;
1503}
1504
1505/**
1506 * g_utf8_validate:
1507 * @str: a pointer to character data
1508 * @max_len: max bytes to validate, or -1 to go until nul
1509 * @end: return location for end of valid data
1510 *
1511 * Validates UTF-8 encoded text. @str is the text to validate;
1512 * if @str is nul-terminated, then @max_len can be -1, otherwise
1513 * @max_len should be the number of bytes to validate.
1514 * If @end is non-%NULL, then the end of the valid range
1515 * will be stored there (i.e. the address of the first invalid byte
1516 * if some bytes were invalid, or the end of the text being validated
1517 * otherwise).
1518 *
1519 * Returns %TRUE if all of @str was valid. Many GLib and GTK+
1520 * routines <emphasis>require</emphasis> valid UTF-8 as input;
1521 * so data read from a file or the network should be checked
1522 * with g_utf8_validate() before doing anything else with it.
1523 *
1524 * Return value: %TRUE if the text was valid UTF-8
1525 **/
1526gboolean
1527g_utf8_validate (const gchar  *str,
1528                 gssize        max_len,   
1529                 const gchar **end)
1530{
1531
1532  const gchar *p;
1533
1534  g_return_val_if_fail (str != NULL, FALSE);
1535 
1536  if (end)
1537    *end = str;
1538 
1539  p = str;
1540 
1541  while ((max_len < 0 || (p - str) < max_len) && *p)
1542    {
1543      int i, mask = 0, len;
1544      gunichar result;
1545      unsigned char c = (unsigned char) *p;
1546     
1547      UTF8_COMPUTE (c, mask, len);
1548
1549      if (len == -1)
1550        break;
1551
1552      /* check that the expected number of bytes exists in str */
1553      if (max_len >= 0 &&
1554          ((max_len - (p - str)) < len))
1555        break;
1556       
1557      UTF8_GET (result, p, i, mask, len);
1558
1559      if (UTF8_LENGTH (result) != len) /* Check for overlong UTF-8 */
1560        break;
1561
1562      if (result == (gunichar)-1)
1563        break;
1564
1565      if (!UNICODE_VALID (result))
1566        break;
1567     
1568      p += len;
1569    }
1570
1571  if (end)
1572    *end = p;
1573
1574  /* See that we covered the entire length if a length was
1575   * passed in, or that we ended on a nul if not
1576   */
1577  if (max_len >= 0 &&
1578      p != (str + max_len))
1579    return FALSE;
1580  else if (max_len < 0 &&
1581           *p != '\0')
1582    return FALSE;
1583  else
1584    return TRUE;
1585}
1586
1587/**
1588 * g_unichar_validate:
1589 * @ch: a Unicode character
1590 *
1591 * Checks whether @ch is a valid Unicode character. Some possible
1592 * integer values of @ch will not be valid. 0 is considered a valid
1593 * character, though it's normally a string terminator.
1594 *
1595 * Return value: %TRUE if @ch is a valid Unicode character
1596 **/
1597gboolean
1598g_unichar_validate (gunichar ch)
1599{
1600  return UNICODE_VALID (ch);
1601}
1602
1603/**
1604 * g_utf8_strreverse:
1605 * @str: a UTF-8 encoded string
1606 * @len: the maximum length of @str to use. If @len < 0, then
1607 *       the string is nul-terminated.
1608 *
1609 * Reverses a UTF-8 string. @str must be valid UTF-8 encoded text.
1610 * (Use g_utf8_validate() on all text before trying to use UTF-8
1611 * utility functions with it.)
1612 *
1613 * Note that unlike g_strreverse(), this function returns
1614 * newly-allocated memory, which should be freed with g_free() when
1615 * no longer needed.
1616 *
1617 * Returns: a newly-allocated string which is the reverse of @str.
1618 *
1619 * Since: 2.2
1620 */
1621gchar *
1622g_utf8_strreverse (const gchar *str,
1623                   gssize len)
1624{
1625  gchar *result;
1626  const gchar *p;
1627  gchar *m, *r, skip;
1628
1629  if (len < 0)
1630    len = strlen (str);
1631
1632  result = g_new (gchar, len + 1);
1633  r = result + len;
1634  p = str;
1635  while (*p)
1636    {
1637      skip = g_utf8_skip[*(guchar*)p];
1638      r -= skip;
1639      for (m = r; skip; skip--)
1640        *m++ = *p++;
1641    }
1642  result[len] = 0;
1643
1644  return result;
1645}
Note: See TracBrowser for help on using the repository browser.