source: trunk/third/glib2/glib/gutf8.c @ 20721

Revision 20721, 39.1 KB checked in by ghudson, 20 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r20720, which included commits to RCS files with non-trunk default branches.
Line 
1/* gutf8.c - Operations on UTF-8 strings.
2 *
3 * Copyright (C) 1999 Tom Tromey
4 * Copyright (C) 2000 Red Hat, Inc.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the
18 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
20 */
21
22#include <config.h>
23
24#include <stdlib.h>
25#ifdef HAVE_CODESET
26#include <langinfo.h>
27#endif
28#include <string.h>
29
30#include "glib.h"
31
32#ifdef G_PLATFORM_WIN32
33#include <stdio.h>
34#define STRICT
35#include <windows.h>
36#undef STRICT
37#endif
38
39#include "libcharset/libcharset.h"
40
41#include "glibintl.h"
42
43#define UTF8_COMPUTE(Char, Mask, Len)                                         \
44  if (Char < 128)                                                             \
45    {                                                                         \
46      Len = 1;                                                                \
47      Mask = 0x7f;                                                            \
48    }                                                                         \
49  else if ((Char & 0xe0) == 0xc0)                                             \
50    {                                                                         \
51      Len = 2;                                                                \
52      Mask = 0x1f;                                                            \
53    }                                                                         \
54  else if ((Char & 0xf0) == 0xe0)                                             \
55    {                                                                         \
56      Len = 3;                                                                \
57      Mask = 0x0f;                                                            \
58    }                                                                         \
59  else if ((Char & 0xf8) == 0xf0)                                             \
60    {                                                                         \
61      Len = 4;                                                                \
62      Mask = 0x07;                                                            \
63    }                                                                         \
64  else if ((Char & 0xfc) == 0xf8)                                             \
65    {                                                                         \
66      Len = 5;                                                                \
67      Mask = 0x03;                                                            \
68    }                                                                         \
69  else if ((Char & 0xfe) == 0xfc)                                             \
70    {                                                                         \
71      Len = 6;                                                                \
72      Mask = 0x01;                                                            \
73    }                                                                         \
74  else                                                                        \
75    Len = -1;
76
77#define UTF8_LENGTH(Char)              \
78  ((Char) < 0x80 ? 1 :                 \
79   ((Char) < 0x800 ? 2 :               \
80    ((Char) < 0x10000 ? 3 :            \
81     ((Char) < 0x200000 ? 4 :          \
82      ((Char) < 0x4000000 ? 5 : 6)))))
83   
84
85#define UTF8_GET(Result, Chars, Count, Mask, Len)                             \
86  (Result) = (Chars)[0] & (Mask);                                             \
87  for ((Count) = 1; (Count) < (Len); ++(Count))                               \
88    {                                                                         \
89      if (((Chars)[(Count)] & 0xc0) != 0x80)                                  \
90        {                                                                     \
91          (Result) = -1;                                                      \
92          break;                                                              \
93        }                                                                     \
94      (Result) <<= 6;                                                         \
95      (Result) |= ((Chars)[(Count)] & 0x3f);                                  \
96    }
97
98#define UNICODE_VALID(Char)                   \
99    ((Char) < 0x110000 &&                     \
100     (((Char) & 0xFFFFF800) != 0xD800) &&     \
101     ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&  \
102     ((Char) & 0xFFFE) != 0xFFFE)
103   
104     
105static const gchar utf8_skip_data[256] = {
106  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
107  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
108  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
109  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
110  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
111  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
112  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
113  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
114};
115
116const gchar * const g_utf8_skip = utf8_skip_data;
117
118/**
119 * g_utf8_find_prev_char:
120 * @str: pointer to the beginning of a UTF-8 encoded string
121 * @p: pointer to some position within @str
122 *
123 * Given a position @p with a UTF-8 encoded string @str, find the start
124 * of the previous UTF-8 character starting before @p. Returns %NULL if no
125 * UTF-8 characters are present in @p before @str.
126 *
127 * @p does not have to be at the beginning of a UTF-8 character. No check
128 * is made to see if the character found is actually valid other than
129 * it starts with an appropriate byte.
130 *
131 * Return value: a pointer to the found character or %NULL.
132 **/
133gchar *
134g_utf8_find_prev_char (const char *str,
135                       const char *p)
136{
137  for (--p; p >= str; --p)
138    {
139      if ((*p & 0xc0) != 0x80)
140        return (gchar *)p;
141    }
142  return NULL;
143}
144
145/**
146 * g_utf8_find_next_char:
147 * @p: a pointer to a position within a UTF-8 encoded string
148 * @end: a pointer to the end of the string, or %NULL to indicate
149 *        that the string is nul-terminated, in which case
150 *        the returned value will be
151 *
152 * Finds the start of the next UTF-8 character in the string after @p.
153 *
154 * @p does not have to be at the beginning of a UTF-8 character. No check
155 * is made to see if the character found is actually valid other than
156 * it starts with an appropriate byte.
157 *
158 * Return value: a pointer to the found character or %NULL
159 **/
160gchar *
161g_utf8_find_next_char (const gchar *p,
162                       const gchar *end)
163{
164  if (*p)
165    {
166      if (end)
167        for (++p; p < end && (*p & 0xc0) == 0x80; ++p)
168          ;
169      else
170        for (++p; (*p & 0xc0) == 0x80; ++p)
171          ;
172    }
173  return (p == end) ? NULL : (gchar *)p;
174}
175
176/**
177 * g_utf8_prev_char:
178 * @p: a pointer to a position within a UTF-8 encoded string
179 *
180 * Finds the previous UTF-8 character in the string before @p.
181 *
182 * @p does not have to be at the beginning of a UTF-8 character. No check
183 * is made to see if the character found is actually valid other than
184 * it starts with an appropriate byte. If @p might be the first
185 * character of the string, you must use g_utf8_find_prev_char() instead.
186 *
187 * Return value: a pointer to the found character.
188 **/
189gchar *
190g_utf8_prev_char (const gchar *p)
191{
192  while (TRUE)
193    {
194      p--;
195      if ((*p & 0xc0) != 0x80)
196        return (gchar *)p;
197    }
198}
199
200/**
201 * g_utf8_strlen:
202 * @p: pointer to the start of a UTF-8 encoded string.
203 * @max: the maximum number of bytes to examine. If @max
204 *       is less than 0, then the string is assumed to be
205 *       nul-terminated. If @max is 0, @p will not be examined and
206 *       may be %NULL.
207 *
208 * Returns the length of the string in characters.
209 *
210 * Return value: the length of the string in characters
211 **/
212glong
213g_utf8_strlen (const gchar *p,
214               gssize       max)
215{
216  glong len = 0;
217  const gchar *start = p;
218  g_return_val_if_fail (p != NULL || max == 0, 0);
219
220  if (max < 0)
221    {
222      while (*p)
223        {
224          p = g_utf8_next_char (p);
225          ++len;
226        }
227    }
228  else
229    {
230      if (max == 0 || !*p)
231        return 0;
232     
233      p = g_utf8_next_char (p);         
234
235      while (p - start < max && *p)
236        {
237          ++len;
238          p = g_utf8_next_char (p);         
239        }
240
241      /* only do the last len increment if we got a complete
242       * char (don't count partial chars)
243       */
244      if (p - start == max)
245        ++len;
246    }
247
248  return len;
249}
250
251/**
252 * g_utf8_get_char:
253 * @p: a pointer to Unicode character encoded as UTF-8
254 *
255 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
256 * If @p does not point to a valid UTF-8 encoded character, results are
257 * undefined. If you are not sure that the bytes are complete
258 * valid Unicode characters, you should use g_utf8_get_char_validated()
259 * instead.
260 *
261 * Return value: the resulting character
262 **/
263gunichar
264g_utf8_get_char (const gchar *p)
265{
266  int i, mask = 0, len;
267  gunichar result;
268  unsigned char c = (unsigned char) *p;
269
270  UTF8_COMPUTE (c, mask, len);
271  if (len == -1)
272    return (gunichar)-1;
273  UTF8_GET (result, p, i, mask, len);
274
275  return result;
276}
277
278/**
279 * g_utf8_offset_to_pointer:
280 * @str: a UTF-8 encoded string
281 * @offset: a character offset within @str
282 *
283 * Converts from an integer character offset to a pointer to a position
284 * within the string.
285 *
286 * Return value: the resulting pointer
287 **/
288gchar *
289g_utf8_offset_to_pointer  (const gchar *str,
290                           glong        offset)   
291{
292  const gchar *s = str;
293  while (offset--)
294    s = g_utf8_next_char (s);
295 
296  return (gchar *)s;
297}
298
299/**
300 * g_utf8_pointer_to_offset:
301 * @str: a UTF-8 encoded string
302 * @pos: a pointer to a position within @str
303 *
304 * Converts from a pointer to position within a string to a integer
305 * character offset.
306 *
307 * Return value: the resulting character offset
308 **/
309glong   
310g_utf8_pointer_to_offset (const gchar *str,
311                          const gchar *pos)
312{
313  const gchar *s = str;
314  glong offset = 0;   
315 
316  while (s < pos)
317    {
318      s = g_utf8_next_char (s);
319      offset++;
320    }
321
322  return offset;
323}
324
325
326/**
327 * g_utf8_strncpy:
328 * @dest: buffer to fill with characters from @src
329 * @src: UTF-8 encoded string
330 * @n: character count
331 *
332 * Like the standard C strncpy() function, but
333 * copies a given number of characters instead of a given number of
334 * bytes. The @src string must be valid UTF-8 encoded text.
335 * (Use g_utf8_validate() on all text before trying to use UTF-8
336 * utility functions with it.)
337 *
338 * Return value: @dest
339 **/
340gchar *
341g_utf8_strncpy (gchar       *dest,
342                const gchar *src,
343                gsize        n)
344{
345  const gchar *s = src;
346  while (n && *s)
347    {
348      s = g_utf8_next_char(s);
349      n--;
350    }
351  strncpy(dest, src, s - src);
352  dest[s - src] = 0;
353  return dest;
354}
355
356G_LOCK_DEFINE_STATIC (aliases);
357
358static GHashTable *
359get_alias_hash (void)
360{
361  static GHashTable *alias_hash = NULL;
362  const char *aliases;
363
364  G_LOCK (aliases);
365
366  if (!alias_hash)
367    {
368      alias_hash = g_hash_table_new (g_str_hash, g_str_equal);
369     
370      aliases = _g_locale_get_charset_aliases ();
371      while (*aliases != '\0')
372        {
373          const char *canonical;
374          const char *alias;
375          const char **alias_array;
376          int count = 0;
377         
378          alias = aliases;
379          aliases += strlen (aliases) + 1;
380          canonical = aliases;
381          aliases += strlen (aliases) + 1;
382         
383          alias_array = g_hash_table_lookup (alias_hash, canonical);
384          if (alias_array)
385            {
386              while (alias_array[count])
387                count++;
388            }
389         
390          alias_array = g_renew (const char *, alias_array, count + 2);
391          alias_array[count] = alias;
392          alias_array[count + 1] = NULL;
393         
394          g_hash_table_insert (alias_hash, (char *)canonical, alias_array);
395        }
396    }
397
398  G_UNLOCK (aliases);
399
400  return alias_hash;
401}
402
403/* As an abuse of the alias table, the following routines gets
404 * the charsets that are aliases for the canonical name.
405 */
406const char **
407_g_charset_get_aliases (const char *canonical_name)
408{
409  GHashTable *alias_hash = get_alias_hash ();
410
411  return g_hash_table_lookup (alias_hash, canonical_name);
412}
413
414static gboolean
415g_utf8_get_charset_internal (const char  *raw_data,
416                             const char **a)
417{
418  const char *charset = getenv("CHARSET");
419
420  if (charset && *charset)
421    {
422      *a = charset;
423
424      if (charset && strstr (charset, "UTF-8"))
425        return TRUE;
426      else
427        return FALSE;
428    }
429
430  /* The libcharset code tries to be thread-safe without
431   * a lock, but has a memory leak and a missing memory
432   * barrier, so we lock for it
433   */
434  G_LOCK (aliases);
435  charset = _g_locale_charset_unalias (raw_data);
436  G_UNLOCK (aliases);
437 
438  if (charset && *charset)
439    {
440      *a = charset;
441     
442      if (charset && strstr (charset, "UTF-8"))
443        return TRUE;
444      else
445        return FALSE;
446    }
447
448  /* Assume this for compatibility at present.  */
449  *a = "US-ASCII";
450 
451  return FALSE;
452}
453
454typedef struct _GCharsetCache GCharsetCache;
455
456struct _GCharsetCache {
457  gboolean is_utf8;
458  gchar *raw;
459  gchar *charset;
460};
461
462static void
463charset_cache_free (gpointer data)
464{
465  GCharsetCache *cache = data;
466  g_free (cache->raw);
467  g_free (cache->charset);
468  g_free (cache);
469}
470
471/**
472 * g_get_charset:
473 * @charset: return location for character set name
474 *
475 * Obtains the character set for the current locale; you might use
476 * this character set as an argument to g_convert(), to convert from
477 * the current locale's encoding to some other encoding. (Frequently
478 * g_locale_to_utf8() and g_locale_from_utf8() are nice shortcuts,
479 * though.)
480 *
481 * The return value is %TRUE if the locale's encoding is UTF-8, in that
482 * case you can perhaps avoid calling g_convert().
483 *
484 * The string returned in @charset is not allocated, and should not be
485 * freed.
486 *
487 * Return value: %TRUE if the returned charset is UTF-8
488 **/
489gboolean
490g_get_charset (G_CONST_RETURN char **charset)
491{
492  static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT;
493  GCharsetCache *cache = g_static_private_get (&cache_private);
494  const gchar *raw;
495
496  if (!cache)
497    {
498      cache = g_new0 (GCharsetCache, 1);
499      g_static_private_set (&cache_private, cache, charset_cache_free);
500    }
501
502  raw = _g_locale_charset_raw ();
503 
504  if (!(cache->raw && strcmp (cache->raw, raw) == 0))
505    {
506      const gchar *new_charset;
507           
508      g_free (cache->raw);
509      g_free (cache->charset);
510      cache->raw = g_strdup (raw);
511      cache->is_utf8 = g_utf8_get_charset_internal (raw, &new_charset);
512      cache->charset = g_strdup (new_charset);
513    }
514
515  if (charset)
516    *charset = cache->charset;
517 
518  return cache->is_utf8;
519}
520
521/* unicode_strchr */
522
523/**
524 * g_unichar_to_utf8:
525 * @c: a ISO10646 character code
526 * @outbuf: output buffer, must have at least 6 bytes of space.
527 *       If %NULL, the length will be computed and returned
528 *       and nothing will be written to @outbuf.
529 *
530 * Converts a single character to UTF-8.
531 *
532 * Return value: number of bytes written
533 **/
534int
535g_unichar_to_utf8 (gunichar c,
536                   gchar   *outbuf)
537{
538  guint len = 0;   
539  int first;
540  int i;
541
542  if (c < 0x80)
543    {
544      first = 0;
545      len = 1;
546    }
547  else if (c < 0x800)
548    {
549      first = 0xc0;
550      len = 2;
551    }
552  else if (c < 0x10000)
553    {
554      first = 0xe0;
555      len = 3;
556    }
557   else if (c < 0x200000)
558    {
559      first = 0xf0;
560      len = 4;
561    }
562  else if (c < 0x4000000)
563    {
564      first = 0xf8;
565      len = 5;
566    }
567  else
568    {
569      first = 0xfc;
570      len = 6;
571    }
572
573  if (outbuf)
574    {
575      for (i = len - 1; i > 0; --i)
576        {
577          outbuf[i] = (c & 0x3f) | 0x80;
578          c >>= 6;
579        }
580      outbuf[0] = c | first;
581    }
582
583  return len;
584}
585
586/**
587 * g_utf8_strchr:
588 * @p: a nul-terminated UTF-8 encoded string
589 * @len: the maximum length of @p
590 * @c: a ISO10646 character
591 *
592 * Finds the leftmost occurrence of the given ISO10646 character
593 * in a UTF-8 encoded string, while limiting the search to @len bytes.
594 * If @len is -1, allow unbounded search.
595 *
596 * Return value: %NULL if the string does not contain the character,
597 *   otherwise, a pointer to the start of the leftmost occurrence of
598 *   the character in the string.
599 **/
600gchar *
601g_utf8_strchr (const char *p,
602               gssize      len,
603               gunichar    c)
604{
605  gchar ch[10];
606
607  gint charlen = g_unichar_to_utf8 (c, ch);
608  ch[charlen] = '\0';
609 
610  return g_strstr_len (p, len, ch);
611}
612
613
614/**
615 * g_utf8_strrchr:
616 * @p: a nul-terminated UTF-8 encoded string
617 * @len: the maximum length of @p
618 * @c: a ISO10646 character
619 *
620 * Find the rightmost occurrence of the given ISO10646 character
621 * in a UTF-8 encoded string, while limiting the search to @len bytes.
622 * If @len is -1, allow unbounded search.
623 *
624 * Return value: %NULL if the string does not contain the character,
625 *   otherwise, a pointer to the start of the rightmost occurrence of the
626 *   character in the string.
627 **/
628gchar *
629g_utf8_strrchr (const char *p,
630                gssize      len,
631                gunichar    c)
632{
633  gchar ch[10];
634
635  gint charlen = g_unichar_to_utf8 (c, ch);
636  ch[charlen] = '\0';
637 
638  return g_strrstr_len (p, len, ch);
639}
640
641
642/* Like g_utf8_get_char, but take a maximum length
643 * and return (gunichar)-2 on incomplete trailing character
644 */
645static inline gunichar
646g_utf8_get_char_extended (const  gchar *p,
647                          gssize max_len) 
648{
649  guint i, len;
650  gunichar wc = (guchar) *p;
651
652  if (wc < 0x80)
653    {
654      return wc;
655    }
656  else if (wc < 0xc0)
657    {
658      return (gunichar)-1;
659    }
660  else if (wc < 0xe0)
661    {
662      len = 2;
663      wc &= 0x1f;
664    }
665  else if (wc < 0xf0)
666    {
667      len = 3;
668      wc &= 0x0f;
669    }
670  else if (wc < 0xf8)
671    {
672      len = 4;
673      wc &= 0x07;
674    }
675  else if (wc < 0xfc)
676    {
677      len = 5;
678      wc &= 0x03;
679    }
680  else if (wc < 0xfe)
681    {
682      len = 6;
683      wc &= 0x01;
684    }
685  else
686    {
687      return (gunichar)-1;
688    }
689 
690  if (max_len >= 0 && len > max_len)
691    {
692      for (i = 1; i < max_len; i++)
693        {
694          if ((((guchar *)p)[i] & 0xc0) != 0x80)
695            return (gunichar)-1;
696        }
697      return (gunichar)-2;
698    }
699
700  for (i = 1; i < len; ++i)
701    {
702      gunichar ch = ((guchar *)p)[i];
703     
704      if ((ch & 0xc0) != 0x80)
705        {
706          if (ch)
707            return (gunichar)-1;
708          else
709            return (gunichar)-2;
710        }
711
712      wc <<= 6;
713      wc |= (ch & 0x3f);
714    }
715
716  if (UTF8_LENGTH(wc) != len)
717    return (gunichar)-1;
718 
719  return wc;
720}
721
722/**
723 * g_utf8_get_char_validated:
724 * @p: a pointer to Unicode character encoded as UTF-8
725 * @max_len: the maximum number of bytes to read, or -1, for no maximum.
726 *
727 * Convert a sequence of bytes encoded as UTF-8 to a Unicode character.
728 * This function checks for incomplete characters, for invalid characters
729 * such as characters that are out of the range of Unicode, and for
730 * overlong encodings of valid characters.
731 *
732 * Return value: the resulting character. If @p points to a partial
733 *    sequence at the end of a string that could begin a valid
734 *    character, returns (gunichar)-2; otherwise, if @p does not point
735 *    to a valid UTF-8 encoded Unicode character, returns (gunichar)-1.
736 **/
737gunichar
738g_utf8_get_char_validated (const  gchar *p,
739                           gssize max_len)
740{
741  gunichar result = g_utf8_get_char_extended (p, max_len);
742
743  if (result & 0x80000000)
744    return result;
745  else if (!UNICODE_VALID (result))
746    return (gunichar)-1;
747  else
748    return result;
749}
750
751/**
752 * g_utf8_to_ucs4_fast:
753 * @str: a UTF-8 encoded string
754 * @len: the maximum length of @str to use. If @len < 0, then
755 *       the string is nul-terminated.
756 * @items_written: location to store the number of characters in the
757 *                 result, or %NULL.
758 *
759 * Convert a string from UTF-8 to a 32-bit fixed width
760 * representation as UCS-4, assuming valid UTF-8 input.
761 * This function is roughly twice as fast as g_utf8_to_ucs4()
762 * but does no error checking on the input.
763 *
764 * Return value: a pointer to a newly allocated UCS-4 string.
765 *               This value must be freed with g_free().
766 **/
767gunichar *
768g_utf8_to_ucs4_fast (const gchar *str,
769                     glong        len,             
770                     glong       *items_written)   
771{
772  gint j, charlen;
773  gunichar *result;
774  gint n_chars, i;
775  const gchar *p;
776
777  g_return_val_if_fail (str != NULL, NULL);
778
779  p = str;
780  n_chars = 0;
781  if (len < 0)
782    {
783      while (*p)
784        {
785          p = g_utf8_next_char (p);
786          ++n_chars;
787        }
788    }
789  else
790    {
791      while (p < str + len && *p)
792        {
793          p = g_utf8_next_char (p);
794          ++n_chars;
795        }
796    }
797 
798  result = g_new (gunichar, n_chars + 1);
799 
800  p = str;
801  for (i=0; i < n_chars; i++)
802    {
803      gunichar wc = ((unsigned char *)p)[0];
804
805      if (wc < 0x80)
806        {
807          result[i] = wc;
808          p++;
809        }
810      else
811        {
812          if (wc < 0xe0)
813            {
814              charlen = 2;
815              wc &= 0x1f;
816            }
817          else if (wc < 0xf0)
818            {
819              charlen = 3;
820              wc &= 0x0f;
821            }
822          else if (wc < 0xf8)
823            {
824              charlen = 4;
825              wc &= 0x07;
826            }
827          else if (wc < 0xfc)
828            {
829              charlen = 5;
830              wc &= 0x03;
831            }
832          else
833            {
834              charlen = 6;
835              wc &= 0x01;
836            }
837
838          for (j = 1; j < charlen; j++)
839            {
840              wc <<= 6;
841              wc |= ((unsigned char *)p)[j] & 0x3f;
842            }
843
844          result[i] = wc;
845          p += charlen;
846        }
847    }
848  result[i] = 0;
849
850  if (items_written)
851    *items_written = i;
852
853  return result;
854}
855
856/**
857 * g_utf8_to_ucs4:
858 * @str: a UTF-8 encoded string
859 * @len: the maximum length of @str to use. If @len < 0, then
860 *       the string is nul-terminated.
861 * @items_read: location to store number of bytes read, or %NULL.
862 *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
863 *              returned in case @str contains a trailing partial
864 *              character. If an error occurs then the index of the
865 *              invalid input is stored here.
866 * @items_written: location to store number of characters written or %NULL.
867 *                 The value here stored does not include the trailing 0
868 *                 character.
869 * @error: location to store the error occuring, or %NULL to ignore
870 *         errors. Any of the errors in #GConvertError other than
871 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
872 *
873 * Convert a string from UTF-8 to a 32-bit fixed width
874 * representation as UCS-4. A trailing 0 will be added to the
875 * string after the converted text.
876 *
877 * Return value: a pointer to a newly allocated UCS-4 string.
878 *               This value must be freed with g_free(). If an
879 *               error occurs, %NULL will be returned and
880 *               @error set.
881 **/
882gunichar *
883g_utf8_to_ucs4 (const gchar *str,
884                glong        len,             
885                glong       *items_read,     
886                glong       *items_written,   
887                GError     **error)
888{
889  gunichar *result = NULL;
890  gint n_chars, i;
891  const gchar *in;
892 
893  in = str;
894  n_chars = 0;
895  while ((len < 0 || str + len - in > 0) && *in)
896    {
897      gunichar wc = g_utf8_get_char_extended (in, str + len - in);
898      if (wc & 0x80000000)
899        {
900          if (wc == (gunichar)-2)
901            {
902              if (items_read)
903                break;
904              else
905                g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
906                             _("Partial character sequence at end of input"));
907            }
908          else
909            g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
910                         _("Invalid byte sequence in conversion input"));
911
912          goto err_out;
913        }
914
915      n_chars++;
916
917      in = g_utf8_next_char (in);
918    }
919
920  result = g_new (gunichar, n_chars + 1);
921 
922  in = str;
923  for (i=0; i < n_chars; i++)
924    {
925      result[i] = g_utf8_get_char (in);
926      in = g_utf8_next_char (in);
927    }
928  result[i] = 0;
929
930  if (items_written)
931    *items_written = n_chars;
932
933 err_out:
934  if (items_read)
935    *items_read = in - str;
936
937  return result;
938}
939
940/**
941 * g_ucs4_to_utf8:
942 * @str: a UCS-4 encoded string
943 * @len: the maximum length of @str to use. If @len < 0, then
944 *       the string is terminated with a 0 character.
945 * @items_read: location to store number of characters read read, or %NULL.
946 * @items_written: location to store number of bytes written or %NULL.
947 *                 The value here stored does not include the trailing 0
948 *                 byte.
949 * @error: location to store the error occuring, or %NULL to ignore
950 *         errors. Any of the errors in #GConvertError other than
951 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
952 *
953 * Convert a string from a 32-bit fixed width representation as UCS-4.
954 * to UTF-8. The result will be terminated with a 0 byte.
955 *
956 * Return value: a pointer to a newly allocated UTF-8 string.
957 *               This value must be freed with g_free(). If an
958 *               error occurs, %NULL will be returned and
959 *               @error set.
960 **/
961gchar *
962g_ucs4_to_utf8 (const gunichar *str,
963                glong           len,             
964                glong          *items_read,       
965                glong          *items_written,   
966                GError        **error)
967{
968  gint result_length;
969  gchar *result = NULL;
970  gchar *p;
971  gint i;
972
973  result_length = 0;
974  for (i = 0; len < 0 || i < len ; i++)
975    {
976      if (!str[i])
977        break;
978
979      if (str[i] >= 0x80000000)
980        {
981          if (items_read)
982            *items_read = i;
983         
984          g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
985                       _("Character out of range for UTF-8"));
986          goto err_out;
987        }
988     
989      result_length += UTF8_LENGTH (str[i]);
990    }
991
992  result = g_malloc (result_length + 1);
993  p = result;
994
995  i = 0;
996  while (p < result + result_length)
997    p += g_unichar_to_utf8 (str[i++], p);
998 
999  *p = '\0';
1000
1001  if (items_written)
1002    *items_written = p - result;
1003
1004 err_out:
1005  if (items_read)
1006    *items_read = i;
1007
1008  return result;
1009}
1010
1011#define SURROGATE_VALUE(h,l) (((h) - 0xd800) * 0x400 + (l) - 0xdc00 + 0x10000)
1012
1013/**
1014 * g_utf16_to_utf8:
1015 * @str: a UTF-16 encoded string
1016 * @len: the maximum length of @str to use. If @len < 0, then
1017 *       the string is terminated with a 0 character.
1018 * @items_read: location to store number of words read, or %NULL.
1019 *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1020 *              returned in case @str contains a trailing partial
1021 *              character. If an error occurs then the index of the
1022 *              invalid input is stored here.
1023 * @items_written: location to store number of bytes written, or %NULL.
1024 *                 The value stored here does not include the trailing
1025 *                 0 byte.
1026 * @error: location to store the error occuring, or %NULL to ignore
1027 *         errors. Any of the errors in #GConvertError other than
1028 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1029 *
1030 * Convert a string from UTF-16 to UTF-8. The result will be
1031 * terminated with a 0 byte.
1032 *
1033 * Return value: a pointer to a newly allocated UTF-8 string.
1034 *               This value must be freed with g_free(). If an
1035 *               error occurs, %NULL will be returned and
1036 *               @error set.
1037 **/
1038gchar *
1039g_utf16_to_utf8 (const gunichar2  *str,
1040                 glong             len,             
1041                 glong            *items_read,       
1042                 glong            *items_written,   
1043                 GError          **error)
1044{
1045  /* This function and g_utf16_to_ucs4 are almost exactly identical - The lines that differ
1046   * are marked.
1047   */
1048  const gunichar2 *in;
1049  gchar *out;
1050  gchar *result = NULL;
1051  gint n_bytes;
1052  gunichar high_surrogate;
1053
1054  g_return_val_if_fail (str != 0, NULL);
1055
1056  n_bytes = 0;
1057  in = str;
1058  high_surrogate = 0;
1059  while ((len < 0 || in - str < len) && *in)
1060    {
1061      gunichar2 c = *in;
1062      gunichar wc;
1063
1064      if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1065        {
1066          if (high_surrogate)
1067            {
1068              wc = SURROGATE_VALUE (high_surrogate, c);
1069              high_surrogate = 0;
1070            }
1071          else
1072            {
1073              g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1074                           _("Invalid sequence in conversion input"));
1075              goto err_out;
1076            }
1077        }
1078      else
1079        {
1080          if (high_surrogate)
1081            {
1082              g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1083                           _("Invalid sequence in conversion input"));
1084              goto err_out;
1085            }
1086
1087          if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1088            {
1089              high_surrogate = c;
1090              goto next1;
1091            }
1092          else
1093            wc = c;
1094        }
1095
1096      /********** DIFFERENT for UTF8/UCS4 **********/
1097      n_bytes += UTF8_LENGTH (wc);
1098
1099    next1:
1100      in++;
1101    }
1102
1103  if (high_surrogate && !items_read)
1104    {
1105      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1106                   _("Partial character sequence at end of input"));
1107      goto err_out;
1108    }
1109 
1110  /* At this point, everything is valid, and we just need to convert
1111   */
1112  /********** DIFFERENT for UTF8/UCS4 **********/
1113  result = g_malloc (n_bytes + 1);
1114 
1115  high_surrogate = 0;
1116  out = result;
1117  in = str;
1118  while (out < result + n_bytes)
1119    {
1120      gunichar2 c = *in;
1121      gunichar wc;
1122
1123      if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1124        {
1125          wc = SURROGATE_VALUE (high_surrogate, c);
1126          high_surrogate = 0;
1127        }
1128      else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1129        {
1130          high_surrogate = c;
1131          goto next2;
1132        }
1133      else
1134        wc = c;
1135
1136      /********** DIFFERENT for UTF8/UCS4 **********/
1137      out += g_unichar_to_utf8 (wc, out);
1138
1139    next2:
1140      in++;
1141    }
1142 
1143  /********** DIFFERENT for UTF8/UCS4 **********/
1144  *out = '\0';
1145
1146  if (items_written)
1147    /********** DIFFERENT for UTF8/UCS4 **********/
1148    *items_written = out - result;
1149
1150 err_out:
1151  if (items_read)
1152    *items_read = in - str;
1153
1154  return result;
1155}
1156
1157/**
1158 * g_utf16_to_ucs4:
1159 * @str: a UTF-16 encoded string
1160 * @len: the maximum length of @str to use. If @len < 0, then
1161 *       the string is terminated with a 0 character.
1162 * @items_read: location to store number of words read, or %NULL.
1163 *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1164 *              returned in case @str contains a trailing partial
1165 *              character. If an error occurs then the index of the
1166 *              invalid input is stored here.
1167 * @items_written: location to store number of characters written, or %NULL.
1168 *                 The value stored here does not include the trailing
1169 *                 0 character.
1170 * @error: location to store the error occuring, or %NULL to ignore
1171 *         errors. Any of the errors in #GConvertError other than
1172 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1173 *
1174 * Convert a string from UTF-16 to UCS-4. The result will be
1175 * terminated with a 0 character.
1176 *
1177 * Return value: a pointer to a newly allocated UCS-4 string.
1178 *               This value must be freed with g_free(). If an
1179 *               error occurs, %NULL will be returned and
1180 *               @error set.
1181 **/
1182gunichar *
1183g_utf16_to_ucs4 (const gunichar2  *str,
1184                 glong             len,             
1185                 glong            *items_read,       
1186                 glong            *items_written,   
1187                 GError          **error)
1188{
1189  const gunichar2 *in;
1190  gchar *out;
1191  gchar *result = NULL;
1192  gint n_bytes;
1193  gunichar high_surrogate;
1194
1195  g_return_val_if_fail (str != 0, NULL);
1196
1197  n_bytes = 0;
1198  in = str;
1199  high_surrogate = 0;
1200  while ((len < 0 || in - str < len) && *in)
1201    {
1202      gunichar2 c = *in;
1203      gunichar wc;
1204
1205      if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1206        {
1207          if (high_surrogate)
1208            {
1209              wc = SURROGATE_VALUE (high_surrogate, c);
1210              high_surrogate = 0;
1211            }
1212          else
1213            {
1214              g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1215                           _("Invalid sequence in conversion input"));
1216              goto err_out;
1217            }
1218        }
1219      else
1220        {
1221          if (high_surrogate)
1222            {
1223              g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1224                           _("Invalid sequence in conversion input"));
1225              goto err_out;
1226            }
1227
1228          if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1229            {
1230              high_surrogate = c;
1231              goto next1;
1232            }
1233          else
1234            wc = c;
1235        }
1236
1237      /********** DIFFERENT for UTF8/UCS4 **********/
1238      n_bytes += sizeof (gunichar);
1239
1240    next1:
1241      in++;
1242    }
1243
1244  if (high_surrogate && !items_read)
1245    {
1246      g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1247                   _("Partial character sequence at end of input"));
1248      goto err_out;
1249    }
1250 
1251  /* At this point, everything is valid, and we just need to convert
1252   */
1253  /********** DIFFERENT for UTF8/UCS4 **********/
1254  result = g_malloc (n_bytes + 4);
1255 
1256  high_surrogate = 0;
1257  out = result;
1258  in = str;
1259  while (out < result + n_bytes)
1260    {
1261      gunichar2 c = *in;
1262      gunichar wc;
1263
1264      if (c >= 0xdc00 && c < 0xe000) /* low surrogate */
1265        {
1266          wc = SURROGATE_VALUE (high_surrogate, c);
1267          high_surrogate = 0;
1268        }
1269      else if (c >= 0xd800 && c < 0xdc00) /* high surrogate */
1270        {
1271          high_surrogate = c;
1272          goto next2;
1273        }
1274      else
1275        wc = c;
1276
1277      /********** DIFFERENT for UTF8/UCS4 **********/
1278      *(gunichar *)out = wc;
1279      out += sizeof (gunichar);
1280
1281    next2:
1282      in++;
1283    }
1284
1285  /********** DIFFERENT for UTF8/UCS4 **********/
1286  *(gunichar *)out = 0;
1287
1288  if (items_written)
1289    /********** DIFFERENT for UTF8/UCS4 **********/
1290    *items_written = (out - result) / sizeof (gunichar);
1291
1292 err_out:
1293  if (items_read)
1294    *items_read = in - str;
1295
1296  return (gunichar *)result;
1297}
1298
1299/**
1300 * g_utf8_to_utf16:
1301 * @str: a UTF-8 encoded string
1302 * @len: the maximum length of @str to use. If @len < 0, then
1303 *       the string is nul-terminated.
1304 * @items_read: location to store number of bytes read, or %NULL.
1305 *              If %NULL, then %G_CONVERT_ERROR_PARTIAL_INPUT will be
1306 *              returned in case @str contains a trailing partial
1307 *              character. If an error occurs then the index of the
1308 *              invalid input is stored here.
1309 * @items_written: location to store number of words written, or %NULL.
1310 *                 The value stored here does not include the trailing
1311 *                 0 word.
1312 * @error: location to store the error occuring, or %NULL to ignore
1313 *         errors. Any of the errors in #GConvertError other than
1314 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1315 *
1316 * Convert a string from UTF-8 to UTF-16. A 0 word will be
1317 * added to the result after the converted text.
1318 *
1319 * Return value: a pointer to a newly allocated UTF-16 string.
1320 *               This value must be freed with g_free(). If an
1321 *               error occurs, %NULL will be returned and
1322 *               @error set.
1323 **/
1324gunichar2 *
1325g_utf8_to_utf16 (const gchar *str,
1326                 glong        len,             
1327                 glong       *items_read,       
1328                 glong       *items_written,   
1329                 GError     **error)
1330{
1331  gunichar2 *result = NULL;
1332  gint n16;
1333  const gchar *in;
1334  gint i;
1335
1336  g_return_val_if_fail (str != NULL, NULL);
1337
1338  in = str;
1339  n16 = 0;
1340  while ((len < 0 || str + len - in > 0) && *in)
1341    {
1342      gunichar wc = g_utf8_get_char_extended (in, str + len - in);
1343      if (wc & 0x80000000)
1344        {
1345          if (wc == (gunichar)-2)
1346            {
1347              if (items_read)
1348                break;
1349              else
1350                g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT,
1351                             _("Partial character sequence at end of input"));
1352            }
1353          else
1354            g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1355                         _("Invalid byte sequence in conversion input"));
1356
1357          goto err_out;
1358        }
1359
1360      if (wc < 0xd800)
1361        n16 += 1;
1362      else if (wc < 0xe000)
1363        {
1364          g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1365                       _("Invalid sequence in conversion input"));
1366
1367          goto err_out;
1368        }
1369      else if (wc < 0x10000)
1370        n16 += 1;
1371      else if (wc < 0x110000)
1372        n16 += 2;
1373      else
1374        {
1375          g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1376                       _("Character out of range for UTF-16"));
1377
1378          goto err_out;
1379        }
1380     
1381      in = g_utf8_next_char (in);
1382    }
1383
1384  result = g_new (gunichar2, n16 + 1);
1385 
1386  in = str;
1387  for (i = 0; i < n16;)
1388    {
1389      gunichar wc = g_utf8_get_char (in);
1390
1391      if (wc < 0x10000)
1392        {
1393          result[i++] = wc;
1394        }
1395      else
1396        {
1397          result[i++] = (wc - 0x10000) / 0x400 + 0xd800;
1398          result[i++] = (wc - 0x10000) % 0x400 + 0xdc00;
1399        }
1400     
1401      in = g_utf8_next_char (in);
1402    }
1403
1404  result[i] = 0;
1405
1406  if (items_written)
1407    *items_written = n16;
1408
1409 err_out:
1410  if (items_read)
1411    *items_read = in - str;
1412 
1413  return result;
1414}
1415
1416/**
1417 * g_ucs4_to_utf16:
1418 * @str: a UCS-4 encoded string
1419 * @len: the maximum length of @str to use. If @len < 0, then
1420 *       the string is terminated with a 0 character.
1421 * @items_read: location to store number of bytes read, or %NULL.
1422 *              If an error occurs then the index of the invalid input
1423 *              is stored here.
1424 * @items_written: location to store number of words written, or %NULL.
1425 *                 The value stored here does not include the trailing
1426 *                 0 word.
1427 * @error: location to store the error occuring, or %NULL to ignore
1428 *         errors. Any of the errors in #GConvertError other than
1429 *         %G_CONVERT_ERROR_NO_CONVERSION may occur.
1430 *
1431 * Convert a string from UCS-4 to UTF-16. A 0 word will be
1432 * added to the result after the converted text.
1433 *
1434 * Return value: a pointer to a newly allocated UTF-16 string.
1435 *               This value must be freed with g_free(). If an
1436 *               error occurs, %NULL will be returned and
1437 *               @error set.
1438 **/
1439gunichar2 *
1440g_ucs4_to_utf16 (const gunichar  *str,
1441                 glong            len,             
1442                 glong           *items_read,       
1443                 glong           *items_written,   
1444                 GError         **error)
1445{
1446  gunichar2 *result = NULL;
1447  gint n16;
1448  gint i, j;
1449
1450  n16 = 0;
1451  i = 0;
1452  while ((len < 0 || i < len) && str[i])
1453    {
1454      gunichar wc = str[i];
1455
1456      if (wc < 0xd800)
1457        n16 += 1;
1458      else if (wc < 0xe000)
1459        {
1460          g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1461                       _("Invalid sequence in conversion input"));
1462
1463          goto err_out;
1464        }
1465      else if (wc < 0x10000)
1466        n16 += 1;
1467      else if (wc < 0x110000)
1468        n16 += 2;
1469      else
1470        {
1471          g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
1472                       _("Character out of range for UTF-16"));
1473
1474          goto err_out;
1475        }
1476
1477      i++;
1478    }
1479 
1480  result = g_new (gunichar2, n16 + 1);
1481 
1482  for (i = 0, j = 0; j < n16; i++)
1483    {
1484      gunichar wc = str[i];
1485
1486      if (wc < 0x10000)
1487        {
1488          result[j++] = wc;
1489        }
1490      else
1491        {
1492          result[j++] = (wc - 0x10000) / 0x400 + 0xd800;
1493          result[j++] = (wc - 0x10000) % 0x400 + 0xdc00;
1494        }
1495    }
1496  result[j] = 0;
1497
1498  if (items_written)
1499    *items_written = n16;
1500 
1501 err_out:
1502  if (items_read)
1503    *items_read = i;
1504 
1505  return result;
1506}
1507
1508/**
1509 * g_utf8_validate:
1510 * @str: a pointer to character data
1511 * @max_len: max bytes to validate, or -1 to go until nul
1512 * @end: return location for end of valid data
1513 *
1514 * Validates UTF-8 encoded text. @str is the text to validate;
1515 * if @str is nul-terminated, then @max_len can be -1, otherwise
1516 * @max_len should be the number of bytes to validate.
1517 * If @end is non-%NULL, then the end of the valid range
1518 * will be stored there (i.e. the address of the first invalid byte
1519 * if some bytes were invalid, or the end of the text being validated
1520 * otherwise).
1521 *
1522 * Returns %TRUE if all of @str was valid. Many GLib and GTK+
1523 * routines <emphasis>require</emphasis> valid UTF-8 as input;
1524 * so data read from a file or the network should be checked
1525 * with g_utf8_validate() before doing anything else with it.
1526 *
1527 * Return value: %TRUE if the text was valid UTF-8
1528 **/
1529gboolean
1530g_utf8_validate (const gchar  *str,
1531                 gssize        max_len,   
1532                 const gchar **end)
1533{
1534
1535  const gchar *p;
1536
1537  g_return_val_if_fail (str != NULL, FALSE);
1538 
1539  if (end)
1540    *end = str;
1541 
1542  p = str;
1543 
1544  while ((max_len < 0 || (p - str) < max_len) && *p)
1545    {
1546      int i, mask = 0, len;
1547      gunichar result;
1548      unsigned char c = (unsigned char) *p;
1549     
1550      UTF8_COMPUTE (c, mask, len);
1551
1552      if (len == -1)
1553        break;
1554
1555      /* check that the expected number of bytes exists in str */
1556      if (max_len >= 0 &&
1557          ((max_len - (p - str)) < len))
1558        break;
1559       
1560      UTF8_GET (result, p, i, mask, len);
1561
1562      if (UTF8_LENGTH (result) != len) /* Check for overlong UTF-8 */
1563        break;
1564
1565      if (result == (gunichar)-1)
1566        break;
1567
1568      if (!UNICODE_VALID (result))
1569        break;
1570     
1571      p += len;
1572    }
1573
1574  if (end)
1575    *end = p;
1576
1577  /* See that we covered the entire length if a length was
1578   * passed in, or that we ended on a nul if not
1579   */
1580  if (max_len >= 0 &&
1581      p != (str + max_len))
1582    return FALSE;
1583  else if (max_len < 0 &&
1584           *p != '\0')
1585    return FALSE;
1586  else
1587    return TRUE;
1588}
1589
1590/**
1591 * g_unichar_validate:
1592 * @ch: a Unicode character
1593 *
1594 * Checks whether @ch is a valid Unicode character. Some possible
1595 * integer values of @ch will not be valid. 0 is considered a valid
1596 * character, though it's normally a string terminator.
1597 *
1598 * Return value: %TRUE if @ch is a valid Unicode character
1599 **/
1600gboolean
1601g_unichar_validate (gunichar ch)
1602{
1603  return UNICODE_VALID (ch);
1604}
1605
1606/**
1607 * g_utf8_strreverse:
1608 * @str: a UTF-8 encoded string
1609 * @len: the maximum length of @str to use. If @len < 0, then
1610 *       the string is nul-terminated.
1611 *
1612 * Reverses a UTF-8 string. @str must be valid UTF-8 encoded text.
1613 * (Use g_utf8_validate() on all text before trying to use UTF-8
1614 * utility functions with it.)
1615 *
1616 * Note that unlike g_strreverse(), this function returns
1617 * newly-allocated memory, which should be freed with g_free() when
1618 * no longer needed.
1619 *
1620 * Returns: a newly-allocated string which is the reverse of @str.
1621 *
1622 * Since: 2.2
1623 */
1624gchar *
1625g_utf8_strreverse (const gchar *str,
1626                   gssize len)
1627{
1628  gchar *result;
1629  const gchar *p;
1630  gchar *m, *r, skip;
1631
1632  if (len < 0)
1633    len = strlen (str);
1634
1635  result = g_new (gchar, len + 1);
1636  r = result + len;
1637  p = str;
1638  while (*p)
1639    {
1640      skip = g_utf8_skip[*(guchar*)p];
1641      r -= skip;
1642      for (m = r; skip; skip--)
1643        *m++ = *p++;
1644    }
1645  result[len] = 0;
1646
1647  return result;
1648}
Note: See TracBrowser for help on using the repository browser.