source: trunk/third/glib2/glib/gunicollate.c @ 21369

Revision 21369, 6.4 KB checked in by ghudson, 20 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r21368, which included commits to RCS files with non-trunk default branches.
Line 
1/* gunicollate.c - Collation
2 *
3 *  Copyright 2001 Red Hat, Inc.
4 *
5 * The Gnome Library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public License as
7 * published by the Free Software Foundation; either version 2 of the
8 * License, or (at your option) any later version.
9 *
10 * The Gnome Library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13 * Lesser General Public License for more details.
14 *
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with the Gnome Library; see the file COPYING.LIB.  If not,
17 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 *   Boston, MA 02111-1307, USA.
19 */
20
21#include "config.h"
22
23#include <locale.h>
24#include <string.h>
25#ifdef __STDC_ISO_10646__
26#include <wchar.h>
27#endif
28
29#include "glib.h"
30#include "gunicodeprivate.h"
31
32/**
33 * g_utf8_collate:
34 * @str1: a UTF-8 encoded string
35 * @str2: a UTF-8 encoded string
36 *
37 * Compares two strings for ordering using the linguistically
38 * correct rules for the current locale. When sorting a large
39 * number of strings, it will be significantly faster to
40 * obtain collation keys with g_utf8_collate_key() and
41 * compare the keys with strcmp() when
42 * sorting instead of sorting the original strings.
43 *
44 * Return value: -1 if @str1 compares before @str2, 0 if they
45 *   compare equal, 1 if @str1 compares after @str2.
46 **/
47gint
48g_utf8_collate (const gchar *str1,
49                const gchar *str2)
50{
51  gint result;
52 
53#ifdef __STDC_ISO_10646__
54
55  gunichar *str1_norm;
56  gunichar *str2_norm;
57
58  g_return_val_if_fail (str1 != NULL, 0);
59  g_return_val_if_fail (str2 != NULL, 0);
60
61  str1_norm = _g_utf8_normalize_wc (str1, -1, G_NORMALIZE_ALL_COMPOSE);
62  str2_norm = _g_utf8_normalize_wc (str2, -1, G_NORMALIZE_ALL_COMPOSE);
63
64  result = wcscoll ((wchar_t *)str1_norm, (wchar_t *)str2_norm);
65
66  g_free (str1_norm);
67  g_free (str2_norm);
68
69#else /* !__STDC_ISO_10646__ */
70
71  const gchar *charset;
72  gchar *str1_norm;
73  gchar *str2_norm;
74
75  g_return_val_if_fail (str1 != NULL, 0);
76  g_return_val_if_fail (str2 != NULL, 0);
77
78  str1_norm = g_utf8_normalize (str1, -1, G_NORMALIZE_ALL_COMPOSE);
79  str2_norm = g_utf8_normalize (str2, -1, G_NORMALIZE_ALL_COMPOSE);
80
81  if (g_get_charset (&charset))
82    {
83      result = strcoll (str1_norm, str2_norm);
84    }
85  else
86    {
87      gchar *str1_locale = g_convert (str1_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
88      gchar *str2_locale = g_convert (str2_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
89
90      if (str1_locale && str2_locale)
91        result =  strcoll (str1_locale, str2_locale);
92      else if (str1_locale)
93        result = -1;
94      else if (str2_locale)
95        result = 1;
96      else
97        result = strcmp (str1_norm, str2_norm);
98
99      g_free (str1_locale);
100      g_free (str2_locale);
101    }
102
103  g_free (str1_norm);
104  g_free (str2_norm);
105
106#endif /* __STDC_ISO_10646__ */
107
108  return result;
109}
110
111#ifdef __STDC_ISO_10646__
112/* We need UTF-8 encoding of numbers to encode the weights if
113 * we are using wcsxfrm. However, we aren't encoding Unicode
114 * characters, so we can't simply use g_unichar_to_utf8.
115 *
116 * The following routine is taken (with modification) from GNU
117 * libc's strxfrm routine:
118 *
119 * Copyright (C) 1995-1999,2000,2001 Free Software Foundation, Inc.
120 * Written by Ulrich Drepper <drepper@cygnus.com>, 1995.
121 */
122static inline int
123utf8_encode (char *buf, wchar_t val)
124{
125  int retval;
126
127  if (val < 0x80)
128    {
129      if (buf)
130        *buf++ = (char) val;
131      retval = 1;
132    }
133  else
134    {
135      int step;
136
137      for (step = 2; step < 6; ++step)
138        if ((val & (~(guint32)0 << (5 * step + 1))) == 0)
139          break;
140      retval = step;
141
142      if (buf)
143        {
144          *buf = (unsigned char) (~0xff >> step);
145          --step;
146          do
147            {
148              buf[step] = 0x80 | (val & 0x3f);
149              val >>= 6;
150            }
151          while (--step > 0);
152          *buf |= val;
153        }
154    }
155
156  return retval;
157}
158#endif /* __STDC_ISO_10646__ */
159
160/**
161 * g_utf8_collate_key:
162 * @str: a UTF-8 encoded string.
163 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
164 *
165 * Converts a string into a collation key that can be compared
166 * with other collation keys using strcmp().
167 * The results of comparing the collation keys of two strings
168 * with strcmp() will always be the same as
169 * comparing the two original keys with g_utf8_collate().
170 *
171 * Return value: a newly allocated string. This string should
172 *   be freed with g_free() when you are done with it.
173 **/
174gchar *
175g_utf8_collate_key (const gchar *str,
176                    gssize       len)
177{
178  gchar *result;
179  size_t xfrm_len;
180 
181#ifdef __STDC_ISO_10646__
182
183  gunichar *str_norm;
184  wchar_t *result_wc;
185  size_t i;
186  size_t result_len = 0;
187
188  g_return_val_if_fail (str != NULL, NULL);
189
190  str_norm = _g_utf8_normalize_wc (str, len, G_NORMALIZE_ALL_COMPOSE);
191
192  setlocale (LC_COLLATE, "");
193
194  xfrm_len = wcsxfrm (NULL, (wchar_t *)str_norm, 0);
195  result_wc = g_new (wchar_t, xfrm_len + 1);
196  wcsxfrm (result_wc, (wchar_t *)str_norm, xfrm_len + 1);
197
198  for (i=0; i < xfrm_len; i++)
199    result_len += utf8_encode (NULL, result_wc[i]);
200
201  result = g_malloc (result_len + 1);
202  result_len = 0;
203  for (i=0; i < xfrm_len; i++)
204    result_len += utf8_encode (result + result_len, result_wc[i]);
205
206  result[result_len] = '\0';
207
208  g_free (result_wc);
209  g_free (str_norm);
210
211  return result;
212#else /* !__STDC_ISO_10646__ */
213
214  const gchar *charset;
215  gchar *str_norm;
216
217  g_return_val_if_fail (str != NULL, NULL);
218
219  str_norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL_COMPOSE);
220
221  if (g_get_charset (&charset))
222    {
223      xfrm_len = strxfrm (NULL, str_norm, 0);
224      result = g_malloc (xfrm_len + 1);
225      strxfrm (result, str_norm, xfrm_len + 1);
226    }
227  else
228    {
229      gchar *str_locale = g_convert (str_norm, -1, charset, "UTF-8", NULL, NULL, NULL);
230
231      if (str_locale)
232        {
233          xfrm_len = strxfrm (NULL, str_locale, 0);
234          if (xfrm_len < 0 || xfrm_len >= G_MAXINT - 2)
235            {
236              g_free (str_locale);
237              str_locale = NULL;
238            }
239        }
240      if (str_locale)
241        {
242          result = g_malloc (xfrm_len + 2);
243          result[0] = 'A';
244          strxfrm (result + 1, str_locale, xfrm_len + 1);
245         
246          g_free (str_locale);
247        }
248      else
249        {
250          xfrm_len = strlen (str_norm);
251          result = g_malloc (xfrm_len + 2);
252          result[0] = 'B';
253          memcpy (result + 1, str_norm, xfrm_len);
254          result[xfrm_len+1] = '\0';
255        }
256    }
257
258  g_free (str_norm);
259#endif /* __STDC_ISO_10646__ */
260
261  return result;
262}
Note: See TracBrowser for help on using the repository browser.