1 | /* GLIB - Library of useful routines for C programming |
---|
2 | * |
---|
3 | * gconvert.c: Convert between character sets using iconv |
---|
4 | * Copyright Red Hat Inc., 2000 |
---|
5 | * Authors: Havoc Pennington <hp@redhat.com>, Owen Taylor <otaylor@redhat.com |
---|
6 | * |
---|
7 | * This library is free software; you can redistribute it and/or |
---|
8 | * modify it under the terms of the GNU Lesser General Public |
---|
9 | * License as published by the Free Software Foundation; either |
---|
10 | * version 2 of the License, or (at your option) any later version. |
---|
11 | * |
---|
12 | * This library is distributed in the hope that it will be useful, |
---|
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
---|
15 | * Lesser General Public License for more details. |
---|
16 | * |
---|
17 | * You should have received a copy of the GNU Lesser General Public |
---|
18 | * License along with this library; if not, write to the |
---|
19 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
---|
20 | * Boston, MA 02111-1307, USA. |
---|
21 | */ |
---|
22 | |
---|
23 | #include "config.h" |
---|
24 | |
---|
25 | #include <iconv.h> |
---|
26 | #include <errno.h> |
---|
27 | #include <stdio.h> |
---|
28 | #include <string.h> |
---|
29 | #include <stdlib.h> |
---|
30 | |
---|
31 | #include "glib.h" |
---|
32 | #include "gprintfint.h" |
---|
33 | #include "gthreadinit.h" |
---|
34 | |
---|
35 | #ifdef G_PLATFORM_WIN32 |
---|
36 | #define STRICT |
---|
37 | #include <windows.h> |
---|
38 | #undef STRICT |
---|
39 | #endif |
---|
40 | |
---|
41 | #include "glibintl.h" |
---|
42 | |
---|
43 | #if defined(USE_LIBICONV_GNU) && !defined (_LIBICONV_H) |
---|
44 | #error GNU libiconv in use but included iconv.h not from libiconv |
---|
45 | #endif |
---|
46 | #if !defined(USE_LIBICONV_GNU) && defined (_LIBICONV_H) |
---|
47 | #error GNU libiconv not in use but included iconv.h is from libiconv |
---|
48 | #endif |
---|
49 | |
---|
50 | GQuark |
---|
51 | g_convert_error_quark (void) |
---|
52 | { |
---|
53 | static GQuark quark; |
---|
54 | if (!quark) |
---|
55 | quark = g_quark_from_static_string ("g_convert_error"); |
---|
56 | |
---|
57 | return quark; |
---|
58 | } |
---|
59 | |
---|
60 | static gboolean |
---|
61 | try_conversion (const char *to_codeset, |
---|
62 | const char *from_codeset, |
---|
63 | iconv_t *cd) |
---|
64 | { |
---|
65 | *cd = iconv_open (to_codeset, from_codeset); |
---|
66 | |
---|
67 | if (*cd == (iconv_t)-1 && errno == EINVAL) |
---|
68 | return FALSE; |
---|
69 | else |
---|
70 | return TRUE; |
---|
71 | } |
---|
72 | |
---|
73 | static gboolean |
---|
74 | try_to_aliases (const char **to_aliases, |
---|
75 | const char *from_codeset, |
---|
76 | iconv_t *cd) |
---|
77 | { |
---|
78 | if (to_aliases) |
---|
79 | { |
---|
80 | const char **p = to_aliases; |
---|
81 | while (*p) |
---|
82 | { |
---|
83 | if (try_conversion (*p, from_codeset, cd)) |
---|
84 | return TRUE; |
---|
85 | |
---|
86 | p++; |
---|
87 | } |
---|
88 | } |
---|
89 | |
---|
90 | return FALSE; |
---|
91 | } |
---|
92 | |
---|
93 | extern const char **_g_charset_get_aliases (const char *canonical_name); |
---|
94 | |
---|
95 | /** |
---|
96 | * g_iconv_open: |
---|
97 | * @to_codeset: destination codeset |
---|
98 | * @from_codeset: source codeset |
---|
99 | * |
---|
100 | * Same as the standard UNIX routine iconv_open(), but |
---|
101 | * may be implemented via libiconv on UNIX flavors that lack |
---|
102 | * a native implementation. |
---|
103 | * |
---|
104 | * GLib provides g_convert() and g_locale_to_utf8() which are likely |
---|
105 | * more convenient than the raw iconv wrappers. |
---|
106 | * |
---|
107 | * Return value: a "conversion descriptor", or (GIConv)-1 if |
---|
108 | * opening the converter failed. |
---|
109 | **/ |
---|
110 | GIConv |
---|
111 | g_iconv_open (const gchar *to_codeset, |
---|
112 | const gchar *from_codeset) |
---|
113 | { |
---|
114 | iconv_t cd; |
---|
115 | |
---|
116 | if (!try_conversion (to_codeset, from_codeset, &cd)) |
---|
117 | { |
---|
118 | const char **to_aliases = _g_charset_get_aliases (to_codeset); |
---|
119 | const char **from_aliases = _g_charset_get_aliases (from_codeset); |
---|
120 | |
---|
121 | if (from_aliases) |
---|
122 | { |
---|
123 | const char **p = from_aliases; |
---|
124 | while (*p) |
---|
125 | { |
---|
126 | if (try_conversion (to_codeset, *p, &cd)) |
---|
127 | goto out; |
---|
128 | |
---|
129 | if (try_to_aliases (to_aliases, *p, &cd)) |
---|
130 | goto out; |
---|
131 | |
---|
132 | p++; |
---|
133 | } |
---|
134 | } |
---|
135 | |
---|
136 | if (try_to_aliases (to_aliases, from_codeset, &cd)) |
---|
137 | goto out; |
---|
138 | } |
---|
139 | |
---|
140 | out: |
---|
141 | return (cd == (iconv_t)-1) ? (GIConv)-1 : (GIConv)cd; |
---|
142 | } |
---|
143 | |
---|
144 | /** |
---|
145 | * g_iconv: |
---|
146 | * @converter: conversion descriptor from g_iconv_open() |
---|
147 | * @inbuf: bytes to convert |
---|
148 | * @inbytes_left: inout parameter, bytes remaining to convert in @inbuf |
---|
149 | * @outbuf: converted output bytes |
---|
150 | * @outbytes_left: inout parameter, bytes available to fill in @outbuf |
---|
151 | * |
---|
152 | * Same as the standard UNIX routine iconv(), but |
---|
153 | * may be implemented via libiconv on UNIX flavors that lack |
---|
154 | * a native implementation. |
---|
155 | * |
---|
156 | * GLib provides g_convert() and g_locale_to_utf8() which are likely |
---|
157 | * more convenient than the raw iconv wrappers. |
---|
158 | * |
---|
159 | * Return value: count of non-reversible conversions, or -1 on error |
---|
160 | **/ |
---|
161 | size_t |
---|
162 | g_iconv (GIConv converter, |
---|
163 | gchar **inbuf, |
---|
164 | gsize *inbytes_left, |
---|
165 | gchar **outbuf, |
---|
166 | gsize *outbytes_left) |
---|
167 | { |
---|
168 | iconv_t cd = (iconv_t)converter; |
---|
169 | |
---|
170 | return iconv (cd, inbuf, inbytes_left, outbuf, outbytes_left); |
---|
171 | } |
---|
172 | |
---|
173 | /** |
---|
174 | * g_iconv_close: |
---|
175 | * @converter: a conversion descriptor from g_iconv_open() |
---|
176 | * |
---|
177 | * Same as the standard UNIX routine iconv_close(), but |
---|
178 | * may be implemented via libiconv on UNIX flavors that lack |
---|
179 | * a native implementation. Should be called to clean up |
---|
180 | * the conversion descriptor from g_iconv_open() when |
---|
181 | * you are done converting things. |
---|
182 | * |
---|
183 | * GLib provides g_convert() and g_locale_to_utf8() which are likely |
---|
184 | * more convenient than the raw iconv wrappers. |
---|
185 | * |
---|
186 | * Return value: -1 on error, 0 on success |
---|
187 | **/ |
---|
188 | gint |
---|
189 | g_iconv_close (GIConv converter) |
---|
190 | { |
---|
191 | iconv_t cd = (iconv_t)converter; |
---|
192 | |
---|
193 | return iconv_close (cd); |
---|
194 | } |
---|
195 | |
---|
196 | |
---|
197 | #define ICONV_CACHE_SIZE (16) |
---|
198 | |
---|
199 | struct _iconv_cache_bucket { |
---|
200 | gchar *key; |
---|
201 | guint32 refcount; |
---|
202 | gboolean used; |
---|
203 | GIConv cd; |
---|
204 | }; |
---|
205 | |
---|
206 | static GList *iconv_cache_list; |
---|
207 | static GHashTable *iconv_cache; |
---|
208 | static GHashTable *iconv_open_hash; |
---|
209 | static guint iconv_cache_size = 0; |
---|
210 | G_LOCK_DEFINE_STATIC (iconv_cache_lock); |
---|
211 | |
---|
212 | /* caller *must* hold the iconv_cache_lock */ |
---|
213 | static void |
---|
214 | iconv_cache_init (void) |
---|
215 | { |
---|
216 | static gboolean initialized = FALSE; |
---|
217 | |
---|
218 | if (initialized) |
---|
219 | return; |
---|
220 | |
---|
221 | iconv_cache_list = NULL; |
---|
222 | iconv_cache = g_hash_table_new (g_str_hash, g_str_equal); |
---|
223 | iconv_open_hash = g_hash_table_new (g_direct_hash, g_direct_equal); |
---|
224 | |
---|
225 | initialized = TRUE; |
---|
226 | } |
---|
227 | |
---|
228 | |
---|
229 | /** |
---|
230 | * iconv_cache_bucket_new: |
---|
231 | * @key: cache key |
---|
232 | * @cd: iconv descriptor |
---|
233 | * |
---|
234 | * Creates a new cache bucket, inserts it into the cache and |
---|
235 | * increments the cache size. |
---|
236 | * |
---|
237 | * Returns a pointer to the newly allocated cache bucket. |
---|
238 | **/ |
---|
239 | static struct _iconv_cache_bucket * |
---|
240 | iconv_cache_bucket_new (const gchar *key, GIConv cd) |
---|
241 | { |
---|
242 | struct _iconv_cache_bucket *bucket; |
---|
243 | |
---|
244 | bucket = g_new (struct _iconv_cache_bucket, 1); |
---|
245 | bucket->key = g_strdup (key); |
---|
246 | bucket->refcount = 1; |
---|
247 | bucket->used = TRUE; |
---|
248 | bucket->cd = cd; |
---|
249 | |
---|
250 | g_hash_table_insert (iconv_cache, bucket->key, bucket); |
---|
251 | |
---|
252 | /* FIXME: if we sorted the list so items with few refcounts were |
---|
253 | first, then we could expire them faster in iconv_cache_expire_unused () */ |
---|
254 | iconv_cache_list = g_list_prepend (iconv_cache_list, bucket); |
---|
255 | |
---|
256 | iconv_cache_size++; |
---|
257 | |
---|
258 | return bucket; |
---|
259 | } |
---|
260 | |
---|
261 | |
---|
262 | /** |
---|
263 | * iconv_cache_bucket_expire: |
---|
264 | * @node: cache bucket's node |
---|
265 | * @bucket: cache bucket |
---|
266 | * |
---|
267 | * Expires a single cache bucket @bucket. This should only ever be |
---|
268 | * called on a bucket that currently has no used iconv descriptors |
---|
269 | * open. |
---|
270 | * |
---|
271 | * @node is not a required argument. If @node is not supplied, we |
---|
272 | * search for it ourselves. |
---|
273 | **/ |
---|
274 | static void |
---|
275 | iconv_cache_bucket_expire (GList *node, struct _iconv_cache_bucket *bucket) |
---|
276 | { |
---|
277 | g_hash_table_remove (iconv_cache, bucket->key); |
---|
278 | |
---|
279 | if (node == NULL) |
---|
280 | node = g_list_find (iconv_cache_list, bucket); |
---|
281 | |
---|
282 | g_assert (node != NULL); |
---|
283 | |
---|
284 | if (node->prev) |
---|
285 | { |
---|
286 | node->prev->next = node->next; |
---|
287 | if (node->next) |
---|
288 | node->next->prev = node->prev; |
---|
289 | } |
---|
290 | else |
---|
291 | { |
---|
292 | iconv_cache_list = node->next; |
---|
293 | if (node->next) |
---|
294 | node->next->prev = NULL; |
---|
295 | } |
---|
296 | |
---|
297 | g_list_free_1 (node); |
---|
298 | |
---|
299 | g_free (bucket->key); |
---|
300 | g_iconv_close (bucket->cd); |
---|
301 | g_free (bucket); |
---|
302 | |
---|
303 | iconv_cache_size--; |
---|
304 | } |
---|
305 | |
---|
306 | |
---|
307 | /** |
---|
308 | * iconv_cache_expire_unused: |
---|
309 | * |
---|
310 | * Expires as many unused cache buckets as it needs to in order to get |
---|
311 | * the total number of buckets < ICONV_CACHE_SIZE. |
---|
312 | **/ |
---|
313 | static void |
---|
314 | iconv_cache_expire_unused (void) |
---|
315 | { |
---|
316 | struct _iconv_cache_bucket *bucket; |
---|
317 | GList *node, *next; |
---|
318 | |
---|
319 | node = iconv_cache_list; |
---|
320 | while (node && iconv_cache_size >= ICONV_CACHE_SIZE) |
---|
321 | { |
---|
322 | next = node->next; |
---|
323 | |
---|
324 | bucket = node->data; |
---|
325 | if (bucket->refcount == 0) |
---|
326 | iconv_cache_bucket_expire (node, bucket); |
---|
327 | |
---|
328 | node = next; |
---|
329 | } |
---|
330 | } |
---|
331 | |
---|
332 | static GIConv |
---|
333 | open_converter (const gchar *to_codeset, |
---|
334 | const gchar *from_codeset, |
---|
335 | GError **error) |
---|
336 | { |
---|
337 | struct _iconv_cache_bucket *bucket; |
---|
338 | gchar *key; |
---|
339 | GIConv cd; |
---|
340 | |
---|
341 | /* create our key */ |
---|
342 | key = g_alloca (strlen (from_codeset) + strlen (to_codeset) + 2); |
---|
343 | _g_sprintf (key, "%s:%s", from_codeset, to_codeset); |
---|
344 | |
---|
345 | G_LOCK (iconv_cache_lock); |
---|
346 | |
---|
347 | /* make sure the cache has been initialized */ |
---|
348 | iconv_cache_init (); |
---|
349 | |
---|
350 | bucket = g_hash_table_lookup (iconv_cache, key); |
---|
351 | if (bucket) |
---|
352 | { |
---|
353 | if (bucket->used) |
---|
354 | { |
---|
355 | cd = g_iconv_open (to_codeset, from_codeset); |
---|
356 | if (cd == (GIConv) -1) |
---|
357 | goto error; |
---|
358 | } |
---|
359 | else |
---|
360 | { |
---|
361 | /* Apparently iconv on Solaris <= 7 segfaults if you pass in |
---|
362 | * NULL for anything but inbuf; work around that. (NULL outbuf |
---|
363 | * or NULL *outbuf is allowed by Unix98.) |
---|
364 | */ |
---|
365 | gsize inbytes_left = 0; |
---|
366 | gchar *outbuf = NULL; |
---|
367 | gsize outbytes_left = 0; |
---|
368 | |
---|
369 | cd = bucket->cd; |
---|
370 | bucket->used = TRUE; |
---|
371 | |
---|
372 | /* reset the descriptor */ |
---|
373 | g_iconv (cd, NULL, &inbytes_left, &outbuf, &outbytes_left); |
---|
374 | } |
---|
375 | |
---|
376 | bucket->refcount++; |
---|
377 | } |
---|
378 | else |
---|
379 | { |
---|
380 | cd = g_iconv_open (to_codeset, from_codeset); |
---|
381 | if (cd == (GIConv) -1) |
---|
382 | goto error; |
---|
383 | |
---|
384 | iconv_cache_expire_unused (); |
---|
385 | |
---|
386 | bucket = iconv_cache_bucket_new (key, cd); |
---|
387 | } |
---|
388 | |
---|
389 | g_hash_table_insert (iconv_open_hash, cd, bucket->key); |
---|
390 | |
---|
391 | G_UNLOCK (iconv_cache_lock); |
---|
392 | |
---|
393 | return cd; |
---|
394 | |
---|
395 | error: |
---|
396 | |
---|
397 | G_UNLOCK (iconv_cache_lock); |
---|
398 | |
---|
399 | /* Something went wrong. */ |
---|
400 | if (errno == EINVAL) |
---|
401 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION, |
---|
402 | _("Conversion from character set '%s' to '%s' is not supported"), |
---|
403 | from_codeset, to_codeset); |
---|
404 | else |
---|
405 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, |
---|
406 | _("Could not open converter from '%s' to '%s'"), |
---|
407 | from_codeset, to_codeset); |
---|
408 | |
---|
409 | return cd; |
---|
410 | } |
---|
411 | |
---|
412 | static int |
---|
413 | close_converter (GIConv converter) |
---|
414 | { |
---|
415 | struct _iconv_cache_bucket *bucket; |
---|
416 | const gchar *key; |
---|
417 | GIConv cd; |
---|
418 | |
---|
419 | cd = converter; |
---|
420 | |
---|
421 | if (cd == (GIConv) -1) |
---|
422 | return 0; |
---|
423 | |
---|
424 | G_LOCK (iconv_cache_lock); |
---|
425 | |
---|
426 | key = g_hash_table_lookup (iconv_open_hash, cd); |
---|
427 | if (key) |
---|
428 | { |
---|
429 | g_hash_table_remove (iconv_open_hash, cd); |
---|
430 | |
---|
431 | bucket = g_hash_table_lookup (iconv_cache, key); |
---|
432 | g_assert (bucket); |
---|
433 | |
---|
434 | bucket->refcount--; |
---|
435 | |
---|
436 | if (cd == bucket->cd) |
---|
437 | bucket->used = FALSE; |
---|
438 | else |
---|
439 | g_iconv_close (cd); |
---|
440 | |
---|
441 | if (!bucket->refcount && iconv_cache_size > ICONV_CACHE_SIZE) |
---|
442 | { |
---|
443 | /* expire this cache bucket */ |
---|
444 | iconv_cache_bucket_expire (NULL, bucket); |
---|
445 | } |
---|
446 | } |
---|
447 | else |
---|
448 | { |
---|
449 | G_UNLOCK (iconv_cache_lock); |
---|
450 | |
---|
451 | g_warning ("This iconv context wasn't opened using open_converter"); |
---|
452 | |
---|
453 | return g_iconv_close (converter); |
---|
454 | } |
---|
455 | |
---|
456 | G_UNLOCK (iconv_cache_lock); |
---|
457 | |
---|
458 | return 0; |
---|
459 | } |
---|
460 | |
---|
461 | |
---|
462 | /** |
---|
463 | * g_convert: |
---|
464 | * @str: the string to convert |
---|
465 | * @len: the length of the string |
---|
466 | * @to_codeset: name of character set into which to convert @str |
---|
467 | * @from_codeset: character set of @str. |
---|
468 | * @bytes_read: location to store the number of bytes in the |
---|
469 | * input string that were successfully converted, or %NULL. |
---|
470 | * Even if the conversion was successful, this may be |
---|
471 | * less than @len if there were partial characters |
---|
472 | * at the end of the input. If the error |
---|
473 | * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
---|
474 | * stored will the byte offset after the last valid |
---|
475 | * input sequence. |
---|
476 | * @bytes_written: the number of bytes stored in the output buffer (not |
---|
477 | * including the terminating nul). |
---|
478 | * @error: location to store the error occuring, or %NULL to ignore |
---|
479 | * errors. Any of the errors in #GConvertError may occur. |
---|
480 | * |
---|
481 | * Converts a string from one character set to another. |
---|
482 | * |
---|
483 | * Return value: If the conversion was successful, a newly allocated |
---|
484 | * nul-terminated string, which must be freed with |
---|
485 | * g_free(). Otherwise %NULL and @error will be set. |
---|
486 | **/ |
---|
487 | gchar* |
---|
488 | g_convert (const gchar *str, |
---|
489 | gssize len, |
---|
490 | const gchar *to_codeset, |
---|
491 | const gchar *from_codeset, |
---|
492 | gsize *bytes_read, |
---|
493 | gsize *bytes_written, |
---|
494 | GError **error) |
---|
495 | { |
---|
496 | gchar *res; |
---|
497 | GIConv cd; |
---|
498 | |
---|
499 | g_return_val_if_fail (str != NULL, NULL); |
---|
500 | g_return_val_if_fail (to_codeset != NULL, NULL); |
---|
501 | g_return_val_if_fail (from_codeset != NULL, NULL); |
---|
502 | |
---|
503 | cd = open_converter (to_codeset, from_codeset, error); |
---|
504 | |
---|
505 | if (cd == (GIConv) -1) |
---|
506 | { |
---|
507 | if (bytes_read) |
---|
508 | *bytes_read = 0; |
---|
509 | |
---|
510 | if (bytes_written) |
---|
511 | *bytes_written = 0; |
---|
512 | |
---|
513 | return NULL; |
---|
514 | } |
---|
515 | |
---|
516 | res = g_convert_with_iconv (str, len, cd, |
---|
517 | bytes_read, bytes_written, |
---|
518 | error); |
---|
519 | |
---|
520 | close_converter (cd); |
---|
521 | |
---|
522 | return res; |
---|
523 | } |
---|
524 | |
---|
525 | /** |
---|
526 | * g_convert_with_iconv: |
---|
527 | * @str: the string to convert |
---|
528 | * @len: the length of the string |
---|
529 | * @converter: conversion descriptor from g_iconv_open() |
---|
530 | * @bytes_read: location to store the number of bytes in the |
---|
531 | * input string that were successfully converted, or %NULL. |
---|
532 | * Even if the conversion was successful, this may be |
---|
533 | * less than @len if there were partial characters |
---|
534 | * at the end of the input. If the error |
---|
535 | * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
---|
536 | * stored will the byte offset after the last valid |
---|
537 | * input sequence. |
---|
538 | * @bytes_written: the number of bytes stored in the output buffer (not |
---|
539 | * including the terminating nul). |
---|
540 | * @error: location to store the error occuring, or %NULL to ignore |
---|
541 | * errors. Any of the errors in #GConvertError may occur. |
---|
542 | * |
---|
543 | * Converts a string from one character set to another. |
---|
544 | * |
---|
545 | * Return value: If the conversion was successful, a newly allocated |
---|
546 | * nul-terminated string, which must be freed with |
---|
547 | * g_free(). Otherwise %NULL and @error will be set. |
---|
548 | **/ |
---|
549 | gchar* |
---|
550 | g_convert_with_iconv (const gchar *str, |
---|
551 | gssize len, |
---|
552 | GIConv converter, |
---|
553 | gsize *bytes_read, |
---|
554 | gsize *bytes_written, |
---|
555 | GError **error) |
---|
556 | { |
---|
557 | gchar *dest; |
---|
558 | gchar *outp; |
---|
559 | const gchar *p; |
---|
560 | gsize inbytes_remaining; |
---|
561 | gsize outbytes_remaining; |
---|
562 | gsize err; |
---|
563 | gsize outbuf_size; |
---|
564 | gboolean have_error = FALSE; |
---|
565 | |
---|
566 | g_return_val_if_fail (str != NULL, NULL); |
---|
567 | g_return_val_if_fail (converter != (GIConv) -1, NULL); |
---|
568 | |
---|
569 | if (len < 0) |
---|
570 | len = strlen (str); |
---|
571 | |
---|
572 | p = str; |
---|
573 | inbytes_remaining = len; |
---|
574 | outbuf_size = len + 1; /* + 1 for nul in case len == 1 */ |
---|
575 | |
---|
576 | outbytes_remaining = outbuf_size - 1; /* -1 for nul */ |
---|
577 | outp = dest = g_malloc (outbuf_size); |
---|
578 | |
---|
579 | again: |
---|
580 | |
---|
581 | err = g_iconv (converter, (char **)&p, &inbytes_remaining, &outp, &outbytes_remaining); |
---|
582 | |
---|
583 | if (err == (size_t) -1) |
---|
584 | { |
---|
585 | switch (errno) |
---|
586 | { |
---|
587 | case EINVAL: |
---|
588 | /* Incomplete text, do not report an error */ |
---|
589 | break; |
---|
590 | case E2BIG: |
---|
591 | { |
---|
592 | size_t used = outp - dest; |
---|
593 | |
---|
594 | outbuf_size *= 2; |
---|
595 | dest = g_realloc (dest, outbuf_size); |
---|
596 | |
---|
597 | outp = dest + used; |
---|
598 | outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */ |
---|
599 | |
---|
600 | goto again; |
---|
601 | } |
---|
602 | case EILSEQ: |
---|
603 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
---|
604 | _("Invalid byte sequence in conversion input")); |
---|
605 | have_error = TRUE; |
---|
606 | break; |
---|
607 | default: |
---|
608 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, |
---|
609 | _("Error during conversion: %s"), |
---|
610 | g_strerror (errno)); |
---|
611 | have_error = TRUE; |
---|
612 | break; |
---|
613 | } |
---|
614 | } |
---|
615 | |
---|
616 | *outp = '\0'; |
---|
617 | |
---|
618 | if (bytes_read) |
---|
619 | *bytes_read = p - str; |
---|
620 | else |
---|
621 | { |
---|
622 | if ((p - str) != len) |
---|
623 | { |
---|
624 | if (!have_error) |
---|
625 | { |
---|
626 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT, |
---|
627 | _("Partial character sequence at end of input")); |
---|
628 | have_error = TRUE; |
---|
629 | } |
---|
630 | } |
---|
631 | } |
---|
632 | |
---|
633 | if (bytes_written) |
---|
634 | *bytes_written = outp - dest; /* Doesn't include '\0' */ |
---|
635 | |
---|
636 | if (have_error) |
---|
637 | { |
---|
638 | g_free (dest); |
---|
639 | return NULL; |
---|
640 | } |
---|
641 | else |
---|
642 | return dest; |
---|
643 | } |
---|
644 | |
---|
645 | /** |
---|
646 | * g_convert_with_fallback: |
---|
647 | * @str: the string to convert |
---|
648 | * @len: the length of the string |
---|
649 | * @to_codeset: name of character set into which to convert @str |
---|
650 | * @from_codeset: character set of @str. |
---|
651 | * @fallback: UTF-8 string to use in place of character not |
---|
652 | * present in the target encoding. (This must be |
---|
653 | * in the target encoding), if %NULL, characters |
---|
654 | * not in the target encoding will be represented |
---|
655 | * as Unicode escapes \uxxxx or \Uxxxxyyyy. |
---|
656 | * @bytes_read: location to store the number of bytes in the |
---|
657 | * input string that were successfully converted, or %NULL. |
---|
658 | * Even if the conversion was successful, this may be |
---|
659 | * less than @len if there were partial characters |
---|
660 | * at the end of the input. |
---|
661 | * @bytes_written: the number of bytes stored in the output buffer (not |
---|
662 | * including the terminating nul). |
---|
663 | * @error: location to store the error occuring, or %NULL to ignore |
---|
664 | * errors. Any of the errors in #GConvertError may occur. |
---|
665 | * |
---|
666 | * Converts a string from one character set to another, possibly |
---|
667 | * including fallback sequences for characters not representable |
---|
668 | * in the output. Note that it is not guaranteed that the specification |
---|
669 | * for the fallback sequences in @fallback will be honored. Some |
---|
670 | * systems may do a approximate conversion from @from_codeset |
---|
671 | * to @to_codeset in their iconv() functions, |
---|
672 | * in which case GLib will simply return that approximate conversion. |
---|
673 | * |
---|
674 | * Return value: If the conversion was successful, a newly allocated |
---|
675 | * nul-terminated string, which must be freed with |
---|
676 | * g_free(). Otherwise %NULL and @error will be set. |
---|
677 | **/ |
---|
678 | gchar* |
---|
679 | g_convert_with_fallback (const gchar *str, |
---|
680 | gssize len, |
---|
681 | const gchar *to_codeset, |
---|
682 | const gchar *from_codeset, |
---|
683 | gchar *fallback, |
---|
684 | gsize *bytes_read, |
---|
685 | gsize *bytes_written, |
---|
686 | GError **error) |
---|
687 | { |
---|
688 | gchar *utf8; |
---|
689 | gchar *dest; |
---|
690 | gchar *outp; |
---|
691 | const gchar *insert_str = NULL; |
---|
692 | const gchar *p; |
---|
693 | gsize inbytes_remaining; |
---|
694 | const gchar *save_p = NULL; |
---|
695 | gsize save_inbytes = 0; |
---|
696 | gsize outbytes_remaining; |
---|
697 | gsize err; |
---|
698 | GIConv cd; |
---|
699 | gsize outbuf_size; |
---|
700 | gboolean have_error = FALSE; |
---|
701 | gboolean done = FALSE; |
---|
702 | |
---|
703 | GError *local_error = NULL; |
---|
704 | |
---|
705 | g_return_val_if_fail (str != NULL, NULL); |
---|
706 | g_return_val_if_fail (to_codeset != NULL, NULL); |
---|
707 | g_return_val_if_fail (from_codeset != NULL, NULL); |
---|
708 | |
---|
709 | if (len < 0) |
---|
710 | len = strlen (str); |
---|
711 | |
---|
712 | /* Try an exact conversion; we only proceed if this fails |
---|
713 | * due to an illegal sequence in the input string. |
---|
714 | */ |
---|
715 | dest = g_convert (str, len, to_codeset, from_codeset, |
---|
716 | bytes_read, bytes_written, &local_error); |
---|
717 | if (!local_error) |
---|
718 | return dest; |
---|
719 | |
---|
720 | if (!g_error_matches (local_error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE)) |
---|
721 | { |
---|
722 | g_propagate_error (error, local_error); |
---|
723 | return NULL; |
---|
724 | } |
---|
725 | else |
---|
726 | g_error_free (local_error); |
---|
727 | |
---|
728 | local_error = NULL; |
---|
729 | |
---|
730 | /* No go; to proceed, we need a converter from "UTF-8" to |
---|
731 | * to_codeset, and the string as UTF-8. |
---|
732 | */ |
---|
733 | cd = open_converter (to_codeset, "UTF-8", error); |
---|
734 | if (cd == (GIConv) -1) |
---|
735 | { |
---|
736 | if (bytes_read) |
---|
737 | *bytes_read = 0; |
---|
738 | |
---|
739 | if (bytes_written) |
---|
740 | *bytes_written = 0; |
---|
741 | |
---|
742 | return NULL; |
---|
743 | } |
---|
744 | |
---|
745 | utf8 = g_convert (str, len, "UTF-8", from_codeset, |
---|
746 | bytes_read, &inbytes_remaining, error); |
---|
747 | if (!utf8) |
---|
748 | { |
---|
749 | close_converter (cd); |
---|
750 | if (bytes_written) |
---|
751 | *bytes_written = 0; |
---|
752 | return NULL; |
---|
753 | } |
---|
754 | |
---|
755 | /* Now the heart of the code. We loop through the UTF-8 string, and |
---|
756 | * whenever we hit an offending character, we form fallback, convert |
---|
757 | * the fallback to the target codeset, and then go back to |
---|
758 | * converting the original string after finishing with the fallback. |
---|
759 | * |
---|
760 | * The variables save_p and save_inbytes store the input state |
---|
761 | * for the original string while we are converting the fallback |
---|
762 | */ |
---|
763 | p = utf8; |
---|
764 | |
---|
765 | outbuf_size = len + 1; /* + 1 for nul in case len == 1 */ |
---|
766 | outbytes_remaining = outbuf_size - 1; /* -1 for nul */ |
---|
767 | outp = dest = g_malloc (outbuf_size); |
---|
768 | |
---|
769 | while (!done && !have_error) |
---|
770 | { |
---|
771 | size_t inbytes_tmp = inbytes_remaining; |
---|
772 | err = g_iconv (cd, (char **)&p, &inbytes_tmp, &outp, &outbytes_remaining); |
---|
773 | inbytes_remaining = inbytes_tmp; |
---|
774 | |
---|
775 | if (err == (size_t) -1) |
---|
776 | { |
---|
777 | switch (errno) |
---|
778 | { |
---|
779 | case EINVAL: |
---|
780 | g_assert_not_reached(); |
---|
781 | break; |
---|
782 | case E2BIG: |
---|
783 | { |
---|
784 | size_t used = outp - dest; |
---|
785 | |
---|
786 | outbuf_size *= 2; |
---|
787 | dest = g_realloc (dest, outbuf_size); |
---|
788 | |
---|
789 | outp = dest + used; |
---|
790 | outbytes_remaining = outbuf_size - used - 1; /* -1 for nul */ |
---|
791 | |
---|
792 | break; |
---|
793 | } |
---|
794 | case EILSEQ: |
---|
795 | if (save_p) |
---|
796 | { |
---|
797 | /* Error converting fallback string - fatal |
---|
798 | */ |
---|
799 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
---|
800 | _("Cannot convert fallback '%s' to codeset '%s'"), |
---|
801 | insert_str, to_codeset); |
---|
802 | have_error = TRUE; |
---|
803 | break; |
---|
804 | } |
---|
805 | else |
---|
806 | { |
---|
807 | if (!fallback) |
---|
808 | { |
---|
809 | gunichar ch = g_utf8_get_char (p); |
---|
810 | insert_str = g_strdup_printf (ch < 0x10000 ? "\\u%04x" : "\\U%08x", |
---|
811 | ch); |
---|
812 | } |
---|
813 | else |
---|
814 | insert_str = fallback; |
---|
815 | |
---|
816 | save_p = g_utf8_next_char (p); |
---|
817 | save_inbytes = inbytes_remaining - (save_p - p); |
---|
818 | p = insert_str; |
---|
819 | inbytes_remaining = strlen (p); |
---|
820 | } |
---|
821 | break; |
---|
822 | default: |
---|
823 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_FAILED, |
---|
824 | _("Error during conversion: %s"), |
---|
825 | g_strerror (errno)); |
---|
826 | have_error = TRUE; |
---|
827 | break; |
---|
828 | } |
---|
829 | } |
---|
830 | else |
---|
831 | { |
---|
832 | if (save_p) |
---|
833 | { |
---|
834 | if (!fallback) |
---|
835 | g_free ((gchar *)insert_str); |
---|
836 | p = save_p; |
---|
837 | inbytes_remaining = save_inbytes; |
---|
838 | save_p = NULL; |
---|
839 | } |
---|
840 | else |
---|
841 | done = TRUE; |
---|
842 | } |
---|
843 | } |
---|
844 | |
---|
845 | /* Cleanup |
---|
846 | */ |
---|
847 | *outp = '\0'; |
---|
848 | |
---|
849 | close_converter (cd); |
---|
850 | |
---|
851 | if (bytes_written) |
---|
852 | *bytes_written = outp - dest; /* Doesn't include '\0' */ |
---|
853 | |
---|
854 | g_free (utf8); |
---|
855 | |
---|
856 | if (have_error) |
---|
857 | { |
---|
858 | if (save_p && !fallback) |
---|
859 | g_free ((gchar *)insert_str); |
---|
860 | g_free (dest); |
---|
861 | return NULL; |
---|
862 | } |
---|
863 | else |
---|
864 | return dest; |
---|
865 | } |
---|
866 | |
---|
867 | /* |
---|
868 | * g_locale_to_utf8 |
---|
869 | * |
---|
870 | * |
---|
871 | */ |
---|
872 | |
---|
873 | static gchar * |
---|
874 | strdup_len (const gchar *string, |
---|
875 | gssize len, |
---|
876 | gsize *bytes_written, |
---|
877 | gsize *bytes_read, |
---|
878 | GError **error) |
---|
879 | |
---|
880 | { |
---|
881 | gsize real_len; |
---|
882 | |
---|
883 | if (!g_utf8_validate (string, len, NULL)) |
---|
884 | { |
---|
885 | if (bytes_read) |
---|
886 | *bytes_read = 0; |
---|
887 | if (bytes_written) |
---|
888 | *bytes_written = 0; |
---|
889 | |
---|
890 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
---|
891 | _("Invalid byte sequence in conversion input")); |
---|
892 | return NULL; |
---|
893 | } |
---|
894 | |
---|
895 | if (len < 0) |
---|
896 | real_len = strlen (string); |
---|
897 | else |
---|
898 | { |
---|
899 | real_len = 0; |
---|
900 | |
---|
901 | while (real_len < len && string[real_len]) |
---|
902 | real_len++; |
---|
903 | } |
---|
904 | |
---|
905 | if (bytes_read) |
---|
906 | *bytes_read = real_len; |
---|
907 | if (bytes_written) |
---|
908 | *bytes_written = real_len; |
---|
909 | |
---|
910 | return g_strndup (string, real_len); |
---|
911 | } |
---|
912 | |
---|
913 | /** |
---|
914 | * g_locale_to_utf8: |
---|
915 | * @opsysstring: a string in the encoding of the current locale |
---|
916 | * @len: the length of the string, or -1 if the string is |
---|
917 | * nul-terminated. |
---|
918 | * @bytes_read: location to store the number of bytes in the |
---|
919 | * input string that were successfully converted, or %NULL. |
---|
920 | * Even if the conversion was successful, this may be |
---|
921 | * less than @len if there were partial characters |
---|
922 | * at the end of the input. If the error |
---|
923 | * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
---|
924 | * stored will the byte offset after the last valid |
---|
925 | * input sequence. |
---|
926 | * @bytes_written: the number of bytes stored in the output buffer (not |
---|
927 | * including the terminating nul). |
---|
928 | * @error: location to store the error occuring, or %NULL to ignore |
---|
929 | * errors. Any of the errors in #GConvertError may occur. |
---|
930 | * |
---|
931 | * Converts a string which is in the encoding used for strings by |
---|
932 | * the C runtime (usually the same as that used by the operating |
---|
933 | * system) in the current locale into a UTF-8 string. |
---|
934 | * |
---|
935 | * Return value: The converted string, or %NULL on an error. |
---|
936 | **/ |
---|
937 | gchar * |
---|
938 | g_locale_to_utf8 (const gchar *opsysstring, |
---|
939 | gssize len, |
---|
940 | gsize *bytes_read, |
---|
941 | gsize *bytes_written, |
---|
942 | GError **error) |
---|
943 | { |
---|
944 | const char *charset; |
---|
945 | |
---|
946 | if (g_get_charset (&charset)) |
---|
947 | return strdup_len (opsysstring, len, bytes_read, bytes_written, error); |
---|
948 | else |
---|
949 | return g_convert (opsysstring, len, |
---|
950 | "UTF-8", charset, bytes_read, bytes_written, error); |
---|
951 | } |
---|
952 | |
---|
953 | /** |
---|
954 | * g_locale_from_utf8: |
---|
955 | * @utf8string: a UTF-8 encoded string |
---|
956 | * @len: the length of the string, or -1 if the string is |
---|
957 | * nul-terminated. |
---|
958 | * @bytes_read: location to store the number of bytes in the |
---|
959 | * input string that were successfully converted, or %NULL. |
---|
960 | * Even if the conversion was successful, this may be |
---|
961 | * less than @len if there were partial characters |
---|
962 | * at the end of the input. If the error |
---|
963 | * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
---|
964 | * stored will the byte offset after the last valid |
---|
965 | * input sequence. |
---|
966 | * @bytes_written: the number of bytes stored in the output buffer (not |
---|
967 | * including the terminating nul). |
---|
968 | * @error: location to store the error occuring, or %NULL to ignore |
---|
969 | * errors. Any of the errors in #GConvertError may occur. |
---|
970 | * |
---|
971 | * Converts a string from UTF-8 to the encoding used for strings by |
---|
972 | * the C runtime (usually the same as that used by the operating |
---|
973 | * system) in the current locale. |
---|
974 | * |
---|
975 | * Return value: The converted string, or %NULL on an error. |
---|
976 | **/ |
---|
977 | gchar * |
---|
978 | g_locale_from_utf8 (const gchar *utf8string, |
---|
979 | gssize len, |
---|
980 | gsize *bytes_read, |
---|
981 | gsize *bytes_written, |
---|
982 | GError **error) |
---|
983 | { |
---|
984 | const gchar *charset; |
---|
985 | |
---|
986 | if (g_get_charset (&charset)) |
---|
987 | return strdup_len (utf8string, len, bytes_read, bytes_written, error); |
---|
988 | else |
---|
989 | return g_convert (utf8string, len, |
---|
990 | charset, "UTF-8", bytes_read, bytes_written, error); |
---|
991 | } |
---|
992 | |
---|
993 | #ifndef G_PLATFORM_WIN32 |
---|
994 | |
---|
995 | typedef struct _GFilenameCharsetCache GFilenameCharsetCache; |
---|
996 | |
---|
997 | struct _GFilenameCharsetCache { |
---|
998 | gboolean is_utf8; |
---|
999 | gchar *charset; |
---|
1000 | gchar *filename_charset; |
---|
1001 | }; |
---|
1002 | |
---|
1003 | static void |
---|
1004 | filename_charset_cache_free (gpointer data) |
---|
1005 | { |
---|
1006 | GFilenameCharsetCache *cache = data; |
---|
1007 | g_free (cache->charset); |
---|
1008 | g_free (cache->filename_charset); |
---|
1009 | g_free (cache); |
---|
1010 | } |
---|
1011 | |
---|
1012 | /* |
---|
1013 | * get_filename_charset: |
---|
1014 | * @charset: return location for the name of the filename encoding |
---|
1015 | * |
---|
1016 | * Determines the character set used for filenames by consulting the |
---|
1017 | * environment variables G_FILENAME_ENCODING and G_BROKEN_FILENAMES. |
---|
1018 | * |
---|
1019 | * G_FILENAME_ENCODING may be set to a comma-separated list of character |
---|
1020 | * set names. The special token "@locale" is taken to mean the character set |
---|
1021 | * for the current locale. The first character set from the list is taken |
---|
1022 | * as the filename encoding. |
---|
1023 | * If G_FILENAME_ENCODING is not set, but G_BROKEN_FILENAMES is, the |
---|
1024 | * character set of the current locale is taken as the filename encoding. |
---|
1025 | * |
---|
1026 | * The returned @charset belongs to GLib and must not be freed. |
---|
1027 | * |
---|
1028 | * Return value: %TRUE if the charset used for filename is UTF-8. |
---|
1029 | */ |
---|
1030 | static gboolean |
---|
1031 | get_filename_charset (const gchar **filename_charset) |
---|
1032 | { |
---|
1033 | static GStaticPrivate cache_private = G_STATIC_PRIVATE_INIT; |
---|
1034 | GFilenameCharsetCache *cache = g_static_private_get (&cache_private); |
---|
1035 | const gchar *charset; |
---|
1036 | |
---|
1037 | if (!cache) |
---|
1038 | { |
---|
1039 | cache = g_new0 (GFilenameCharsetCache, 1); |
---|
1040 | g_static_private_set (&cache_private, cache, filename_charset_cache_free); |
---|
1041 | } |
---|
1042 | |
---|
1043 | g_get_charset (&charset); |
---|
1044 | |
---|
1045 | if (!(cache->charset && strcmp (cache->charset, charset) == 0)) |
---|
1046 | { |
---|
1047 | const gchar *new_charset; |
---|
1048 | gchar *p, *q; |
---|
1049 | |
---|
1050 | g_free (cache->charset); |
---|
1051 | g_free (cache->filename_charset); |
---|
1052 | cache->charset = g_strdup (charset); |
---|
1053 | |
---|
1054 | p = getenv ("G_FILENAME_ENCODING"); |
---|
1055 | if (p != NULL) |
---|
1056 | { |
---|
1057 | q = strchr (p, ','); |
---|
1058 | if (!q) |
---|
1059 | q = p + strlen (p); |
---|
1060 | |
---|
1061 | if (strncmp ("@locale", p, q - p) == 0) |
---|
1062 | { |
---|
1063 | cache->is_utf8 = g_get_charset (&new_charset); |
---|
1064 | cache->filename_charset = g_strdup (new_charset); |
---|
1065 | } |
---|
1066 | else |
---|
1067 | { |
---|
1068 | cache->filename_charset = g_strndup (p, q - p); |
---|
1069 | cache->is_utf8 = (strcmp (cache->filename_charset, "UTF-8") == 0); |
---|
1070 | } |
---|
1071 | } |
---|
1072 | else if (getenv ("G_BROKEN_FILENAMES") != NULL) |
---|
1073 | { |
---|
1074 | cache->is_utf8 = g_get_charset (&new_charset); |
---|
1075 | cache->filename_charset = g_strdup (new_charset); |
---|
1076 | } |
---|
1077 | else |
---|
1078 | { |
---|
1079 | cache->filename_charset = g_strdup ("UTF-8"); |
---|
1080 | cache->is_utf8 = TRUE; |
---|
1081 | } |
---|
1082 | } |
---|
1083 | |
---|
1084 | if (filename_charset) |
---|
1085 | *filename_charset = cache->filename_charset; |
---|
1086 | |
---|
1087 | return cache->is_utf8; |
---|
1088 | } |
---|
1089 | |
---|
1090 | #else /* G_PLATFORM_WIN32 */ |
---|
1091 | static gboolean |
---|
1092 | get_filename_charset (const gchar **filename_charset) |
---|
1093 | { |
---|
1094 | g_get_charset (filename_charset); |
---|
1095 | return FALSE; |
---|
1096 | } |
---|
1097 | #endif /* G_PLATFORM_WIN32 */ |
---|
1098 | |
---|
1099 | /* This is called from g_thread_init(). It's used to |
---|
1100 | * initialize some static data in a threadsafe way. |
---|
1101 | */ |
---|
1102 | void |
---|
1103 | _g_convert_thread_init (void) |
---|
1104 | { |
---|
1105 | const gchar *dummy; |
---|
1106 | (void) get_filename_charset (&dummy); |
---|
1107 | } |
---|
1108 | |
---|
1109 | /** |
---|
1110 | * g_filename_to_utf8: |
---|
1111 | * @opsysstring: a string in the encoding for filenames |
---|
1112 | * @len: the length of the string, or -1 if the string is |
---|
1113 | * nul-terminated. |
---|
1114 | * @bytes_read: location to store the number of bytes in the |
---|
1115 | * input string that were successfully converted, or %NULL. |
---|
1116 | * Even if the conversion was successful, this may be |
---|
1117 | * less than @len if there were partial characters |
---|
1118 | * at the end of the input. If the error |
---|
1119 | * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
---|
1120 | * stored will the byte offset after the last valid |
---|
1121 | * input sequence. |
---|
1122 | * @bytes_written: the number of bytes stored in the output buffer (not |
---|
1123 | * including the terminating nul). |
---|
1124 | * @error: location to store the error occuring, or %NULL to ignore |
---|
1125 | * errors. Any of the errors in #GConvertError may occur. |
---|
1126 | * |
---|
1127 | * Converts a string which is in the encoding used for filenames |
---|
1128 | * into a UTF-8 string. |
---|
1129 | * |
---|
1130 | * Return value: The converted string, or %NULL on an error. |
---|
1131 | **/ |
---|
1132 | gchar* |
---|
1133 | g_filename_to_utf8 (const gchar *opsysstring, |
---|
1134 | gssize len, |
---|
1135 | gsize *bytes_read, |
---|
1136 | gsize *bytes_written, |
---|
1137 | GError **error) |
---|
1138 | { |
---|
1139 | const gchar *charset; |
---|
1140 | |
---|
1141 | if (get_filename_charset (&charset)) |
---|
1142 | return strdup_len (opsysstring, len, bytes_read, bytes_written, error); |
---|
1143 | else |
---|
1144 | return g_convert (opsysstring, len, |
---|
1145 | "UTF-8", charset, bytes_read, bytes_written, error); |
---|
1146 | } |
---|
1147 | |
---|
1148 | /** |
---|
1149 | * g_filename_from_utf8: |
---|
1150 | * @utf8string: a UTF-8 encoded string. |
---|
1151 | * @len: the length of the string, or -1 if the string is |
---|
1152 | * nul-terminated. |
---|
1153 | * @bytes_read: location to store the number of bytes in the |
---|
1154 | * input string that were successfully converted, or %NULL. |
---|
1155 | * Even if the conversion was successful, this may be |
---|
1156 | * less than @len if there were partial characters |
---|
1157 | * at the end of the input. If the error |
---|
1158 | * #G_CONVERT_ERROR_ILLEGAL_SEQUENCE occurs, the value |
---|
1159 | * stored will the byte offset after the last valid |
---|
1160 | * input sequence. |
---|
1161 | * @bytes_written: the number of bytes stored in the output buffer (not |
---|
1162 | * including the terminating nul). |
---|
1163 | * @error: location to store the error occuring, or %NULL to ignore |
---|
1164 | * errors. Any of the errors in #GConvertError may occur. |
---|
1165 | * |
---|
1166 | * Converts a string from UTF-8 to the encoding used for filenames. |
---|
1167 | * |
---|
1168 | * Return value: The converted string, or %NULL on an error. |
---|
1169 | **/ |
---|
1170 | gchar* |
---|
1171 | g_filename_from_utf8 (const gchar *utf8string, |
---|
1172 | gssize len, |
---|
1173 | gsize *bytes_read, |
---|
1174 | gsize *bytes_written, |
---|
1175 | GError **error) |
---|
1176 | { |
---|
1177 | const gchar *charset; |
---|
1178 | |
---|
1179 | if (get_filename_charset (&charset)) |
---|
1180 | return strdup_len (utf8string, len, bytes_read, bytes_written, error); |
---|
1181 | else |
---|
1182 | return g_convert (utf8string, len, |
---|
1183 | charset, "UTF-8", bytes_read, bytes_written, error); |
---|
1184 | } |
---|
1185 | |
---|
1186 | /* Test of haystack has the needle prefix, comparing case |
---|
1187 | * insensitive. haystack may be UTF-8, but needle must |
---|
1188 | * contain only ascii. */ |
---|
1189 | static gboolean |
---|
1190 | has_case_prefix (const gchar *haystack, const gchar *needle) |
---|
1191 | { |
---|
1192 | const gchar *h, *n; |
---|
1193 | |
---|
1194 | /* Eat one character at a time. */ |
---|
1195 | h = haystack; |
---|
1196 | n = needle; |
---|
1197 | |
---|
1198 | while (*n && *h && |
---|
1199 | g_ascii_tolower (*n) == g_ascii_tolower (*h)) |
---|
1200 | { |
---|
1201 | n++; |
---|
1202 | h++; |
---|
1203 | } |
---|
1204 | |
---|
1205 | return *n == '\0'; |
---|
1206 | } |
---|
1207 | |
---|
1208 | typedef enum { |
---|
1209 | UNSAFE_ALL = 0x1, /* Escape all unsafe characters */ |
---|
1210 | UNSAFE_ALLOW_PLUS = 0x2, /* Allows '+' */ |
---|
1211 | UNSAFE_PATH = 0x8, /* Allows '/', '&', '=', ':', '@', '+', '$' and ',' */ |
---|
1212 | UNSAFE_HOST = 0x10, /* Allows '/' and ':' and '@' */ |
---|
1213 | UNSAFE_SLASHES = 0x20 /* Allows all characters except for '/' and '%' */ |
---|
1214 | } UnsafeCharacterSet; |
---|
1215 | |
---|
1216 | static const guchar acceptable[96] = { |
---|
1217 | /* A table of the ASCII chars from space (32) to DEL (127) */ |
---|
1218 | /* ! " # $ % & ' ( ) * + , - . / */ |
---|
1219 | 0x00,0x3F,0x20,0x20,0x28,0x00,0x2C,0x3F,0x3F,0x3F,0x3F,0x2A,0x28,0x3F,0x3F,0x1C, |
---|
1220 | /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */ |
---|
1221 | 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x38,0x20,0x20,0x2C,0x20,0x20, |
---|
1222 | /* @ A B C D E F G H I J K L M N O */ |
---|
1223 | 0x38,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, |
---|
1224 | /* P Q R S T U V W X Y Z [ \ ] ^ _ */ |
---|
1225 | 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x20,0x3F, |
---|
1226 | /* ` a b c d e f g h i j k l m n o */ |
---|
1227 | 0x20,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F, |
---|
1228 | /* p q r s t u v w x y z { | } ~ DEL */ |
---|
1229 | 0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x3F,0x20,0x20,0x20,0x3F,0x20 |
---|
1230 | }; |
---|
1231 | |
---|
1232 | static const gchar hex[16] = "0123456789ABCDEF"; |
---|
1233 | |
---|
1234 | /* Note: This escape function works on file: URIs, but if you want to |
---|
1235 | * escape something else, please read RFC-2396 */ |
---|
1236 | static gchar * |
---|
1237 | g_escape_uri_string (const gchar *string, |
---|
1238 | UnsafeCharacterSet mask) |
---|
1239 | { |
---|
1240 | #define ACCEPTABLE(a) ((a)>=32 && (a)<128 && (acceptable[(a)-32] & use_mask)) |
---|
1241 | |
---|
1242 | const gchar *p; |
---|
1243 | gchar *q; |
---|
1244 | gchar *result; |
---|
1245 | int c; |
---|
1246 | gint unacceptable; |
---|
1247 | UnsafeCharacterSet use_mask; |
---|
1248 | |
---|
1249 | g_return_val_if_fail (mask == UNSAFE_ALL |
---|
1250 | || mask == UNSAFE_ALLOW_PLUS |
---|
1251 | || mask == UNSAFE_PATH |
---|
1252 | || mask == UNSAFE_HOST |
---|
1253 | || mask == UNSAFE_SLASHES, NULL); |
---|
1254 | |
---|
1255 | unacceptable = 0; |
---|
1256 | use_mask = mask; |
---|
1257 | for (p = string; *p != '\0'; p++) |
---|
1258 | { |
---|
1259 | c = (guchar) *p; |
---|
1260 | if (!ACCEPTABLE (c)) |
---|
1261 | unacceptable++; |
---|
1262 | } |
---|
1263 | |
---|
1264 | result = g_malloc (p - string + unacceptable * 2 + 1); |
---|
1265 | |
---|
1266 | use_mask = mask; |
---|
1267 | for (q = result, p = string; *p != '\0'; p++) |
---|
1268 | { |
---|
1269 | c = (guchar) *p; |
---|
1270 | |
---|
1271 | if (!ACCEPTABLE (c)) |
---|
1272 | { |
---|
1273 | *q++ = '%'; /* means hex coming */ |
---|
1274 | *q++ = hex[c >> 4]; |
---|
1275 | *q++ = hex[c & 15]; |
---|
1276 | } |
---|
1277 | else |
---|
1278 | *q++ = *p; |
---|
1279 | } |
---|
1280 | |
---|
1281 | *q = '\0'; |
---|
1282 | |
---|
1283 | return result; |
---|
1284 | } |
---|
1285 | |
---|
1286 | |
---|
1287 | static gchar * |
---|
1288 | g_escape_file_uri (const gchar *hostname, |
---|
1289 | const gchar *pathname) |
---|
1290 | { |
---|
1291 | char *escaped_hostname = NULL; |
---|
1292 | char *escaped_path; |
---|
1293 | char *res; |
---|
1294 | |
---|
1295 | #ifdef G_OS_WIN32 |
---|
1296 | char *p, *backslash; |
---|
1297 | |
---|
1298 | /* Turn backslashes into forward slashes. That's what Netscape |
---|
1299 | * does, and they are actually more or less equivalent in Windows. |
---|
1300 | */ |
---|
1301 | |
---|
1302 | pathname = g_strdup (pathname); |
---|
1303 | p = (char *) pathname; |
---|
1304 | |
---|
1305 | while ((backslash = strchr (p, '\\')) != NULL) |
---|
1306 | { |
---|
1307 | *backslash = '/'; |
---|
1308 | p = backslash + 1; |
---|
1309 | } |
---|
1310 | #endif |
---|
1311 | |
---|
1312 | if (hostname && *hostname != '\0') |
---|
1313 | { |
---|
1314 | escaped_hostname = g_escape_uri_string (hostname, UNSAFE_HOST); |
---|
1315 | } |
---|
1316 | |
---|
1317 | escaped_path = g_escape_uri_string (pathname, UNSAFE_PATH); |
---|
1318 | |
---|
1319 | res = g_strconcat ("file://", |
---|
1320 | (escaped_hostname) ? escaped_hostname : "", |
---|
1321 | (*escaped_path != '/') ? "/" : "", |
---|
1322 | escaped_path, |
---|
1323 | NULL); |
---|
1324 | |
---|
1325 | #ifdef G_OS_WIN32 |
---|
1326 | g_free ((char *) pathname); |
---|
1327 | #endif |
---|
1328 | |
---|
1329 | g_free (escaped_hostname); |
---|
1330 | g_free (escaped_path); |
---|
1331 | |
---|
1332 | return res; |
---|
1333 | } |
---|
1334 | |
---|
1335 | static int |
---|
1336 | unescape_character (const char *scanner) |
---|
1337 | { |
---|
1338 | int first_digit; |
---|
1339 | int second_digit; |
---|
1340 | |
---|
1341 | first_digit = g_ascii_xdigit_value (scanner[0]); |
---|
1342 | if (first_digit < 0) |
---|
1343 | return -1; |
---|
1344 | |
---|
1345 | second_digit = g_ascii_xdigit_value (scanner[1]); |
---|
1346 | if (second_digit < 0) |
---|
1347 | return -1; |
---|
1348 | |
---|
1349 | return (first_digit << 4) | second_digit; |
---|
1350 | } |
---|
1351 | |
---|
1352 | static gchar * |
---|
1353 | g_unescape_uri_string (const char *escaped, |
---|
1354 | int len, |
---|
1355 | const char *illegal_escaped_characters, |
---|
1356 | gboolean ascii_must_not_be_escaped) |
---|
1357 | { |
---|
1358 | const gchar *in, *in_end; |
---|
1359 | gchar *out, *result; |
---|
1360 | int c; |
---|
1361 | |
---|
1362 | if (escaped == NULL) |
---|
1363 | return NULL; |
---|
1364 | |
---|
1365 | if (len < 0) |
---|
1366 | len = strlen (escaped); |
---|
1367 | |
---|
1368 | result = g_malloc (len + 1); |
---|
1369 | |
---|
1370 | out = result; |
---|
1371 | for (in = escaped, in_end = escaped + len; in < in_end; in++) |
---|
1372 | { |
---|
1373 | c = *in; |
---|
1374 | |
---|
1375 | if (c == '%') |
---|
1376 | { |
---|
1377 | /* catch partial escape sequences past the end of the substring */ |
---|
1378 | if (in + 3 > in_end) |
---|
1379 | break; |
---|
1380 | |
---|
1381 | c = unescape_character (in + 1); |
---|
1382 | |
---|
1383 | /* catch bad escape sequences and NUL characters */ |
---|
1384 | if (c <= 0) |
---|
1385 | break; |
---|
1386 | |
---|
1387 | /* catch escaped ASCII */ |
---|
1388 | if (ascii_must_not_be_escaped && c <= 0x7F) |
---|
1389 | break; |
---|
1390 | |
---|
1391 | /* catch other illegal escaped characters */ |
---|
1392 | if (strchr (illegal_escaped_characters, c) != NULL) |
---|
1393 | break; |
---|
1394 | |
---|
1395 | in += 2; |
---|
1396 | } |
---|
1397 | |
---|
1398 | *out++ = c; |
---|
1399 | } |
---|
1400 | |
---|
1401 | g_assert (out - result <= len); |
---|
1402 | *out = '\0'; |
---|
1403 | |
---|
1404 | if (in != in_end) |
---|
1405 | { |
---|
1406 | g_free (result); |
---|
1407 | return NULL; |
---|
1408 | } |
---|
1409 | |
---|
1410 | return result; |
---|
1411 | } |
---|
1412 | |
---|
1413 | static gboolean |
---|
1414 | is_asciialphanum (gunichar c) |
---|
1415 | { |
---|
1416 | return c <= 0x7F && g_ascii_isalnum (c); |
---|
1417 | } |
---|
1418 | |
---|
1419 | static gboolean |
---|
1420 | is_asciialpha (gunichar c) |
---|
1421 | { |
---|
1422 | return c <= 0x7F && g_ascii_isalpha (c); |
---|
1423 | } |
---|
1424 | |
---|
1425 | /* allows an empty string */ |
---|
1426 | static gboolean |
---|
1427 | hostname_validate (const char *hostname) |
---|
1428 | { |
---|
1429 | const char *p; |
---|
1430 | gunichar c, first_char, last_char; |
---|
1431 | |
---|
1432 | p = hostname; |
---|
1433 | if (*p == '\0') |
---|
1434 | return TRUE; |
---|
1435 | do |
---|
1436 | { |
---|
1437 | /* read in a label */ |
---|
1438 | c = g_utf8_get_char (p); |
---|
1439 | p = g_utf8_next_char (p); |
---|
1440 | if (!is_asciialphanum (c)) |
---|
1441 | return FALSE; |
---|
1442 | first_char = c; |
---|
1443 | do |
---|
1444 | { |
---|
1445 | last_char = c; |
---|
1446 | c = g_utf8_get_char (p); |
---|
1447 | p = g_utf8_next_char (p); |
---|
1448 | } |
---|
1449 | while (is_asciialphanum (c) || c == '-'); |
---|
1450 | if (last_char == '-') |
---|
1451 | return FALSE; |
---|
1452 | |
---|
1453 | /* if that was the last label, check that it was a toplabel */ |
---|
1454 | if (c == '\0' || (c == '.' && *p == '\0')) |
---|
1455 | return is_asciialpha (first_char); |
---|
1456 | } |
---|
1457 | while (c == '.'); |
---|
1458 | return FALSE; |
---|
1459 | } |
---|
1460 | |
---|
1461 | /** |
---|
1462 | * g_filename_from_uri: |
---|
1463 | * @uri: a uri describing a filename (escaped, encoded in ASCII). |
---|
1464 | * @hostname: Location to store hostname for the URI, or %NULL. |
---|
1465 | * If there is no hostname in the URI, %NULL will be |
---|
1466 | * stored in this location. |
---|
1467 | * @error: location to store the error occuring, or %NULL to ignore |
---|
1468 | * errors. Any of the errors in #GConvertError may occur. |
---|
1469 | * |
---|
1470 | * Converts an escaped ASCII-encoded URI to a local filename in the |
---|
1471 | * encoding used for filenames. |
---|
1472 | * |
---|
1473 | * Return value: a newly-allocated string holding the resulting |
---|
1474 | * filename, or %NULL on an error. |
---|
1475 | **/ |
---|
1476 | gchar * |
---|
1477 | g_filename_from_uri (const gchar *uri, |
---|
1478 | gchar **hostname, |
---|
1479 | GError **error) |
---|
1480 | { |
---|
1481 | const char *path_part; |
---|
1482 | const char *host_part; |
---|
1483 | char *unescaped_hostname; |
---|
1484 | char *result; |
---|
1485 | char *filename; |
---|
1486 | int offs; |
---|
1487 | #ifdef G_OS_WIN32 |
---|
1488 | char *p, *slash; |
---|
1489 | #endif |
---|
1490 | |
---|
1491 | if (hostname) |
---|
1492 | *hostname = NULL; |
---|
1493 | |
---|
1494 | if (!has_case_prefix (uri, "file:/")) |
---|
1495 | { |
---|
1496 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
---|
1497 | _("The URI '%s' is not an absolute URI using the file scheme"), |
---|
1498 | uri); |
---|
1499 | return NULL; |
---|
1500 | } |
---|
1501 | |
---|
1502 | path_part = uri + strlen ("file:"); |
---|
1503 | |
---|
1504 | if (strchr (path_part, '#') != NULL) |
---|
1505 | { |
---|
1506 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
---|
1507 | _("The local file URI '%s' may not include a '#'"), |
---|
1508 | uri); |
---|
1509 | return NULL; |
---|
1510 | } |
---|
1511 | |
---|
1512 | if (has_case_prefix (path_part, "///")) |
---|
1513 | path_part += 2; |
---|
1514 | else if (has_case_prefix (path_part, "//")) |
---|
1515 | { |
---|
1516 | path_part += 2; |
---|
1517 | host_part = path_part; |
---|
1518 | |
---|
1519 | path_part = strchr (path_part, '/'); |
---|
1520 | |
---|
1521 | if (path_part == NULL) |
---|
1522 | { |
---|
1523 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
---|
1524 | _("The URI '%s' is invalid"), |
---|
1525 | uri); |
---|
1526 | return NULL; |
---|
1527 | } |
---|
1528 | |
---|
1529 | unescaped_hostname = g_unescape_uri_string (host_part, path_part - host_part, "", TRUE); |
---|
1530 | |
---|
1531 | if (unescaped_hostname == NULL || |
---|
1532 | !hostname_validate (unescaped_hostname)) |
---|
1533 | { |
---|
1534 | g_free (unescaped_hostname); |
---|
1535 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
---|
1536 | _("The hostname of the URI '%s' is invalid"), |
---|
1537 | uri); |
---|
1538 | return NULL; |
---|
1539 | } |
---|
1540 | |
---|
1541 | if (hostname) |
---|
1542 | *hostname = unescaped_hostname; |
---|
1543 | else |
---|
1544 | g_free (unescaped_hostname); |
---|
1545 | } |
---|
1546 | |
---|
1547 | filename = g_unescape_uri_string (path_part, -1, "/", FALSE); |
---|
1548 | |
---|
1549 | if (filename == NULL) |
---|
1550 | { |
---|
1551 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_BAD_URI, |
---|
1552 | _("The URI '%s' contains invalidly escaped characters"), |
---|
1553 | uri); |
---|
1554 | return NULL; |
---|
1555 | } |
---|
1556 | |
---|
1557 | offs = 0; |
---|
1558 | #ifdef G_OS_WIN32 |
---|
1559 | /* Drop localhost */ |
---|
1560 | if (hostname && *hostname != NULL && |
---|
1561 | g_ascii_strcasecmp (*hostname, "localhost") == 0) |
---|
1562 | { |
---|
1563 | g_free (*hostname); |
---|
1564 | *hostname = NULL; |
---|
1565 | } |
---|
1566 | |
---|
1567 | /* Turn slashes into backslashes, because that's the canonical spelling */ |
---|
1568 | p = filename; |
---|
1569 | while ((slash = strchr (p, '/')) != NULL) |
---|
1570 | { |
---|
1571 | *slash = '\\'; |
---|
1572 | p = slash + 1; |
---|
1573 | } |
---|
1574 | |
---|
1575 | /* Windows URIs with a drive letter can be like "file://host/c:/foo" |
---|
1576 | * or "file://host/c|/foo" (some Netscape versions). In those cases, start |
---|
1577 | * the filename from the drive letter. |
---|
1578 | */ |
---|
1579 | if (g_ascii_isalpha (filename[1])) |
---|
1580 | { |
---|
1581 | if (filename[2] == ':') |
---|
1582 | offs = 1; |
---|
1583 | else if (filename[2] == '|') |
---|
1584 | { |
---|
1585 | filename[2] = ':'; |
---|
1586 | offs = 1; |
---|
1587 | } |
---|
1588 | } |
---|
1589 | #endif |
---|
1590 | |
---|
1591 | result = g_strdup (filename + offs); |
---|
1592 | g_free (filename); |
---|
1593 | |
---|
1594 | return result; |
---|
1595 | } |
---|
1596 | |
---|
1597 | /** |
---|
1598 | * g_filename_to_uri: |
---|
1599 | * @filename: an absolute filename specified in the encoding |
---|
1600 | * used for filenames by the operating system. |
---|
1601 | * @hostname: A UTF-8 encoded hostname, or %NULL for none. |
---|
1602 | * @error: location to store the error occuring, or %NULL to ignore |
---|
1603 | * errors. Any of the errors in #GConvertError may occur. |
---|
1604 | * |
---|
1605 | * Converts an absolute filename to an escaped ASCII-encoded URI. |
---|
1606 | * |
---|
1607 | * Return value: a newly-allocated string holding the resulting |
---|
1608 | * URI, or %NULL on an error. |
---|
1609 | **/ |
---|
1610 | gchar * |
---|
1611 | g_filename_to_uri (const gchar *filename, |
---|
1612 | const gchar *hostname, |
---|
1613 | GError **error) |
---|
1614 | { |
---|
1615 | char *escaped_uri; |
---|
1616 | |
---|
1617 | g_return_val_if_fail (filename != NULL, NULL); |
---|
1618 | |
---|
1619 | if (!g_path_is_absolute (filename)) |
---|
1620 | { |
---|
1621 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_NOT_ABSOLUTE_PATH, |
---|
1622 | _("The pathname '%s' is not an absolute path"), |
---|
1623 | filename); |
---|
1624 | return NULL; |
---|
1625 | } |
---|
1626 | |
---|
1627 | if (hostname && |
---|
1628 | !(g_utf8_validate (hostname, -1, NULL) |
---|
1629 | && hostname_validate (hostname))) |
---|
1630 | { |
---|
1631 | g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, |
---|
1632 | _("Invalid hostname")); |
---|
1633 | return NULL; |
---|
1634 | } |
---|
1635 | |
---|
1636 | #ifdef G_OS_WIN32 |
---|
1637 | /* Don't use localhost unnecessarily */ |
---|
1638 | if (hostname && g_ascii_strcasecmp (hostname, "localhost") == 0) |
---|
1639 | hostname = NULL; |
---|
1640 | #endif |
---|
1641 | |
---|
1642 | escaped_uri = g_escape_file_uri (hostname, filename); |
---|
1643 | |
---|
1644 | return escaped_uri; |
---|
1645 | } |
---|