source: trunk/third/glib2/tests/unicode-encoding.c @ 18159

Revision 18159, 8.8 KB checked in by ghudson, 22 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r18158, which included commits to RCS files with non-trunk default branches.
Line 
1#undef G_DISABLE_ASSERT
2#undef G_LOG_DOMAIN
3
4#include <stdarg.h>
5#include <stdio.h>
6#include <stdlib.h>
7#include <string.h>
8#include <glib.h>
9
10static gint exit_status = 0;
11
12static void
13croak (char *format, ...)
14{
15  va_list va;
16 
17  va_start (va, format);
18  vfprintf (stderr, format, va);
19  va_end (va);
20
21  exit (1);
22}
23
24static void
25fail (char *format, ...)
26{
27  va_list va;
28 
29  va_start (va, format);
30  vfprintf (stderr, format, va);
31  va_end (va);
32
33  exit_status |= 1;
34}
35
36typedef enum
37{
38  VALID,
39  INCOMPLETE,
40  NOTUNICODE,
41  OVERLONG,
42  MALFORMED
43} Status;
44
45static gboolean
46ucs4_equal (gunichar *a, gunichar *b)
47{
48  while (*a && *b && (*a == *b))
49    {
50      a++;
51      b++;
52    }
53
54  return (*a == *b);
55}
56
57static gboolean
58utf16_equal (gunichar2 *a, gunichar2 *b)
59{
60  while (*a && *b && (*a == *b))
61    {
62      a++;
63      b++;
64    }
65
66  return (*a == *b);
67}
68
69static gint
70utf16_count (gunichar2 *a)
71{
72  gint result = 0;
73 
74  while (a[result])
75    result++;
76
77  return result;
78}
79
80static void
81process (gint      line,
82         gchar    *utf8,
83         Status    status,
84         gunichar *ucs4,
85         gint      ucs4_len)
86{
87  const gchar *end;
88  gboolean is_valid = g_utf8_validate (utf8, -1, &end);
89  GError *error = NULL;
90  glong items_read, items_written;
91
92  switch (status)
93    {
94    case VALID:
95      if (!is_valid)
96        {
97          fail ("line %d: valid but g_utf8_validate returned FALSE\n", line);
98          return;
99        }
100      break;
101    case NOTUNICODE:
102    case INCOMPLETE:
103    case OVERLONG:
104    case MALFORMED:
105      if (is_valid)
106        {
107          fail ("line %d: invalid but g_utf8_validate returned TRUE\n", line);
108          return;
109        }
110      break;
111    }
112
113  if (status == INCOMPLETE)
114    {
115      gunichar *ucs4_result;     
116
117      ucs4_result = g_utf8_to_ucs4 (utf8, -1, NULL, NULL, &error);
118
119      if (!error || !g_error_matches (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT))
120        {
121          fail ("line %d: incomplete input not properly detected\n", line);
122          return;
123        }
124      g_clear_error (&error);
125
126      ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, NULL, &error);
127
128      if (!ucs4_result || items_read == strlen (utf8))
129        {
130          fail ("line %d: incomplete input not properly detected\n", line);
131          return;
132        }
133
134      g_free (ucs4_result);
135    }
136
137  if (status == VALID || status == NOTUNICODE)
138    {
139      gunichar *ucs4_result;
140      gchar *utf8_result;
141
142      ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, &items_written, &error);
143      if (!ucs4_result)
144        {
145          fail ("line %d: conversion to ucs4 failed: %s\n", line, error->message);
146          return;
147        }
148     
149      if (!ucs4_equal (ucs4_result, ucs4) ||
150          items_read != strlen (utf8) ||
151          items_written != ucs4_len)
152        {
153          fail ("line %d: results of conversion to ucs4 do not match expected.\n", line);
154          return;
155        }
156
157      g_free (ucs4_result);
158
159      ucs4_result = g_utf8_to_ucs4_fast (utf8, -1, &items_written);
160     
161      if (!ucs4_equal (ucs4_result, ucs4) ||
162          items_written != ucs4_len)
163        {
164          fail ("line %d: results of conversion to ucs4 do not match expected.\n", line);
165          return;
166        }
167
168      utf8_result = g_ucs4_to_utf8 (ucs4_result, -1, &items_read, &items_written, &error);
169      if (!utf8_result)
170        {
171          fail ("line %d: conversion back to utf8 failed: %s", line, error->message);
172          return;
173        }
174
175      if (strcmp (utf8_result, utf8) != 0 ||
176          items_read != ucs4_len ||
177          items_written != strlen (utf8))
178        {
179          fail ("line %d: conversion back to utf8 did not match original\n", line);
180          return;
181        }
182
183      g_free (utf8_result);
184      g_free (ucs4_result);
185    }
186
187  if (status == VALID)
188    {
189      gunichar2 *utf16_expected_tmp;
190      gunichar2 *utf16_expected;
191      gunichar2 *utf16_from_utf8;
192      gunichar2 *utf16_from_ucs4;
193      gunichar *ucs4_result;
194      gsize bytes_written;
195      gint n_chars;
196      gchar *utf8_result;
197
198#ifdef G_OS_WIN32
199#define TARGET "UTF-16LE"
200#else
201#define TARGET "UTF-16"
202#endif
203
204      if (!(utf16_expected_tmp = (gunichar2 *)g_convert (utf8, -1, TARGET, "UTF-8",
205                                                         NULL, &bytes_written, NULL)))
206        {
207          fail ("line %d: could not convert to UTF-16 via g_convert\n", line);
208          return;
209        }
210
211      /* zero-terminate and remove BOM
212       */
213      n_chars = bytes_written / 2;
214      if (utf16_expected_tmp[0] == 0xfeff) /* BOM */
215        {
216          n_chars--;
217          utf16_expected = g_new (gunichar2, n_chars + 1);
218          memcpy (utf16_expected, utf16_expected_tmp + 1, sizeof(gunichar2) * n_chars);
219        }
220      else if (utf16_expected_tmp[0] == 0xfffe) /* ANTI-BOM */
221        {
222          fail ("line %d: conversion via iconv to \"UTF-16\" is not native-endian\n", line);
223          return;
224        }
225      else
226        {
227          utf16_expected = g_new (gunichar2, n_chars + 1);
228          memcpy (utf16_expected, utf16_expected_tmp, sizeof(gunichar2) * n_chars);
229        }
230
231      utf16_expected[n_chars] = '\0';
232     
233      if (!(utf16_from_utf8 = g_utf8_to_utf16 (utf8, -1, &items_read, &items_written, &error)))
234        {
235          fail ("line %d: conversion to ucs16 failed: %s\n", line, error->message);
236          return;
237        }
238
239      if (items_read != strlen (utf8) ||
240          utf16_count (utf16_from_utf8) != items_written)
241        {
242          fail ("line %d: length error in conversion to ucs16\n", line);
243          return;
244        }
245
246      if (!(utf16_from_ucs4 = g_ucs4_to_utf16 (ucs4, -1, &items_read, &items_written, &error)))
247        {
248          fail ("line %d: conversion to ucs16 failed: %s\n", line, error->message);
249          return;
250        }
251
252      if (items_read != ucs4_len ||
253          utf16_count (utf16_from_ucs4) != items_written)
254        {
255          fail ("line %d: length error in conversion to ucs16\n", line);
256          return;
257        }
258
259      if (!utf16_equal (utf16_from_utf8, utf16_expected) ||
260          !utf16_equal (utf16_from_ucs4, utf16_expected))
261        {
262          fail ("line %d: results of conversion to ucs16 do not match\n", line);
263          return;
264        }
265
266      if (!(utf8_result = g_utf16_to_utf8 (utf16_from_utf8, -1, &items_read, &items_written, &error)))
267        {
268          fail ("line %d: conversion back to utf8 failed: %s\n", line, error->message);
269          return;
270        }
271
272      if (items_read != utf16_count (utf16_from_utf8) ||
273          items_written != strlen (utf8))
274        {
275          fail ("line %d: length error in conversion from ucs16 to utf8\n", line);
276          return;
277        }
278
279      if (!(ucs4_result = g_utf16_to_ucs4 (utf16_from_ucs4, -1, &items_read, &items_written, &error)))
280        {
281          fail ("line %d: conversion back to utf8/ucs4 failed\n", line);
282          return;
283        }
284
285      if (items_read != utf16_count (utf16_from_utf8) ||
286          items_written != ucs4_len)
287        {
288          fail ("line %d: length error in conversion from ucs16 to ucs4\n", line);
289          return;
290        }
291
292      if (strcmp (utf8, utf8_result) != 0 ||
293          !ucs4_equal (ucs4, ucs4_result))
294        {
295          fail ("line %d: conversion back to utf8/ucs4 did not match original\n", line);
296          return;
297        }
298     
299      g_free (utf16_expected_tmp);
300      g_free (utf16_expected);
301      g_free (utf16_from_utf8);
302      g_free (utf16_from_ucs4);
303      g_free (utf8_result);
304      g_free (ucs4_result);
305    }
306}
307
308int
309main (int argc, char **argv)
310{
311  gchar *srcdir = getenv ("srcdir");
312  gchar *testfile;
313  gchar *contents;
314  GError *error = NULL;
315  gchar *p, *end;
316  char *tmp;
317  gint state = 0;
318  gint line = 1;
319  gint start_line = 0;          /* Quiet GCC */
320  gchar *utf8 = NULL;           /* Quiet GCC */
321  GArray *ucs4;
322  Status status = VALID;        /* Quiet GCC */
323
324  if (!srcdir)
325    srcdir = ".";
326 
327  testfile = g_strconcat (srcdir, G_DIR_SEPARATOR_S "utf8.txt", NULL);
328 
329  g_file_get_contents (testfile, &contents, NULL, &error);
330  if (error)
331    croak ("Cannot open utf8.txt: %s", error->message);
332
333  ucs4 = g_array_new (TRUE, FALSE, sizeof(gunichar));
334
335  p = contents;
336
337  /* Loop over lines */
338  while (*p)
339    {
340      while (*p && (*p == ' ' || *p == '\t'))
341        p++;
342
343      end = p;
344      while (*end && (*end != '\r' && *end != '\n'))
345        end++;
346     
347      if (!*p || *p == '#' || *p == '\r' || *p == '\n')
348        goto next_line;
349
350      tmp = g_strstrip (g_strndup (p, end - p));
351     
352      switch (state)
353        {
354        case 0:
355          /* UTF-8 string */
356          start_line = line;
357          utf8 = tmp;
358          tmp = NULL;
359          break;
360         
361        case 1:
362          /* Status */
363          if (!strcmp (tmp, "VALID"))
364            status = VALID;
365          else if (!strcmp (tmp, "INCOMPLETE"))
366            status = INCOMPLETE;
367          else if (!strcmp (tmp, "NOTUNICODE"))
368            status = NOTUNICODE;
369          else if (!strcmp (tmp, "OVERLONG"))
370            status = OVERLONG;
371          else if (!strcmp (tmp, "MALFORMED"))
372            status = MALFORMED;
373          else
374            croak ("Invalid status on line %d\n", line);
375
376          if (status != VALID && status != NOTUNICODE)
377            state++;            /* No UCS-4 data */
378         
379          break;
380         
381        case 2:
382          /* UCS-4 version */
383
384          p = strtok (tmp, " \t");
385          while (p)
386            {
387              gchar *endptr;
388             
389              gunichar ch = strtoul (p, &endptr, 16);
390              if (*endptr != '\0')
391                croak ("Invalid UCS-4 character on line %d\n", line);
392
393              g_array_append_val (ucs4, ch);
394             
395              p = strtok (NULL, " \t");
396            }
397
398          break;
399        }
400
401      g_free (tmp);
402      state = (state + 1) % 3;
403
404      if (state == 0)
405        {
406          process (start_line, utf8, status, (gunichar *)ucs4->data, ucs4->len);
407          g_array_set_size (ucs4, 0);
408          g_free (utf8);
409        }
410     
411    next_line:
412      p = end;
413      if (*p && *p == '\r')
414        p++;
415      if (*p && *p == '\n')
416        p++;
417     
418      line++;
419    }
420
421  return 0;
422}
Note: See TracBrowser for help on using the repository browser.