1 | #undef G_DISABLE_ASSERT |
---|
2 | #undef G_LOG_DOMAIN |
---|
3 | |
---|
4 | #include <stdarg.h> |
---|
5 | #include <stdio.h> |
---|
6 | #include <stdlib.h> |
---|
7 | #include <string.h> |
---|
8 | #include <glib.h> |
---|
9 | |
---|
10 | static gint exit_status = 0; |
---|
11 | |
---|
12 | static void |
---|
13 | croak (char *format, ...) |
---|
14 | { |
---|
15 | va_list va; |
---|
16 | |
---|
17 | va_start (va, format); |
---|
18 | vfprintf (stderr, format, va); |
---|
19 | va_end (va); |
---|
20 | |
---|
21 | exit (1); |
---|
22 | } |
---|
23 | |
---|
24 | static void |
---|
25 | fail (char *format, ...) |
---|
26 | { |
---|
27 | va_list va; |
---|
28 | |
---|
29 | va_start (va, format); |
---|
30 | vfprintf (stderr, format, va); |
---|
31 | va_end (va); |
---|
32 | |
---|
33 | exit_status |= 1; |
---|
34 | } |
---|
35 | |
---|
36 | typedef enum |
---|
37 | { |
---|
38 | VALID, |
---|
39 | INCOMPLETE, |
---|
40 | NOTUNICODE, |
---|
41 | OVERLONG, |
---|
42 | MALFORMED |
---|
43 | } Status; |
---|
44 | |
---|
45 | static gboolean |
---|
46 | ucs4_equal (gunichar *a, gunichar *b) |
---|
47 | { |
---|
48 | while (*a && *b && (*a == *b)) |
---|
49 | { |
---|
50 | a++; |
---|
51 | b++; |
---|
52 | } |
---|
53 | |
---|
54 | return (*a == *b); |
---|
55 | } |
---|
56 | |
---|
57 | static gboolean |
---|
58 | utf16_equal (gunichar2 *a, gunichar2 *b) |
---|
59 | { |
---|
60 | while (*a && *b && (*a == *b)) |
---|
61 | { |
---|
62 | a++; |
---|
63 | b++; |
---|
64 | } |
---|
65 | |
---|
66 | return (*a == *b); |
---|
67 | } |
---|
68 | |
---|
69 | static gint |
---|
70 | utf16_count (gunichar2 *a) |
---|
71 | { |
---|
72 | gint result = 0; |
---|
73 | |
---|
74 | while (a[result]) |
---|
75 | result++; |
---|
76 | |
---|
77 | return result; |
---|
78 | } |
---|
79 | |
---|
80 | static void |
---|
81 | process (gint line, |
---|
82 | gchar *utf8, |
---|
83 | Status status, |
---|
84 | gunichar *ucs4, |
---|
85 | gint ucs4_len) |
---|
86 | { |
---|
87 | const gchar *end; |
---|
88 | gboolean is_valid = g_utf8_validate (utf8, -1, &end); |
---|
89 | GError *error = NULL; |
---|
90 | glong items_read, items_written; |
---|
91 | |
---|
92 | switch (status) |
---|
93 | { |
---|
94 | case VALID: |
---|
95 | if (!is_valid) |
---|
96 | { |
---|
97 | fail ("line %d: valid but g_utf8_validate returned FALSE\n", line); |
---|
98 | return; |
---|
99 | } |
---|
100 | break; |
---|
101 | case NOTUNICODE: |
---|
102 | case INCOMPLETE: |
---|
103 | case OVERLONG: |
---|
104 | case MALFORMED: |
---|
105 | if (is_valid) |
---|
106 | { |
---|
107 | fail ("line %d: invalid but g_utf8_validate returned TRUE\n", line); |
---|
108 | return; |
---|
109 | } |
---|
110 | break; |
---|
111 | } |
---|
112 | |
---|
113 | if (status == INCOMPLETE) |
---|
114 | { |
---|
115 | gunichar *ucs4_result; |
---|
116 | |
---|
117 | ucs4_result = g_utf8_to_ucs4 (utf8, -1, NULL, NULL, &error); |
---|
118 | |
---|
119 | if (!error || !g_error_matches (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT)) |
---|
120 | { |
---|
121 | fail ("line %d: incomplete input not properly detected\n", line); |
---|
122 | return; |
---|
123 | } |
---|
124 | g_clear_error (&error); |
---|
125 | |
---|
126 | ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, NULL, &error); |
---|
127 | |
---|
128 | if (!ucs4_result || items_read == strlen (utf8)) |
---|
129 | { |
---|
130 | fail ("line %d: incomplete input not properly detected\n", line); |
---|
131 | return; |
---|
132 | } |
---|
133 | |
---|
134 | g_free (ucs4_result); |
---|
135 | } |
---|
136 | |
---|
137 | if (status == VALID || status == NOTUNICODE) |
---|
138 | { |
---|
139 | gunichar *ucs4_result; |
---|
140 | gchar *utf8_result; |
---|
141 | |
---|
142 | ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, &items_written, &error); |
---|
143 | if (!ucs4_result) |
---|
144 | { |
---|
145 | fail ("line %d: conversion to ucs4 failed: %s\n", line, error->message); |
---|
146 | return; |
---|
147 | } |
---|
148 | |
---|
149 | if (!ucs4_equal (ucs4_result, ucs4) || |
---|
150 | items_read != strlen (utf8) || |
---|
151 | items_written != ucs4_len) |
---|
152 | { |
---|
153 | fail ("line %d: results of conversion to ucs4 do not match expected.\n", line); |
---|
154 | return; |
---|
155 | } |
---|
156 | |
---|
157 | g_free (ucs4_result); |
---|
158 | |
---|
159 | ucs4_result = g_utf8_to_ucs4_fast (utf8, -1, &items_written); |
---|
160 | |
---|
161 | if (!ucs4_equal (ucs4_result, ucs4) || |
---|
162 | items_written != ucs4_len) |
---|
163 | { |
---|
164 | fail ("line %d: results of conversion to ucs4 do not match expected.\n", line); |
---|
165 | return; |
---|
166 | } |
---|
167 | |
---|
168 | utf8_result = g_ucs4_to_utf8 (ucs4_result, -1, &items_read, &items_written, &error); |
---|
169 | if (!utf8_result) |
---|
170 | { |
---|
171 | fail ("line %d: conversion back to utf8 failed: %s", line, error->message); |
---|
172 | return; |
---|
173 | } |
---|
174 | |
---|
175 | if (strcmp (utf8_result, utf8) != 0 || |
---|
176 | items_read != ucs4_len || |
---|
177 | items_written != strlen (utf8)) |
---|
178 | { |
---|
179 | fail ("line %d: conversion back to utf8 did not match original\n", line); |
---|
180 | return; |
---|
181 | } |
---|
182 | |
---|
183 | g_free (utf8_result); |
---|
184 | g_free (ucs4_result); |
---|
185 | } |
---|
186 | |
---|
187 | if (status == VALID) |
---|
188 | { |
---|
189 | gunichar2 *utf16_expected_tmp; |
---|
190 | gunichar2 *utf16_expected; |
---|
191 | gunichar2 *utf16_from_utf8; |
---|
192 | gunichar2 *utf16_from_ucs4; |
---|
193 | gunichar *ucs4_result; |
---|
194 | gsize bytes_written; |
---|
195 | gint n_chars; |
---|
196 | gchar *utf8_result; |
---|
197 | |
---|
198 | #ifdef G_OS_WIN32 |
---|
199 | #define TARGET "UTF-16LE" |
---|
200 | #else |
---|
201 | #define TARGET "UTF-16" |
---|
202 | #endif |
---|
203 | |
---|
204 | if (!(utf16_expected_tmp = (gunichar2 *)g_convert (utf8, -1, TARGET, "UTF-8", |
---|
205 | NULL, &bytes_written, NULL))) |
---|
206 | { |
---|
207 | fail ("line %d: could not convert to UTF-16 via g_convert\n", line); |
---|
208 | return; |
---|
209 | } |
---|
210 | |
---|
211 | /* zero-terminate and remove BOM |
---|
212 | */ |
---|
213 | n_chars = bytes_written / 2; |
---|
214 | if (utf16_expected_tmp[0] == 0xfeff) /* BOM */ |
---|
215 | { |
---|
216 | n_chars--; |
---|
217 | utf16_expected = g_new (gunichar2, n_chars + 1); |
---|
218 | memcpy (utf16_expected, utf16_expected_tmp + 1, sizeof(gunichar2) * n_chars); |
---|
219 | } |
---|
220 | else if (utf16_expected_tmp[0] == 0xfffe) /* ANTI-BOM */ |
---|
221 | { |
---|
222 | fail ("line %d: conversion via iconv to \"UTF-16\" is not native-endian\n", line); |
---|
223 | return; |
---|
224 | } |
---|
225 | else |
---|
226 | { |
---|
227 | utf16_expected = g_new (gunichar2, n_chars + 1); |
---|
228 | memcpy (utf16_expected, utf16_expected_tmp, sizeof(gunichar2) * n_chars); |
---|
229 | } |
---|
230 | |
---|
231 | utf16_expected[n_chars] = '\0'; |
---|
232 | |
---|
233 | if (!(utf16_from_utf8 = g_utf8_to_utf16 (utf8, -1, &items_read, &items_written, &error))) |
---|
234 | { |
---|
235 | fail ("line %d: conversion to ucs16 failed: %s\n", line, error->message); |
---|
236 | return; |
---|
237 | } |
---|
238 | |
---|
239 | if (items_read != strlen (utf8) || |
---|
240 | utf16_count (utf16_from_utf8) != items_written) |
---|
241 | { |
---|
242 | fail ("line %d: length error in conversion to ucs16\n", line); |
---|
243 | return; |
---|
244 | } |
---|
245 | |
---|
246 | if (!(utf16_from_ucs4 = g_ucs4_to_utf16 (ucs4, -1, &items_read, &items_written, &error))) |
---|
247 | { |
---|
248 | fail ("line %d: conversion to ucs16 failed: %s\n", line, error->message); |
---|
249 | return; |
---|
250 | } |
---|
251 | |
---|
252 | if (items_read != ucs4_len || |
---|
253 | utf16_count (utf16_from_ucs4) != items_written) |
---|
254 | { |
---|
255 | fail ("line %d: length error in conversion to ucs16\n", line); |
---|
256 | return; |
---|
257 | } |
---|
258 | |
---|
259 | if (!utf16_equal (utf16_from_utf8, utf16_expected) || |
---|
260 | !utf16_equal (utf16_from_ucs4, utf16_expected)) |
---|
261 | { |
---|
262 | fail ("line %d: results of conversion to ucs16 do not match\n", line); |
---|
263 | return; |
---|
264 | } |
---|
265 | |
---|
266 | if (!(utf8_result = g_utf16_to_utf8 (utf16_from_utf8, -1, &items_read, &items_written, &error))) |
---|
267 | { |
---|
268 | fail ("line %d: conversion back to utf8 failed: %s\n", line, error->message); |
---|
269 | return; |
---|
270 | } |
---|
271 | |
---|
272 | if (items_read != utf16_count (utf16_from_utf8) || |
---|
273 | items_written != strlen (utf8)) |
---|
274 | { |
---|
275 | fail ("line %d: length error in conversion from ucs16 to utf8\n", line); |
---|
276 | return; |
---|
277 | } |
---|
278 | |
---|
279 | if (!(ucs4_result = g_utf16_to_ucs4 (utf16_from_ucs4, -1, &items_read, &items_written, &error))) |
---|
280 | { |
---|
281 | fail ("line %d: conversion back to utf8/ucs4 failed\n", line); |
---|
282 | return; |
---|
283 | } |
---|
284 | |
---|
285 | if (items_read != utf16_count (utf16_from_utf8) || |
---|
286 | items_written != ucs4_len) |
---|
287 | { |
---|
288 | fail ("line %d: length error in conversion from ucs16 to ucs4\n", line); |
---|
289 | return; |
---|
290 | } |
---|
291 | |
---|
292 | if (strcmp (utf8, utf8_result) != 0 || |
---|
293 | !ucs4_equal (ucs4, ucs4_result)) |
---|
294 | { |
---|
295 | fail ("line %d: conversion back to utf8/ucs4 did not match original\n", line); |
---|
296 | return; |
---|
297 | } |
---|
298 | |
---|
299 | g_free (utf16_expected_tmp); |
---|
300 | g_free (utf16_expected); |
---|
301 | g_free (utf16_from_utf8); |
---|
302 | g_free (utf16_from_ucs4); |
---|
303 | g_free (utf8_result); |
---|
304 | g_free (ucs4_result); |
---|
305 | } |
---|
306 | } |
---|
307 | |
---|
308 | int |
---|
309 | main (int argc, char **argv) |
---|
310 | { |
---|
311 | gchar *srcdir = getenv ("srcdir"); |
---|
312 | gchar *testfile; |
---|
313 | gchar *contents; |
---|
314 | GError *error = NULL; |
---|
315 | gchar *p, *end; |
---|
316 | char *tmp; |
---|
317 | gint state = 0; |
---|
318 | gint line = 1; |
---|
319 | gint start_line = 0; /* Quiet GCC */ |
---|
320 | gchar *utf8 = NULL; /* Quiet GCC */ |
---|
321 | GArray *ucs4; |
---|
322 | Status status = VALID; /* Quiet GCC */ |
---|
323 | |
---|
324 | if (!srcdir) |
---|
325 | srcdir = "."; |
---|
326 | |
---|
327 | testfile = g_strconcat (srcdir, G_DIR_SEPARATOR_S "utf8.txt", NULL); |
---|
328 | |
---|
329 | g_file_get_contents (testfile, &contents, NULL, &error); |
---|
330 | if (error) |
---|
331 | croak ("Cannot open utf8.txt: %s", error->message); |
---|
332 | |
---|
333 | ucs4 = g_array_new (TRUE, FALSE, sizeof(gunichar)); |
---|
334 | |
---|
335 | p = contents; |
---|
336 | |
---|
337 | /* Loop over lines */ |
---|
338 | while (*p) |
---|
339 | { |
---|
340 | while (*p && (*p == ' ' || *p == '\t')) |
---|
341 | p++; |
---|
342 | |
---|
343 | end = p; |
---|
344 | while (*end && (*end != '\r' && *end != '\n')) |
---|
345 | end++; |
---|
346 | |
---|
347 | if (!*p || *p == '#' || *p == '\r' || *p == '\n') |
---|
348 | goto next_line; |
---|
349 | |
---|
350 | tmp = g_strstrip (g_strndup (p, end - p)); |
---|
351 | |
---|
352 | switch (state) |
---|
353 | { |
---|
354 | case 0: |
---|
355 | /* UTF-8 string */ |
---|
356 | start_line = line; |
---|
357 | utf8 = tmp; |
---|
358 | tmp = NULL; |
---|
359 | break; |
---|
360 | |
---|
361 | case 1: |
---|
362 | /* Status */ |
---|
363 | if (!strcmp (tmp, "VALID")) |
---|
364 | status = VALID; |
---|
365 | else if (!strcmp (tmp, "INCOMPLETE")) |
---|
366 | status = INCOMPLETE; |
---|
367 | else if (!strcmp (tmp, "NOTUNICODE")) |
---|
368 | status = NOTUNICODE; |
---|
369 | else if (!strcmp (tmp, "OVERLONG")) |
---|
370 | status = OVERLONG; |
---|
371 | else if (!strcmp (tmp, "MALFORMED")) |
---|
372 | status = MALFORMED; |
---|
373 | else |
---|
374 | croak ("Invalid status on line %d\n", line); |
---|
375 | |
---|
376 | if (status != VALID && status != NOTUNICODE) |
---|
377 | state++; /* No UCS-4 data */ |
---|
378 | |
---|
379 | break; |
---|
380 | |
---|
381 | case 2: |
---|
382 | /* UCS-4 version */ |
---|
383 | |
---|
384 | p = strtok (tmp, " \t"); |
---|
385 | while (p) |
---|
386 | { |
---|
387 | gchar *endptr; |
---|
388 | |
---|
389 | gunichar ch = strtoul (p, &endptr, 16); |
---|
390 | if (*endptr != '\0') |
---|
391 | croak ("Invalid UCS-4 character on line %d\n", line); |
---|
392 | |
---|
393 | g_array_append_val (ucs4, ch); |
---|
394 | |
---|
395 | p = strtok (NULL, " \t"); |
---|
396 | } |
---|
397 | |
---|
398 | break; |
---|
399 | } |
---|
400 | |
---|
401 | g_free (tmp); |
---|
402 | state = (state + 1) % 3; |
---|
403 | |
---|
404 | if (state == 0) |
---|
405 | { |
---|
406 | process (start_line, utf8, status, (gunichar *)ucs4->data, ucs4->len); |
---|
407 | g_array_set_size (ucs4, 0); |
---|
408 | g_free (utf8); |
---|
409 | } |
---|
410 | |
---|
411 | next_line: |
---|
412 | p = end; |
---|
413 | if (*p && *p == '\r') |
---|
414 | p++; |
---|
415 | if (*p && *p == '\n') |
---|
416 | p++; |
---|
417 | |
---|
418 | line++; |
---|
419 | } |
---|
420 | |
---|
421 | return 0; |
---|
422 | } |
---|