1 | /* guniprop.c - Unicode character properties. |
---|
2 | * |
---|
3 | * Copyright (C) 1999 Tom Tromey |
---|
4 | * Copyright (C) 2000 Red Hat, Inc. |
---|
5 | * |
---|
6 | * This library is free software; you can redistribute it and/or |
---|
7 | * modify it under the terms of the GNU Lesser General Public |
---|
8 | * License as published by the Free Software Foundation; either |
---|
9 | * version 2 of the License, or (at your option) any later version. |
---|
10 | * |
---|
11 | * This library is distributed in the hope that it will be useful, |
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
---|
14 | * Lesser General Public License for more details. |
---|
15 | * |
---|
16 | * You should have received a copy of the GNU Lesser General Public |
---|
17 | * License along with this library; if not, write to the |
---|
18 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
---|
19 | * Boston, MA 02111-1307, USA. |
---|
20 | */ |
---|
21 | |
---|
22 | #include "config.h" |
---|
23 | |
---|
24 | #include <stddef.h> |
---|
25 | #include <string.h> |
---|
26 | #include <locale.h> |
---|
27 | |
---|
28 | #include "glib.h" |
---|
29 | #include "gunichartables.h" |
---|
30 | #include "gunicodeprivate.h" |
---|
31 | |
---|
32 | #define ATTR_TABLE(Page) (((Page) <= G_UNICODE_LAST_PAGE_PART1) \ |
---|
33 | ? attr_table_part1[Page] \ |
---|
34 | : attr_table_part2[(Page) - 0xe00]) |
---|
35 | |
---|
36 | #define ATTTABLE(Page, Char) \ |
---|
37 | ((ATTR_TABLE(Page) == G_UNICODE_MAX_TABLE_INDEX) ? 0 : (attr_data[ATTR_TABLE(Page)][Char])) |
---|
38 | |
---|
39 | #define TTYPE_PART1(Page, Char) \ |
---|
40 | ((type_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
---|
41 | ? (type_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
---|
42 | : (type_data[type_table_part1[Page]][Char])) |
---|
43 | |
---|
44 | #define TTYPE_PART2(Page, Char) \ |
---|
45 | ((type_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \ |
---|
46 | ? (type_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \ |
---|
47 | : (type_data[type_table_part2[Page]][Char])) |
---|
48 | |
---|
49 | #define TYPE(Char) \ |
---|
50 | (((Char) <= G_UNICODE_LAST_CHAR_PART1) \ |
---|
51 | ? TTYPE_PART1 ((Char) >> 8, (Char) & 0xff) \ |
---|
52 | : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \ |
---|
53 | ? TTYPE_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \ |
---|
54 | : G_UNICODE_UNASSIGNED)) |
---|
55 | |
---|
56 | |
---|
57 | #define ISDIGIT(Type) ((Type) == G_UNICODE_DECIMAL_NUMBER \ |
---|
58 | || (Type) == G_UNICODE_LETTER_NUMBER \ |
---|
59 | || (Type) == G_UNICODE_OTHER_NUMBER) |
---|
60 | |
---|
61 | #define ISALPHA(Type) ((Type) == G_UNICODE_LOWERCASE_LETTER \ |
---|
62 | || (Type) == G_UNICODE_UPPERCASE_LETTER \ |
---|
63 | || (Type) == G_UNICODE_TITLECASE_LETTER \ |
---|
64 | || (Type) == G_UNICODE_MODIFIER_LETTER \ |
---|
65 | || (Type) == G_UNICODE_OTHER_LETTER) |
---|
66 | |
---|
67 | #define ISMARK(Type) ((Type) == G_UNICODE_NON_SPACING_MARK || \ |
---|
68 | (Type) == G_UNICODE_COMBINING_MARK || \ |
---|
69 | (Type) == G_UNICODE_ENCLOSING_MARK) |
---|
70 | |
---|
71 | |
---|
72 | /** |
---|
73 | * g_unichar_isalnum: |
---|
74 | * @c: a Unicode character |
---|
75 | * |
---|
76 | * Determines whether a character is alphanumeric. |
---|
77 | * Given some UTF-8 text, obtain a character value |
---|
78 | * with g_utf8_get_char(). |
---|
79 | * |
---|
80 | * Return value: %TRUE if @c is an alphanumeric character |
---|
81 | **/ |
---|
82 | gboolean |
---|
83 | g_unichar_isalnum (gunichar c) |
---|
84 | { |
---|
85 | int t = TYPE (c); |
---|
86 | return ISDIGIT (t) || ISALPHA (t); |
---|
87 | } |
---|
88 | |
---|
89 | /** |
---|
90 | * g_unichar_isalpha: |
---|
91 | * @c: a Unicode character |
---|
92 | * |
---|
93 | * Determines whether a character is alphabetic (i.e. a letter). |
---|
94 | * Given some UTF-8 text, obtain a character value with |
---|
95 | * g_utf8_get_char(). |
---|
96 | * |
---|
97 | * Return value: %TRUE if @c is an alphabetic character |
---|
98 | **/ |
---|
99 | gboolean |
---|
100 | g_unichar_isalpha (gunichar c) |
---|
101 | { |
---|
102 | int t = TYPE (c); |
---|
103 | return ISALPHA (t); |
---|
104 | } |
---|
105 | |
---|
106 | |
---|
107 | /** |
---|
108 | * g_unichar_iscntrl: |
---|
109 | * @c: a Unicode character |
---|
110 | * |
---|
111 | * Determines whether a character is a control character. |
---|
112 | * Given some UTF-8 text, obtain a character value with |
---|
113 | * g_utf8_get_char(). |
---|
114 | * |
---|
115 | * Return value: %TRUE if @c is a control character |
---|
116 | **/ |
---|
117 | gboolean |
---|
118 | g_unichar_iscntrl (gunichar c) |
---|
119 | { |
---|
120 | return TYPE (c) == G_UNICODE_CONTROL; |
---|
121 | } |
---|
122 | |
---|
123 | /** |
---|
124 | * g_unichar_isdigit: |
---|
125 | * @c: a Unicode character |
---|
126 | * |
---|
127 | * Determines whether a character is numeric (i.e. a digit). This |
---|
128 | * covers ASCII 0-9 and also digits in other languages/scripts. Given |
---|
129 | * some UTF-8 text, obtain a character value with g_utf8_get_char(). |
---|
130 | * |
---|
131 | * Return value: %TRUE if @c is a digit |
---|
132 | **/ |
---|
133 | gboolean |
---|
134 | g_unichar_isdigit (gunichar c) |
---|
135 | { |
---|
136 | return TYPE (c) == G_UNICODE_DECIMAL_NUMBER; |
---|
137 | } |
---|
138 | |
---|
139 | |
---|
140 | /** |
---|
141 | * g_unichar_isgraph: |
---|
142 | * @c: a Unicode character |
---|
143 | * |
---|
144 | * Determines whether a character is printable and not a space |
---|
145 | * (returns %FALSE for control characters, format characters, and |
---|
146 | * spaces). g_unichar_isprint() is similar, but returns %TRUE for |
---|
147 | * spaces. Given some UTF-8 text, obtain a character value with |
---|
148 | * g_utf8_get_char(). |
---|
149 | * |
---|
150 | * Return value: %TRUE if @c is printable unless it's a space |
---|
151 | **/ |
---|
152 | gboolean |
---|
153 | g_unichar_isgraph (gunichar c) |
---|
154 | { |
---|
155 | int t = TYPE (c); |
---|
156 | return (t != G_UNICODE_CONTROL |
---|
157 | && t != G_UNICODE_FORMAT |
---|
158 | && t != G_UNICODE_UNASSIGNED |
---|
159 | && t != G_UNICODE_PRIVATE_USE |
---|
160 | && t != G_UNICODE_SURROGATE |
---|
161 | && t != G_UNICODE_SPACE_SEPARATOR); |
---|
162 | } |
---|
163 | |
---|
164 | /** |
---|
165 | * g_unichar_islower: |
---|
166 | * @c: a Unicode character |
---|
167 | * |
---|
168 | * Determines whether a character is a lowercase letter. |
---|
169 | * Given some UTF-8 text, obtain a character value with |
---|
170 | * g_utf8_get_char(). |
---|
171 | * |
---|
172 | * Return value: %TRUE if @c is a lowercase letter |
---|
173 | **/ |
---|
174 | gboolean |
---|
175 | g_unichar_islower (gunichar c) |
---|
176 | { |
---|
177 | return TYPE (c) == G_UNICODE_LOWERCASE_LETTER; |
---|
178 | } |
---|
179 | |
---|
180 | |
---|
181 | /** |
---|
182 | * g_unichar_isprint: |
---|
183 | * @c: a Unicode character |
---|
184 | * |
---|
185 | * Determines whether a character is printable. |
---|
186 | * Unlike g_unichar_isgraph(), returns %TRUE for spaces. |
---|
187 | * Given some UTF-8 text, obtain a character value with |
---|
188 | * g_utf8_get_char(). |
---|
189 | * |
---|
190 | * Return value: %TRUE if @c is printable |
---|
191 | **/ |
---|
192 | gboolean |
---|
193 | g_unichar_isprint (gunichar c) |
---|
194 | { |
---|
195 | int t = TYPE (c); |
---|
196 | return (t != G_UNICODE_CONTROL |
---|
197 | && t != G_UNICODE_FORMAT |
---|
198 | && t != G_UNICODE_UNASSIGNED |
---|
199 | && t != G_UNICODE_PRIVATE_USE |
---|
200 | && t != G_UNICODE_SURROGATE); |
---|
201 | } |
---|
202 | |
---|
203 | /** |
---|
204 | * g_unichar_ispunct: |
---|
205 | * @c: a Unicode character |
---|
206 | * |
---|
207 | * Determines whether a character is punctuation or a symbol. |
---|
208 | * Given some UTF-8 text, obtain a character value with |
---|
209 | * g_utf8_get_char(). |
---|
210 | * |
---|
211 | * Return value: %TRUE if @c is a punctuation or symbol character |
---|
212 | **/ |
---|
213 | gboolean |
---|
214 | g_unichar_ispunct (gunichar c) |
---|
215 | { |
---|
216 | int t = TYPE (c); |
---|
217 | return (t == G_UNICODE_CONNECT_PUNCTUATION || t == G_UNICODE_DASH_PUNCTUATION |
---|
218 | || t == G_UNICODE_CLOSE_PUNCTUATION || t == G_UNICODE_FINAL_PUNCTUATION |
---|
219 | || t == G_UNICODE_INITIAL_PUNCTUATION || t == G_UNICODE_OTHER_PUNCTUATION |
---|
220 | || t == G_UNICODE_OPEN_PUNCTUATION || t == G_UNICODE_CURRENCY_SYMBOL |
---|
221 | || t == G_UNICODE_MODIFIER_SYMBOL || t == G_UNICODE_MATH_SYMBOL |
---|
222 | || t == G_UNICODE_OTHER_SYMBOL); |
---|
223 | } |
---|
224 | |
---|
225 | /** |
---|
226 | * g_unichar_isspace: |
---|
227 | * @c: a Unicode character |
---|
228 | * |
---|
229 | * Determines whether a character is a space, tab, or line separator |
---|
230 | * (newline, carriage return, etc.). Given some UTF-8 text, obtain a |
---|
231 | * character value with g_utf8_get_char(). |
---|
232 | * |
---|
233 | * (Note: don't use this to do word breaking; you have to use |
---|
234 | * Pango or equivalent to get word breaking right, the algorithm |
---|
235 | * is fairly complex.) |
---|
236 | * |
---|
237 | * Return value: %TRUE if @c is a punctuation character |
---|
238 | **/ |
---|
239 | gboolean |
---|
240 | g_unichar_isspace (gunichar c) |
---|
241 | { |
---|
242 | switch (c) |
---|
243 | { |
---|
244 | /* special-case these since Unicode thinks they are not spaces */ |
---|
245 | case '\t': |
---|
246 | case '\n': |
---|
247 | case '\r': |
---|
248 | case '\f': |
---|
249 | return TRUE; |
---|
250 | break; |
---|
251 | |
---|
252 | default: |
---|
253 | { |
---|
254 | int t = TYPE (c); |
---|
255 | return (t == G_UNICODE_SPACE_SEPARATOR || t == G_UNICODE_LINE_SEPARATOR |
---|
256 | || t == G_UNICODE_PARAGRAPH_SEPARATOR); |
---|
257 | } |
---|
258 | break; |
---|
259 | } |
---|
260 | } |
---|
261 | |
---|
262 | /** |
---|
263 | * g_unichar_isupper: |
---|
264 | * @c: a Unicode character |
---|
265 | * |
---|
266 | * Determines if a character is uppercase. |
---|
267 | * |
---|
268 | * Return value: %TRUE if @c is an uppercase character |
---|
269 | **/ |
---|
270 | gboolean |
---|
271 | g_unichar_isupper (gunichar c) |
---|
272 | { |
---|
273 | return TYPE (c) == G_UNICODE_UPPERCASE_LETTER; |
---|
274 | } |
---|
275 | |
---|
276 | /** |
---|
277 | * g_unichar_istitle: |
---|
278 | * @c: a Unicode character |
---|
279 | * |
---|
280 | * Determines if a character is titlecase. Some characters in |
---|
281 | * Unicode which are composites, such as the DZ digraph |
---|
282 | * have three case variants instead of just two. The titlecase |
---|
283 | * form is used at the beginning of a word where only the |
---|
284 | * first letter is capitalized. The titlecase form of the DZ |
---|
285 | * digraph is U+01F2 LATIN CAPITAL LETTTER D WITH SMALL LETTER Z. |
---|
286 | * |
---|
287 | * Return value: %TRUE if the character is titlecase |
---|
288 | **/ |
---|
289 | gboolean |
---|
290 | g_unichar_istitle (gunichar c) |
---|
291 | { |
---|
292 | unsigned int i; |
---|
293 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
---|
294 | if (title_table[i][0] == c) |
---|
295 | return 1; |
---|
296 | return 0; |
---|
297 | } |
---|
298 | |
---|
299 | /** |
---|
300 | * g_unichar_isxdigit: |
---|
301 | * @c: a Unicode character. |
---|
302 | * |
---|
303 | * Determines if a character is a hexidecimal digit. |
---|
304 | * |
---|
305 | * Return value: %TRUE if the character is a hexadecimal digit |
---|
306 | **/ |
---|
307 | gboolean |
---|
308 | g_unichar_isxdigit (gunichar c) |
---|
309 | { |
---|
310 | int t = TYPE (c); |
---|
311 | return ((c >= 'a' && c <= 'f') |
---|
312 | || (c >= 'A' && c <= 'F') |
---|
313 | || ISDIGIT (t)); |
---|
314 | } |
---|
315 | |
---|
316 | /** |
---|
317 | * g_unichar_isdefined: |
---|
318 | * @c: a Unicode character |
---|
319 | * |
---|
320 | * Determines if a given character is assigned in the Unicode |
---|
321 | * standard. |
---|
322 | * |
---|
323 | * Return value: %TRUE if the character has an assigned value |
---|
324 | **/ |
---|
325 | gboolean |
---|
326 | g_unichar_isdefined (gunichar c) |
---|
327 | { |
---|
328 | int t = TYPE (c); |
---|
329 | return t != G_UNICODE_UNASSIGNED; |
---|
330 | } |
---|
331 | |
---|
332 | /** |
---|
333 | * g_unichar_iswide: |
---|
334 | * @c: a Unicode character |
---|
335 | * |
---|
336 | * Determines if a character is typically rendered in a double-width |
---|
337 | * cell. |
---|
338 | * |
---|
339 | * Return value: %TRUE if the character is wide |
---|
340 | **/ |
---|
341 | /* This function stolen from Markus Kuhn <Markus.Kuhn@cl.cam.ac.uk>. */ |
---|
342 | gboolean |
---|
343 | g_unichar_iswide (gunichar c) |
---|
344 | { |
---|
345 | if (c < 0x1100) |
---|
346 | return FALSE; |
---|
347 | |
---|
348 | return (c <= 0x115f /* Hangul Jamo init. consonants */ |
---|
349 | || c == 0x2329 || c == 0x232a /* angle brackets */ |
---|
350 | || (c >= 0x2e80 && c <= 0xa4cf && (c < 0x302a || c > 0x302f) |
---|
351 | && c != 0x303f && c != 0x3099 && c!= 0x309a) /* CJK ... Yi */ |
---|
352 | || (c >= 0xac00 && c <= 0xd7a3) /* Hangul Syllables */ |
---|
353 | || (c >= 0xf900 && c <= 0xfaff) /* CJK Compatibility Ideographs */ |
---|
354 | || (c >= 0xfe30 && c <= 0xfe6f) /* CJK Compatibility Forms */ |
---|
355 | || (c >= 0xff00 && c <= 0xff60) /* Fullwidth Forms */ |
---|
356 | || (c >= 0xffe0 && c <= 0xffe6) /* Fullwidth Forms */ |
---|
357 | || (c >= 0x20000 && c <= 0x2fffd) /* CJK extra stuff */ |
---|
358 | || (c >= 0x30000 && c <= 0x3fffd)); |
---|
359 | } |
---|
360 | |
---|
361 | /** |
---|
362 | * g_unichar_toupper: |
---|
363 | * @c: a Unicode character |
---|
364 | * |
---|
365 | * Converts a character to uppercase. |
---|
366 | * |
---|
367 | * Return value: the result of converting @c to uppercase. |
---|
368 | * If @c is not an lowercase or titlecase character, |
---|
369 | * or has no upper case equivalent @c is returned unchanged. |
---|
370 | **/ |
---|
371 | gunichar |
---|
372 | g_unichar_toupper (gunichar c) |
---|
373 | { |
---|
374 | int t = TYPE (c); |
---|
375 | if (t == G_UNICODE_LOWERCASE_LETTER) |
---|
376 | { |
---|
377 | gunichar val = ATTTABLE (c >> 8, c & 0xff); |
---|
378 | if (val >= 0x1000000) |
---|
379 | { |
---|
380 | const gchar *p = special_case_table + val - 0x1000000; |
---|
381 | return g_utf8_get_char (p); |
---|
382 | } |
---|
383 | else |
---|
384 | return val ? val : c; |
---|
385 | } |
---|
386 | else if (t == G_UNICODE_TITLECASE_LETTER) |
---|
387 | { |
---|
388 | unsigned int i; |
---|
389 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
---|
390 | { |
---|
391 | if (title_table[i][0] == c) |
---|
392 | return title_table[i][1]; |
---|
393 | } |
---|
394 | } |
---|
395 | return c; |
---|
396 | } |
---|
397 | |
---|
398 | /** |
---|
399 | * g_unichar_tolower: |
---|
400 | * @c: a Unicode character. |
---|
401 | * |
---|
402 | * Converts a character to lower case. |
---|
403 | * |
---|
404 | * Return value: the result of converting @c to lower case. |
---|
405 | * If @c is not an upperlower or titlecase character, |
---|
406 | * or has no lowercase equivalent @c is returned unchanged. |
---|
407 | **/ |
---|
408 | gunichar |
---|
409 | g_unichar_tolower (gunichar c) |
---|
410 | { |
---|
411 | int t = TYPE (c); |
---|
412 | if (t == G_UNICODE_UPPERCASE_LETTER) |
---|
413 | { |
---|
414 | gunichar val = ATTTABLE (c >> 8, c & 0xff); |
---|
415 | if (val >= 0x1000000) |
---|
416 | { |
---|
417 | const gchar *p = special_case_table + val - 0x1000000; |
---|
418 | return g_utf8_get_char (p); |
---|
419 | } |
---|
420 | else |
---|
421 | return val ? val : c; |
---|
422 | } |
---|
423 | else if (t == G_UNICODE_TITLECASE_LETTER) |
---|
424 | { |
---|
425 | unsigned int i; |
---|
426 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
---|
427 | { |
---|
428 | if (title_table[i][0] == c) |
---|
429 | return title_table[i][2]; |
---|
430 | } |
---|
431 | } |
---|
432 | return c; |
---|
433 | } |
---|
434 | |
---|
435 | /** |
---|
436 | * g_unichar_totitle: |
---|
437 | * @c: a Unicode character |
---|
438 | * |
---|
439 | * Converts a character to the titlecase. |
---|
440 | * |
---|
441 | * Return value: the result of converting @c to titlecase. |
---|
442 | * If @c is not an uppercase or lowercase character, |
---|
443 | * @c is returned unchanged. |
---|
444 | **/ |
---|
445 | gunichar |
---|
446 | g_unichar_totitle (gunichar c) |
---|
447 | { |
---|
448 | unsigned int i; |
---|
449 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
---|
450 | { |
---|
451 | if (title_table[i][0] == c || title_table[i][1] == c |
---|
452 | || title_table[i][2] == c) |
---|
453 | return title_table[i][0]; |
---|
454 | } |
---|
455 | return (TYPE (c) == G_UNICODE_LOWERCASE_LETTER |
---|
456 | ? ATTTABLE (c >> 8, c & 0xff) |
---|
457 | : c); |
---|
458 | } |
---|
459 | |
---|
460 | /** |
---|
461 | * g_unichar_digit_value: |
---|
462 | * @c: a Unicode character |
---|
463 | * |
---|
464 | * Determines the numeric value of a character as a decimal |
---|
465 | * digit. |
---|
466 | * |
---|
467 | * Return value: If @c is a decimal digit (according to |
---|
468 | * g_unichar_isdigit()), its numeric value. Otherwise, -1. |
---|
469 | **/ |
---|
470 | int |
---|
471 | g_unichar_digit_value (gunichar c) |
---|
472 | { |
---|
473 | if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER) |
---|
474 | return ATTTABLE (c >> 8, c & 0xff); |
---|
475 | return -1; |
---|
476 | } |
---|
477 | |
---|
478 | /** |
---|
479 | * g_unichar_xdigit_value: |
---|
480 | * @c: a Unicode character |
---|
481 | * |
---|
482 | * Determines the numeric value of a character as a hexidecimal |
---|
483 | * digit. |
---|
484 | * |
---|
485 | * Return value: If @c is a hex digit (according to |
---|
486 | * g_unichar_isxdigit()), its numeric value. Otherwise, -1. |
---|
487 | **/ |
---|
488 | int |
---|
489 | g_unichar_xdigit_value (gunichar c) |
---|
490 | { |
---|
491 | if (c >= 'A' && c <= 'F') |
---|
492 | return c - 'A' + 10; |
---|
493 | if (c >= 'a' && c <= 'f') |
---|
494 | return c - 'a' + 10; |
---|
495 | if (TYPE (c) == G_UNICODE_DECIMAL_NUMBER) |
---|
496 | return ATTTABLE (c >> 8, c & 0xff); |
---|
497 | return -1; |
---|
498 | } |
---|
499 | |
---|
500 | /** |
---|
501 | * g_unichar_type: |
---|
502 | * @c: a Unicode character |
---|
503 | * |
---|
504 | * Classifies a Unicode character by type. |
---|
505 | * |
---|
506 | * Return value: the type of the character. |
---|
507 | **/ |
---|
508 | GUnicodeType |
---|
509 | g_unichar_type (gunichar c) |
---|
510 | { |
---|
511 | return TYPE (c); |
---|
512 | } |
---|
513 | |
---|
514 | /* |
---|
515 | * Case mapping functions |
---|
516 | */ |
---|
517 | |
---|
518 | typedef enum { |
---|
519 | LOCALE_NORMAL, |
---|
520 | LOCALE_TURKIC, |
---|
521 | LOCALE_LITHUANIAN |
---|
522 | } LocaleType; |
---|
523 | |
---|
524 | static LocaleType |
---|
525 | get_locale_type (void) |
---|
526 | { |
---|
527 | const char *locale = setlocale (LC_CTYPE, NULL); |
---|
528 | |
---|
529 | switch (locale[0]) |
---|
530 | { |
---|
531 | case 'a': |
---|
532 | if (locale[1] == 'z') |
---|
533 | return LOCALE_TURKIC; |
---|
534 | break; |
---|
535 | case 'l': |
---|
536 | if (locale[1] == 't') |
---|
537 | return LOCALE_LITHUANIAN; |
---|
538 | break; |
---|
539 | case 't': |
---|
540 | if (locale[1] == 'r') |
---|
541 | return LOCALE_TURKIC; |
---|
542 | break; |
---|
543 | } |
---|
544 | |
---|
545 | return LOCALE_NORMAL; |
---|
546 | } |
---|
547 | |
---|
548 | static gint |
---|
549 | output_marks (const char **p_inout, |
---|
550 | char *out_buffer, |
---|
551 | gboolean remove_dot) |
---|
552 | { |
---|
553 | const char *p = *p_inout; |
---|
554 | gint len = 0; |
---|
555 | |
---|
556 | while (*p) |
---|
557 | { |
---|
558 | gunichar c = g_utf8_get_char (p); |
---|
559 | int t = TYPE(c); |
---|
560 | |
---|
561 | if (ISMARK(t)) |
---|
562 | { |
---|
563 | if (!remove_dot || c != 0x307 /* COMBINING DOT ABOVE */) |
---|
564 | len += g_unichar_to_utf8 (c, out_buffer ? out_buffer + len : NULL); |
---|
565 | p = g_utf8_next_char (p); |
---|
566 | } |
---|
567 | else |
---|
568 | break; |
---|
569 | } |
---|
570 | |
---|
571 | *p_inout = p; |
---|
572 | return len; |
---|
573 | } |
---|
574 | |
---|
575 | static gint |
---|
576 | output_special_case (gchar *out_buffer, |
---|
577 | int offset, |
---|
578 | int type, |
---|
579 | int which) |
---|
580 | { |
---|
581 | const gchar *p = special_case_table + offset; |
---|
582 | gint len; |
---|
583 | |
---|
584 | if (type != G_UNICODE_TITLECASE_LETTER) |
---|
585 | p = g_utf8_next_char (p); |
---|
586 | |
---|
587 | if (which == 1) |
---|
588 | p += strlen (p) + 1; |
---|
589 | |
---|
590 | len = strlen (p); |
---|
591 | if (out_buffer) |
---|
592 | memcpy (out_buffer, p, len); |
---|
593 | |
---|
594 | return len; |
---|
595 | } |
---|
596 | |
---|
597 | static gsize |
---|
598 | real_toupper (const gchar *str, |
---|
599 | gssize max_len, |
---|
600 | gchar *out_buffer, |
---|
601 | LocaleType locale_type) |
---|
602 | { |
---|
603 | const gchar *p = str; |
---|
604 | const char *last = NULL; |
---|
605 | gsize len = 0; |
---|
606 | gboolean last_was_i = FALSE; |
---|
607 | |
---|
608 | while ((max_len < 0 || p < str + max_len) && *p) |
---|
609 | { |
---|
610 | gunichar c = g_utf8_get_char (p); |
---|
611 | int t = TYPE (c); |
---|
612 | gunichar val; |
---|
613 | |
---|
614 | last = p; |
---|
615 | p = g_utf8_next_char (p); |
---|
616 | |
---|
617 | if (locale_type == LOCALE_LITHUANIAN) |
---|
618 | { |
---|
619 | if (c == 'i') |
---|
620 | last_was_i = TRUE; |
---|
621 | else |
---|
622 | { |
---|
623 | if (last_was_i) |
---|
624 | { |
---|
625 | /* Nasty, need to remove any dot above. Though |
---|
626 | * I think only E WITH DOT ABOVE occurs in practice |
---|
627 | * which could simplify this considerably. |
---|
628 | */ |
---|
629 | gsize decomp_len, i; |
---|
630 | gunichar *decomp; |
---|
631 | |
---|
632 | decomp = g_unicode_canonical_decomposition (c, &decomp_len); |
---|
633 | for (i=0; i < decomp_len; i++) |
---|
634 | { |
---|
635 | if (decomp[i] != 0x307 /* COMBINING DOT ABOVE */) |
---|
636 | len += g_unichar_to_utf8 (g_unichar_toupper (decomp[i]), out_buffer ? out_buffer + len : NULL); |
---|
637 | } |
---|
638 | g_free (decomp); |
---|
639 | |
---|
640 | len += output_marks (&p, out_buffer ? out_buffer + len : NULL, TRUE); |
---|
641 | |
---|
642 | continue; |
---|
643 | } |
---|
644 | |
---|
645 | if (!ISMARK(t)) |
---|
646 | last_was_i = FALSE; |
---|
647 | } |
---|
648 | } |
---|
649 | |
---|
650 | if (locale_type == LOCALE_TURKIC && c == 'i') |
---|
651 | { |
---|
652 | /* i => LATIN CAPITAL LETTER I WITH DOT ABOVE */ |
---|
653 | len += g_unichar_to_utf8 (0x130, out_buffer ? out_buffer + len : NULL); |
---|
654 | } |
---|
655 | else if (c == 0x0345) /* COMBINING GREEK YPOGEGRAMMENI */ |
---|
656 | { |
---|
657 | /* Nasty, need to move it after other combining marks .. this would go away if |
---|
658 | * we normalized first. |
---|
659 | */ |
---|
660 | len += output_marks (&p, out_buffer ? out_buffer + len : NULL, FALSE); |
---|
661 | |
---|
662 | /* And output as GREEK CAPITAL LETTER IOTA */ |
---|
663 | len += g_unichar_to_utf8 (0x399, out_buffer ? out_buffer + len : NULL); |
---|
664 | } |
---|
665 | else if (t == G_UNICODE_LOWERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER) |
---|
666 | { |
---|
667 | val = ATTTABLE (c >> 8, c & 0xff); |
---|
668 | |
---|
669 | if (val >= 0x1000000) |
---|
670 | { |
---|
671 | len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t, |
---|
672 | t == G_UNICODE_LOWERCASE_LETTER ? 0 : 1); |
---|
673 | } |
---|
674 | else |
---|
675 | { |
---|
676 | if (t == G_UNICODE_TITLECASE_LETTER) |
---|
677 | { |
---|
678 | unsigned int i; |
---|
679 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
---|
680 | { |
---|
681 | if (title_table[i][0] == c) |
---|
682 | val = title_table[i][1]; |
---|
683 | } |
---|
684 | } |
---|
685 | |
---|
686 | len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL); |
---|
687 | } |
---|
688 | } |
---|
689 | else |
---|
690 | { |
---|
691 | gsize char_len = g_utf8_skip[*(guchar *)last]; |
---|
692 | |
---|
693 | if (out_buffer) |
---|
694 | memcpy (out_buffer + len, last, char_len); |
---|
695 | |
---|
696 | len += char_len; |
---|
697 | } |
---|
698 | |
---|
699 | } |
---|
700 | |
---|
701 | return len; |
---|
702 | } |
---|
703 | |
---|
704 | /** |
---|
705 | * g_utf8_strup: |
---|
706 | * @str: a UTF-8 encoded string |
---|
707 | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
---|
708 | * |
---|
709 | * Converts all Unicode characters in the string that have a case |
---|
710 | * to uppercase. The exact manner that this is done depends |
---|
711 | * on the current locale, and may result in the number of |
---|
712 | * characters in the string increasing. (For instance, the |
---|
713 | * German ess-zet will be changed to SS.) |
---|
714 | * |
---|
715 | * Return value: a newly allocated string, with all characters |
---|
716 | * converted to uppercase. |
---|
717 | **/ |
---|
718 | gchar * |
---|
719 | g_utf8_strup (const gchar *str, |
---|
720 | gssize len) |
---|
721 | { |
---|
722 | gsize result_len; |
---|
723 | LocaleType locale_type; |
---|
724 | gchar *result; |
---|
725 | |
---|
726 | g_return_val_if_fail (str != NULL, NULL); |
---|
727 | |
---|
728 | locale_type = get_locale_type (); |
---|
729 | |
---|
730 | /* |
---|
731 | * We use a two pass approach to keep memory management simple |
---|
732 | */ |
---|
733 | result_len = real_toupper (str, len, NULL, locale_type); |
---|
734 | result = g_malloc (result_len + 1); |
---|
735 | real_toupper (str, len, result, locale_type); |
---|
736 | result[result_len] = '\0'; |
---|
737 | |
---|
738 | return result; |
---|
739 | } |
---|
740 | |
---|
741 | /* traverses the string checking for characters with combining class == 230 |
---|
742 | * until a base character is found */ |
---|
743 | static gboolean |
---|
744 | has_more_above (const gchar *str) |
---|
745 | { |
---|
746 | const gchar *p = str; |
---|
747 | gint combining_class; |
---|
748 | |
---|
749 | while (*p) |
---|
750 | { |
---|
751 | combining_class = _g_unichar_combining_class (g_utf8_get_char (p)); |
---|
752 | if (combining_class == 230) |
---|
753 | return TRUE; |
---|
754 | else if (combining_class == 0) |
---|
755 | break; |
---|
756 | |
---|
757 | p = g_utf8_next_char (p); |
---|
758 | } |
---|
759 | |
---|
760 | return FALSE; |
---|
761 | } |
---|
762 | |
---|
763 | static gsize |
---|
764 | real_tolower (const gchar *str, |
---|
765 | gssize max_len, |
---|
766 | gchar *out_buffer, |
---|
767 | LocaleType locale_type) |
---|
768 | { |
---|
769 | const gchar *p = str; |
---|
770 | const char *last = NULL; |
---|
771 | gsize len = 0; |
---|
772 | |
---|
773 | while ((max_len < 0 || p < str + max_len) && *p) |
---|
774 | { |
---|
775 | gunichar c = g_utf8_get_char (p); |
---|
776 | int t = TYPE (c); |
---|
777 | gunichar val; |
---|
778 | |
---|
779 | last = p; |
---|
780 | p = g_utf8_next_char (p); |
---|
781 | |
---|
782 | if (locale_type == LOCALE_TURKIC && c == 'I') |
---|
783 | { |
---|
784 | if (g_utf8_get_char (p) == 0x0307) |
---|
785 | { |
---|
786 | /* I + COMBINING DOT ABOVE => i (U+0069) */ |
---|
787 | len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); |
---|
788 | p = g_utf8_next_char (p); |
---|
789 | } |
---|
790 | else |
---|
791 | { |
---|
792 | /* I => LATIN SMALL LETTER DOTLESS I */ |
---|
793 | len += g_unichar_to_utf8 (0x131, out_buffer ? out_buffer + len : NULL); |
---|
794 | } |
---|
795 | } |
---|
796 | /* Introduce an explicit dot above when lowercasing capital I's and J's |
---|
797 | * whenever there are more accents above. [SpecialCasing.txt] */ |
---|
798 | else if (locale_type == LOCALE_LITHUANIAN && |
---|
799 | (c == 0x00cc || c == 0x00cd || c == 0x0128)) |
---|
800 | { |
---|
801 | len += g_unichar_to_utf8 (0x0069, out_buffer ? out_buffer + len : NULL); |
---|
802 | len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL); |
---|
803 | |
---|
804 | switch (c) |
---|
805 | { |
---|
806 | case 0x00cc: |
---|
807 | len += g_unichar_to_utf8 (0x0300, out_buffer ? out_buffer + len : NULL); |
---|
808 | break; |
---|
809 | case 0x00cd: |
---|
810 | len += g_unichar_to_utf8 (0x0301, out_buffer ? out_buffer + len : NULL); |
---|
811 | break; |
---|
812 | case 0x0128: |
---|
813 | len += g_unichar_to_utf8 (0x0303, out_buffer ? out_buffer + len : NULL); |
---|
814 | break; |
---|
815 | } |
---|
816 | } |
---|
817 | else if (locale_type == LOCALE_LITHUANIAN && |
---|
818 | (c == 'I' || c == 'J' || c == 0x012e) && |
---|
819 | has_more_above (p)) |
---|
820 | { |
---|
821 | len += g_unichar_to_utf8 (g_unichar_tolower (c), out_buffer ? out_buffer + len : NULL); |
---|
822 | len += g_unichar_to_utf8 (0x0307, out_buffer ? out_buffer + len : NULL); |
---|
823 | } |
---|
824 | else if (c == 0x03A3) /* GREEK CAPITAL LETTER SIGMA */ |
---|
825 | { |
---|
826 | if ((max_len < 0 || p < str + max_len) && *p) |
---|
827 | { |
---|
828 | gunichar next_c = g_utf8_get_char (p); |
---|
829 | int next_type = TYPE(next_c); |
---|
830 | |
---|
831 | /* SIGMA mapps differently depending on whether it is |
---|
832 | * final or not. The following simplified test would |
---|
833 | * fail in the case of combining marks following the |
---|
834 | * sigma, but I don't think that occurs in real text. |
---|
835 | * The test here matches that in ICU. |
---|
836 | */ |
---|
837 | if (ISALPHA(next_type)) /* Lu,Ll,Lt,Lm,Lo */ |
---|
838 | val = 0x3c3; /* GREEK SMALL SIGMA */ |
---|
839 | else |
---|
840 | val = 0x3c2; /* GREEK SMALL FINAL SIGMA */ |
---|
841 | } |
---|
842 | else |
---|
843 | val = 0x3c2; /* GREEK SMALL FINAL SIGMA */ |
---|
844 | |
---|
845 | len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL); |
---|
846 | } |
---|
847 | else if (t == G_UNICODE_UPPERCASE_LETTER || t == G_UNICODE_TITLECASE_LETTER) |
---|
848 | { |
---|
849 | val = ATTTABLE (c >> 8, c & 0xff); |
---|
850 | |
---|
851 | if (val >= 0x1000000) |
---|
852 | { |
---|
853 | len += output_special_case (out_buffer ? out_buffer + len : NULL, val - 0x1000000, t, 0); |
---|
854 | } |
---|
855 | else |
---|
856 | { |
---|
857 | if (t == G_UNICODE_TITLECASE_LETTER) |
---|
858 | { |
---|
859 | unsigned int i; |
---|
860 | for (i = 0; i < G_N_ELEMENTS (title_table); ++i) |
---|
861 | { |
---|
862 | if (title_table[i][0] == c) |
---|
863 | val = title_table[i][2]; |
---|
864 | } |
---|
865 | } |
---|
866 | |
---|
867 | len += g_unichar_to_utf8 (val, out_buffer ? out_buffer + len : NULL); |
---|
868 | } |
---|
869 | } |
---|
870 | else |
---|
871 | { |
---|
872 | gsize char_len = g_utf8_skip[*(guchar *)last]; |
---|
873 | |
---|
874 | if (out_buffer) |
---|
875 | memcpy (out_buffer + len, last, char_len); |
---|
876 | |
---|
877 | len += char_len; |
---|
878 | } |
---|
879 | |
---|
880 | } |
---|
881 | |
---|
882 | return len; |
---|
883 | } |
---|
884 | |
---|
885 | /** |
---|
886 | * g_utf8_strdown: |
---|
887 | * @str: a UTF-8 encoded string |
---|
888 | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
---|
889 | * |
---|
890 | * Converts all Unicode characters in the string that have a case |
---|
891 | * to lowercase. The exact manner that this is done depends |
---|
892 | * on the current locale, and may result in the number of |
---|
893 | * characters in the string changing. |
---|
894 | * |
---|
895 | * Return value: a newly allocated string, with all characters |
---|
896 | * converted to lowercase. |
---|
897 | **/ |
---|
898 | gchar * |
---|
899 | g_utf8_strdown (const gchar *str, |
---|
900 | gssize len) |
---|
901 | { |
---|
902 | gsize result_len; |
---|
903 | LocaleType locale_type; |
---|
904 | gchar *result; |
---|
905 | |
---|
906 | g_return_val_if_fail (str != NULL, NULL); |
---|
907 | |
---|
908 | locale_type = get_locale_type (); |
---|
909 | |
---|
910 | /* |
---|
911 | * We use a two pass approach to keep memory management simple |
---|
912 | */ |
---|
913 | result_len = real_tolower (str, len, NULL, locale_type); |
---|
914 | result = g_malloc (result_len + 1); |
---|
915 | real_tolower (str, len, result, locale_type); |
---|
916 | result[result_len] = '\0'; |
---|
917 | |
---|
918 | return result; |
---|
919 | } |
---|
920 | |
---|
921 | /** |
---|
922 | * g_utf8_casefold: |
---|
923 | * @str: a UTF-8 encoded string |
---|
924 | * @len: length of @str, in bytes, or -1 if @str is nul-terminated. |
---|
925 | * |
---|
926 | * Converts a string into a form that is independent of case. The |
---|
927 | * result will not correspond to any particular case, but can be |
---|
928 | * compared for equality or ordered with the results of calling |
---|
929 | * g_utf8_casefold() on other strings. |
---|
930 | * |
---|
931 | * Note that calling g_utf8_casefold() followed by g_utf8_collate() is |
---|
932 | * only an approximation to the correct linguistic case insensitive |
---|
933 | * ordering, though it is a fairly good one. Getting this exactly |
---|
934 | * right would require a more sophisticated collation function that |
---|
935 | * takes case sensitivity into account. GLib does not currently |
---|
936 | * provide such a function. |
---|
937 | * |
---|
938 | * Return value: a newly allocated string, that is a |
---|
939 | * case independent form of @str. |
---|
940 | **/ |
---|
941 | gchar * |
---|
942 | g_utf8_casefold (const gchar *str, |
---|
943 | gssize len) |
---|
944 | { |
---|
945 | GString *result; |
---|
946 | const char *p; |
---|
947 | |
---|
948 | g_return_val_if_fail (str != NULL, NULL); |
---|
949 | |
---|
950 | result = g_string_new (NULL); |
---|
951 | p = str; |
---|
952 | while ((len < 0 || p < str + len) && *p) |
---|
953 | { |
---|
954 | gunichar ch = g_utf8_get_char (p); |
---|
955 | |
---|
956 | int start = 0; |
---|
957 | int end = G_N_ELEMENTS (casefold_table); |
---|
958 | |
---|
959 | if (ch >= casefold_table[start].ch && |
---|
960 | ch <= casefold_table[end - 1].ch) |
---|
961 | { |
---|
962 | while (TRUE) |
---|
963 | { |
---|
964 | int half = (start + end) / 2; |
---|
965 | if (ch == casefold_table[half].ch) |
---|
966 | { |
---|
967 | g_string_append (result, casefold_table[half].data); |
---|
968 | goto next; |
---|
969 | } |
---|
970 | else if (half == start) |
---|
971 | break; |
---|
972 | else if (ch > casefold_table[half].ch) |
---|
973 | start = half; |
---|
974 | else |
---|
975 | end = half; |
---|
976 | } |
---|
977 | } |
---|
978 | |
---|
979 | g_string_append_unichar (result, g_unichar_tolower (ch)); |
---|
980 | |
---|
981 | next: |
---|
982 | p = g_utf8_next_char (p); |
---|
983 | } |
---|
984 | |
---|
985 | return g_string_free (result, FALSE); |
---|
986 | } |
---|
987 | |
---|
988 | /** |
---|
989 | * g_unichar_get_mirror_char: |
---|
990 | * @ch: a unicode character |
---|
991 | * @mirrored_ch: location to store the mirrored character |
---|
992 | * |
---|
993 | * In Unicode, some characters are <firstterm>mirrored</firstterm>. This |
---|
994 | * means that their images are mirrored horizontally in text that is laid |
---|
995 | * out from right to left. For instance, "(" would become its mirror image, |
---|
996 | * ")", in right-to-left text. |
---|
997 | * |
---|
998 | * If @ch has the Unicode mirrored property and there is another unicode |
---|
999 | * character that typically has a glyph that is the mirror image of @ch's |
---|
1000 | * glyph, puts that character in the address pointed to by @mirrored_ch. |
---|
1001 | * |
---|
1002 | * Return value: %TRUE if @ch has a mirrored character and @mirrored_ch is |
---|
1003 | * filled in, %FALSE otherwise |
---|
1004 | * |
---|
1005 | * Since: 2.4 |
---|
1006 | **/ |
---|
1007 | /* This code is adapted from FriBidi (http://fribidi.sourceforge.net/). |
---|
1008 | * FriBidi is: Copyright (C) 1999,2000 Dov Grobgeld, and |
---|
1009 | * Copyright (C) 2001,2002 Behdad Esfahbod. |
---|
1010 | */ |
---|
1011 | gboolean |
---|
1012 | g_unichar_get_mirror_char (gunichar ch, |
---|
1013 | gunichar *mirrored_ch) |
---|
1014 | { |
---|
1015 | gint pos, step, size; |
---|
1016 | gboolean found; |
---|
1017 | |
---|
1018 | size = G_N_ELEMENTS (bidi_mirroring_table); |
---|
1019 | pos = step = (size / 2) + 1; |
---|
1020 | |
---|
1021 | while (step > 1) |
---|
1022 | { |
---|
1023 | gunichar cmp_ch = bidi_mirroring_table[pos].ch; |
---|
1024 | step = (step + 1) / 2; |
---|
1025 | |
---|
1026 | if (cmp_ch < ch) |
---|
1027 | { |
---|
1028 | pos += step; |
---|
1029 | if (pos > size - 1) |
---|
1030 | pos = size - 1; |
---|
1031 | } |
---|
1032 | else if (cmp_ch > ch) |
---|
1033 | { |
---|
1034 | pos -= step; |
---|
1035 | if (pos < 0) |
---|
1036 | pos = 0; |
---|
1037 | } |
---|
1038 | else |
---|
1039 | break; |
---|
1040 | } |
---|
1041 | found = bidi_mirroring_table[pos].ch == ch; |
---|
1042 | if (mirrored_ch) |
---|
1043 | *mirrored_ch = found ? bidi_mirroring_table[pos].mirrored_ch : ch; |
---|
1044 | |
---|
1045 | return found; |
---|
1046 | |
---|
1047 | } |
---|