1 | /* gunicode.h - Unicode manipulation functions |
---|
2 | * |
---|
3 | * Copyright (C) 1999, 2000 Tom Tromey |
---|
4 | * Copyright 2000 Red Hat, Inc. |
---|
5 | * |
---|
6 | * The Gnome Library is free software; you can redistribute it and/or |
---|
7 | * modify it under the terms of the GNU Lesser General Public License as |
---|
8 | * published by the Free Software Foundation; either version 2 of the |
---|
9 | * License, or (at your option) any later version. |
---|
10 | * |
---|
11 | * The Gnome Library is distributed in the hope that it will be useful, |
---|
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
---|
14 | * Lesser General Public License for more details. |
---|
15 | * |
---|
16 | * You should have received a copy of the GNU Lesser General Public |
---|
17 | * License along with the Gnome Library; see the file COPYING.LIB. If not, |
---|
18 | * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
---|
19 | * Boston, MA 02111-1307, USA. |
---|
20 | */ |
---|
21 | |
---|
22 | #ifndef __G_UNICODE_H__ |
---|
23 | #define __G_UNICODE_H__ |
---|
24 | |
---|
25 | #include <glib/gerror.h> |
---|
26 | #include <glib/gtypes.h> |
---|
27 | |
---|
28 | G_BEGIN_DECLS |
---|
29 | |
---|
30 | typedef guint32 gunichar; |
---|
31 | typedef guint16 gunichar2; |
---|
32 | |
---|
33 | /* These are the possible character classifications. |
---|
34 | * See http://www.unicode.org/Public/UNIDATA/UnicodeData.html |
---|
35 | */ |
---|
36 | typedef enum |
---|
37 | { |
---|
38 | G_UNICODE_CONTROL, |
---|
39 | G_UNICODE_FORMAT, |
---|
40 | G_UNICODE_UNASSIGNED, |
---|
41 | G_UNICODE_PRIVATE_USE, |
---|
42 | G_UNICODE_SURROGATE, |
---|
43 | G_UNICODE_LOWERCASE_LETTER, |
---|
44 | G_UNICODE_MODIFIER_LETTER, |
---|
45 | G_UNICODE_OTHER_LETTER, |
---|
46 | G_UNICODE_TITLECASE_LETTER, |
---|
47 | G_UNICODE_UPPERCASE_LETTER, |
---|
48 | G_UNICODE_COMBINING_MARK, |
---|
49 | G_UNICODE_ENCLOSING_MARK, |
---|
50 | G_UNICODE_NON_SPACING_MARK, |
---|
51 | G_UNICODE_DECIMAL_NUMBER, |
---|
52 | G_UNICODE_LETTER_NUMBER, |
---|
53 | G_UNICODE_OTHER_NUMBER, |
---|
54 | G_UNICODE_CONNECT_PUNCTUATION, |
---|
55 | G_UNICODE_DASH_PUNCTUATION, |
---|
56 | G_UNICODE_CLOSE_PUNCTUATION, |
---|
57 | G_UNICODE_FINAL_PUNCTUATION, |
---|
58 | G_UNICODE_INITIAL_PUNCTUATION, |
---|
59 | G_UNICODE_OTHER_PUNCTUATION, |
---|
60 | G_UNICODE_OPEN_PUNCTUATION, |
---|
61 | G_UNICODE_CURRENCY_SYMBOL, |
---|
62 | G_UNICODE_MODIFIER_SYMBOL, |
---|
63 | G_UNICODE_MATH_SYMBOL, |
---|
64 | G_UNICODE_OTHER_SYMBOL, |
---|
65 | G_UNICODE_LINE_SEPARATOR, |
---|
66 | G_UNICODE_PARAGRAPH_SEPARATOR, |
---|
67 | G_UNICODE_SPACE_SEPARATOR |
---|
68 | } GUnicodeType; |
---|
69 | |
---|
70 | /* These are the possible line break classifications. |
---|
71 | * See http://www.unicode.org/unicode/reports/tr14/ |
---|
72 | */ |
---|
73 | typedef enum |
---|
74 | { |
---|
75 | G_UNICODE_BREAK_MANDATORY, |
---|
76 | G_UNICODE_BREAK_CARRIAGE_RETURN, |
---|
77 | G_UNICODE_BREAK_LINE_FEED, |
---|
78 | G_UNICODE_BREAK_COMBINING_MARK, |
---|
79 | G_UNICODE_BREAK_SURROGATE, |
---|
80 | G_UNICODE_BREAK_ZERO_WIDTH_SPACE, |
---|
81 | G_UNICODE_BREAK_INSEPARABLE, |
---|
82 | G_UNICODE_BREAK_NON_BREAKING_GLUE, |
---|
83 | G_UNICODE_BREAK_CONTINGENT, |
---|
84 | G_UNICODE_BREAK_SPACE, |
---|
85 | G_UNICODE_BREAK_AFTER, |
---|
86 | G_UNICODE_BREAK_BEFORE, |
---|
87 | G_UNICODE_BREAK_BEFORE_AND_AFTER, |
---|
88 | G_UNICODE_BREAK_HYPHEN, |
---|
89 | G_UNICODE_BREAK_NON_STARTER, |
---|
90 | G_UNICODE_BREAK_OPEN_PUNCTUATION, |
---|
91 | G_UNICODE_BREAK_CLOSE_PUNCTUATION, |
---|
92 | G_UNICODE_BREAK_QUOTATION, |
---|
93 | G_UNICODE_BREAK_EXCLAMATION, |
---|
94 | G_UNICODE_BREAK_IDEOGRAPHIC, |
---|
95 | G_UNICODE_BREAK_NUMERIC, |
---|
96 | G_UNICODE_BREAK_INFIX_SEPARATOR, |
---|
97 | G_UNICODE_BREAK_SYMBOL, |
---|
98 | G_UNICODE_BREAK_ALPHABETIC, |
---|
99 | G_UNICODE_BREAK_PREFIX, |
---|
100 | G_UNICODE_BREAK_POSTFIX, |
---|
101 | G_UNICODE_BREAK_COMPLEX_CONTEXT, |
---|
102 | G_UNICODE_BREAK_AMBIGUOUS, |
---|
103 | G_UNICODE_BREAK_UNKNOWN, |
---|
104 | G_UNICODE_BREAK_NEXT_LINE, |
---|
105 | G_UNICODE_BREAK_WORD_JOINER |
---|
106 | } GUnicodeBreakType; |
---|
107 | |
---|
108 | /* Returns TRUE if current locale uses UTF-8 charset. If CHARSET is |
---|
109 | * not null, sets *CHARSET to the name of the current locale's |
---|
110 | * charset. This value is statically allocated, and should be copied |
---|
111 | * in case the locale's charset will be changed later using setlocale() |
---|
112 | * or in some other way. |
---|
113 | */ |
---|
114 | gboolean g_get_charset (G_CONST_RETURN char **charset); |
---|
115 | |
---|
116 | /* These are all analogs of the <ctype.h> functions. |
---|
117 | */ |
---|
118 | gboolean g_unichar_isalnum (gunichar c) G_GNUC_CONST; |
---|
119 | gboolean g_unichar_isalpha (gunichar c) G_GNUC_CONST; |
---|
120 | gboolean g_unichar_iscntrl (gunichar c) G_GNUC_CONST; |
---|
121 | gboolean g_unichar_isdigit (gunichar c) G_GNUC_CONST; |
---|
122 | gboolean g_unichar_isgraph (gunichar c) G_GNUC_CONST; |
---|
123 | gboolean g_unichar_islower (gunichar c) G_GNUC_CONST; |
---|
124 | gboolean g_unichar_isprint (gunichar c) G_GNUC_CONST; |
---|
125 | gboolean g_unichar_ispunct (gunichar c) G_GNUC_CONST; |
---|
126 | gboolean g_unichar_isspace (gunichar c) G_GNUC_CONST; |
---|
127 | gboolean g_unichar_isupper (gunichar c) G_GNUC_CONST; |
---|
128 | gboolean g_unichar_isxdigit (gunichar c) G_GNUC_CONST; |
---|
129 | gboolean g_unichar_istitle (gunichar c) G_GNUC_CONST; |
---|
130 | gboolean g_unichar_isdefined (gunichar c) G_GNUC_CONST; |
---|
131 | gboolean g_unichar_iswide (gunichar c) G_GNUC_CONST; |
---|
132 | |
---|
133 | /* More <ctype.h> functions. These convert between the three cases. |
---|
134 | * See the Unicode book to understand title case. */ |
---|
135 | gunichar g_unichar_toupper (gunichar c) G_GNUC_CONST; |
---|
136 | gunichar g_unichar_tolower (gunichar c) G_GNUC_CONST; |
---|
137 | gunichar g_unichar_totitle (gunichar c) G_GNUC_CONST; |
---|
138 | |
---|
139 | /* If C is a digit (according to `g_unichar_isdigit'), then return its |
---|
140 | numeric value. Otherwise return -1. */ |
---|
141 | gint g_unichar_digit_value (gunichar c) G_GNUC_CONST; |
---|
142 | |
---|
143 | gint g_unichar_xdigit_value (gunichar c) G_GNUC_CONST; |
---|
144 | |
---|
145 | /* Return the Unicode character type of a given character. */ |
---|
146 | GUnicodeType g_unichar_type (gunichar c) G_GNUC_CONST; |
---|
147 | |
---|
148 | /* Return the line break property for a given character */ |
---|
149 | GUnicodeBreakType g_unichar_break_type (gunichar c) G_GNUC_CONST; |
---|
150 | |
---|
151 | |
---|
152 | /* Compute canonical ordering of a string in-place. This rearranges |
---|
153 | decomposed characters in the string according to their combining |
---|
154 | classes. See the Unicode manual for more information. */ |
---|
155 | void g_unicode_canonical_ordering (gunichar *string, |
---|
156 | gsize len); |
---|
157 | |
---|
158 | /* Compute canonical decomposition of a character. Returns g_malloc()d |
---|
159 | string of Unicode characters. RESULT_LEN is set to the resulting |
---|
160 | length of the string. */ |
---|
161 | gunichar *g_unicode_canonical_decomposition (gunichar ch, |
---|
162 | gsize *result_len); |
---|
163 | |
---|
164 | /* Array of skip-bytes-per-initial character. |
---|
165 | */ |
---|
166 | GLIB_VAR const gchar * const g_utf8_skip; |
---|
167 | |
---|
168 | #define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)]) |
---|
169 | |
---|
170 | gunichar g_utf8_get_char (const gchar *p); |
---|
171 | gunichar g_utf8_get_char_validated (const gchar *p, |
---|
172 | gssize max_len); |
---|
173 | |
---|
174 | gchar* g_utf8_offset_to_pointer (const gchar *str, |
---|
175 | glong offset); |
---|
176 | glong g_utf8_pointer_to_offset (const gchar *str, |
---|
177 | const gchar *pos); |
---|
178 | gchar* g_utf8_prev_char (const gchar *p); |
---|
179 | gchar* g_utf8_find_next_char (const gchar *p, |
---|
180 | const gchar *end); |
---|
181 | gchar* g_utf8_find_prev_char (const gchar *str, |
---|
182 | const gchar *p); |
---|
183 | |
---|
184 | glong g_utf8_strlen (const gchar *p, |
---|
185 | gssize max); |
---|
186 | |
---|
187 | /* Copies n characters from src to dest */ |
---|
188 | gchar* g_utf8_strncpy (gchar *dest, |
---|
189 | const gchar *src, |
---|
190 | gsize n); |
---|
191 | |
---|
192 | /* Find the UTF-8 character corresponding to ch, in string p. These |
---|
193 | functions are equivalants to strchr and strrchr */ |
---|
194 | gchar* g_utf8_strchr (const gchar *p, |
---|
195 | gssize len, |
---|
196 | gunichar c); |
---|
197 | gchar* g_utf8_strrchr (const gchar *p, |
---|
198 | gssize len, |
---|
199 | gunichar c); |
---|
200 | gchar* g_utf8_strreverse (const gchar *str, |
---|
201 | gssize len); |
---|
202 | |
---|
203 | gunichar2 *g_utf8_to_utf16 (const gchar *str, |
---|
204 | glong len, |
---|
205 | glong *items_read, |
---|
206 | glong *items_written, |
---|
207 | GError **error); |
---|
208 | gunichar * g_utf8_to_ucs4 (const gchar *str, |
---|
209 | glong len, |
---|
210 | glong *items_read, |
---|
211 | glong *items_written, |
---|
212 | GError **error); |
---|
213 | gunichar * g_utf8_to_ucs4_fast (const gchar *str, |
---|
214 | glong len, |
---|
215 | glong *items_written); |
---|
216 | gunichar * g_utf16_to_ucs4 (const gunichar2 *str, |
---|
217 | glong len, |
---|
218 | glong *items_read, |
---|
219 | glong *items_written, |
---|
220 | GError **error); |
---|
221 | gchar* g_utf16_to_utf8 (const gunichar2 *str, |
---|
222 | glong len, |
---|
223 | glong *items_read, |
---|
224 | glong *items_written, |
---|
225 | GError **error); |
---|
226 | gunichar2 *g_ucs4_to_utf16 (const gunichar *str, |
---|
227 | glong len, |
---|
228 | glong *items_read, |
---|
229 | glong *items_written, |
---|
230 | GError **error); |
---|
231 | gchar* g_ucs4_to_utf8 (const gunichar *str, |
---|
232 | glong len, |
---|
233 | glong *items_read, |
---|
234 | glong *items_written, |
---|
235 | GError **error); |
---|
236 | |
---|
237 | /* Convert a single character into UTF-8. outbuf must have at |
---|
238 | * least 6 bytes of space. Returns the number of bytes in the |
---|
239 | * result. |
---|
240 | */ |
---|
241 | gint g_unichar_to_utf8 (gunichar c, |
---|
242 | gchar *outbuf); |
---|
243 | |
---|
244 | /* Validate a UTF8 string, return TRUE if valid, put pointer to |
---|
245 | * first invalid char in **end |
---|
246 | */ |
---|
247 | |
---|
248 | gboolean g_utf8_validate (const gchar *str, |
---|
249 | gssize max_len, |
---|
250 | const gchar **end); |
---|
251 | |
---|
252 | /* Validate a Unicode character */ |
---|
253 | gboolean g_unichar_validate (gunichar ch); |
---|
254 | |
---|
255 | gchar *g_utf8_strup (const gchar *str, |
---|
256 | gssize len); |
---|
257 | gchar *g_utf8_strdown (const gchar *str, |
---|
258 | gssize len); |
---|
259 | gchar *g_utf8_casefold (const gchar *str, |
---|
260 | gssize len); |
---|
261 | |
---|
262 | typedef enum { |
---|
263 | G_NORMALIZE_DEFAULT, |
---|
264 | G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT, |
---|
265 | G_NORMALIZE_DEFAULT_COMPOSE, |
---|
266 | G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE, |
---|
267 | G_NORMALIZE_ALL, |
---|
268 | G_NORMALIZE_NFKD = G_NORMALIZE_ALL, |
---|
269 | G_NORMALIZE_ALL_COMPOSE, |
---|
270 | G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE |
---|
271 | } GNormalizeMode; |
---|
272 | |
---|
273 | gchar *g_utf8_normalize (const gchar *str, |
---|
274 | gssize len, |
---|
275 | GNormalizeMode mode); |
---|
276 | |
---|
277 | gint g_utf8_collate (const gchar *str1, |
---|
278 | const gchar *str2); |
---|
279 | gchar *g_utf8_collate_key (const gchar *str, |
---|
280 | gssize len); |
---|
281 | |
---|
282 | gboolean g_unichar_get_mirror_char (gunichar ch, |
---|
283 | gunichar *mirrored_ch); |
---|
284 | |
---|
285 | G_END_DECLS |
---|
286 | |
---|
287 | #endif /* __G_UNICODE_H__ */ |
---|