source: trunk/third/glib2/glib/gunicode.h @ 20721

Revision 20721, 9.5 KB checked in by ghudson, 20 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r20720, which included commits to RCS files with non-trunk default branches.
Line 
1/* gunicode.h - Unicode manipulation functions
2 *
3 *  Copyright (C) 1999, 2000 Tom Tromey
4 *  Copyright 2000 Red Hat, Inc.
5 *
6 * The Gnome Library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
10 *
11 * The Gnome Library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with the Gnome Library; see the file COPYING.LIB.  If not,
18 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 *   Boston, MA 02111-1307, USA.
20 */
21
22#ifndef __G_UNICODE_H__
23#define __G_UNICODE_H__
24
25#include <glib/gerror.h>
26#include <glib/gtypes.h>
27
28G_BEGIN_DECLS
29
30typedef guint32 gunichar;
31typedef guint16 gunichar2;
32
33/* These are the possible character classifications.
34 * See http://www.unicode.org/Public/UNIDATA/UnicodeData.html
35 */
36typedef enum
37{
38  G_UNICODE_CONTROL,
39  G_UNICODE_FORMAT,
40  G_UNICODE_UNASSIGNED,
41  G_UNICODE_PRIVATE_USE,
42  G_UNICODE_SURROGATE,
43  G_UNICODE_LOWERCASE_LETTER,
44  G_UNICODE_MODIFIER_LETTER,
45  G_UNICODE_OTHER_LETTER,
46  G_UNICODE_TITLECASE_LETTER,
47  G_UNICODE_UPPERCASE_LETTER,
48  G_UNICODE_COMBINING_MARK,
49  G_UNICODE_ENCLOSING_MARK,
50  G_UNICODE_NON_SPACING_MARK,
51  G_UNICODE_DECIMAL_NUMBER,
52  G_UNICODE_LETTER_NUMBER,
53  G_UNICODE_OTHER_NUMBER,
54  G_UNICODE_CONNECT_PUNCTUATION,
55  G_UNICODE_DASH_PUNCTUATION,
56  G_UNICODE_CLOSE_PUNCTUATION,
57  G_UNICODE_FINAL_PUNCTUATION,
58  G_UNICODE_INITIAL_PUNCTUATION,
59  G_UNICODE_OTHER_PUNCTUATION,
60  G_UNICODE_OPEN_PUNCTUATION,
61  G_UNICODE_CURRENCY_SYMBOL,
62  G_UNICODE_MODIFIER_SYMBOL,
63  G_UNICODE_MATH_SYMBOL,
64  G_UNICODE_OTHER_SYMBOL,
65  G_UNICODE_LINE_SEPARATOR,
66  G_UNICODE_PARAGRAPH_SEPARATOR,
67  G_UNICODE_SPACE_SEPARATOR
68} GUnicodeType;
69
70/* These are the possible line break classifications.
71 * See http://www.unicode.org/unicode/reports/tr14/
72 */
73typedef enum
74{
75  G_UNICODE_BREAK_MANDATORY,
76  G_UNICODE_BREAK_CARRIAGE_RETURN,
77  G_UNICODE_BREAK_LINE_FEED,
78  G_UNICODE_BREAK_COMBINING_MARK,
79  G_UNICODE_BREAK_SURROGATE,
80  G_UNICODE_BREAK_ZERO_WIDTH_SPACE,
81  G_UNICODE_BREAK_INSEPARABLE,
82  G_UNICODE_BREAK_NON_BREAKING_GLUE,
83  G_UNICODE_BREAK_CONTINGENT,
84  G_UNICODE_BREAK_SPACE,
85  G_UNICODE_BREAK_AFTER,
86  G_UNICODE_BREAK_BEFORE,
87  G_UNICODE_BREAK_BEFORE_AND_AFTER,
88  G_UNICODE_BREAK_HYPHEN,
89  G_UNICODE_BREAK_NON_STARTER,
90  G_UNICODE_BREAK_OPEN_PUNCTUATION,
91  G_UNICODE_BREAK_CLOSE_PUNCTUATION,
92  G_UNICODE_BREAK_QUOTATION,
93  G_UNICODE_BREAK_EXCLAMATION,
94  G_UNICODE_BREAK_IDEOGRAPHIC,
95  G_UNICODE_BREAK_NUMERIC,
96  G_UNICODE_BREAK_INFIX_SEPARATOR,
97  G_UNICODE_BREAK_SYMBOL,
98  G_UNICODE_BREAK_ALPHABETIC,
99  G_UNICODE_BREAK_PREFIX,
100  G_UNICODE_BREAK_POSTFIX,
101  G_UNICODE_BREAK_COMPLEX_CONTEXT,
102  G_UNICODE_BREAK_AMBIGUOUS,
103  G_UNICODE_BREAK_UNKNOWN,
104  G_UNICODE_BREAK_NEXT_LINE,
105  G_UNICODE_BREAK_WORD_JOINER
106} GUnicodeBreakType;
107
108/* Returns TRUE if current locale uses UTF-8 charset.  If CHARSET is
109 * not null, sets *CHARSET to the name of the current locale's
110 * charset.  This value is statically allocated, and should be copied
111 * in case the locale's charset will be changed later using setlocale()
112 * or in some other way.
113 */
114gboolean g_get_charset (G_CONST_RETURN char **charset);
115
116/* These are all analogs of the <ctype.h> functions.
117 */
118gboolean g_unichar_isalnum   (gunichar c) G_GNUC_CONST;
119gboolean g_unichar_isalpha   (gunichar c) G_GNUC_CONST;
120gboolean g_unichar_iscntrl   (gunichar c) G_GNUC_CONST;
121gboolean g_unichar_isdigit   (gunichar c) G_GNUC_CONST;
122gboolean g_unichar_isgraph   (gunichar c) G_GNUC_CONST;
123gboolean g_unichar_islower   (gunichar c) G_GNUC_CONST;
124gboolean g_unichar_isprint   (gunichar c) G_GNUC_CONST;
125gboolean g_unichar_ispunct   (gunichar c) G_GNUC_CONST;
126gboolean g_unichar_isspace   (gunichar c) G_GNUC_CONST;
127gboolean g_unichar_isupper   (gunichar c) G_GNUC_CONST;
128gboolean g_unichar_isxdigit  (gunichar c) G_GNUC_CONST;
129gboolean g_unichar_istitle   (gunichar c) G_GNUC_CONST;
130gboolean g_unichar_isdefined (gunichar c) G_GNUC_CONST;
131gboolean g_unichar_iswide    (gunichar c) G_GNUC_CONST;
132
133/* More <ctype.h> functions.  These convert between the three cases.
134 * See the Unicode book to understand title case.  */
135gunichar g_unichar_toupper (gunichar c) G_GNUC_CONST;
136gunichar g_unichar_tolower (gunichar c) G_GNUC_CONST;
137gunichar g_unichar_totitle (gunichar c) G_GNUC_CONST;
138
139/* If C is a digit (according to `g_unichar_isdigit'), then return its
140   numeric value.  Otherwise return -1.  */
141gint g_unichar_digit_value (gunichar c) G_GNUC_CONST;
142
143gint g_unichar_xdigit_value (gunichar c) G_GNUC_CONST;
144
145/* Return the Unicode character type of a given character.  */
146GUnicodeType g_unichar_type (gunichar c) G_GNUC_CONST;
147
148/* Return the line break property for a given character */
149GUnicodeBreakType g_unichar_break_type (gunichar c) G_GNUC_CONST;
150
151
152/* Compute canonical ordering of a string in-place.  This rearranges
153   decomposed characters in the string according to their combining
154   classes.  See the Unicode manual for more information.  */
155void g_unicode_canonical_ordering (gunichar *string,
156                                   gsize     len);
157
158/* Compute canonical decomposition of a character.  Returns g_malloc()d
159   string of Unicode characters.  RESULT_LEN is set to the resulting
160   length of the string.  */
161gunichar *g_unicode_canonical_decomposition (gunichar  ch,
162                                             gsize    *result_len);
163
164/* Array of skip-bytes-per-initial character.
165 */
166GLIB_VAR const gchar * const g_utf8_skip;
167
168#define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
169
170gunichar g_utf8_get_char           (const gchar  *p);
171gunichar g_utf8_get_char_validated (const  gchar *p,
172                                    gssize        max_len);
173
174gchar*   g_utf8_offset_to_pointer (const gchar *str,
175                                   glong        offset); 
176glong    g_utf8_pointer_to_offset (const gchar *str,     
177                                   const gchar *pos);
178gchar*   g_utf8_prev_char         (const gchar *p);
179gchar*   g_utf8_find_next_char    (const gchar *p,
180                                   const gchar *end);
181gchar*   g_utf8_find_prev_char    (const gchar *str,
182                                   const gchar *p);
183
184glong g_utf8_strlen (const gchar *p, 
185                     gssize       max);       
186
187/* Copies n characters from src to dest */
188gchar* g_utf8_strncpy (gchar       *dest,
189                       const gchar *src,
190                       gsize        n);
191
192/* Find the UTF-8 character corresponding to ch, in string p. These
193   functions are equivalants to strchr and strrchr */
194gchar* g_utf8_strchr  (const gchar *p,
195                       gssize       len,
196                       gunichar     c);
197gchar* g_utf8_strrchr (const gchar *p,
198                       gssize       len,
199                       gunichar     c);
200gchar* g_utf8_strreverse (const gchar *str,
201                          gssize len);
202
203gunichar2 *g_utf8_to_utf16     (const gchar      *str,
204                                glong             len,           
205                                glong            *items_read,     
206                                glong            *items_written, 
207                                GError          **error);
208gunichar * g_utf8_to_ucs4      (const gchar      *str,
209                                glong             len,           
210                                glong            *items_read,     
211                                glong            *items_written, 
212                                GError          **error);
213gunichar * g_utf8_to_ucs4_fast (const gchar      *str,
214                                glong             len,           
215                                glong            *items_written);
216gunichar * g_utf16_to_ucs4     (const gunichar2  *str,
217                                glong             len,           
218                                glong            *items_read,     
219                                glong            *items_written, 
220                                GError          **error);
221gchar*     g_utf16_to_utf8     (const gunichar2  *str,
222                                glong             len,           
223                                glong            *items_read,     
224                                glong            *items_written, 
225                                GError          **error);
226gunichar2 *g_ucs4_to_utf16     (const gunichar   *str,
227                                glong             len,           
228                                glong            *items_read,     
229                                glong            *items_written, 
230                                GError          **error);
231gchar*     g_ucs4_to_utf8      (const gunichar   *str,
232                                glong             len,           
233                                glong            *items_read,     
234                                glong            *items_written, 
235                                GError          **error);
236
237/* Convert a single character into UTF-8. outbuf must have at
238 * least 6 bytes of space. Returns the number of bytes in the
239 * result.
240 */
241gint      g_unichar_to_utf8 (gunichar    c,
242                             gchar      *outbuf);
243
244/* Validate a UTF8 string, return TRUE if valid, put pointer to
245 * first invalid char in **end
246 */
247
248gboolean g_utf8_validate (const gchar  *str,
249                          gssize        max_len, 
250                          const gchar **end);
251
252/* Validate a Unicode character */
253gboolean g_unichar_validate (gunichar ch);
254
255gchar *g_utf8_strup   (const gchar *str,
256                       gssize       len);
257gchar *g_utf8_strdown (const gchar *str,
258                       gssize       len);
259gchar *g_utf8_casefold (const gchar *str,
260                        gssize       len);
261
262typedef enum {
263  G_NORMALIZE_DEFAULT,
264  G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
265  G_NORMALIZE_DEFAULT_COMPOSE,
266  G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
267  G_NORMALIZE_ALL,
268  G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
269  G_NORMALIZE_ALL_COMPOSE,
270  G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
271} GNormalizeMode;
272
273gchar *g_utf8_normalize (const gchar   *str,
274                         gssize         len,
275                         GNormalizeMode mode);
276
277gint   g_utf8_collate     (const gchar *str1,
278                           const gchar *str2);
279gchar *g_utf8_collate_key (const gchar *str,
280                           gssize       len);
281
282gboolean g_unichar_get_mirror_char (gunichar ch,
283                                    gunichar *mirrored_ch);
284
285G_END_DECLS
286
287#endif /* __G_UNICODE_H__ */
Note: See TracBrowser for help on using the repository browser.