source: trunk/third/glib2/glib/gunicode.h @ 18159

Revision 18159, 9.3 KB checked in by ghudson, 22 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r18158, which included commits to RCS files with non-trunk default branches.
Line 
1/* gunicode.h - Unicode manipulation functions
2 *
3 *  Copyright (C) 1999, 2000 Tom Tromey
4 *  Copyright 2000 Red Hat, Inc.
5 *
6 * The Gnome Library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
10 *
11 * The Gnome Library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with the Gnome Library; see the file COPYING.LIB.  If not,
18 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 *   Boston, MA 02111-1307, USA.
20 */
21
22#ifndef __G_UNICODE_H__
23#define __G_UNICODE_H__
24
25#include <glib/gerror.h>
26#include <glib/gtypes.h>
27
28G_BEGIN_DECLS
29
30typedef guint32 gunichar;
31typedef guint16 gunichar2;
32
33/* These are the possible character classifications.
34 * See http://www.unicode.org/Public/UNIDATA/UnicodeData.html
35 */
36typedef enum
37{
38  G_UNICODE_CONTROL,
39  G_UNICODE_FORMAT,
40  G_UNICODE_UNASSIGNED,
41  G_UNICODE_PRIVATE_USE,
42  G_UNICODE_SURROGATE,
43  G_UNICODE_LOWERCASE_LETTER,
44  G_UNICODE_MODIFIER_LETTER,
45  G_UNICODE_OTHER_LETTER,
46  G_UNICODE_TITLECASE_LETTER,
47  G_UNICODE_UPPERCASE_LETTER,
48  G_UNICODE_COMBINING_MARK,
49  G_UNICODE_ENCLOSING_MARK,
50  G_UNICODE_NON_SPACING_MARK,
51  G_UNICODE_DECIMAL_NUMBER,
52  G_UNICODE_LETTER_NUMBER,
53  G_UNICODE_OTHER_NUMBER,
54  G_UNICODE_CONNECT_PUNCTUATION,
55  G_UNICODE_DASH_PUNCTUATION,
56  G_UNICODE_CLOSE_PUNCTUATION,
57  G_UNICODE_FINAL_PUNCTUATION,
58  G_UNICODE_INITIAL_PUNCTUATION,
59  G_UNICODE_OTHER_PUNCTUATION,
60  G_UNICODE_OPEN_PUNCTUATION,
61  G_UNICODE_CURRENCY_SYMBOL,
62  G_UNICODE_MODIFIER_SYMBOL,
63  G_UNICODE_MATH_SYMBOL,
64  G_UNICODE_OTHER_SYMBOL,
65  G_UNICODE_LINE_SEPARATOR,
66  G_UNICODE_PARAGRAPH_SEPARATOR,
67  G_UNICODE_SPACE_SEPARATOR
68} GUnicodeType;
69
70/* These are the possible line break classifications.
71 * See http://www.unicode.org/unicode/reports/tr14/
72 */
73typedef enum
74{
75  G_UNICODE_BREAK_MANDATORY,
76  G_UNICODE_BREAK_CARRIAGE_RETURN,
77  G_UNICODE_BREAK_LINE_FEED,
78  G_UNICODE_BREAK_COMBINING_MARK,
79  G_UNICODE_BREAK_SURROGATE,
80  G_UNICODE_BREAK_ZERO_WIDTH_SPACE,
81  G_UNICODE_BREAK_INSEPARABLE,
82  G_UNICODE_BREAK_NON_BREAKING_GLUE,
83  G_UNICODE_BREAK_CONTINGENT,
84  G_UNICODE_BREAK_SPACE,
85  G_UNICODE_BREAK_AFTER,
86  G_UNICODE_BREAK_BEFORE,
87  G_UNICODE_BREAK_BEFORE_AND_AFTER,
88  G_UNICODE_BREAK_HYPHEN,
89  G_UNICODE_BREAK_NON_STARTER,
90  G_UNICODE_BREAK_OPEN_PUNCTUATION,
91  G_UNICODE_BREAK_CLOSE_PUNCTUATION,
92  G_UNICODE_BREAK_QUOTATION,
93  G_UNICODE_BREAK_EXCLAMATION,
94  G_UNICODE_BREAK_IDEOGRAPHIC,
95  G_UNICODE_BREAK_NUMERIC,
96  G_UNICODE_BREAK_INFIX_SEPARATOR,
97  G_UNICODE_BREAK_SYMBOL,
98  G_UNICODE_BREAK_ALPHABETIC,
99  G_UNICODE_BREAK_PREFIX,
100  G_UNICODE_BREAK_POSTFIX,
101  G_UNICODE_BREAK_COMPLEX_CONTEXT,
102  G_UNICODE_BREAK_AMBIGUOUS,
103  G_UNICODE_BREAK_UNKNOWN
104} GUnicodeBreakType;
105
106/* Returns TRUE if current locale uses UTF-8 charset.  If CHARSET is
107 * not null, sets *CHARSET to the name of the current locale's
108 * charset.  This value is statically allocated, and should be copied
109 * in case the locale's charset will be changed later using setlocale()
110 * or in some other way.
111 */
112gboolean g_get_charset (G_CONST_RETURN char **charset);
113
114/* These are all analogs of the <ctype.h> functions.
115 */
116gboolean g_unichar_isalnum   (gunichar c) G_GNUC_CONST;
117gboolean g_unichar_isalpha   (gunichar c) G_GNUC_CONST;
118gboolean g_unichar_iscntrl   (gunichar c) G_GNUC_CONST;
119gboolean g_unichar_isdigit   (gunichar c) G_GNUC_CONST;
120gboolean g_unichar_isgraph   (gunichar c) G_GNUC_CONST;
121gboolean g_unichar_islower   (gunichar c) G_GNUC_CONST;
122gboolean g_unichar_isprint   (gunichar c) G_GNUC_CONST;
123gboolean g_unichar_ispunct   (gunichar c) G_GNUC_CONST;
124gboolean g_unichar_isspace   (gunichar c) G_GNUC_CONST;
125gboolean g_unichar_isupper   (gunichar c) G_GNUC_CONST;
126gboolean g_unichar_isxdigit  (gunichar c) G_GNUC_CONST;
127gboolean g_unichar_istitle   (gunichar c) G_GNUC_CONST;
128gboolean g_unichar_isdefined (gunichar c) G_GNUC_CONST;
129gboolean g_unichar_iswide    (gunichar c) G_GNUC_CONST;
130
131/* More <ctype.h> functions.  These convert between the three cases.
132 * See the Unicode book to understand title case.  */
133gunichar g_unichar_toupper (gunichar c) G_GNUC_CONST;
134gunichar g_unichar_tolower (gunichar c) G_GNUC_CONST;
135gunichar g_unichar_totitle (gunichar c) G_GNUC_CONST;
136
137/* If C is a digit (according to `g_unichar_isdigit'), then return its
138   numeric value.  Otherwise return -1.  */
139gint g_unichar_digit_value (gunichar c) G_GNUC_CONST;
140
141gint g_unichar_xdigit_value (gunichar c) G_GNUC_CONST;
142
143/* Return the Unicode character type of a given character.  */
144GUnicodeType g_unichar_type (gunichar c) G_GNUC_CONST;
145
146/* Return the line break property for a given character */
147GUnicodeBreakType g_unichar_break_type (gunichar c) G_GNUC_CONST;
148
149
150/* Compute canonical ordering of a string in-place.  This rearranges
151   decomposed characters in the string according to their combining
152   classes.  See the Unicode manual for more information.  */
153void g_unicode_canonical_ordering (gunichar *string,
154                                   gsize     len);
155
156/* Compute canonical decomposition of a character.  Returns g_malloc()d
157   string of Unicode characters.  RESULT_LEN is set to the resulting
158   length of the string.  */
159gunichar *g_unicode_canonical_decomposition (gunichar  ch,
160                                             gsize    *result_len);
161
162/* Array of skip-bytes-per-initial character.
163 */
164GLIB_VAR const gchar * const g_utf8_skip;
165
166#define g_utf8_next_char(p) (char *)((p) + g_utf8_skip[*(guchar *)(p)])
167
168gunichar g_utf8_get_char           (const gchar  *p);
169gunichar g_utf8_get_char_validated (const  gchar *p,
170                                    gssize        max_len);
171
172gchar*   g_utf8_offset_to_pointer (const gchar *str,
173                                   glong        offset); 
174glong    g_utf8_pointer_to_offset (const gchar *str,     
175                                   const gchar *pos);
176gchar*   g_utf8_prev_char         (const gchar *p);
177gchar*   g_utf8_find_next_char    (const gchar *p,
178                                   const gchar *end);
179gchar*   g_utf8_find_prev_char    (const gchar *str,
180                                   const gchar *p);
181
182glong g_utf8_strlen (const gchar *p, 
183                     gssize       max);       
184
185/* Copies n characters from src to dest */
186gchar* g_utf8_strncpy (gchar       *dest,
187                       const gchar *src,
188                       gsize        n);
189
190/* Find the UTF-8 character corresponding to ch, in string p. These
191   functions are equivalants to strchr and strrchr */
192gchar* g_utf8_strchr  (const gchar *p,
193                       gssize       len,
194                       gunichar     c);
195gchar* g_utf8_strrchr (const gchar *p,
196                       gssize       len,
197                       gunichar     c);
198gchar* g_utf8_strreverse (const gchar *str,
199                          gssize len);
200
201gunichar2 *g_utf8_to_utf16     (const gchar      *str,
202                                glong             len,           
203                                glong            *items_read,     
204                                glong            *items_written, 
205                                GError          **error);
206gunichar * g_utf8_to_ucs4      (const gchar      *str,
207                                glong             len,           
208                                glong            *items_read,     
209                                glong            *items_written, 
210                                GError          **error);
211gunichar * g_utf8_to_ucs4_fast (const gchar      *str,
212                                glong             len,           
213                                glong            *items_written);
214gunichar * g_utf16_to_ucs4     (const gunichar2  *str,
215                                glong             len,           
216                                glong            *items_read,     
217                                glong            *items_written, 
218                                GError          **error);
219gchar*     g_utf16_to_utf8     (const gunichar2  *str,
220                                glong             len,           
221                                glong            *items_read,     
222                                glong            *items_written, 
223                                GError          **error);
224gunichar2 *g_ucs4_to_utf16     (const gunichar   *str,
225                                glong             len,           
226                                glong            *items_read,     
227                                glong            *items_written, 
228                                GError          **error);
229gchar*     g_ucs4_to_utf8      (const gunichar   *str,
230                                glong             len,           
231                                glong            *items_read,     
232                                glong            *items_written, 
233                                GError          **error);
234
235/* Convert a single character into UTF-8. outbuf must have at
236 * least 6 bytes of space. Returns the number of bytes in the
237 * result.
238 */
239gint      g_unichar_to_utf8 (gunichar    c,
240                             gchar      *outbuf);
241
242/* Validate a UTF8 string, return TRUE if valid, put pointer to
243 * first invalid char in **end
244 */
245
246gboolean g_utf8_validate (const gchar  *str,
247                          gssize        max_len, 
248                          const gchar **end);
249
250/* Validate a Unicode character */
251gboolean g_unichar_validate (gunichar ch);
252
253gchar *g_utf8_strup   (const gchar *str,
254                       gssize       len);
255gchar *g_utf8_strdown (const gchar *str,
256                       gssize       len);
257gchar *g_utf8_casefold (const gchar *str,
258                        gssize       len);
259
260typedef enum {
261  G_NORMALIZE_DEFAULT,
262  G_NORMALIZE_NFD = G_NORMALIZE_DEFAULT,
263  G_NORMALIZE_DEFAULT_COMPOSE,
264  G_NORMALIZE_NFC = G_NORMALIZE_DEFAULT_COMPOSE,
265  G_NORMALIZE_ALL,
266  G_NORMALIZE_NFKD = G_NORMALIZE_ALL,
267  G_NORMALIZE_ALL_COMPOSE,
268  G_NORMALIZE_NFKC = G_NORMALIZE_ALL_COMPOSE
269} GNormalizeMode;
270
271gchar *g_utf8_normalize (const gchar   *str,
272                         gssize         len,
273                         GNormalizeMode mode);
274
275gint   g_utf8_collate     (const gchar *str1,
276                           const gchar *str2);
277gchar *g_utf8_collate_key (const gchar *str,
278                           gssize       len);
279
280G_END_DECLS
281
282#endif /* __G_UNICODE_H__ */
Note: See TracBrowser for help on using the repository browser.