source: trunk/third/libxml2/xmlstring.c @ 21532

Revision 21532, 24.3 KB checked in by ghudson, 20 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r21531, which included commits to RCS files with non-trunk default branches.
Line 
1/*
2 * string.c : an XML string utilities module
3 *
4 * This module provides various utility functions for manipulating
5 * the xmlChar* type. All functions named xmlStr* have been moved here
6 * from the parser.c file (their original home).
7 *
8 * See Copyright for the status of this software.
9 *
10 * UTF8 string routines from:
11 * William Brack <wbrack@mmm.com.hk>
12 *
13 * daniel@veillard.com
14 */
15
16#define IN_LIBXML
17#include "libxml.h"
18
19#include <stdlib.h>
20#include <string.h>
21#include <libxml/xmlmemory.h>
22#include <libxml/parserInternals.h>
23#include <libxml/xmlstring.h>
24
25/************************************************************************
26 *                                                                      *
27 *                Commodity functions to handle xmlChars                *
28 *                                                                      *
29 ************************************************************************/
30
31/**
32 * xmlStrndup:
33 * @cur:  the input xmlChar *
34 * @len:  the len of @cur
35 *
36 * a strndup for array of xmlChar's
37 *
38 * Returns a new xmlChar * or NULL
39 */
40xmlChar *
41xmlStrndup(const xmlChar *cur, int len) {
42    xmlChar *ret;
43   
44    if ((cur == NULL) || (len < 0)) return(NULL);
45    ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
46    if (ret == NULL) {
47        xmlErrMemory(NULL, NULL);
48        return(NULL);
49    }
50    memcpy(ret, cur, len * sizeof(xmlChar));
51    ret[len] = 0;
52    return(ret);
53}
54
55/**
56 * xmlStrdup:
57 * @cur:  the input xmlChar *
58 *
59 * a strdup for array of xmlChar's. Since they are supposed to be
60 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
61 * a termination mark of '0'.
62 *
63 * Returns a new xmlChar * or NULL
64 */
65xmlChar *
66xmlStrdup(const xmlChar *cur) {
67    const xmlChar *p = cur;
68
69    if (cur == NULL) return(NULL);
70    while (*p != 0) p++; /* non input consuming */
71    return(xmlStrndup(cur, p - cur));
72}
73
74/**
75 * xmlCharStrndup:
76 * @cur:  the input char *
77 * @len:  the len of @cur
78 *
79 * a strndup for char's to xmlChar's
80 *
81 * Returns a new xmlChar * or NULL
82 */
83
84xmlChar *
85xmlCharStrndup(const char *cur, int len) {
86    int i;
87    xmlChar *ret;
88   
89    if ((cur == NULL) || (len < 0)) return(NULL);
90    ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
91    if (ret == NULL) {
92        xmlErrMemory(NULL, NULL);
93        return(NULL);
94    }
95    for (i = 0;i < len;i++) {
96        ret[i] = (xmlChar) cur[i];
97        if (ret[i] == 0) return(ret);
98    }
99    ret[len] = 0;
100    return(ret);
101}
102
103/**
104 * xmlCharStrdup:
105 * @cur:  the input char *
106 *
107 * a strdup for char's to xmlChar's
108 *
109 * Returns a new xmlChar * or NULL
110 */
111
112xmlChar *
113xmlCharStrdup(const char *cur) {
114    const char *p = cur;
115
116    if (cur == NULL) return(NULL);
117    while (*p != '\0') p++; /* non input consuming */
118    return(xmlCharStrndup(cur, p - cur));
119}
120
121/**
122 * xmlStrcmp:
123 * @str1:  the first xmlChar *
124 * @str2:  the second xmlChar *
125 *
126 * a strcmp for xmlChar's
127 *
128 * Returns the integer result of the comparison
129 */
130
131int
132xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
133    register int tmp;
134
135    if (str1 == str2) return(0);
136    if (str1 == NULL) return(-1);
137    if (str2 == NULL) return(1);
138    do {
139        tmp = *str1++ - *str2;
140        if (tmp != 0) return(tmp);
141    } while (*str2++ != 0);
142    return 0;
143}
144
145/**
146 * xmlStrEqual:
147 * @str1:  the first xmlChar *
148 * @str2:  the second xmlChar *
149 *
150 * Check if both string are equal of have same content
151 * Should be a bit more readable and faster than xmlStrEqual()
152 *
153 * Returns 1 if they are equal, 0 if they are different
154 */
155
156int
157xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
158    if (str1 == str2) return(1);
159    if (str1 == NULL) return(0);
160    if (str2 == NULL) return(0);
161    do {
162        if (*str1++ != *str2) return(0);
163    } while (*str2++);
164    return(1);
165}
166
167/**
168 * xmlStrQEqual:
169 * @pref:  the prefix of the QName
170 * @name:  the localname of the QName
171 * @str:  the second xmlChar *
172 *
173 * Check if a QName is Equal to a given string
174 *
175 * Returns 1 if they are equal, 0 if they are different
176 */
177
178int
179xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
180    if (pref == NULL) return(xmlStrEqual(name, str));
181    if (name == NULL) return(0);
182    if (str == NULL) return(0);
183
184    do {
185        if (*pref++ != *str) return(0);
186    } while ((*str++) && (*pref));
187    if (*str++ != ':') return(0);
188    do {
189        if (*name++ != *str) return(0);
190    } while (*str++);
191    return(1);
192}
193
194/**
195 * xmlStrncmp:
196 * @str1:  the first xmlChar *
197 * @str2:  the second xmlChar *
198 * @len:  the max comparison length
199 *
200 * a strncmp for xmlChar's
201 *
202 * Returns the integer result of the comparison
203 */
204
205int
206xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
207    register int tmp;
208
209    if (len <= 0) return(0);
210    if (str1 == str2) return(0);
211    if (str1 == NULL) return(-1);
212    if (str2 == NULL) return(1);
213#ifdef __GNUC__
214    tmp = strncmp((const char *)str1, (const char *)str2, len);
215    return tmp;
216#else
217    do {
218        tmp = *str1++ - *str2;
219        if (tmp != 0 || --len == 0) return(tmp);
220    } while (*str2++ != 0);
221    return 0;
222#endif
223}
224
225static const xmlChar casemap[256] = {
226    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
227    0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
228    0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
229    0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
230    0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
231    0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
232    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
233    0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
234    0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
235    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
236    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
237    0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
238    0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
239    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
240    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
241    0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
242    0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
243    0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
244    0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
245    0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
246    0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
247    0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
248    0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
249    0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
250    0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
251    0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
252    0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
253    0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
254    0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
255    0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
256    0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
257    0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
258};
259
260/**
261 * xmlStrcasecmp:
262 * @str1:  the first xmlChar *
263 * @str2:  the second xmlChar *
264 *
265 * a strcasecmp for xmlChar's
266 *
267 * Returns the integer result of the comparison
268 */
269
270int
271xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
272    register int tmp;
273
274    if (str1 == str2) return(0);
275    if (str1 == NULL) return(-1);
276    if (str2 == NULL) return(1);
277    do {
278        tmp = casemap[*str1++] - casemap[*str2];
279        if (tmp != 0) return(tmp);
280    } while (*str2++ != 0);
281    return 0;
282}
283
284/**
285 * xmlStrncasecmp:
286 * @str1:  the first xmlChar *
287 * @str2:  the second xmlChar *
288 * @len:  the max comparison length
289 *
290 * a strncasecmp for xmlChar's
291 *
292 * Returns the integer result of the comparison
293 */
294
295int
296xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
297    register int tmp;
298
299    if (len <= 0) return(0);
300    if (str1 == str2) return(0);
301    if (str1 == NULL) return(-1);
302    if (str2 == NULL) return(1);
303    do {
304        tmp = casemap[*str1++] - casemap[*str2];
305        if (tmp != 0 || --len == 0) return(tmp);
306    } while (*str2++ != 0);
307    return 0;
308}
309
310/**
311 * xmlStrchr:
312 * @str:  the xmlChar * array
313 * @val:  the xmlChar to search
314 *
315 * a strchr for xmlChar's
316 *
317 * Returns the xmlChar * for the first occurrence or NULL.
318 */
319
320const xmlChar *
321xmlStrchr(const xmlChar *str, xmlChar val) {
322    if (str == NULL) return(NULL);
323    while (*str != 0) { /* non input consuming */
324        if (*str == val) return((xmlChar *) str);
325        str++;
326    }
327    return(NULL);
328}
329
330/**
331 * xmlStrstr:
332 * @str:  the xmlChar * array (haystack)
333 * @val:  the xmlChar to search (needle)
334 *
335 * a strstr for xmlChar's
336 *
337 * Returns the xmlChar * for the first occurrence or NULL.
338 */
339
340const xmlChar *
341xmlStrstr(const xmlChar *str, const xmlChar *val) {
342    int n;
343   
344    if (str == NULL) return(NULL);
345    if (val == NULL) return(NULL);
346    n = xmlStrlen(val);
347
348    if (n == 0) return(str);
349    while (*str != 0) { /* non input consuming */
350        if (*str == *val) {
351            if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
352        }
353        str++;
354    }
355    return(NULL);
356}
357
358/**
359 * xmlStrcasestr:
360 * @str:  the xmlChar * array (haystack)
361 * @val:  the xmlChar to search (needle)
362 *
363 * a case-ignoring strstr for xmlChar's
364 *
365 * Returns the xmlChar * for the first occurrence or NULL.
366 */
367
368const xmlChar *
369xmlStrcasestr(const xmlChar *str, xmlChar *val) {
370    int n;
371   
372    if (str == NULL) return(NULL);
373    if (val == NULL) return(NULL);
374    n = xmlStrlen(val);
375
376    if (n == 0) return(str);
377    while (*str != 0) { /* non input consuming */
378        if (casemap[*str] == casemap[*val])
379            if (!xmlStrncasecmp(str, val, n)) return(str);
380        str++;
381    }
382    return(NULL);
383}
384
385/**
386 * xmlStrsub:
387 * @str:  the xmlChar * array (haystack)
388 * @start:  the index of the first char (zero based)
389 * @len:  the length of the substring
390 *
391 * Extract a substring of a given string
392 *
393 * Returns the xmlChar * for the first occurrence or NULL.
394 */
395
396xmlChar *
397xmlStrsub(const xmlChar *str, int start, int len) {
398    int i;
399   
400    if (str == NULL) return(NULL);
401    if (start < 0) return(NULL);
402    if (len < 0) return(NULL);
403
404    for (i = 0;i < start;i++) {
405        if (*str == 0) return(NULL);
406        str++;
407    }
408    if (*str == 0) return(NULL);
409    return(xmlStrndup(str, len));
410}
411
412/**
413 * xmlStrlen:
414 * @str:  the xmlChar * array
415 *
416 * length of a xmlChar's string
417 *
418 * Returns the number of xmlChar contained in the ARRAY.
419 */
420
421int
422xmlStrlen(const xmlChar *str) {
423    int len = 0;
424
425    if (str == NULL) return(0);
426    while (*str != 0) { /* non input consuming */
427        str++;
428        len++;
429    }
430    return(len);
431}
432
433/**
434 * xmlStrncat:
435 * @cur:  the original xmlChar * array
436 * @add:  the xmlChar * array added
437 * @len:  the length of @add
438 *
439 * a strncat for array of xmlChar's, it will extend @cur with the len
440 * first bytes of @add.
441 *
442 * Returns a new xmlChar *, the original @cur is reallocated if needed
443 * and should not be freed
444 */
445
446xmlChar *
447xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
448    int size;
449    xmlChar *ret;
450
451    if ((add == NULL) || (len == 0))
452        return(cur);
453    if (cur == NULL)
454        return(xmlStrndup(add, len));
455
456    size = xmlStrlen(cur);
457    ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
458    if (ret == NULL) {
459        xmlErrMemory(NULL, NULL);
460        return(cur);
461    }
462    memcpy(&ret[size], add, len * sizeof(xmlChar));
463    ret[size + len] = 0;
464    return(ret);
465}
466
467/**
468 * xmlStrncatNew:
469 * @str1:  first xmlChar string
470 * @str2:  second xmlChar string
471 * @len:  the len of @str2
472 *
473 * same as xmlStrncat, but creates a new string.  The original
474 * two strings are not freed.
475 *
476 * Returns a new xmlChar * or NULL
477 */
478xmlChar *
479xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
480    int size;
481    xmlChar *ret;
482
483    if (len < 0)
484        len = xmlStrlen(str2);
485    if ((str2 == NULL) || (len == 0))
486        return(xmlStrdup(str1));
487    if (str1 == NULL)
488        return(xmlStrndup(str2, len));
489
490    size = xmlStrlen(str1);
491    ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
492    if (ret == NULL) {
493        xmlErrMemory(NULL, NULL);
494        return(xmlStrndup(str1, size));
495    }
496    memcpy(ret, str1, size * sizeof(xmlChar));
497    memcpy(&ret[size], str2, len * sizeof(xmlChar));
498    ret[size + len] = 0;
499    return(ret);
500}
501
502/**
503 * xmlStrcat:
504 * @cur:  the original xmlChar * array
505 * @add:  the xmlChar * array added
506 *
507 * a strcat for array of xmlChar's. Since they are supposed to be
508 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
509 * a termination mark of '0'.
510 *
511 * Returns a new xmlChar * containing the concatenated string.
512 */
513xmlChar *
514xmlStrcat(xmlChar *cur, const xmlChar *add) {
515    const xmlChar *p = add;
516
517    if (add == NULL) return(cur);
518    if (cur == NULL)
519        return(xmlStrdup(add));
520
521    while (*p != 0) p++; /* non input consuming */
522    return(xmlStrncat(cur, add, p - add));
523}
524
525/**
526 * xmlStrPrintf:
527 * @buf:   the result buffer.
528 * @len:   the result buffer length.
529 * @msg:   the message with printf formatting.
530 * @...:   extra parameters for the message.
531 *
532 * Formats @msg and places result into @buf.
533 *
534 * Returns the number of characters written to @buf or -1 if an error occurs.
535 */
536int
537xmlStrPrintf(xmlChar *buf, int len, const xmlChar *msg, ...) {
538    va_list args;
539    int ret;
540   
541    if((buf == NULL) || (msg == NULL)) {
542        return(-1);
543    }
544   
545    va_start(args, msg);
546    ret = vsnprintf((char *) buf, len, (const char *) msg, args);
547    va_end(args);
548    buf[len - 1] = 0; /* be safe ! */
549   
550    return(ret);
551}
552
553/**
554 * xmlStrVPrintf:
555 * @buf:   the result buffer.
556 * @len:   the result buffer length.
557 * @msg:   the message with printf formatting.
558 * @ap:    extra parameters for the message.
559 *
560 * Formats @msg and places result into @buf.
561 *
562 * Returns the number of characters written to @buf or -1 if an error occurs.
563 */
564int
565xmlStrVPrintf(xmlChar *buf, int len, const xmlChar *msg, va_list ap) {
566    int ret;
567   
568    if((buf == NULL) || (msg == NULL)) {
569        return(-1);
570    }
571   
572    ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
573    buf[len - 1] = 0; /* be safe ! */
574   
575    return(ret);
576}
577
578/************************************************************************
579 *                                                                      *
580 *              Generic UTF8 handling routines                          *
581 *                                                                      *
582 * From rfc2044: encoding of the Unicode values on UTF-8:               *
583 *                                                                      *
584 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
585 * 0000 0000-0000 007F   0xxxxxxx                                       *
586 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
587 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
588 *                                                                      *
589 * I hope we won't use values > 0xFFFF anytime soon !                   *
590 *                                                                      *
591 ************************************************************************/
592
593
594/**
595 * xmlUTF8Size:
596 * @utf: pointer to the UTF8 character
597 *
598 * calculates the internal size of a UTF8 character
599 *
600 * returns the numbers of bytes in the character, -1 on format error
601 */
602int
603xmlUTF8Size(const xmlChar *utf) {
604    xmlChar mask;
605    int len;
606
607    if (utf == NULL)
608        return -1;
609    if (*utf < 0x80)
610        return 1;
611    /* check valid UTF8 character */
612    if (!(*utf & 0x40))
613        return -1;
614    /* determine number of bytes in char */
615    len = 2;
616    for (mask=0x20; mask != 0; mask>>=1) {
617        if (!(*utf & mask))
618            return len;
619        len++;
620    }
621    return -1;
622}
623
624/**
625 * xmlUTF8Charcmp:
626 * @utf1: pointer to first UTF8 char
627 * @utf2: pointer to second UTF8 char
628 *
629 * compares the two UCS4 values
630 *
631 * returns result of the compare as with xmlStrncmp
632 */
633int
634xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
635
636    if (utf1 == NULL ) {
637        if (utf2 == NULL)
638            return 0;
639        return -1;
640    }
641    return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
642}
643
644/**
645 * xmlUTF8Strlen:
646 * @utf:  a sequence of UTF-8 encoded bytes
647 *
648 * compute the length of an UTF8 string, it doesn't do a full UTF8
649 * checking of the content of the string.
650 *
651 * Returns the number of characters in the string or -1 in case of error
652 */
653int
654xmlUTF8Strlen(const xmlChar *utf) {
655    int ret = 0;
656
657    if (utf == NULL)
658        return(-1);
659
660    while (*utf != 0) {
661        if (utf[0] & 0x80) {
662            if ((utf[1] & 0xc0) != 0x80)
663                return(-1);
664            if ((utf[0] & 0xe0) == 0xe0) {
665                if ((utf[2] & 0xc0) != 0x80)
666                    return(-1);
667                if ((utf[0] & 0xf0) == 0xf0) {
668                    if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
669                        return(-1);
670                    utf += 4;
671                } else {
672                    utf += 3;
673                }
674            } else {
675                utf += 2;
676            }
677        } else {
678            utf++;
679        }
680        ret++;
681    }
682    return(ret);
683}
684
685/**
686 * xmlGetUTF8Char:
687 * @utf:  a sequence of UTF-8 encoded bytes
688 * @len:  a pointer to the minimum number of bytes present in
689 *        the sequence.  This is used to assure the next character
690 *        is completely contained within the sequence.
691 *
692 * Read the first UTF8 character from @utf
693 *
694 * Returns the char value or -1 in case of error, and sets *len to
695 *        the actual number of bytes consumed (0 in case of error)
696 */
697int
698xmlGetUTF8Char(const unsigned char *utf, int *len) {
699    unsigned int c;
700
701    if (utf == NULL)
702        goto error;
703    if (len == NULL)
704        goto error;
705    if (*len < 1)
706        goto error;
707
708    c = utf[0];
709    if (c & 0x80) {
710        if (*len < 2)
711            goto error;
712        if ((utf[1] & 0xc0) != 0x80)
713            goto error;
714        if ((c & 0xe0) == 0xe0) {
715            if (*len < 3)
716                goto error;
717            if ((utf[2] & 0xc0) != 0x80)
718                goto error;
719            if ((c & 0xf0) == 0xf0) {
720                if (*len < 4)
721                    goto error;
722                if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
723                    goto error;
724                *len = 4;
725                /* 4-byte code */
726                c = (utf[0] & 0x7) << 18;
727                c |= (utf[1] & 0x3f) << 12;
728                c |= (utf[2] & 0x3f) << 6;
729                c |= utf[3] & 0x3f;
730            } else {
731              /* 3-byte code */
732                *len = 3;
733                c = (utf[0] & 0xf) << 12;
734                c |= (utf[1] & 0x3f) << 6;
735                c |= utf[2] & 0x3f;
736            }
737        } else {
738          /* 2-byte code */
739            *len = 2;
740            c = (utf[0] & 0x1f) << 6;
741            c |= utf[1] & 0x3f;
742        }
743    } else {
744        /* 1-byte code */
745        *len = 1;
746    }
747    return(c);
748
749error:
750    if (len != NULL)
751        *len = 0;
752    return(-1);
753}
754
755/**
756 * xmlCheckUTF8:
757 * @utf: Pointer to putative UTF-8 encoded string.
758 *
759 * Checks @utf for being valid UTF-8. @utf is assumed to be
760 * null-terminated. This function is not super-strict, as it will
761 * allow longer UTF-8 sequences than necessary. Note that Java is
762 * capable of producing these sequences if provoked. Also note, this
763 * routine checks for the 4-byte maximum size, but does not check for
764 * 0x10ffff maximum value.
765 *
766 * Return value: true if @utf is valid.
767 **/
768int
769xmlCheckUTF8(const unsigned char *utf)
770{
771    int ix;
772    unsigned char c;
773
774    if (utf == NULL)
775        return(0);
776    /*
777     * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
778     * are as follows (in "bit format"):
779     *    0xxxxxxx                                      valid 1-byte
780     *    110xxxxx 10xxxxxx                             valid 2-byte
781     *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
782     *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
783     */
784    for (ix = 0; (c = utf[ix]);) {      /* string is 0-terminated */
785        if ((c & 0x80) == 0x00) {       /* 1-byte code, starts with 10 */
786            ix++;
787        } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
788            if ((utf[ix+1] & 0xc0 ) != 0x80)
789                return 0;
790            ix += 2;
791        } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
792            if (((utf[ix+1] & 0xc0) != 0x80) ||
793                ((utf[ix+2] & 0xc0) != 0x80))
794                    return 0;
795            ix += 3;
796        } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
797            if (((utf[ix+1] & 0xc0) != 0x80) ||
798                ((utf[ix+2] & 0xc0) != 0x80) ||
799                ((utf[ix+3] & 0xc0) != 0x80))
800                    return 0;
801            ix += 4;
802        } else                          /* unknown encoding */
803            return 0;
804      }
805      return(1);
806}
807
808/**
809 * xmlUTF8Strsize:
810 * @utf:  a sequence of UTF-8 encoded bytes
811 * @len:  the number of characters in the array
812 *
813 * storage size of an UTF8 string
814 * the behaviour is not garanteed if the input string is not UTF-8
815 *
816 * Returns the storage size of
817 * the first 'len' characters of ARRAY
818 */
819
820int
821xmlUTF8Strsize(const xmlChar *utf, int len) {
822    const xmlChar   *ptr=utf;
823    xmlChar         ch;
824
825    if (utf == NULL)
826        return(0);
827
828    if (len <= 0)
829        return(0);
830
831    while ( len-- > 0) {
832        if ( !*ptr )
833            break;
834        if ( (ch = *ptr++) & 0x80)
835            while ((ch<<=1) & 0x80 ) {
836                ptr++;
837                if (*ptr == 0) break;
838            }
839    }
840    return (ptr - utf);
841}
842
843
844/**
845 * xmlUTF8Strndup:
846 * @utf:  the input UTF8 *
847 * @len:  the len of @utf (in chars)
848 *
849 * a strndup for array of UTF8's
850 *
851 * Returns a new UTF8 * or NULL
852 */
853xmlChar *
854xmlUTF8Strndup(const xmlChar *utf, int len) {
855    xmlChar *ret;
856    int i;
857   
858    if ((utf == NULL) || (len < 0)) return(NULL);
859    i = xmlUTF8Strsize(utf, len);
860    ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
861    if (ret == NULL) {
862        xmlGenericError(xmlGenericErrorContext,
863                "malloc of %ld byte failed\n",
864                (len + 1) * (long)sizeof(xmlChar));
865        return(NULL);
866    }
867    memcpy(ret, utf, i * sizeof(xmlChar));
868    ret[i] = 0;
869    return(ret);
870}
871
872/**
873 * xmlUTF8Strpos:
874 * @utf:  the input UTF8 *
875 * @pos:  the position of the desired UTF8 char (in chars)
876 *
877 * a function to provide the equivalent of fetching a
878 * character from a string array
879 *
880 * Returns a pointer to the UTF8 character or NULL
881 */
882const xmlChar *
883xmlUTF8Strpos(const xmlChar *utf, int pos) {
884    xmlChar ch;
885
886    if (utf == NULL) return(NULL);
887    if (pos < 0)
888        return(NULL);
889    while (pos--) {
890        if ((ch=*utf++) == 0) return(NULL);
891        if ( ch & 0x80 ) {
892            /* if not simple ascii, verify proper format */
893            if ( (ch & 0xc0) != 0xc0 )
894                return(NULL);
895            /* then skip over remaining bytes for this char */
896            while ( (ch <<= 1) & 0x80 )
897                if ( (*utf++ & 0xc0) != 0x80 )
898                    return(NULL);
899        }
900    }
901    return((xmlChar *)utf);
902}
903
904/**
905 * xmlUTF8Strloc:
906 * @utf:  the input UTF8 *
907 * @utfchar:  the UTF8 character to be found
908 *
909 * a function to provide the relative location of a UTF8 char
910 *
911 * Returns the relative character position of the desired char
912 * or -1 if not found
913 */
914int
915xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
916    int i, size;
917    xmlChar ch;
918
919    if (utf==NULL || utfchar==NULL) return -1;
920    size = xmlUTF8Strsize(utfchar, 1);
921        for(i=0; (ch=*utf) != 0; i++) {
922            if (xmlStrncmp(utf, utfchar, size)==0)
923                return(i);
924            utf++;
925            if ( ch & 0x80 ) {
926                /* if not simple ascii, verify proper format */
927                if ( (ch & 0xc0) != 0xc0 )
928                    return(-1);
929                /* then skip over remaining bytes for this char */
930                while ( (ch <<= 1) & 0x80 )
931                    if ( (*utf++ & 0xc0) != 0x80 )
932                        return(-1);
933            }
934        }
935
936    return(-1);
937}
938/**
939 * xmlUTF8Strsub:
940 * @utf:  a sequence of UTF-8 encoded bytes
941 * @start: relative pos of first char
942 * @len:   total number to copy
943 *
944 * Create a substring from a given UTF-8 string
945 * Note:  positions are given in units of UTF-8 chars
946 *
947 * Returns a pointer to a newly created string
948 * or NULL if any problem
949 */
950
951xmlChar *
952xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
953    int            i;
954    xmlChar ch;
955
956    if (utf == NULL) return(NULL);
957    if (start < 0) return(NULL);
958    if (len < 0) return(NULL);
959
960    /*
961     * Skip over any leading chars
962     */
963    for (i = 0;i < start;i++) {
964        if ((ch=*utf++) == 0) return(NULL);
965        if ( ch & 0x80 ) {
966            /* if not simple ascii, verify proper format */
967            if ( (ch & 0xc0) != 0xc0 )
968                return(NULL);
969            /* then skip over remaining bytes for this char */
970            while ( (ch <<= 1) & 0x80 )
971                if ( (*utf++ & 0xc0) != 0x80 )
972                    return(NULL);
973        }
974    }
975
976    return(xmlUTF8Strndup(utf, len));
977}
Note: See TracBrowser for help on using the repository browser.