source: trunk/third/libxml2/encoding.c @ 17096

Revision 17096, 64.0 KB checked in by ghudson, 23 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r17095, which included commits to RCS files with non-trunk default branches.
Line 
1/*
2 * encoding.c : implements the encoding conversion functions needed for XML
3 *
4 * Related specs:
5 * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies
6 * rfc2781        UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
7 * [ISO-10646]    UTF-8 and UTF-16 in Annexes
8 * [ISO-8859-1]   ISO Latin-1 characters codes.
9 * [UNICODE]      The Unicode Consortium, "The Unicode Standard --
10 *                Worldwide Character Encoding -- Version 1.0", Addison-
11 *                Wesley, Volume 1, 1991, Volume 2, 1992.  UTF-8 is
12 *                described in Unicode Technical Report #4.
13 * [US-ASCII]     Coded Character Set--7-bit American Standard Code for
14 *                Information Interchange, ANSI X3.4-1986.
15 *
16 * See Copyright for the status of this software.
17 *
18 * daniel@veillard.com
19 *
20 * UTF8 string routines from:
21 * "William M. Brack" <wbrack@mmm.com.hk>
22 *
23 * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
24 */
25
26#include "libxml.h"
27
28#include <string.h>
29
30#ifdef HAVE_CTYPE_H
31#include <ctype.h>
32#endif
33#ifdef HAVE_STDLIB_H
34#include <stdlib.h>
35#endif
36#ifdef LIBXML_ICONV_ENABLED
37#ifdef HAVE_ERRNO_H
38#include <errno.h>
39#endif
40#endif
41#include <libxml/xmlmemory.h>
42#include <libxml/encoding.h>
43#ifdef LIBXML_HTML_ENABLED
44#include <libxml/HTMLparser.h>
45#endif
46#include <libxml/globals.h>
47#include <libxml/xmlerror.h>
48
49static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
50static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
51
52typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
53typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
54struct _xmlCharEncodingAlias {
55    const char *name;
56    const char *alias;
57};
58
59static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
60static int xmlCharEncodingAliasesNb = 0;
61static int xmlCharEncodingAliasesMax = 0;
62
63#ifdef LIBXML_ICONV_ENABLED
64#if 0
65#define DEBUG_ENCODING  /* Define this to get encoding traces */
66#endif
67#endif
68
69static int xmlLittleEndian = 1;
70
71/************************************************************************
72 *                                                                      *
73 *                      Generic UTF8 handling routines                  *
74 *                                                                      *
75 * From rfc2044: encoding of the Unicode values on UTF-8:               *
76 *                                                                      *
77 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
78 * 0000 0000-0000 007F   0xxxxxxx                                       *
79 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
80 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
81 *                                                                      *
82 * I hope we won't use values > 0xFFFF anytime soon !                   *
83 *                                                                      *
84 ************************************************************************/
85
86/**
87 * xmlUTF8Strlen:
88 * @utf:  a sequence of UTF-8 encoded bytes
89 *
90 * compute the length of an UTF8 string, it doesn't do a full UTF8
91 * checking of the content of the string.
92 *
93 * Returns the number of characters in the string or -1 in case of error
94 */
95int
96xmlUTF8Strlen(const xmlChar *utf) {
97    int ret = 0;
98
99    if (utf == NULL)
100        return(-1);
101
102    while (*utf != 0) {
103        if (utf[0] & 0x80) {
104            if ((utf[1] & 0xc0) != 0x80)
105                return(-1);
106            if ((utf[0] & 0xe0) == 0xe0) {
107                if ((utf[2] & 0xc0) != 0x80)
108                    return(-1);
109                if ((utf[0] & 0xf0) == 0xf0) {
110                    if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
111                        return(-1);
112                    utf += 4;
113                } else {
114                    utf += 3;
115                }
116            } else {
117                utf += 2;
118            }
119        } else {
120            utf++;
121        }
122        ret++;
123    }
124    return(ret);
125}
126
127/**
128 * xmlGetUTF8Char:
129 * @utf:  a sequence of UTF-8 encoded bytes
130 * @len:  a pointer to @bytes len
131 *
132 * Read one UTF8 Char from @utf
133 *
134 * Returns the char value or -1 in case of error and update @len with the
135 *        number of bytes used
136 */
137static int
138xmlGetUTF8Char(const unsigned char *utf, int *len) {
139    unsigned int c;
140
141    if (utf == NULL)
142        goto error;
143    if (len == NULL)
144        goto error;
145    if (*len < 1)
146        goto error;
147
148    c = utf[0];
149    if (c & 0x80) {
150        if (*len < 2)
151            goto error;
152        if ((utf[1] & 0xc0) != 0x80)
153            goto error;
154        if ((c & 0xe0) == 0xe0) {
155            if (*len < 3)
156                goto error;
157            if ((utf[2] & 0xc0) != 0x80)
158                goto error;
159            if ((c & 0xf0) == 0xf0) {
160                if (*len < 4)
161                    goto error;
162                if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
163                    goto error;
164                *len = 4;
165                /* 4-byte code */
166                c = (utf[0] & 0x7) << 18;
167                c |= (utf[1] & 0x3f) << 12;
168                c |= (utf[2] & 0x3f) << 6;
169                c |= utf[3] & 0x3f;
170            } else {
171              /* 3-byte code */
172                *len = 3;
173                c = (utf[0] & 0xf) << 12;
174                c |= (utf[1] & 0x3f) << 6;
175                c |= utf[2] & 0x3f;
176            }
177        } else {
178          /* 2-byte code */
179            *len = 2;
180            c = (utf[0] & 0x1f) << 6;
181            c |= utf[1] & 0x3f;
182        }
183    } else {
184        /* 1-byte code */
185        *len = 1;
186    }
187    return(c);
188
189error:
190    *len = 0;
191    return(-1);
192}
193
194/**
195 * xmlCheckUTF8: Check utf-8 string for legality.
196 * @utf: Pointer to putative utf-8 encoded string.
197 *
198 * Checks @utf for being valid utf-8. @utf is assumed to be
199 * null-terminated. This function is not super-strict, as it will
200 * allow longer utf-8 sequences than necessary. Note that Java is
201 * capable of producing these sequences if provoked. Also note, this
202 * routine checks for the 4-byte maximum size, but does not check for
203 * 0x10ffff maximum value.
204 *
205 * Return value: true if @utf is valid.
206 **/
207int
208xmlCheckUTF8(const unsigned char *utf)
209{
210    int ix;
211    unsigned char c;
212
213    for (ix = 0; (c = utf[ix]);) {
214        if (c & 0x80) {
215            if ((utf[ix + 1] & 0xc0) != 0x80)
216                return(0);
217            if ((c & 0xe0) == 0xe0) {
218                if ((utf[ix + 2] & 0xc0) != 0x80)
219                    return(0);
220                if ((c & 0xf0) == 0xf0) {
221                    if ((c & 0xf8) != 0xf0 || (utf[ix + 3] & 0xc0) != 0x80)
222                        return(0);
223                    ix += 4;
224                    /* 4-byte code */
225                } else
226                  /* 3-byte code */
227                    ix += 3;
228            } else
229              /* 2-byte code */
230                ix += 2;
231        } else
232            /* 1-byte code */
233            ix++;
234      }
235      return(1);
236}
237
238/**
239 * xmlUTF8Strsize:
240 * @utf:  a sequence of UTF-8 encoded bytes
241 * @len:  the number of characters in the array
242 *
243 * storage size of an UTF8 string
244 *
245 * Returns the storage size of
246 * the first 'len' characters of ARRAY
247 *
248 */
249
250int
251xmlUTF8Strsize(const xmlChar *utf, int len) {
252    const xmlChar       *ptr=utf;
253    xmlChar     ch;
254
255    if (len <= 0)
256        return(0);
257
258    while ( len-- > 0) {
259        if ( !*ptr )
260            break;
261        if ( (ch = *ptr++) & 0x80)
262            while ( (ch<<=1) & 0x80 )
263                ptr++;
264    }
265    return (ptr - utf);
266}
267
268
269/**
270 * xmlUTF8Strndup:
271 * @utf:  the input UTF8 *
272 * @len:  the len of @utf (in chars)
273 *
274 * a strndup for array of UTF8's
275 *
276 * Returns a new UTF8 * or NULL
277 */
278xmlChar *
279xmlUTF8Strndup(const xmlChar *utf, int len) {
280    xmlChar *ret;
281    int i;
282   
283    if ((utf == NULL) || (len < 0)) return(NULL);
284    i = xmlUTF8Strsize(utf, len);
285    ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar));
286    if (ret == NULL) {
287        xmlGenericError(xmlGenericErrorContext,
288                "malloc of %ld byte failed\n",
289                (len + 1) * (long)sizeof(xmlChar));
290        return(NULL);
291    }
292    memcpy(ret, utf, i * sizeof(xmlChar));
293    ret[i] = 0;
294    return(ret);
295}
296
297/**
298 * xmlUTF8Strpos:
299 * @utf:  the input UTF8 *
300 * @pos:  the position of the desired UTF8 char (in chars)
301 *
302 * a function to provide the equivalent of fetching a
303 * character from a string array
304 *
305 * Returns a pointer to the UTF8 character or NULL
306 */
307xmlChar *
308xmlUTF8Strpos(const xmlChar *utf, int pos) {
309    xmlChar ch;
310
311    if (utf == NULL) return(NULL);
312    if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
313        return(NULL);
314    while (pos--) {
315        if ((ch=*utf++) == 0) return(NULL);
316        if ( ch & 0x80 ) {
317            /* if not simple ascii, verify proper format */
318            if ( (ch & 0xc0) != 0xc0 )
319                return(NULL);
320            /* then skip over remaining bytes for this char */
321            while ( (ch <<= 1) & 0x80 )
322                if ( (*utf++ & 0xc0) != 0x80 )
323                    return(NULL);
324        }
325    }
326    return((xmlChar *)utf);
327}
328
329/**
330 * xmlUTF8Strloc:
331 * @utf:  the input UTF8 *
332 * @utfchar:  the UTF8 character to be found
333 *
334 * a function to provide relative location of a UTF8 char
335 *
336 * Returns the relative character position of the desired char
337 * or -1 if not found
338 */
339int
340xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
341    int i, size;
342    xmlChar ch;
343
344    if (utf==NULL || utfchar==NULL) return -1;
345    size = xmlUTF8Strsize(utfchar, 1);
346        for(i=0; (ch=*utf) != 0; i++) {
347            if (xmlStrncmp(utf, utfchar, size)==0)
348                return(i);
349            utf++;
350            if ( ch & 0x80 ) {
351                /* if not simple ascii, verify proper format */
352                if ( (ch & 0xc0) != 0xc0 )
353                    return(-1);
354                /* then skip over remaining bytes for this char */
355                while ( (ch <<= 1) & 0x80 )
356                    if ( (*utf++ & 0xc0) != 0x80 )
357                        return(-1);
358            }
359        }
360
361    return(-1);
362}
363/**
364 * xmlUTF8Strsub:
365 * @utf:  a sequence of UTF-8 encoded bytes
366 * @start: relative pos of first char
367 * @len:   total number to copy
368 *
369 * Note:  positions are given in units of UTF-8 chars
370 *
371 * Returns a pointer to a newly created string
372 * or NULL if any problem
373 */
374
375xmlChar *
376xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
377    int     i;
378    xmlChar ch;
379
380    if (utf == NULL) return(NULL);
381    if (start < 0) return(NULL);
382    if (len < 0) return(NULL);
383
384    /*
385     * Skip over any leading chars
386     */
387    for (i = 0;i < start;i++) {
388        if ((ch=*utf++) == 0) return(NULL);
389        if ( ch & 0x80 ) {
390            /* if not simple ascii, verify proper format */
391            if ( (ch & 0xc0) != 0xc0 )
392                return(NULL);
393            /* then skip over remaining bytes for this char */
394            while ( (ch <<= 1) & 0x80 )
395                if ( (*utf++ & 0xc0) != 0x80 )
396                    return(NULL);
397        }
398    }
399
400    return(xmlUTF8Strndup(utf, len));
401}
402
403/************************************************************************
404 *                                                                      *
405 *              Conversions To/From UTF8 encoding                       *
406 *                                                                      *
407 ************************************************************************/
408
409/**
410 * asciiToUTF8:
411 * @out:  a pointer to an array of bytes to store the result
412 * @outlen:  the length of @out
413 * @in:  a pointer to an array of ASCII chars
414 * @inlen:  the length of @in
415 *
416 * Take a block of ASCII chars in and try to convert it to an UTF-8
417 * block of chars out.
418 * Returns 0 if success, or -1 otherwise
419 * The value of @inlen after return is the number of octets consumed
420 *     as the return value is positive, else unpredictable.
421 * The value of @outlen after return is the number of ocetes consumed.
422 */
423static int
424asciiToUTF8(unsigned char* out, int *outlen,
425              const unsigned char* in, int *inlen) {
426    unsigned char* outstart = out;
427    const unsigned char* base = in;
428    const unsigned char* processed = in;
429    unsigned char* outend = out + *outlen;
430    const unsigned char* inend;
431    unsigned int c;
432    int bits;
433
434    inend = in + (*inlen);
435    while ((in < inend) && (out - outstart + 5 < *outlen)) {
436        c= *in++;
437
438        /* assertion: c is a single UTF-4 value */
439        if (out >= outend)
440            break;
441        if      (c <    0x80) {  *out++=  c;                bits= -6; }
442        else {
443            *outlen = out - outstart;
444            *inlen = processed - base;
445            return(-1);
446        }
447 
448        for ( ; bits >= 0; bits-= 6) {
449            if (out >= outend)
450                break;
451            *out++= ((c >> bits) & 0x3F) | 0x80;
452        }
453        processed = (const unsigned char*) in;
454    }
455    *outlen = out - outstart;
456    *inlen = processed - base;
457    return(0);
458}
459
460/**
461 * UTF8Toascii:
462 * @out:  a pointer to an array of bytes to store the result
463 * @outlen:  the length of @out
464 * @in:  a pointer to an array of UTF-8 chars
465 * @inlen:  the length of @in
466 *
467 * Take a block of UTF-8 chars in and try to convert it to an ASCII
468 * block of chars out.
469 *
470 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
471 * The value of @inlen after return is the number of octets consumed
472 *     as the return value is positive, else unpredictable.
473 * The value of @outlen after return is the number of ocetes consumed.
474 */
475static int
476UTF8Toascii(unsigned char* out, int *outlen,
477              const unsigned char* in, int *inlen) {
478    const unsigned char* processed = in;
479    const unsigned char* outend;
480    const unsigned char* outstart = out;
481    const unsigned char* instart = in;
482    const unsigned char* inend;
483    unsigned int c, d;
484    int trailing;
485
486    if (in == NULL) {
487        /*
488         * initialization nothing to do
489         */
490        *outlen = 0;
491        *inlen = 0;
492        return(0);
493    }
494    inend = in + (*inlen);
495    outend = out + (*outlen);
496    while (in < inend) {
497        d = *in++;
498        if      (d < 0x80)  { c= d; trailing= 0; }
499        else if (d < 0xC0) {
500            /* trailing byte in leading position */
501            *outlen = out - outstart;
502            *inlen = processed - instart;
503            return(-2);
504        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
505        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
506        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
507        else {
508            /* no chance for this in Ascii */
509            *outlen = out - outstart;
510            *inlen = processed - instart;
511            return(-2);
512        }
513
514        if (inend - in < trailing) {
515            break;
516        }
517
518        for ( ; trailing; trailing--) {
519            if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
520                break;
521            c <<= 6;
522            c |= d & 0x3F;
523        }
524
525        /* assertion: c is a single UTF-4 value */
526        if (c < 0x80) {
527            if (out >= outend)
528                break;
529            *out++ = c;
530        } else {
531            /* no chance for this in Ascii */
532            *outlen = out - outstart;
533            *inlen = processed - instart;
534            return(-2);
535        }
536        processed = in;
537    }
538    *outlen = out - outstart;
539    *inlen = processed - instart;
540    return(0);
541}
542
543/**
544 * isolat1ToUTF8:
545 * @out:  a pointer to an array of bytes to store the result
546 * @outlen:  the length of @out
547 * @in:  a pointer to an array of ISO Latin 1 chars
548 * @inlen:  the length of @in
549 *
550 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
551 * block of chars out.
552 * Returns 0 if success, or -1 otherwise
553 * The value of @inlen after return is the number of octets consumed
554 *     as the return value is positive, else unpredictable.
555 * The value of @outlen after return is the number of ocetes consumed.
556 */
557int
558isolat1ToUTF8(unsigned char* out, int *outlen,
559              const unsigned char* in, int *inlen) {
560    unsigned char* outstart = out;
561    const unsigned char* base = in;
562    const unsigned char* processed = in;
563    unsigned char* outend = out + *outlen;
564    const unsigned char* inend;
565    unsigned int c;
566
567    inend = in + (*inlen);
568    while (in < inend) {
569        c = *in++;
570
571        if (out >= outend)
572            break;
573
574        if (c < 0x80) {
575            *out++ =  c;
576            processed++;
577            continue;
578        } else {
579            *out++= ((c >>  6) & 0x1F) | 0xC0;
580            if (out >= outend)
581                break;
582            *out++= (c & 0x3F) | 0x80;
583            processed++;
584        }
585    }
586    *outlen = out - outstart;
587    *inlen = processed - base;
588    return(0);
589}
590
591/**
592 * UTF8Toisolat1:
593 * @out:  a pointer to an array of bytes to store the result
594 * @outlen:  the length of @out
595 * @in:  a pointer to an array of UTF-8 chars
596 * @inlen:  the length of @in
597 *
598 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
599 * block of chars out.
600 *
601 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
602 * The value of @inlen after return is the number of octets consumed
603 *     as the return value is positive, else unpredictable.
604 * The value of @outlen after return is the number of ocetes consumed.
605 */
606int
607UTF8Toisolat1(unsigned char* out, int *outlen,
608              const unsigned char* in, int *inlen) {
609    const unsigned char* processed = in;
610    const unsigned char* outend;
611    const unsigned char* outstart = out;
612    const unsigned char* instart = in;
613    const unsigned char* inend;
614    unsigned int c, d;
615    int trailing;
616
617    if (in == NULL) {
618        /*
619         * initialization nothing to do
620         */
621        *outlen = 0;
622        *inlen = 0;
623        return(0);
624    }
625    inend = in + (*inlen);
626    outend = out + (*outlen);
627    while (in < inend) {
628        d = *in++;
629        if      (d < 0x80)  { c= d; trailing= 0; }
630        else if (d < 0xC0) {
631            /* trailing byte in leading position */
632            *outlen = out - outstart;
633            *inlen = processed - instart;
634            return(-2);
635        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
636        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
637        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
638        else {
639            /* no chance for this in IsoLat1 */
640            *outlen = out - outstart;
641            *inlen = processed - instart;
642            return(-2);
643        }
644
645        if (inend - in < trailing) {
646            break;
647        }
648
649        for ( ; trailing; trailing--) {
650            if (in >= inend)
651                break;
652            if (((d= *in++) & 0xC0) != 0x80) {
653                *outlen = out - outstart;
654                *inlen = processed - instart;
655                return(-2);
656            }
657            c <<= 6;
658            c |= d & 0x3F;
659        }
660
661        /* assertion: c is a single UTF-4 value */
662        if (c <= 0xFF) {
663            if (out >= outend)
664                break;
665            *out++ = c;
666        } else {
667            /* no chance for this in IsoLat1 */
668            *outlen = out - outstart;
669            *inlen = processed - instart;
670            return(-2);
671        }
672        processed = in;
673    }
674    *outlen = out - outstart;
675    *inlen = processed - instart;
676    return(0);
677}
678
679/**
680 * UTF16LEToUTF8:
681 * @out:  a pointer to an array of bytes to store the result
682 * @outlen:  the length of @out
683 * @inb:  a pointer to an array of UTF-16LE passwd as a byte array
684 * @inlenb:  the length of @in in UTF-16LE chars
685 *
686 * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
687 * block of chars out. This function assume the endian property
688 * is the same between the native type of this machine and the
689 * inputed one.
690 *
691 * Returns the number of byte written, or -1 by lack of space, or -2
692 *     if the transcoding fails (for *in is not valid utf16 string)
693 *     The value of *inlen after return is the number of octets consumed
694 *     as the return value is positive, else unpredictable.
695 */
696static int
697UTF16LEToUTF8(unsigned char* out, int *outlen,
698            const unsigned char* inb, int *inlenb)
699{
700    unsigned char* outstart = out;
701    const unsigned char* processed = inb;
702    unsigned char* outend = out + *outlen;
703    unsigned short* in = (unsigned short*) inb;
704    unsigned short* inend;
705    unsigned int c, d, inlen;
706    unsigned char *tmp;
707    int bits;
708
709    if ((*inlenb % 2) == 1)
710        (*inlenb)--;
711    inlen = *inlenb / 2;
712    inend = in + inlen;
713    while ((in < inend) && (out - outstart + 5 < *outlen)) {
714        if (xmlLittleEndian) {
715            c= *in++;
716        } else {
717            tmp = (unsigned char *) in;
718            c = *tmp++;
719            c = c | (((unsigned int)*tmp) << 8);
720            in++;
721        }
722        if ((c & 0xFC00) == 0xD800) {    /* surrogates */
723            if (in >= inend) {           /* (in > inend) shouldn't happens */
724                break;
725            }
726            if (xmlLittleEndian) {
727                d = *in++;
728            } else {
729                tmp = (unsigned char *) in;
730                d = *tmp++;
731                d = d | (((unsigned int)*tmp) << 8);
732                in++;
733            }
734            if ((d & 0xFC00) == 0xDC00) {
735                c &= 0x03FF;
736                c <<= 10;
737                c |= d & 0x03FF;
738                c += 0x10000;
739            }
740            else {
741                *outlen = out - outstart;
742                *inlenb = processed - inb;
743                return(-2);
744            }
745        }
746
747        /* assertion: c is a single UTF-4 value */
748        if (out >= outend)
749            break;
750        if      (c <    0x80) {  *out++=  c;                bits= -6; }
751        else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
752        else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
753        else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
754 
755        for ( ; bits >= 0; bits-= 6) {
756            if (out >= outend)
757                break;
758            *out++= ((c >> bits) & 0x3F) | 0x80;
759        }
760        processed = (const unsigned char*) in;
761    }
762    *outlen = out - outstart;
763    *inlenb = processed - inb;
764    return(0);
765}
766
767/**
768 * UTF8ToUTF16LE:
769 * @outb:  a pointer to an array of bytes to store the result
770 * @outlen:  the length of @outb
771 * @in:  a pointer to an array of UTF-8 chars
772 * @inlen:  the length of @in
773 *
774 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
775 * block of chars out.
776 *
777 * Returns the number of byte written, or -1 by lack of space, or -2
778 *     if the transcoding failed.
779 */
780static int
781UTF8ToUTF16LE(unsigned char* outb, int *outlen,
782            const unsigned char* in, int *inlen)
783{
784    unsigned short* out = (unsigned short*) outb;
785    const unsigned char* processed = in;
786    unsigned short* outstart= out;
787    unsigned short* outend;
788    const unsigned char* inend= in+*inlen;
789    unsigned int c, d;
790    int trailing;
791    unsigned char *tmp;
792    unsigned short tmp1, tmp2;
793
794    if (in == NULL) {
795        /*
796         * initialization, add the Byte Order Mark
797         */
798        if (*outlen >= 2) {
799            outb[0] = 0xFF;
800            outb[1] = 0xFE;
801            *outlen = 2;
802            *inlen = 0;
803#ifdef DEBUG_ENCODING
804            xmlGenericError(xmlGenericErrorContext,
805                    "Added FFFE Byte Order Mark\n");
806#endif
807            return(2);
808        }
809        *outlen = 0;
810        *inlen = 0;
811        return(0);
812    }
813    outend = out + (*outlen / 2);
814    while (in < inend) {
815      d= *in++;
816      if      (d < 0x80)  { c= d; trailing= 0; }
817      else if (d < 0xC0) {
818          /* trailing byte in leading position */
819          *outlen = (out - outstart) * 2;
820          *inlen = processed - in;
821          return(-2);
822      } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
823      else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
824      else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
825      else {
826        /* no chance for this in UTF-16 */
827        *outlen = (out - outstart) * 2;
828        *inlen = processed - in;
829        return(-2);
830      }
831
832      if (inend - in < trailing) {
833          break;
834      }
835
836      for ( ; trailing; trailing--) {
837          if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
838              break;
839          c <<= 6;
840          c |= d & 0x3F;
841      }
842
843      /* assertion: c is a single UTF-4 value */
844        if (c < 0x10000) {
845            if (out >= outend)
846                break;
847            if (xmlLittleEndian) {
848                *out++ = c;
849            } else {
850                tmp = (unsigned char *) out;
851                *tmp = c ;
852                *(tmp + 1) = c >> 8 ;
853                out++;
854            }
855        }
856        else if (c < 0x110000) {
857            if (out+1 >= outend)
858                break;
859            c -= 0x10000;
860            if (xmlLittleEndian) {
861                *out++ = 0xD800 | (c >> 10);
862                *out++ = 0xDC00 | (c & 0x03FF);
863            } else {
864                tmp1 = 0xD800 | (c >> 10);
865                tmp = (unsigned char *) out;
866                *tmp = (unsigned char) tmp1;
867                *(tmp + 1) = tmp1 >> 8;
868                out++;
869
870                tmp2 = 0xDC00 | (c & 0x03FF);
871                tmp = (unsigned char *) out;
872                *tmp  = (unsigned char) tmp2;
873                *(tmp + 1) = tmp2 >> 8;
874                out++;
875            }
876        }
877        else
878            break;
879        processed = in;
880    }
881    *outlen = (out - outstart) * 2;
882    *inlen = processed - in;
883    return(0);
884}
885
886/**
887 * UTF16BEToUTF8:
888 * @out:  a pointer to an array of bytes to store the result
889 * @outlen:  the length of @out
890 * @inb:  a pointer to an array of UTF-16 passwd as a byte array
891 * @inlenb:  the length of @in in UTF-16 chars
892 *
893 * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
894 * block of chars out. This function assume the endian property
895 * is the same between the native type of this machine and the
896 * inputed one.
897 *
898 * Returns the number of byte written, or -1 by lack of space, or -2
899 *     if the transcoding fails (for *in is not valid utf16 string)
900 * The value of *inlen after return is the number of octets consumed
901 *     as the return value is positive, else unpredictable.
902 */
903static int
904UTF16BEToUTF8(unsigned char* out, int *outlen,
905            const unsigned char* inb, int *inlenb)
906{
907    unsigned char* outstart = out;
908    const unsigned char* processed = inb;
909    unsigned char* outend = out + *outlen;
910    unsigned short* in = (unsigned short*) inb;
911    unsigned short* inend;
912    unsigned int c, d, inlen;
913    unsigned char *tmp;
914    int bits;
915
916    if ((*inlenb % 2) == 1)
917        (*inlenb)--;
918    inlen = *inlenb / 2;
919    inend= in + inlen;
920    while (in < inend) {
921        if (xmlLittleEndian) {
922            tmp = (unsigned char *) in;
923            c = *tmp++;
924            c = c << 8;
925            c = c | (unsigned int) *tmp;
926            in++;
927        } else {
928            c= *in++;
929        }
930        if ((c & 0xFC00) == 0xD800) {    /* surrogates */
931            if (in >= inend) {           /* (in > inend) shouldn't happens */
932                *outlen = out - outstart;
933                *inlenb = processed - inb;
934                return(-2);
935            }
936            if (xmlLittleEndian) {
937                tmp = (unsigned char *) in;
938                d = *tmp++;
939                d = d << 8;
940                d = d | (unsigned int) *tmp;
941                in++;
942            } else {
943                d= *in++;
944            }
945            if ((d & 0xFC00) == 0xDC00) {
946                c &= 0x03FF;
947                c <<= 10;
948                c |= d & 0x03FF;
949                c += 0x10000;
950            }
951            else {
952                *outlen = out - outstart;
953                *inlenb = processed - inb;
954                return(-2);
955            }
956        }
957
958        /* assertion: c is a single UTF-4 value */
959        if (out >= outend)
960            break;
961        if      (c <    0x80) {  *out++=  c;                bits= -6; }
962        else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
963        else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
964        else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
965 
966        for ( ; bits >= 0; bits-= 6) {
967            if (out >= outend)
968                break;
969            *out++= ((c >> bits) & 0x3F) | 0x80;
970        }
971        processed = (const unsigned char*) in;
972    }
973    *outlen = out - outstart;
974    *inlenb = processed - inb;
975    return(0);
976}
977
978/**
979 * UTF8ToUTF16BE:
980 * @outb:  a pointer to an array of bytes to store the result
981 * @outlen:  the length of @outb
982 * @in:  a pointer to an array of UTF-8 chars
983 * @inlen:  the length of @in
984 *
985 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
986 * block of chars out.
987 *
988 * Returns the number of byte written, or -1 by lack of space, or -2
989 *     if the transcoding failed.
990 */
991static int
992UTF8ToUTF16BE(unsigned char* outb, int *outlen,
993            const unsigned char* in, int *inlen)
994{
995    unsigned short* out = (unsigned short*) outb;
996    const unsigned char* processed = in;
997    unsigned short* outstart= out;
998    unsigned short* outend;
999    const unsigned char* inend= in+*inlen;
1000    unsigned int c, d;
1001    int trailing;
1002    unsigned char *tmp;
1003    unsigned short tmp1, tmp2;
1004
1005    if (in == NULL) {
1006        /*
1007         * initialization, add the Byte Order Mark
1008         */
1009        if (*outlen >= 2) {
1010            outb[0] = 0xFE;
1011            outb[1] = 0xFF;
1012            *outlen = 2;
1013            *inlen = 0;
1014#ifdef DEBUG_ENCODING
1015            xmlGenericError(xmlGenericErrorContext,
1016                    "Added FEFF Byte Order Mark\n");
1017#endif
1018            return(2);
1019        }
1020        *outlen = 0;
1021        *inlen = 0;
1022        return(0);
1023    }
1024    outend = out + (*outlen / 2);
1025    while (in < inend) {
1026      d= *in++;
1027      if      (d < 0x80)  { c= d; trailing= 0; }
1028      else if (d < 0xC0)  {
1029          /* trailing byte in leading position */
1030          *outlen = out - outstart;
1031          *inlen = processed - in;
1032          return(-2);
1033      } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1034      else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1035      else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1036      else {
1037          /* no chance for this in UTF-16 */
1038          *outlen = out - outstart;
1039          *inlen = processed - in;
1040          return(-2);
1041      }
1042
1043      if (inend - in < trailing) {
1044          break;
1045      }
1046
1047      for ( ; trailing; trailing--) {
1048          if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))  break;
1049          c <<= 6;
1050          c |= d & 0x3F;
1051      }
1052
1053      /* assertion: c is a single UTF-4 value */
1054        if (c < 0x10000) {
1055            if (out >= outend)  break;
1056            if (xmlLittleEndian) {
1057                tmp = (unsigned char *) out;
1058                *tmp = c >> 8;
1059                *(tmp + 1) = c;
1060                out++;
1061            } else {
1062                *out++ = c;
1063            }
1064        }
1065        else if (c < 0x110000) {
1066            if (out+1 >= outend)  break;
1067            c -= 0x10000;
1068            if (xmlLittleEndian) {
1069                tmp1 = 0xD800 | (c >> 10);
1070                tmp = (unsigned char *) out;
1071                *tmp = tmp1 >> 8;
1072                *(tmp + 1) = (unsigned char) tmp1;
1073                out++;
1074
1075                tmp2 = 0xDC00 | (c & 0x03FF);
1076                tmp = (unsigned char *) out;
1077                *tmp = tmp2 >> 8;
1078                *(tmp + 1) = (unsigned char) tmp2;
1079                out++;
1080            } else {
1081                *out++ = 0xD800 | (c >> 10);
1082                *out++ = 0xDC00 | (c & 0x03FF);
1083            }
1084        }
1085        else
1086            break;
1087        processed = in;
1088    }
1089    *outlen = (out - outstart) * 2;
1090    *inlen = processed - in;
1091    return(0);
1092}
1093
1094/************************************************************************
1095 *                                                                      *
1096 *              Generic encoding handling routines                      *
1097 *                                                                      *
1098 ************************************************************************/
1099
1100/**
1101 * xmlDetectCharEncoding:
1102 * @in:  a pointer to the first bytes of the XML entity, must be at least
1103 *       4 bytes long.
1104 * @len:  pointer to the length of the buffer
1105 *
1106 * Guess the encoding of the entity using the first bytes of the entity content
1107 * accordingly of the non-normative appendix F of the XML-1.0 recommendation.
1108 *
1109 * Returns one of the XML_CHAR_ENCODING_... values.
1110 */
1111xmlCharEncoding
1112xmlDetectCharEncoding(const unsigned char* in, int len)
1113{
1114    if (len >= 4) {
1115        if ((in[0] == 0x00) && (in[1] == 0x00) &&
1116            (in[2] == 0x00) && (in[3] == 0x3C))
1117            return(XML_CHAR_ENCODING_UCS4BE);
1118        if ((in[0] == 0x3C) && (in[1] == 0x00) &&
1119            (in[2] == 0x00) && (in[3] == 0x00))
1120            return(XML_CHAR_ENCODING_UCS4LE);
1121        if ((in[0] == 0x00) && (in[1] == 0x00) &&
1122            (in[2] == 0x3C) && (in[3] == 0x00))
1123            return(XML_CHAR_ENCODING_UCS4_2143);
1124        if ((in[0] == 0x00) && (in[1] == 0x3C) &&
1125            (in[2] == 0x00) && (in[3] == 0x00))
1126            return(XML_CHAR_ENCODING_UCS4_3412);
1127        if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
1128            (in[2] == 0xA7) && (in[3] == 0x94))
1129            return(XML_CHAR_ENCODING_EBCDIC);
1130        if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
1131            (in[2] == 0x78) && (in[3] == 0x6D))
1132            return(XML_CHAR_ENCODING_UTF8);
1133    }
1134    if (len >= 3) {
1135        /*
1136         * Errata on XML-1.0 June 20 2001
1137         * We now allow an UTF8 encoded BOM
1138         */
1139        if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
1140            (in[2] == 0xBF))
1141            return(XML_CHAR_ENCODING_UTF8);
1142    }
1143    if (len >= 2) {
1144        if ((in[0] == 0xFE) && (in[1] == 0xFF))
1145            return(XML_CHAR_ENCODING_UTF16BE);
1146        if ((in[0] == 0xFF) && (in[1] == 0xFE))
1147            return(XML_CHAR_ENCODING_UTF16LE);
1148    }
1149    return(XML_CHAR_ENCODING_NONE);
1150}
1151
1152/**
1153 * xmlCleanupEncodingAliases:
1154 *
1155 * Unregisters all aliases
1156 */
1157void
1158xmlCleanupEncodingAliases(void) {
1159    int i;
1160
1161    if (xmlCharEncodingAliases == NULL)
1162        return;
1163
1164    for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1165        if (xmlCharEncodingAliases[i].name != NULL)
1166            xmlFree((char *) xmlCharEncodingAliases[i].name);
1167        if (xmlCharEncodingAliases[i].alias != NULL)
1168            xmlFree((char *) xmlCharEncodingAliases[i].alias);
1169    }
1170    xmlCharEncodingAliasesNb = 0;
1171    xmlCharEncodingAliasesMax = 0;
1172    xmlFree(xmlCharEncodingAliases);
1173    xmlCharEncodingAliases = NULL;
1174}
1175
1176/**
1177 * xmlGetEncodingAlias:
1178 * @alias:  the alias name as parsed, in UTF-8 format (ASCII actually)
1179 *
1180 * Lookup an encoding name for the given alias.
1181 *
1182 * Returns NULL if not found the original name otherwise
1183 */
1184const char *
1185xmlGetEncodingAlias(const char *alias) {
1186    int i;
1187    char upper[100];
1188
1189    if (alias == NULL)
1190        return(NULL);
1191
1192    if (xmlCharEncodingAliases == NULL)
1193        return(NULL);
1194
1195    for (i = 0;i < 99;i++) {
1196        upper[i] = toupper(alias[i]);
1197        if (upper[i] == 0) break;
1198    }
1199    upper[i] = 0;
1200
1201    /*
1202     * Walk down the list looking for a definition of the alias
1203     */
1204    for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1205        if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1206            return(xmlCharEncodingAliases[i].name);
1207        }
1208    }
1209    return(NULL);
1210}
1211
1212/**
1213 * xmlAddEncodingAlias:
1214 * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
1215 * @alias:  the alias name as parsed, in UTF-8 format (ASCII actually)
1216 *
1217 * Registers and alias @alias for an encoding named @name. Existing alias
1218 * will be overwritten.
1219 *
1220 * Returns 0 in case of success, -1 in case of error
1221 */
1222int
1223xmlAddEncodingAlias(const char *name, const char *alias) {
1224    int i;
1225    char upper[100];
1226
1227    if ((name == NULL) || (alias == NULL))
1228        return(-1);
1229
1230    for (i = 0;i < 99;i++) {
1231        upper[i] = toupper(alias[i]);
1232        if (upper[i] == 0) break;
1233    }
1234    upper[i] = 0;
1235
1236    if (xmlCharEncodingAliases == NULL) {
1237        xmlCharEncodingAliasesNb = 0;
1238        xmlCharEncodingAliasesMax = 20;
1239        xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1240              xmlMalloc(xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1241        if (xmlCharEncodingAliases == NULL)
1242            return(-1);
1243    } else if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) {
1244        xmlCharEncodingAliasesMax *= 2;
1245        xmlCharEncodingAliases = (xmlCharEncodingAliasPtr)
1246              xmlRealloc(xmlCharEncodingAliases,
1247                         xmlCharEncodingAliasesMax * sizeof(xmlCharEncodingAlias));
1248    }
1249    /*
1250     * Walk down the list looking for a definition of the alias
1251     */
1252    for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1253        if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) {
1254            /*
1255             * Replace the definition.
1256             */
1257            xmlFree((char *) xmlCharEncodingAliases[i].name);
1258            xmlCharEncodingAliases[i].name = xmlMemStrdup(name);
1259            return(0);
1260        }
1261    }
1262    /*
1263     * Add the definition
1264     */
1265    xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = xmlMemStrdup(name);
1266    xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = xmlMemStrdup(upper);
1267    xmlCharEncodingAliasesNb++;
1268    return(0);
1269}
1270
1271/**
1272 * xmlDelEncodingAlias:
1273 * @alias:  the alias name as parsed, in UTF-8 format (ASCII actually)
1274 *
1275 * Unregisters an encoding alias @alias
1276 *
1277 * Returns 0 in case of success, -1 in case of error
1278 */
1279int
1280xmlDelEncodingAlias(const char *alias) {
1281    int i;
1282
1283    if (alias == NULL)
1284        return(-1);
1285
1286    if (xmlCharEncodingAliases == NULL)
1287        return(-1);
1288    /*
1289     * Walk down the list looking for a definition of the alias
1290     */
1291    for (i = 0;i < xmlCharEncodingAliasesNb;i++) {
1292        if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) {
1293            xmlFree((char *) xmlCharEncodingAliases[i].name);
1294            xmlFree((char *) xmlCharEncodingAliases[i].alias);
1295            xmlCharEncodingAliasesNb--;
1296            memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1],
1297                    sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i));
1298            return(0);
1299        }
1300    }
1301    return(-1);
1302}
1303
1304/**
1305 * xmlParseCharEncoding:
1306 * @name:  the encoding name as parsed, in UTF-8 format (ASCII actually)
1307 *
1308 * Compare the string to the known encoding schemes already known. Note
1309 * that the comparison is case insensitive accordingly to the section
1310 * [XML] 4.3.3 Character Encoding in Entities.
1311 *
1312 * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
1313 * if not recognized.
1314 */
1315xmlCharEncoding
1316xmlParseCharEncoding(const char* name)
1317{
1318    const char *alias;
1319    char upper[500];
1320    int i;
1321
1322    if (name == NULL)
1323        return(XML_CHAR_ENCODING_NONE);
1324
1325    /*
1326     * Do the alias resolution
1327     */
1328    alias = xmlGetEncodingAlias(name);
1329    if (alias != NULL)
1330        name = alias;
1331
1332    for (i = 0;i < 499;i++) {
1333        upper[i] = toupper(name[i]);
1334        if (upper[i] == 0) break;
1335    }
1336    upper[i] = 0;
1337
1338    if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
1339    if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
1340    if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
1341
1342    /*
1343     * NOTE: if we were able to parse this, the endianness of UTF16 is
1344     *       already found and in use
1345     */
1346    if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
1347    if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
1348   
1349    if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1350    if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
1351    if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
1352
1353    /*
1354     * NOTE: if we were able to parse this, the endianness of UCS4 is
1355     *       already found and in use
1356     */
1357    if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1358    if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
1359    if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
1360
1361   
1362    if (!strcmp(upper,  "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
1363    if (!strcmp(upper,  "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
1364    if (!strcmp(upper,  "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
1365
1366    if (!strcmp(upper,  "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
1367    if (!strcmp(upper,  "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
1368    if (!strcmp(upper,  "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
1369
1370    if (!strcmp(upper,  "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
1371    if (!strcmp(upper,  "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
1372    if (!strcmp(upper,  "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
1373    if (!strcmp(upper,  "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
1374    if (!strcmp(upper,  "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
1375    if (!strcmp(upper,  "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
1376    if (!strcmp(upper,  "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
1377
1378    if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
1379    if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
1380    if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
1381
1382#ifdef DEBUG_ENCODING
1383    xmlGenericError(xmlGenericErrorContext, "Unknown encoding %s\n", name);
1384#endif
1385    return(XML_CHAR_ENCODING_ERROR);
1386}
1387
1388/**
1389 * xmlGetCharEncodingName:
1390 * @enc:  the encoding
1391 *
1392 * The "canonical" name for XML encoding.
1393 * C.f. http://www.w3.org/TR/REC-xml#charencoding
1394 * Section 4.3.3  Character Encoding in Entities
1395 *
1396 * Returns the canonical name for the given encoding
1397 */
1398
1399const char*
1400xmlGetCharEncodingName(xmlCharEncoding enc) {
1401    switch (enc) {
1402        case XML_CHAR_ENCODING_ERROR:
1403            return(NULL);
1404        case XML_CHAR_ENCODING_NONE:
1405            return(NULL);
1406        case XML_CHAR_ENCODING_UTF8:
1407            return("UTF-8");
1408        case XML_CHAR_ENCODING_UTF16LE:
1409            return("UTF-16");
1410        case XML_CHAR_ENCODING_UTF16BE:
1411            return("UTF-16");
1412        case XML_CHAR_ENCODING_EBCDIC:
1413            return("EBCDIC");
1414        case XML_CHAR_ENCODING_UCS4LE:
1415            return("ISO-10646-UCS-4");
1416        case XML_CHAR_ENCODING_UCS4BE:
1417            return("ISO-10646-UCS-4");
1418        case XML_CHAR_ENCODING_UCS4_2143:
1419            return("ISO-10646-UCS-4");
1420        case XML_CHAR_ENCODING_UCS4_3412:
1421            return("ISO-10646-UCS-4");
1422        case XML_CHAR_ENCODING_UCS2:
1423            return("ISO-10646-UCS-2");
1424        case XML_CHAR_ENCODING_8859_1:
1425            return("ISO-8859-1");
1426        case XML_CHAR_ENCODING_8859_2:
1427            return("ISO-8859-2");
1428        case XML_CHAR_ENCODING_8859_3:
1429            return("ISO-8859-3");
1430        case XML_CHAR_ENCODING_8859_4:
1431            return("ISO-8859-4");
1432        case XML_CHAR_ENCODING_8859_5:
1433            return("ISO-8859-5");
1434        case XML_CHAR_ENCODING_8859_6:
1435            return("ISO-8859-6");
1436        case XML_CHAR_ENCODING_8859_7:
1437            return("ISO-8859-7");
1438        case XML_CHAR_ENCODING_8859_8:
1439            return("ISO-8859-8");
1440        case XML_CHAR_ENCODING_8859_9:
1441            return("ISO-8859-9");
1442        case XML_CHAR_ENCODING_2022_JP:
1443            return("ISO-2022-JP");
1444        case XML_CHAR_ENCODING_SHIFT_JIS:
1445            return("Shift-JIS");
1446        case XML_CHAR_ENCODING_EUC_JP:
1447            return("EUC-JP");
1448        case XML_CHAR_ENCODING_ASCII:
1449            return(NULL);
1450    }
1451    return(NULL);
1452}
1453
1454/************************************************************************
1455 *                                                                      *
1456 *                      Char encoding handlers                          *
1457 *                                                                      *
1458 ************************************************************************/
1459
1460
1461/* the size should be growable, but it's not a big deal ... */
1462#define MAX_ENCODING_HANDLERS 50
1463static xmlCharEncodingHandlerPtr *handlers = NULL;
1464static int nbCharEncodingHandler = 0;
1465
1466/*
1467 * The default is UTF-8 for XML, that's also the default used for the
1468 * parser internals, so the default encoding handler is NULL
1469 */
1470
1471static xmlCharEncodingHandlerPtr xmlDefaultCharEncodingHandler = NULL;
1472
1473/**
1474 * xmlNewCharEncodingHandler:
1475 * @name:  the encoding name, in UTF-8 format (ASCII actually)
1476 * @input:  the xmlCharEncodingInputFunc to read that encoding
1477 * @output:  the xmlCharEncodingOutputFunc to write that encoding
1478 *
1479 * Create and registers an xmlCharEncodingHandler.
1480 * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error).
1481 */
1482static xmlCharEncodingHandlerPtr
1483xmlNewCharEncodingHandler(const char *name,
1484                          xmlCharEncodingInputFunc input,
1485                          xmlCharEncodingOutputFunc output) {
1486    xmlCharEncodingHandlerPtr handler;
1487    const char *alias;
1488    char upper[500];
1489    int i;
1490    char *up = 0;
1491
1492    /*
1493     * Do the alias resolution
1494     */
1495    alias = xmlGetEncodingAlias(name);
1496    if (alias != NULL)
1497        name = alias;
1498
1499    /*
1500     * Keep only the uppercase version of the encoding.
1501     */
1502    if (name == NULL) {
1503        xmlGenericError(xmlGenericErrorContext,
1504                "xmlNewCharEncodingHandler : no name !\n");
1505        return(NULL);
1506    }
1507    for (i = 0;i < 499;i++) {
1508        upper[i] = toupper(name[i]);
1509        if (upper[i] == 0) break;
1510    }
1511    upper[i] = 0;
1512    up = xmlMemStrdup(upper);
1513    if (up == NULL) {
1514        xmlGenericError(xmlGenericErrorContext,
1515                "xmlNewCharEncodingHandler : out of memory !\n");
1516        return(NULL);
1517    }
1518
1519    /*
1520     * allocate and fill-up an handler block.
1521     */
1522    handler = (xmlCharEncodingHandlerPtr)
1523              xmlMalloc(sizeof(xmlCharEncodingHandler));
1524    if (handler == NULL) {
1525        xmlGenericError(xmlGenericErrorContext,
1526                "xmlNewCharEncodingHandler : out of memory !\n");
1527        return(NULL);
1528    }
1529    handler->input = input;
1530    handler->output = output;
1531    handler->name = up;
1532
1533#ifdef LIBXML_ICONV_ENABLED
1534    handler->iconv_in = NULL;
1535    handler->iconv_out = NULL;
1536#endif /* LIBXML_ICONV_ENABLED */
1537
1538    /*
1539     * registers and returns the handler.
1540     */
1541    xmlRegisterCharEncodingHandler(handler);
1542#ifdef DEBUG_ENCODING
1543    xmlGenericError(xmlGenericErrorContext,
1544            "Registered encoding handler for %s\n", name);
1545#endif
1546    return(handler);
1547}
1548
1549/**
1550 * xmlInitCharEncodingHandlers:
1551 *
1552 * Initialize the char encoding support, it registers the default
1553 * encoding supported.
1554 * NOTE: while public, this function usually doesn't need to be called
1555 *       in normal processing.
1556 */
1557void
1558xmlInitCharEncodingHandlers(void) {
1559    unsigned short int tst = 0x1234;
1560    unsigned char *ptr = (unsigned char *) &tst;
1561
1562    if (handlers != NULL) return;
1563
1564    handlers = (xmlCharEncodingHandlerPtr *)
1565        xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
1566
1567    if (*ptr == 0x12) xmlLittleEndian = 0;
1568    else if (*ptr == 0x34) xmlLittleEndian = 1;
1569    else xmlGenericError(xmlGenericErrorContext,
1570            "Odd problem at endianness detection\n");
1571
1572    if (handlers == NULL) {
1573        xmlGenericError(xmlGenericErrorContext,
1574                "xmlInitCharEncodingHandlers : out of memory !\n");
1575        return;
1576    }
1577    xmlNewCharEncodingHandler("UTF-8", NULL, NULL);
1578    xmlUTF16LEHandler =
1579          xmlNewCharEncodingHandler("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE);
1580    xmlUTF16BEHandler =
1581          xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
1582    xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
1583    xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
1584    xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, UTF8Toascii);
1585#ifdef LIBXML_HTML_ENABLED
1586    xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
1587#endif
1588}
1589
1590/**
1591 * xmlCleanupCharEncodingHandlers:
1592 *
1593 * Cleanup the memory allocated for the char encoding support, it
1594 * unregisters all the encoding handlers and the aliases.
1595 */
1596void
1597xmlCleanupCharEncodingHandlers(void) {
1598    xmlCleanupEncodingAliases();
1599
1600    if (handlers == NULL) return;
1601
1602    for (;nbCharEncodingHandler > 0;) {
1603        nbCharEncodingHandler--;
1604        if (handlers[nbCharEncodingHandler] != NULL) {
1605            if (handlers[nbCharEncodingHandler]->name != NULL)
1606                xmlFree(handlers[nbCharEncodingHandler]->name);
1607            xmlFree(handlers[nbCharEncodingHandler]);
1608        }
1609    }
1610    xmlFree(handlers);
1611    handlers = NULL;
1612    nbCharEncodingHandler = 0;
1613    xmlDefaultCharEncodingHandler = NULL;
1614}
1615
1616/**
1617 * xmlRegisterCharEncodingHandler:
1618 * @handler:  the xmlCharEncodingHandlerPtr handler block
1619 *
1620 * Register the char encoding handler, surprising, isn't it ?
1621 */
1622void
1623xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
1624    if (handlers == NULL) xmlInitCharEncodingHandlers();
1625    if (handler == NULL) {
1626        xmlGenericError(xmlGenericErrorContext,
1627                "xmlRegisterCharEncodingHandler: NULL handler !\n");
1628        return;
1629    }
1630
1631    if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) {
1632        xmlGenericError(xmlGenericErrorContext,
1633        "xmlRegisterCharEncodingHandler: Too many handler registered\n");
1634        xmlGenericError(xmlGenericErrorContext,
1635                "\tincrease MAX_ENCODING_HANDLERS : %s\n", __FILE__);
1636        return;
1637    }
1638    handlers[nbCharEncodingHandler++] = handler;
1639}
1640
1641/**
1642 * xmlGetCharEncodingHandler:
1643 * @enc:  an xmlCharEncoding value.
1644 *
1645 * Search in the registered set the handler able to read/write that encoding.
1646 *
1647 * Returns the handler or NULL if not found
1648 */
1649xmlCharEncodingHandlerPtr
1650xmlGetCharEncodingHandler(xmlCharEncoding enc) {
1651    xmlCharEncodingHandlerPtr handler;
1652
1653    if (handlers == NULL) xmlInitCharEncodingHandlers();
1654    switch (enc) {
1655        case XML_CHAR_ENCODING_ERROR:
1656            return(NULL);
1657        case XML_CHAR_ENCODING_NONE:
1658            return(NULL);
1659        case XML_CHAR_ENCODING_UTF8:
1660            return(NULL);
1661        case XML_CHAR_ENCODING_UTF16LE:
1662            return(xmlUTF16LEHandler);
1663        case XML_CHAR_ENCODING_UTF16BE:
1664            return(xmlUTF16BEHandler);
1665        case XML_CHAR_ENCODING_EBCDIC:
1666            handler = xmlFindCharEncodingHandler("EBCDIC");
1667            if (handler != NULL) return(handler);
1668            handler = xmlFindCharEncodingHandler("ebcdic");
1669            if (handler != NULL) return(handler);
1670            break;
1671        case XML_CHAR_ENCODING_UCS4BE:
1672            handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1673            if (handler != NULL) return(handler);
1674            handler = xmlFindCharEncodingHandler("UCS-4");
1675            if (handler != NULL) return(handler);
1676            handler = xmlFindCharEncodingHandler("UCS4");
1677            if (handler != NULL) return(handler);
1678            break;
1679        case XML_CHAR_ENCODING_UCS4LE:
1680            handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
1681            if (handler != NULL) return(handler);
1682            handler = xmlFindCharEncodingHandler("UCS-4");
1683            if (handler != NULL) return(handler);
1684            handler = xmlFindCharEncodingHandler("UCS4");
1685            if (handler != NULL) return(handler);
1686            break;
1687        case XML_CHAR_ENCODING_UCS4_2143:
1688            break;
1689        case XML_CHAR_ENCODING_UCS4_3412:
1690            break;
1691        case XML_CHAR_ENCODING_UCS2:
1692            handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
1693            if (handler != NULL) return(handler);
1694            handler = xmlFindCharEncodingHandler("UCS-2");
1695            if (handler != NULL) return(handler);
1696            handler = xmlFindCharEncodingHandler("UCS2");
1697            if (handler != NULL) return(handler);
1698            break;
1699
1700            /*
1701             * We used to keep ISO Latin encodings native in the
1702             * generated data. This led to so many problems that
1703             * this has been removed. One can still change this
1704             * back by registering no-ops encoders for those
1705             */
1706        case XML_CHAR_ENCODING_8859_1:
1707            handler = xmlFindCharEncodingHandler("ISO-8859-1");
1708            if (handler != NULL) return(handler);
1709            break;
1710        case XML_CHAR_ENCODING_8859_2:
1711            handler = xmlFindCharEncodingHandler("ISO-8859-2");
1712            if (handler != NULL) return(handler);
1713            break;
1714        case XML_CHAR_ENCODING_8859_3:
1715            handler = xmlFindCharEncodingHandler("ISO-8859-3");
1716            if (handler != NULL) return(handler);
1717            break;
1718        case XML_CHAR_ENCODING_8859_4:
1719            handler = xmlFindCharEncodingHandler("ISO-8859-4");
1720            if (handler != NULL) return(handler);
1721            break;
1722        case XML_CHAR_ENCODING_8859_5:
1723            handler = xmlFindCharEncodingHandler("ISO-8859-5");
1724            if (handler != NULL) return(handler);
1725            break;
1726        case XML_CHAR_ENCODING_8859_6:
1727            handler = xmlFindCharEncodingHandler("ISO-8859-6");
1728            if (handler != NULL) return(handler);
1729            break;
1730        case XML_CHAR_ENCODING_8859_7:
1731            handler = xmlFindCharEncodingHandler("ISO-8859-7");
1732            if (handler != NULL) return(handler);
1733            break;
1734        case XML_CHAR_ENCODING_8859_8:
1735            handler = xmlFindCharEncodingHandler("ISO-8859-8");
1736            if (handler != NULL) return(handler);
1737            break;
1738        case XML_CHAR_ENCODING_8859_9:
1739            handler = xmlFindCharEncodingHandler("ISO-8859-9");
1740            if (handler != NULL) return(handler);
1741            break;
1742
1743
1744        case XML_CHAR_ENCODING_2022_JP:
1745            handler = xmlFindCharEncodingHandler("ISO-2022-JP");
1746            if (handler != NULL) return(handler);
1747            break;
1748        case XML_CHAR_ENCODING_SHIFT_JIS:
1749            handler = xmlFindCharEncodingHandler("SHIFT-JIS");
1750            if (handler != NULL) return(handler);
1751            handler = xmlFindCharEncodingHandler("SHIFT_JIS");
1752            if (handler != NULL) return(handler);
1753            handler = xmlFindCharEncodingHandler("Shift_JIS");
1754            if (handler != NULL) return(handler);
1755            break;
1756        case XML_CHAR_ENCODING_EUC_JP:
1757            handler = xmlFindCharEncodingHandler("EUC-JP");
1758            if (handler != NULL) return(handler);
1759            break;
1760        default:
1761            break;
1762    }
1763   
1764#ifdef DEBUG_ENCODING
1765    xmlGenericError(xmlGenericErrorContext,
1766            "No handler found for encoding %d\n", enc);
1767#endif
1768    return(NULL);
1769}
1770
1771/**
1772 * xmlFindCharEncodingHandler:
1773 * @name:  a string describing the char encoding.
1774 *
1775 * Search in the registered set the handler able to read/write that encoding.
1776 *
1777 * Returns the handler or NULL if not found
1778 */
1779xmlCharEncodingHandlerPtr
1780xmlFindCharEncodingHandler(const char *name) {
1781    const char *nalias;
1782    const char *norig;
1783    xmlCharEncoding alias;
1784#ifdef LIBXML_ICONV_ENABLED
1785    xmlCharEncodingHandlerPtr enc;
1786    iconv_t icv_in, icv_out;
1787#endif /* LIBXML_ICONV_ENABLED */
1788    char upper[100];
1789    int i;
1790
1791    if (handlers == NULL) xmlInitCharEncodingHandlers();
1792    if (name == NULL) return(xmlDefaultCharEncodingHandler);
1793    if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
1794
1795    /*
1796     * Do the alias resolution
1797     */
1798    norig = name;
1799    nalias = xmlGetEncodingAlias(name);
1800    if (nalias != NULL)
1801        name = nalias;
1802
1803    /*
1804     * Check first for directly registered encoding names
1805     */
1806    for (i = 0;i < 99;i++) {
1807        upper[i] = toupper(name[i]);
1808        if (upper[i] == 0) break;
1809    }
1810    upper[i] = 0;
1811
1812    for (i = 0;i < nbCharEncodingHandler; i++)
1813        if (!strcmp(upper, handlers[i]->name)) {
1814#ifdef DEBUG_ENCODING
1815            xmlGenericError(xmlGenericErrorContext,
1816                    "Found registered handler for encoding %s\n", name);
1817#endif
1818            return(handlers[i]);
1819        }
1820
1821#ifdef LIBXML_ICONV_ENABLED
1822    /* check whether iconv can handle this */
1823    icv_in = iconv_open("UTF-8", name);
1824    icv_out = iconv_open(name, "UTF-8");
1825    if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
1826            enc = (xmlCharEncodingHandlerPtr)
1827                  xmlMalloc(sizeof(xmlCharEncodingHandler));
1828            if (enc == NULL) {
1829                iconv_close(icv_in);
1830                iconv_close(icv_out);
1831                return(NULL);
1832            }
1833            enc->name = xmlMemStrdup(name);
1834            enc->input = NULL;
1835            enc->output = NULL;
1836            enc->iconv_in = icv_in;
1837            enc->iconv_out = icv_out;
1838#ifdef DEBUG_ENCODING
1839            xmlGenericError(xmlGenericErrorContext,
1840                    "Found iconv handler for encoding %s\n", name);
1841#endif
1842            return enc;
1843    } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
1844            xmlGenericError(xmlGenericErrorContext,
1845                    "iconv : problems with filters for '%s'\n", name);
1846    }
1847#endif /* LIBXML_ICONV_ENABLED */
1848
1849#ifdef DEBUG_ENCODING
1850    xmlGenericError(xmlGenericErrorContext,
1851            "No handler found for encoding %s\n", name);
1852#endif
1853
1854    /*
1855     * Fallback using the canonical names
1856     */
1857    alias = xmlParseCharEncoding(norig);
1858    if (alias != XML_CHAR_ENCODING_ERROR) {
1859        const char* canon;
1860        canon = xmlGetCharEncodingName(alias);
1861        if ((canon != NULL) && (strcmp(name, canon))) {
1862            return(xmlFindCharEncodingHandler(canon));
1863        }
1864    }
1865
1866    return(NULL);
1867}
1868
1869/************************************************************************
1870 *                                                                      *
1871 *              ICONV based generic conversion functions                *
1872 *                                                                      *
1873 ************************************************************************/
1874
1875#ifdef LIBXML_ICONV_ENABLED
1876/**
1877 * xmlIconvWrapper:
1878 * @cd:         iconv converter data structure
1879 * @out:  a pointer to an array of bytes to store the result
1880 * @outlen:  the length of @out
1881 * @in:  a pointer to an array of ISO Latin 1 chars
1882 * @inlen:  the length of @in
1883 *
1884 * Returns 0 if success, or
1885 *     -1 by lack of space, or
1886 *     -2 if the transcoding fails (for *in is not valid utf8 string or
1887 *        the result of transformation can't fit into the encoding we want), or
1888 *     -3 if there the last byte can't form a single output char.
1889 *     
1890 * The value of @inlen after return is the number of octets consumed
1891 *     as the return value is positive, else unpredictable.
1892 * The value of @outlen after return is the number of ocetes consumed.
1893 */
1894static int
1895xmlIconvWrapper(iconv_t cd,
1896    unsigned char *out, int *outlen,
1897    const unsigned char *in, int *inlen) {
1898
1899    size_t icv_inlen = *inlen, icv_outlen = *outlen;
1900    const char *icv_in = (const char *) in;
1901    char *icv_out = (char *) out;
1902    int ret;
1903
1904    ret = iconv(cd, (char **) &icv_in, &icv_inlen, &icv_out, &icv_outlen);
1905    if (in != NULL) {
1906        *inlen -= icv_inlen;
1907        *outlen -= icv_outlen;
1908    } else {
1909        *inlen = 0;
1910        *outlen = 0;
1911    }
1912    if ((icv_inlen != 0) || (ret == -1)) {
1913#ifdef EILSEQ
1914        if (errno == EILSEQ) {
1915            return -2;
1916        } else
1917#endif
1918#ifdef E2BIG
1919        if (errno == E2BIG) {
1920            return -1;
1921        } else
1922#endif
1923#ifdef EINVAL
1924        if (errno == EINVAL) {
1925            return -3;
1926        } else
1927#endif
1928        {
1929            return -3;
1930        }
1931    }
1932    return 0;
1933}
1934#endif /* LIBXML_ICONV_ENABLED */
1935
1936/************************************************************************
1937 *                                                                      *
1938 *              The real API used by libxml for on-the-fly conversion   *
1939 *                                                                      *
1940 ************************************************************************/
1941
1942/**
1943 * xmlCharEncFirstLine:
1944 * @handler:    char enconding transformation data structure
1945 * @out:  an xmlBuffer for the output.
1946 * @in:  an xmlBuffer for the input
1947 *     
1948 * Front-end for the encoding handler input function, but handle only
1949 * the very first line, i.e. limit itself to 45 chars.
1950 *     
1951 * Returns the number of byte written if success, or
1952 *     -1 general error
1953 *     -2 if the transcoding fails (for *in is not valid utf8 string or
1954 *        the result of transformation can't fit into the encoding we want), or
1955 */
1956int
1957xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
1958                 xmlBufferPtr in) {
1959    int ret = -2;
1960    int written;
1961    int toconv;
1962
1963    if (handler == NULL) return(-1);
1964    if (out == NULL) return(-1);
1965    if (in == NULL) return(-1);
1966
1967    written = out->size - out->use;
1968    toconv = in->use;
1969    if (toconv * 2 >= written) {
1970        xmlBufferGrow(out, toconv);
1971        written = out->size - out->use - 1;
1972    }
1973
1974    /*
1975     * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
1976     * 45 chars should be sufficient to reach the end of the encoding
1977     * declaration without going too far inside the document content.
1978     */
1979    written = 45;
1980
1981    if (handler->input != NULL) {
1982        ret = handler->input(&out->content[out->use], &written,
1983                             in->content, &toconv);
1984        xmlBufferShrink(in, toconv);
1985        out->use += written;
1986        out->content[out->use] = 0;
1987    }
1988#ifdef LIBXML_ICONV_ENABLED
1989    else if (handler->iconv_in != NULL) {
1990        ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
1991                              &written, in->content, &toconv);
1992        xmlBufferShrink(in, toconv);
1993        out->use += written;
1994        out->content[out->use] = 0;
1995        if (ret == -1) ret = -3;
1996    }
1997#endif /* LIBXML_ICONV_ENABLED */
1998#ifdef DEBUG_ENCODING
1999    switch (ret) {
2000        case 0:
2001            xmlGenericError(xmlGenericErrorContext,
2002                    "converted %d bytes to %d bytes of input\n",
2003                    toconv, written);
2004            break;
2005        case -1:
2006            xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2007                    toconv, written, in->use);
2008            break;
2009        case -2:
2010            xmlGenericError(xmlGenericErrorContext,
2011                    "input conversion failed due to input error\n");
2012            break;
2013        case -3:
2014            xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
2015                    toconv, written, in->use);
2016            break;
2017        default:
2018            xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
2019    }
2020#endif /* DEBUG_ENCODING */
2021    /*
2022     * Ignore when input buffer is not on a boundary
2023     */
2024    if (ret == -3) ret = 0;
2025    if (ret == -1) ret = 0;
2026    return(ret);
2027}
2028
2029/**
2030 * xmlCharEncInFunc:
2031 * @handler:    char encoding transformation data structure
2032 * @out:  an xmlBuffer for the output.
2033 * @in:  an xmlBuffer for the input
2034 *     
2035 * Generic front-end for the encoding handler input function
2036 *     
2037 * Returns the number of byte written if success, or
2038 *     -1 general error
2039 *     -2 if the transcoding fails (for *in is not valid utf8 string or
2040 *        the result of transformation can't fit into the encoding we want), or
2041 */
2042int
2043xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
2044                 xmlBufferPtr in)
2045{
2046    int ret = -2;
2047    int written;
2048    int toconv;
2049
2050    if (handler == NULL)
2051        return (-1);
2052    if (out == NULL)
2053        return (-1);
2054    if (in == NULL)
2055        return (-1);
2056
2057    toconv = in->use;
2058    if (toconv == 0)
2059        return (0);
2060    written = out->size - out->use;
2061    if (toconv * 2 >= written) {
2062        xmlBufferGrow(out, out->size + toconv * 2);
2063        written = out->size - out->use - 1;
2064    }
2065    if (handler->input != NULL) {
2066        ret = handler->input(&out->content[out->use], &written,
2067                             in->content, &toconv);
2068        xmlBufferShrink(in, toconv);
2069        out->use += written;
2070        out->content[out->use] = 0;
2071    }
2072#ifdef LIBXML_ICONV_ENABLED
2073    else if (handler->iconv_in != NULL) {
2074        ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
2075                              &written, in->content, &toconv);
2076        xmlBufferShrink(in, toconv);
2077        out->use += written;
2078        out->content[out->use] = 0;
2079        if (ret == -1)
2080            ret = -3;
2081    }
2082#endif /* LIBXML_ICONV_ENABLED */
2083    switch (ret) {
2084        case 0:
2085#ifdef DEBUG_ENCODING
2086            xmlGenericError(xmlGenericErrorContext,
2087                            "converted %d bytes to %d bytes of input\n",
2088                            toconv, written);
2089#endif
2090            break;
2091        case -1:
2092#ifdef DEBUG_ENCODING
2093            xmlGenericError(xmlGenericErrorContext,
2094                         "converted %d bytes to %d bytes of input, %d left\n",
2095                            toconv, written, in->use);
2096#endif
2097            break;
2098        case -3:
2099#ifdef DEBUG_ENCODING
2100            xmlGenericError(xmlGenericErrorContext,
2101                        "converted %d bytes to %d bytes of input, %d left\n",
2102                            toconv, written, in->use);
2103#endif
2104            break;
2105        case -2:
2106            xmlGenericError(xmlGenericErrorContext,
2107                            "input conversion failed due to input error\n");
2108            xmlGenericError(xmlGenericErrorContext,
2109                            "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2110                            in->content[0], in->content[1],
2111                            in->content[2], in->content[3]);
2112    }
2113    /*
2114     * Ignore when input buffer is not on a boundary
2115     */
2116    if (ret == -3)
2117        ret = 0;
2118    return (ret);
2119}
2120
2121/**
2122 * xmlCharEncOutFunc:
2123 * @handler:    char enconding transformation data structure
2124 * @out:  an xmlBuffer for the output.
2125 * @in:  an xmlBuffer for the input
2126 *     
2127 * Generic front-end for the encoding handler output function
2128 * a first call with @in == NULL has to be made firs to initiate the
2129 * output in case of non-stateless encoding needing to initiate their
2130 * state or the output (like the BOM in UTF16).
2131 * In case of UTF8 sequence conversion errors for the given encoder,
2132 * the content will be automatically remapped to a CharRef sequence.
2133 *     
2134 * Returns the number of byte written if success, or
2135 *     -1 general error
2136 *     -2 if the transcoding fails (for *in is not valid utf8 string or
2137 *        the result of transformation can't fit into the encoding we want), or
2138 */
2139int
2140xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
2141                  xmlBufferPtr in) {
2142    int ret = -2;
2143    int written;
2144    int writtentot = 0;
2145    int toconv;
2146    int output = 0;
2147
2148    if (handler == NULL) return(-1);
2149    if (out == NULL) return(-1);
2150
2151retry:
2152   
2153    written = out->size - out->use;
2154
2155    /*
2156     * First specific handling of in = NULL, i.e. the initialization call
2157     */
2158    if (in == NULL) {
2159        toconv = 0;
2160        if (handler->output != NULL) {
2161            ret = handler->output(&out->content[out->use], &written,
2162                                  NULL, &toconv);
2163            out->use += written;
2164            out->content[out->use] = 0;
2165        }
2166#ifdef LIBXML_ICONV_ENABLED
2167        else if (handler->iconv_out != NULL) {
2168            ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2169                                  &written, NULL, &toconv);
2170            out->use += written;
2171            out->content[out->use] = 0;
2172        }
2173#endif /* LIBXML_ICONV_ENABLED */
2174#ifdef DEBUG_ENCODING
2175        xmlGenericError(xmlGenericErrorContext,
2176                "initialized encoder\n");
2177#endif
2178        return(0);
2179    }
2180
2181    /*
2182     * Conversion itself.
2183     */
2184    toconv = in->use;
2185    if (toconv == 0)
2186        return(0);
2187    if (toconv * 2 >= written) {
2188        xmlBufferGrow(out, toconv * 2);
2189        written = out->size - out->use - 1;
2190    }
2191    if (handler->output != NULL) {
2192        ret = handler->output(&out->content[out->use], &written,
2193                              in->content, &toconv);
2194        xmlBufferShrink(in, toconv);
2195        out->use += written;
2196        writtentot += written;
2197        out->content[out->use] = 0;
2198    }
2199#ifdef LIBXML_ICONV_ENABLED
2200    else if (handler->iconv_out != NULL) {
2201        ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
2202                              &written, in->content, &toconv);
2203        xmlBufferShrink(in, toconv);
2204        out->use += written;
2205        writtentot += written;
2206        out->content[out->use] = 0;
2207        if (ret == -1) {
2208            if (written > 0) {
2209                /*
2210                 * Can be a limitation of iconv
2211                 */
2212                goto retry;
2213            }
2214            ret = -3;
2215        }
2216    }
2217#endif /* LIBXML_ICONV_ENABLED */
2218    else {
2219        xmlGenericError(xmlGenericErrorContext,
2220                "xmlCharEncOutFunc: no output function !\n");
2221        return(-1);
2222    }
2223
2224    if (ret >= 0) output += ret;
2225
2226    /*
2227     * Attempt to handle error cases
2228     */
2229    switch (ret) {
2230        case 0:
2231#ifdef DEBUG_ENCODING
2232            xmlGenericError(xmlGenericErrorContext,
2233                    "converted %d bytes to %d bytes of output\n",
2234                    toconv, written);
2235#endif
2236            break;
2237        case -1:
2238#ifdef DEBUG_ENCODING
2239            xmlGenericError(xmlGenericErrorContext,
2240                    "output conversion failed by lack of space\n");
2241#endif
2242            break;
2243        case -3:
2244            xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
2245                    toconv, written, in->use);
2246            break;
2247        case -2: {
2248            int len = in->use;
2249            const xmlChar *utf = (const xmlChar *) in->content;
2250            int cur;
2251
2252            cur = xmlGetUTF8Char(utf, &len);
2253            if (cur > 0) {
2254                xmlChar charref[20];
2255
2256#ifdef DEBUG_ENCODING
2257                xmlGenericError(xmlGenericErrorContext,
2258                        "handling output conversion error\n");
2259                xmlGenericError(xmlGenericErrorContext,
2260                        "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2261                        in->content[0], in->content[1],
2262                        in->content[2], in->content[3]);
2263#endif
2264                /*
2265                 * Removes the UTF8 sequence, and replace it by a charref
2266                 * and continue the transcoding phase, hoping the error
2267                 * did not mangle the encoder state.
2268                 */
2269                sprintf((char *) charref, "&#%d;", cur);
2270                xmlBufferShrink(in, len);
2271                xmlBufferAddHead(in, charref, -1);
2272
2273                goto retry;
2274            } else {
2275                xmlGenericError(xmlGenericErrorContext,
2276                        "output conversion failed due to conv error\n");
2277                xmlGenericError(xmlGenericErrorContext,
2278                        "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
2279                        in->content[0], in->content[1],
2280                        in->content[2], in->content[3]);
2281                in->content[0] = ' ';
2282            }
2283            break;
2284        }
2285    }
2286    return(ret);
2287}
2288
2289/**
2290 * xmlCharEncCloseFunc:
2291 * @handler:    char enconding transformation data structure
2292 *     
2293 * Generic front-end for encoding handler close function
2294 *
2295 * Returns 0 if success, or -1 in case of error
2296 */
2297int
2298xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
2299    int ret = 0;
2300    if (handler == NULL) return(-1);
2301    if (handler->name == NULL) return(-1);
2302#ifdef LIBXML_ICONV_ENABLED
2303    /*
2304     * Iconv handlers can be used only once, free the whole block.
2305     * and the associated icon resources.
2306     */
2307    if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
2308        if (handler->name != NULL)
2309            xmlFree(handler->name);
2310        handler->name = NULL;
2311        if (handler->iconv_out != NULL) {
2312            if (iconv_close(handler->iconv_out))
2313                ret = -1;
2314            handler->iconv_out = NULL;
2315        }
2316        if (handler->iconv_in != NULL) {
2317            if (iconv_close(handler->iconv_in))
2318                ret = -1;
2319            handler->iconv_in = NULL;
2320        }
2321        xmlFree(handler);
2322    }
2323#endif /* LIBXML_ICONV_ENABLED */
2324#ifdef DEBUG_ENCODING
2325    if (ret)
2326        xmlGenericError(xmlGenericErrorContext,
2327                "failed to close the encoding handler\n");
2328    else
2329        xmlGenericError(xmlGenericErrorContext,
2330                "closed the encoding handler\n");
2331#endif
2332
2333    return(ret);
2334}
2335
Note: See TracBrowser for help on using the repository browser.