source: trunk/third/jwgc/lib/libxode/xmltok.c @ 22406

Revision 22406, 37.1 KB checked in by ghudson, 19 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r22405, which included commits to RCS files with non-trunk default branches.
Line 
1/*
2Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
3See the file COPYING for copying permission.
4*/
5
6
7#ifdef COMPILED_FROM_DSP
8#  include "winconfig.h"
9#else
10#  include <config.h>
11#endif /* ndef COMPILED_FROM_DSP */
12
13#include "xmltok.h"
14#include "nametab.h"
15
16#ifdef XML_DTD
17#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
18#else
19#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
20#endif
21
22#define VTABLE1 \
23  { PREFIX(prologTok), PREFIX(contentTok), \
24    PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
25  { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
26  PREFIX(sameName), \
27  PREFIX(nameMatchesAscii), \
28  PREFIX(nameLength), \
29  PREFIX(skipS), \
30  PREFIX(getAtts), \
31  PREFIX(charRefNumber), \
32  PREFIX(predefinedEntityName), \
33  PREFIX(updatePosition), \
34  PREFIX(isPublicId)
35
36#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
37
38#define UCS2_GET_NAMING(pages, hi, lo) \
39   (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
40
41/* A 2 byte UTF-8 representation splits the characters 11 bits
42between the bottom 5 and 6 bits of the bytes.
43We need 8 bits to index into pages, 3 bits to add to that index and
445 bits to generate the mask. */
45#define UTF8_GET_NAMING2(pages, byte) \
46    (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
47                      + ((((byte)[0]) & 3) << 1) \
48                      + ((((byte)[1]) >> 5) & 1)] \
49         & (1 << (((byte)[1]) & 0x1F)))
50
51/* A 3 byte UTF-8 representation splits the characters 16 bits
52between the bottom 4, 6 and 6 bits of the bytes.
53We need 8 bits to index into pages, 3 bits to add to that index and
545 bits to generate the mask. */
55#define UTF8_GET_NAMING3(pages, byte) \
56  (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
57                             + ((((byte)[1]) >> 2) & 0xF)] \
58                       << 3) \
59                      + ((((byte)[1]) & 3) << 1) \
60                      + ((((byte)[2]) >> 5) & 1)] \
61         & (1 << (((byte)[2]) & 0x1F)))
62
63#define UTF8_GET_NAMING(pages, p, n) \
64  ((n) == 2 \
65  ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
66  : ((n) == 3 \
67     ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
68     : 0))
69
70#define UTF8_INVALID3(p) \
71  ((*p) == 0xED \
72  ? (((p)[1] & 0x20) != 0) \
73  : ((*p) == 0xEF \
74     ? ((p)[1] == 0xBF && ((p)[2] == 0xBF || (p)[2] == 0xBE)) \
75     : 0))
76
77#define UTF8_INVALID4(p) ((*p) == 0xF4 && ((p)[1] & 0x30) != 0)
78
79static
80int isNever(const ENCODING *enc, const char *p)
81{
82  return 0;
83}
84
85static
86int utf8_isName2(const ENCODING *enc, const char *p)
87{
88  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
89}
90
91static
92int utf8_isName3(const ENCODING *enc, const char *p)
93{
94  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
95}
96
97#define utf8_isName4 isNever
98
99static
100int utf8_isNmstrt2(const ENCODING *enc, const char *p)
101{
102  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
103}
104
105static
106int utf8_isNmstrt3(const ENCODING *enc, const char *p)
107{
108  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
109}
110
111#define utf8_isNmstrt4 isNever
112
113#define utf8_isInvalid2 isNever
114
115static
116int utf8_isInvalid3(const ENCODING *enc, const char *p)
117{
118  return UTF8_INVALID3((const unsigned char *)p);
119}
120
121static
122int utf8_isInvalid4(const ENCODING *enc, const char *p)
123{
124  return UTF8_INVALID4((const unsigned char *)p);
125}
126
127struct normal_encoding {
128  ENCODING enc;
129  unsigned char type[256];
130#ifdef XML_MIN_SIZE
131  int (*byteType)(const ENCODING *, const char *);
132  int (*isNameMin)(const ENCODING *, const char *);
133  int (*isNmstrtMin)(const ENCODING *, const char *);
134  int (*byteToAscii)(const ENCODING *, const char *);
135  int (*charMatches)(const ENCODING *, const char *, int);
136#endif /* XML_MIN_SIZE */
137  int (*isName2)(const ENCODING *, const char *);
138  int (*isName3)(const ENCODING *, const char *);
139  int (*isName4)(const ENCODING *, const char *);
140  int (*isNmstrt2)(const ENCODING *, const char *);
141  int (*isNmstrt3)(const ENCODING *, const char *);
142  int (*isNmstrt4)(const ENCODING *, const char *);
143  int (*isInvalid2)(const ENCODING *, const char *);
144  int (*isInvalid3)(const ENCODING *, const char *);
145  int (*isInvalid4)(const ENCODING *, const char *);
146};
147
148#ifdef XML_MIN_SIZE
149
150#define STANDARD_VTABLE(E) \
151 E ## byteType, \
152 E ## isNameMin, \
153 E ## isNmstrtMin, \
154 E ## byteToAscii, \
155 E ## charMatches,
156
157#else
158
159#define STANDARD_VTABLE(E) /* as nothing */
160
161#endif
162
163#define NORMAL_VTABLE(E) \
164 E ## isName2, \
165 E ## isName3, \
166 E ## isName4, \
167 E ## isNmstrt2, \
168 E ## isNmstrt3, \
169 E ## isNmstrt4, \
170 E ## isInvalid2, \
171 E ## isInvalid3, \
172 E ## isInvalid4
173
174static int checkCharRefNumber(int);
175
176#include "xmltok_impl.h"
177#include "ascii.h"
178
179#ifdef XML_MIN_SIZE
180#define sb_isNameMin isNever
181#define sb_isNmstrtMin isNever
182#endif
183
184#ifdef XML_MIN_SIZE
185#define MINBPC(enc) ((enc)->minBytesPerChar)
186#else
187/* minimum bytes per character */
188#define MINBPC(enc) 1
189#endif
190
191#define SB_BYTE_TYPE(enc, p) \
192  (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
193
194#ifdef XML_MIN_SIZE
195static
196int sb_byteType(const ENCODING *enc, const char *p)
197{
198  return SB_BYTE_TYPE(enc, p);
199}
200#define BYTE_TYPE(enc, p) \
201 (((const struct normal_encoding *)(enc))->byteType(enc, p))
202#else
203#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
204#endif
205
206#ifdef XML_MIN_SIZE
207#define BYTE_TO_ASCII(enc, p) \
208 (((const struct normal_encoding *)(enc))->byteToAscii(enc, p))
209static
210int sb_byteToAscii(const ENCODING *enc, const char *p)
211{
212  return *p;
213}
214#else
215#define BYTE_TO_ASCII(enc, p) (*(p))
216#endif
217
218#define IS_NAME_CHAR(enc, p, n) \
219 (((const struct normal_encoding *)(enc))->isName ## n(enc, p))
220#define IS_NMSTRT_CHAR(enc, p, n) \
221 (((const struct normal_encoding *)(enc))->isNmstrt ## n(enc, p))
222#define IS_INVALID_CHAR(enc, p, n) \
223 (((const struct normal_encoding *)(enc))->isInvalid ## n(enc, p))
224
225#ifdef XML_MIN_SIZE
226#define IS_NAME_CHAR_MINBPC(enc, p) \
227 (((const struct normal_encoding *)(enc))->isNameMin(enc, p))
228#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
229 (((const struct normal_encoding *)(enc))->isNmstrtMin(enc, p))
230#else
231#define IS_NAME_CHAR_MINBPC(enc, p) (0)
232#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
233#endif
234
235#ifdef XML_MIN_SIZE
236#define CHAR_MATCHES(enc, p, c) \
237 (((const struct normal_encoding *)(enc))->charMatches(enc, p, c))
238static
239int sb_charMatches(const ENCODING *enc, const char *p, int c)
240{
241  return *p == c;
242}
243#else
244/* c is an ASCII character */
245#define CHAR_MATCHES(enc, p, c) (*(p) == c)
246#endif
247
248#define PREFIX(ident) normal_ ## ident
249#include "xmltok_impl.c"
250
251#undef MINBPC
252#undef BYTE_TYPE
253#undef BYTE_TO_ASCII
254#undef CHAR_MATCHES
255#undef IS_NAME_CHAR
256#undef IS_NAME_CHAR_MINBPC
257#undef IS_NMSTRT_CHAR
258#undef IS_NMSTRT_CHAR_MINBPC
259#undef IS_INVALID_CHAR
260
261enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
262  UTF8_cval1 = 0x00,
263  UTF8_cval2 = 0xc0,
264  UTF8_cval3 = 0xe0,
265  UTF8_cval4 = 0xf0
266};
267
268static
269void utf8_toUtf8(const ENCODING *enc,
270                 const char **fromP, const char *fromLim,
271                 char **toP, const char *toLim)
272{
273  char *to;
274  const char *from;
275  if (fromLim - *fromP > toLim - *toP) {
276    /* Avoid copying partial characters. */
277    for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
278      if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
279        break;
280  }
281  for (to = *toP, from = *fromP; from != fromLim; from++, to++)
282    *to = *from;
283  *fromP = from;
284  *toP = to;
285}
286
287static
288void utf8_toUtf16(const ENCODING *enc,
289                  const char **fromP, const char *fromLim,
290                  unsigned short **toP, const unsigned short *toLim)
291{
292  unsigned short *to = *toP;
293  const char *from = *fromP;
294  while (from != fromLim && to != toLim) {
295    switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
296    case BT_LEAD2:
297      *to++ = ((from[0] & 0x1f) << 6) | (from[1] & 0x3f);
298      from += 2;
299      break;
300    case BT_LEAD3:
301      *to++ = ((from[0] & 0xf) << 12) | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f);
302      from += 3;
303      break;
304    case BT_LEAD4:
305      {
306        unsigned long n;
307        if (to + 1 == toLim)
308          break;
309        n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12) | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
310        n -= 0x10000;
311        to[0] = (unsigned short)((n >> 10) | 0xD800);
312        to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
313        to += 2;
314        from += 4;
315      }
316      break;
317    default:
318      *to++ = *from++;
319      break;
320    }
321  }
322  *fromP = from;
323  *toP = to;
324}
325
326#ifdef XML_NS
327static const struct normal_encoding utf8_encoding_ns = {
328  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
329  {
330#include "asciitab.h"
331#include "utf8tab.h"
332  },
333  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
334};
335#endif
336
337static const struct normal_encoding utf8_encoding = {
338  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
339  {
340#define BT_COLON BT_NMSTRT
341#include "asciitab.h"
342#undef BT_COLON
343#include "utf8tab.h"
344  },
345  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
346};
347
348#ifdef XML_NS
349
350static const struct normal_encoding internal_utf8_encoding_ns = {
351  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
352  {
353#include "iasciitab.h"
354#include "utf8tab.h"
355  },
356  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
357};
358
359#endif
360
361static const struct normal_encoding internal_utf8_encoding = {
362  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
363  {
364#define BT_COLON BT_NMSTRT
365#include "iasciitab.h"
366#undef BT_COLON
367#include "utf8tab.h"
368  },
369  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
370};
371
372static
373void latin1_toUtf8(const ENCODING *enc,
374                   const char **fromP, const char *fromLim,
375                   char **toP, const char *toLim)
376{
377  for (;;) {
378    unsigned char c;
379    if (*fromP == fromLim)
380      break;
381    c = (unsigned char)**fromP;
382    if (c & 0x80) {
383      if (toLim - *toP < 2)
384        break;
385      *(*toP)++ = ((c >> 6) | UTF8_cval2);
386      *(*toP)++ = ((c & 0x3f) | 0x80);
387      (*fromP)++;
388    }
389    else {
390      if (*toP == toLim)
391        break;
392      *(*toP)++ = *(*fromP)++;
393    }
394  }
395}
396
397static
398void latin1_toUtf16(const ENCODING *enc,
399                    const char **fromP, const char *fromLim,
400                    unsigned short **toP, const unsigned short *toLim)
401{
402  while (*fromP != fromLim && *toP != toLim)
403    *(*toP)++ = (unsigned char)*(*fromP)++;
404}
405
406#ifdef XML_NS
407
408static const struct normal_encoding latin1_encoding_ns = {
409  { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
410  {
411#include "asciitab.h"
412#include "latin1tab.h"
413  },
414  STANDARD_VTABLE(sb_)
415};
416
417#endif
418
419static const struct normal_encoding latin1_encoding = {
420  { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
421  {
422#define BT_COLON BT_NMSTRT
423#include "asciitab.h"
424#undef BT_COLON
425#include "latin1tab.h"
426  },
427  STANDARD_VTABLE(sb_)
428};
429
430static
431void ascii_toUtf8(const ENCODING *enc,
432                  const char **fromP, const char *fromLim,
433                  char **toP, const char *toLim)
434{
435  while (*fromP != fromLim && *toP != toLim)
436    *(*toP)++ = *(*fromP)++;
437}
438
439#ifdef XML_NS
440
441static const struct normal_encoding ascii_encoding_ns = {
442  { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
443  {
444#include "asciitab.h"
445/* BT_NONXML == 0 */
446  },
447  STANDARD_VTABLE(sb_)
448};
449
450#endif
451
452static const struct normal_encoding ascii_encoding = {
453  { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
454  {
455#define BT_COLON BT_NMSTRT
456#include "asciitab.h"
457#undef BT_COLON
458/* BT_NONXML == 0 */
459  },
460  STANDARD_VTABLE(sb_)
461};
462
463static int unicode_byte_type(char hi, char lo)
464{
465  switch ((unsigned char)hi) {
466  case 0xD8: case 0xD9: case 0xDA: case 0xDB:
467    return BT_LEAD4;
468  case 0xDC: case 0xDD: case 0xDE: case 0xDF:
469    return BT_TRAIL;
470  case 0xFF:
471    switch ((unsigned char)lo) {
472    case 0xFF:
473    case 0xFE:
474      return BT_NONXML;
475    }
476    break;
477  }
478  return BT_NONASCII;
479}
480
481#define DEFINE_UTF16_TO_UTF8(E) \
482static \
483void E ## toUtf8(const ENCODING *enc, \
484                 const char **fromP, const char *fromLim, \
485                 char **toP, const char *toLim) \
486{ \
487  const char *from; \
488  for (from = *fromP; from != fromLim; from += 2) { \
489    int plane; \
490    unsigned char lo2; \
491    unsigned char lo = GET_LO(from); \
492    unsigned char hi = GET_HI(from); \
493    switch (hi) { \
494    case 0: \
495      if (lo < 0x80) { \
496        if (*toP == toLim) { \
497          *fromP = from; \
498          return; \
499        } \
500        *(*toP)++ = lo; \
501        break; \
502      } \
503      /* fall through */ \
504    case 0x1: case 0x2: case 0x3: \
505    case 0x4: case 0x5: case 0x6: case 0x7: \
506      if (toLim -  *toP < 2) { \
507        *fromP = from; \
508        return; \
509      } \
510      *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
511      *(*toP)++ = ((lo & 0x3f) | 0x80); \
512      break; \
513    default: \
514      if (toLim -  *toP < 3)  { \
515        *fromP = from; \
516        return; \
517      } \
518      /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
519      *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
520      *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
521      *(*toP)++ = ((lo & 0x3f) | 0x80); \
522      break; \
523    case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
524      if (toLim -  *toP < 4) { \
525        *fromP = from; \
526        return; \
527      } \
528      plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
529      *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
530      *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
531      from += 2; \
532      lo2 = GET_LO(from); \
533      *(*toP)++ = (((lo & 0x3) << 4) \
534                   | ((GET_HI(from) & 0x3) << 2) \
535                   | (lo2 >> 6) \
536                   | 0x80); \
537      *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
538      break; \
539    } \
540  } \
541  *fromP = from; \
542}
543
544#define DEFINE_UTF16_TO_UTF16(E) \
545static \
546void E ## toUtf16(const ENCODING *enc, \
547                  const char **fromP, const char *fromLim, \
548                  unsigned short **toP, const unsigned short *toLim) \
549{ \
550  /* Avoid copying first half only of surrogate */ \
551  if (fromLim - *fromP > ((toLim - *toP) << 1) \
552      && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
553    fromLim -= 2; \
554  for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
555    *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
556}
557
558#define SET2(ptr, ch) \
559  (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
560#define GET_LO(ptr) ((unsigned char)(ptr)[0])
561#define GET_HI(ptr) ((unsigned char)(ptr)[1])
562
563DEFINE_UTF16_TO_UTF8(little2_)
564DEFINE_UTF16_TO_UTF16(little2_)
565
566#undef SET2
567#undef GET_LO
568#undef GET_HI
569
570#define SET2(ptr, ch) \
571  (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
572#define GET_LO(ptr) ((unsigned char)(ptr)[1])
573#define GET_HI(ptr) ((unsigned char)(ptr)[0])
574
575DEFINE_UTF16_TO_UTF8(big2_)
576DEFINE_UTF16_TO_UTF16(big2_)
577
578#undef SET2
579#undef GET_LO
580#undef GET_HI
581
582#define LITTLE2_BYTE_TYPE(enc, p) \
583 ((p)[1] == 0 \
584  ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
585  : unicode_byte_type((p)[1], (p)[0]))
586#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
587#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
588#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
589  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
590#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
591  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
592
593#ifdef XML_MIN_SIZE
594
595static
596int little2_byteType(const ENCODING *enc, const char *p)
597{
598  return LITTLE2_BYTE_TYPE(enc, p);
599}
600
601static
602int little2_byteToAscii(const ENCODING *enc, const char *p)
603{
604  return LITTLE2_BYTE_TO_ASCII(enc, p);
605}
606
607static
608int little2_charMatches(const ENCODING *enc, const char *p, int c)
609{
610  return LITTLE2_CHAR_MATCHES(enc, p, c);
611}
612
613static
614int little2_isNameMin(const ENCODING *enc, const char *p)
615{
616  return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
617}
618
619static
620int little2_isNmstrtMin(const ENCODING *enc, const char *p)
621{
622  return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
623}
624
625#undef VTABLE
626#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
627
628#else /* not XML_MIN_SIZE */
629
630#undef PREFIX
631#define PREFIX(ident) little2_ ## ident
632#define MINBPC(enc) 2
633/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
634#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
635#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
636#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
637#define IS_NAME_CHAR(enc, p, n) 0
638#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
639#define IS_NMSTRT_CHAR(enc, p, n) (0)
640#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
641
642#include "xmltok_impl.c"
643
644#undef MINBPC
645#undef BYTE_TYPE
646#undef BYTE_TO_ASCII
647#undef CHAR_MATCHES
648#undef IS_NAME_CHAR
649#undef IS_NAME_CHAR_MINBPC
650#undef IS_NMSTRT_CHAR
651#undef IS_NMSTRT_CHAR_MINBPC
652#undef IS_INVALID_CHAR
653
654#endif /* not XML_MIN_SIZE */
655
656#ifdef XML_NS
657
658static const struct normal_encoding little2_encoding_ns = {
659  { VTABLE, 2, 0,
660#if XML_BYTE_ORDER == 12
661    1
662#else
663    0
664#endif
665  },
666  {
667#include "asciitab.h"
668#include "latin1tab.h"
669  },
670  STANDARD_VTABLE(little2_)
671};
672
673#endif
674
675static const struct normal_encoding little2_encoding = {
676  { VTABLE, 2, 0,
677#if XML_BYTE_ORDER == 12
678    1
679#else
680    0
681#endif
682  },
683  {
684#define BT_COLON BT_NMSTRT
685#include "asciitab.h"
686#undef BT_COLON
687#include "latin1tab.h"
688  },
689  STANDARD_VTABLE(little2_)
690};
691
692#if XML_BYTE_ORDER != 21
693
694#ifdef XML_NS
695
696static const struct normal_encoding internal_little2_encoding_ns = {
697  { VTABLE, 2, 0, 1 },
698  {
699#include "iasciitab.h"
700#include "latin1tab.h"
701  },
702  STANDARD_VTABLE(little2_)
703};
704
705#endif
706
707static const struct normal_encoding internal_little2_encoding = {
708  { VTABLE, 2, 0, 1 },
709  {
710#define BT_COLON BT_NMSTRT
711#include "iasciitab.h"
712#undef BT_COLON
713#include "latin1tab.h"
714  },
715  STANDARD_VTABLE(little2_)
716};
717
718#endif
719
720
721#define BIG2_BYTE_TYPE(enc, p) \
722 ((p)[0] == 0 \
723  ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
724  : unicode_byte_type((p)[0], (p)[1]))
725#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
726#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
727#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
728  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
729#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
730  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
731
732#ifdef XML_MIN_SIZE
733
734static
735int big2_byteType(const ENCODING *enc, const char *p)
736{
737  return BIG2_BYTE_TYPE(enc, p);
738}
739
740static
741int big2_byteToAscii(const ENCODING *enc, const char *p)
742{
743  return BIG2_BYTE_TO_ASCII(enc, p);
744}
745
746static
747int big2_charMatches(const ENCODING *enc, const char *p, int c)
748{
749  return BIG2_CHAR_MATCHES(enc, p, c);
750}
751
752static
753int big2_isNameMin(const ENCODING *enc, const char *p)
754{
755  return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
756}
757
758static
759int big2_isNmstrtMin(const ENCODING *enc, const char *p)
760{
761  return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
762}
763
764#undef VTABLE
765#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
766
767#else /* not XML_MIN_SIZE */
768
769#undef PREFIX
770#define PREFIX(ident) big2_ ## ident
771#define MINBPC(enc) 2
772/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
773#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
774#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
775#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
776#define IS_NAME_CHAR(enc, p, n) 0
777#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
778#define IS_NMSTRT_CHAR(enc, p, n) (0)
779#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
780
781#include "xmltok_impl.c"
782
783#undef MINBPC
784#undef BYTE_TYPE
785#undef BYTE_TO_ASCII
786#undef CHAR_MATCHES
787#undef IS_NAME_CHAR
788#undef IS_NAME_CHAR_MINBPC
789#undef IS_NMSTRT_CHAR
790#undef IS_NMSTRT_CHAR_MINBPC
791#undef IS_INVALID_CHAR
792
793#endif /* not XML_MIN_SIZE */
794
795#ifdef XML_NS
796
797static const struct normal_encoding big2_encoding_ns = {
798  { VTABLE, 2, 0,
799#if XML_BYTE_ORDER == 21
800  1
801#else
802  0
803#endif
804  },
805  {
806#include "asciitab.h"
807#include "latin1tab.h"
808  },
809  STANDARD_VTABLE(big2_)
810};
811
812#endif
813
814static const struct normal_encoding big2_encoding = {
815  { VTABLE, 2, 0,
816#if XML_BYTE_ORDER == 21
817  1
818#else
819  0
820#endif
821  },
822  {
823#define BT_COLON BT_NMSTRT
824#include "asciitab.h"
825#undef BT_COLON
826#include "latin1tab.h"
827  },
828  STANDARD_VTABLE(big2_)
829};
830
831#if XML_BYTE_ORDER != 12
832
833#ifdef XML_NS
834
835static const struct normal_encoding internal_big2_encoding_ns = {
836  { VTABLE, 2, 0, 1 },
837  {
838#include "iasciitab.h"
839#include "latin1tab.h"
840  },
841  STANDARD_VTABLE(big2_)
842};
843
844#endif
845
846static const struct normal_encoding internal_big2_encoding = {
847  { VTABLE, 2, 0, 1 },
848  {
849#define BT_COLON BT_NMSTRT
850#include "iasciitab.h"
851#undef BT_COLON
852#include "latin1tab.h"
853  },
854  STANDARD_VTABLE(big2_)
855};
856
857#endif
858
859#undef PREFIX
860
861static
862int streqci(const char *s1, const char *s2)
863{
864  for (;;) {
865    char c1 = *s1++;
866    char c2 = *s2++;
867    if (ASCII_a <= c1 && c1 <= ASCII_z)
868      c1 += ASCII_A - ASCII_a;
869    if (ASCII_a <= c2 && c2 <= ASCII_z)
870      c2 += ASCII_A - ASCII_a;
871    if (c1 != c2)
872      return 0;
873    if (!c1)
874      break;
875  }
876  return 1;
877}
878
879static
880void initUpdatePosition(const ENCODING *enc, const char *ptr,
881                        const char *end, POSITION *pos)
882{
883  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
884}
885
886static
887int toAscii(const ENCODING *enc, const char *ptr, const char *end)
888{
889  char buf[1];
890  char *p = buf;
891  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
892  if (p == buf)
893    return -1;
894  else
895    return buf[0];
896}
897
898static
899int isSpace(int c)
900{
901  switch (c) {
902  case 0x20:
903  case 0xD:
904  case 0xA:
905  case 0x9:     
906    return 1;
907  }
908  return 0;
909}
910
911/* Return 1 if there's just optional white space
912or there's an S followed by name=val. */
913static
914int parsePseudoAttribute(const ENCODING *enc,
915                         const char *ptr,
916                         const char *end,
917                         const char **namePtr,
918                         const char **nameEndPtr,
919                         const char **valPtr,
920                         const char **nextTokPtr)
921{
922  int c;
923  char open;
924  if (ptr == end) {
925    *namePtr = 0;
926    return 1;
927  }
928  if (!isSpace(toAscii(enc, ptr, end))) {
929    *nextTokPtr = ptr;
930    return 0;
931  }
932  do {
933    ptr += enc->minBytesPerChar;
934  } while (isSpace(toAscii(enc, ptr, end)));
935  if (ptr == end) {
936    *namePtr = 0;
937    return 1;
938  }
939  *namePtr = ptr;
940  for (;;) {
941    c = toAscii(enc, ptr, end);
942    if (c == -1) {
943      *nextTokPtr = ptr;
944      return 0;
945    }
946    if (c == ASCII_EQUALS) {
947      *nameEndPtr = ptr;
948      break;
949    }
950    if (isSpace(c)) {
951      *nameEndPtr = ptr;
952      do {
953        ptr += enc->minBytesPerChar;
954      } while (isSpace(c = toAscii(enc, ptr, end)));
955      if (c != ASCII_EQUALS) {
956        *nextTokPtr = ptr;
957        return 0;
958      }
959      break;
960    }
961    ptr += enc->minBytesPerChar;
962  }
963  if (ptr == *namePtr) {
964    *nextTokPtr = ptr;
965    return 0;
966  }
967  ptr += enc->minBytesPerChar;
968  c = toAscii(enc, ptr, end);
969  while (isSpace(c)) {
970    ptr += enc->minBytesPerChar;
971    c = toAscii(enc, ptr, end);
972  }
973  if (c != ASCII_QUOT && c != ASCII_APOS) {
974    *nextTokPtr = ptr;
975    return 0;
976  }
977  open = c;
978  ptr += enc->minBytesPerChar;
979  *valPtr = ptr;
980  for (;; ptr += enc->minBytesPerChar) {
981    c = toAscii(enc, ptr, end);
982    if (c == open)
983      break;
984    if (!(ASCII_a <= c && c <= ASCII_z)
985        && !(ASCII_A <= c && c <= ASCII_Z)
986        && !(ASCII_0 <= c && c <= ASCII_9)
987        && c != ASCII_PERIOD
988        && c != ASCII_MINUS
989        && c != ASCII_UNDERSCORE) {
990      *nextTokPtr = ptr;
991      return 0;
992    }
993  }
994  *nextTokPtr = ptr + enc->minBytesPerChar;
995  return 1;
996}
997
998static const char KW_version[] = {
999  ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1000};
1001
1002static const char KW_encoding[] = {
1003  ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1004};
1005
1006static const char KW_standalone[] = {
1007  ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o, ASCII_n, ASCII_e, '\0'
1008};
1009
1010static const char KW_yes[] = {
1011  ASCII_y, ASCII_e, ASCII_s,  '\0'
1012};
1013
1014static const char KW_no[] = {
1015  ASCII_n, ASCII_o,  '\0'
1016};
1017
1018static
1019int doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1020                                                     const char *,
1021                                                     const char *),
1022                   int isGeneralTextEntity,
1023                   const ENCODING *enc,
1024                   const char *ptr,
1025                   const char *end,
1026                   const char **badPtr,
1027                   const char **versionPtr,
1028                   const char **versionEndPtr,
1029                   const char **encodingName,
1030                   const ENCODING **encoding,
1031                   int *standalone)
1032{
1033  const char *val = 0;
1034  const char *name = 0;
1035  const char *nameEnd = 0;
1036  ptr += 5 * enc->minBytesPerChar;
1037  end -= 2 * enc->minBytesPerChar;
1038  if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr) || !name) {
1039    *badPtr = ptr;
1040    return 0;
1041  }
1042  if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1043    if (!isGeneralTextEntity) {
1044      *badPtr = name;
1045      return 0;
1046    }
1047  }
1048  else {
1049    if (versionPtr)
1050      *versionPtr = val;
1051    if (versionEndPtr)
1052      *versionEndPtr = ptr;
1053    if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1054      *badPtr = ptr;
1055      return 0;
1056    }
1057    if (!name) {
1058      if (isGeneralTextEntity) {
1059        /* a TextDecl must have an EncodingDecl */
1060        *badPtr = ptr;
1061        return 0;
1062      }
1063      return 1;
1064    }
1065  }
1066  if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1067    int c = toAscii(enc, val, end);
1068    if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1069      *badPtr = val;
1070      return 0;
1071    }
1072    if (encodingName)
1073      *encodingName = val;
1074    if (encoding)
1075      *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1076    if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1077      *badPtr = ptr;
1078      return 0;
1079    }
1080    if (!name)
1081      return 1;
1082  }
1083  if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone) || isGeneralTextEntity) {
1084    *badPtr = name;
1085    return 0;
1086  }
1087  if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1088    if (standalone)
1089      *standalone = 1;
1090  }
1091  else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1092    if (standalone)
1093      *standalone = 0;
1094  }
1095  else {
1096    *badPtr = val;
1097    return 0;
1098  }
1099  while (isSpace(toAscii(enc, ptr, end)))
1100    ptr += enc->minBytesPerChar;
1101  if (ptr != end) {
1102    *badPtr = ptr;
1103    return 0;
1104  }
1105  return 1;
1106}
1107
1108static
1109int checkCharRefNumber(int result)
1110{
1111  switch (result >> 8) {
1112  case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1113  case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1114    return -1;
1115  case 0:
1116    if (latin1_encoding.type[result] == BT_NONXML)
1117      return -1;
1118    break;
1119  case 0xFF:
1120    if (result == 0xFFFE || result == 0xFFFF)
1121      return -1;
1122    break;
1123  }
1124  return result;
1125}
1126
1127int XmlUtf8Encode(int c, char *buf)
1128{
1129  enum {
1130    /* minN is minimum legal resulting value for N byte sequence */
1131    min2 = 0x80,
1132    min3 = 0x800,
1133    min4 = 0x10000
1134  };
1135
1136  if (c < 0)
1137    return 0;
1138  if (c < min2) {
1139    buf[0] = (c | UTF8_cval1);
1140    return 1;
1141  }
1142  if (c < min3) {
1143    buf[0] = ((c >> 6) | UTF8_cval2);
1144    buf[1] = ((c & 0x3f) | 0x80);
1145    return 2;
1146  }
1147  if (c < min4) {
1148    buf[0] = ((c >> 12) | UTF8_cval3);
1149    buf[1] = (((c >> 6) & 0x3f) | 0x80);
1150    buf[2] = ((c & 0x3f) | 0x80);
1151    return 3;
1152  }
1153  if (c < 0x110000) {
1154    buf[0] = ((c >> 18) | UTF8_cval4);
1155    buf[1] = (((c >> 12) & 0x3f) | 0x80);
1156    buf[2] = (((c >> 6) & 0x3f) | 0x80);
1157    buf[3] = ((c & 0x3f) | 0x80);
1158    return 4;
1159  }
1160  return 0;
1161}
1162
1163int XmlUtf16Encode(int charNum, unsigned short *buf)
1164{
1165  if (charNum < 0)
1166    return 0;
1167  if (charNum < 0x10000) {
1168    buf[0] = charNum;
1169    return 1;
1170  }
1171  if (charNum < 0x110000) {
1172    charNum -= 0x10000;
1173    buf[0] = (charNum >> 10) + 0xD800;
1174    buf[1] = (charNum & 0x3FF) + 0xDC00;
1175    return 2;
1176  }
1177  return 0;
1178}
1179
1180struct unknown_encoding {
1181  struct normal_encoding normal;
1182  int (*convert)(void *userData, const char *p);
1183  void *userData;
1184  unsigned short utf16[256];
1185  char utf8[256][4];
1186};
1187
1188int XmlSizeOfUnknownEncoding(void)
1189{
1190  return sizeof(struct unknown_encoding);
1191}
1192
1193static
1194int unknown_isName(const ENCODING *enc, const char *p)
1195{
1196  int c = ((const struct unknown_encoding *)enc)
1197          ->convert(((const struct unknown_encoding *)enc)->userData, p);
1198  if (c & ~0xFFFF)
1199    return 0;
1200  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1201}
1202
1203static
1204int unknown_isNmstrt(const ENCODING *enc, const char *p)
1205{
1206  int c = ((const struct unknown_encoding *)enc)
1207          ->convert(((const struct unknown_encoding *)enc)->userData, p);
1208  if (c & ~0xFFFF)
1209    return 0;
1210  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1211}
1212
1213static
1214int unknown_isInvalid(const ENCODING *enc, const char *p)
1215{
1216  int c = ((const struct unknown_encoding *)enc)
1217           ->convert(((const struct unknown_encoding *)enc)->userData, p);
1218  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1219}
1220
1221static
1222void unknown_toUtf8(const ENCODING *enc,
1223                    const char **fromP, const char *fromLim,
1224                    char **toP, const char *toLim)
1225{
1226  char buf[XML_UTF8_ENCODE_MAX];
1227  for (;;) {
1228    const char *utf8;
1229    int n;
1230    if (*fromP == fromLim)
1231      break;
1232    utf8 = ((const struct unknown_encoding *)enc)->utf8[(unsigned char)**fromP];
1233    n = *utf8++;
1234    if (n == 0) {
1235      int c = ((const struct unknown_encoding *)enc)
1236              ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1237      n = XmlUtf8Encode(c, buf);
1238      if (n > toLim - *toP)
1239        break;
1240      utf8 = buf;
1241      *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1242                 - (BT_LEAD2 - 2);
1243    }
1244    else {
1245      if (n > toLim - *toP)
1246        break;
1247      (*fromP)++;
1248    }
1249    do {
1250      *(*toP)++ = *utf8++;
1251    } while (--n != 0);
1252  }
1253}
1254
1255static
1256void unknown_toUtf16(const ENCODING *enc,
1257                     const char **fromP, const char *fromLim,
1258                     unsigned short **toP, const unsigned short *toLim)
1259{
1260  while (*fromP != fromLim && *toP != toLim) {
1261    unsigned short c
1262      = ((const struct unknown_encoding *)enc)->utf16[(unsigned char)**fromP];
1263    if (c == 0) {
1264      c = (unsigned short)((const struct unknown_encoding *)enc)
1265           ->convert(((const struct unknown_encoding *)enc)->userData, *fromP);
1266      *fromP += ((const struct normal_encoding *)enc)->type[(unsigned char)**fromP]
1267                 - (BT_LEAD2 - 2);
1268    }
1269    else
1270      (*fromP)++;
1271    *(*toP)++ = c;
1272  }
1273}
1274
1275ENCODING *
1276XmlInitUnknownEncoding(void *mem,
1277                       int *table,
1278                       int (*convert)(void *userData, const char *p),
1279                       void *userData)
1280{
1281  int i;
1282  struct unknown_encoding *e = mem;
1283  for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1284    ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1285  for (i = 0; i < 128; i++)
1286    if (latin1_encoding.type[i] != BT_OTHER
1287        && latin1_encoding.type[i] != BT_NONXML
1288        && table[i] != i)
1289      return 0;
1290  for (i = 0; i < 256; i++) {
1291    int c = table[i];
1292    if (c == -1) {
1293      e->normal.type[i] = BT_MALFORM;
1294      /* This shouldn't really get used. */
1295      e->utf16[i] = 0xFFFF;
1296      e->utf8[i][0] = 1;
1297      e->utf8[i][1] = 0;
1298    }
1299    else if (c < 0) {
1300      if (c < -4)
1301        return 0;
1302      e->normal.type[i] = BT_LEAD2 - (c + 2);
1303      e->utf8[i][0] = 0;
1304      e->utf16[i] = 0;
1305    }
1306    else if (c < 0x80) {
1307      if (latin1_encoding.type[c] != BT_OTHER
1308          && latin1_encoding.type[c] != BT_NONXML
1309          && c != i)
1310        return 0;
1311      e->normal.type[i] = latin1_encoding.type[c];
1312      e->utf8[i][0] = 1;
1313      e->utf8[i][1] = (char)c;
1314      e->utf16[i] = c == 0 ? 0xFFFF : c;
1315    }
1316    else if (checkCharRefNumber(c) < 0) {
1317      e->normal.type[i] = BT_NONXML;
1318      /* This shouldn't really get used. */
1319      e->utf16[i] = 0xFFFF;
1320      e->utf8[i][0] = 1;
1321      e->utf8[i][1] = 0;
1322    }
1323    else {
1324      if (c > 0xFFFF)
1325        return 0;
1326      if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1327        e->normal.type[i] = BT_NMSTRT;
1328      else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1329        e->normal.type[i] = BT_NAME;
1330      else
1331        e->normal.type[i] = BT_OTHER;
1332      e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1333      e->utf16[i] = c;
1334    }
1335  }
1336  e->userData = userData;
1337  e->convert = convert;
1338  if (convert) {
1339    e->normal.isName2 = unknown_isName;
1340    e->normal.isName3 = unknown_isName;
1341    e->normal.isName4 = unknown_isName;
1342    e->normal.isNmstrt2 = unknown_isNmstrt;
1343    e->normal.isNmstrt3 = unknown_isNmstrt;
1344    e->normal.isNmstrt4 = unknown_isNmstrt;
1345    e->normal.isInvalid2 = unknown_isInvalid;
1346    e->normal.isInvalid3 = unknown_isInvalid;
1347    e->normal.isInvalid4 = unknown_isInvalid;
1348  }
1349  e->normal.enc.utf8Convert = unknown_toUtf8;
1350  e->normal.enc.utf16Convert = unknown_toUtf16;
1351  return &(e->normal.enc);
1352}
1353
1354/* If this enumeration is changed, getEncodingIndex and encodings
1355must also be changed. */
1356enum {
1357  UNKNOWN_ENC = -1,
1358  ISO_8859_1_ENC = 0,
1359  US_ASCII_ENC,
1360  UTF_8_ENC,
1361  UTF_16_ENC,
1362  UTF_16BE_ENC,
1363  UTF_16LE_ENC,
1364  /* must match encodingNames up to here */
1365  NO_ENC
1366};
1367
1368static const char KW_ISO_8859_1[] = {
1369  ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9, ASCII_MINUS, ASCII_1, '\0'
1370};
1371static const char KW_US_ASCII[] = {
1372  ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I, '\0'
1373};
1374static const char KW_UTF_8[] =  {
1375  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1376};
1377static const char KW_UTF_16[] = {
1378  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1379};
1380static const char KW_UTF_16BE[] = {
1381  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E, '\0'
1382};
1383static const char KW_UTF_16LE[] = {
1384  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E, '\0'
1385};
1386
1387static
1388int getEncodingIndex(const char *name)
1389{
1390  static const char *encodingNames[] = {
1391    KW_ISO_8859_1,
1392    KW_US_ASCII,
1393    KW_UTF_8,
1394    KW_UTF_16,
1395    KW_UTF_16BE,
1396    KW_UTF_16LE,
1397  };
1398  int i;
1399  if (name == 0)
1400    return NO_ENC;
1401  for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1402    if (streqci(name, encodingNames[i]))
1403      return i;
1404  return UNKNOWN_ENC;
1405}
1406
1407/* For binary compatibility, we store the index of the encoding specified
1408at initialization in the isUtf16 member. */
1409
1410#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1411#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1412
1413/* This is what detects the encoding.
1414encodingTable maps from encoding indices to encodings;
1415INIT_ENC_INDEX(enc) is the index of the external (protocol) specified encoding;
1416state is XML_CONTENT_STATE if we're parsing an external text entity,
1417and XML_PROLOG_STATE otherwise.
1418*/
1419
1420
1421static
1422int initScan(const ENCODING **encodingTable,
1423             const INIT_ENCODING *enc,
1424             int state,
1425             const char *ptr,
1426             const char *end,
1427             const char **nextTokPtr)
1428{
1429  const ENCODING **encPtr;
1430
1431  if (ptr == end)
1432    return XML_TOK_NONE;
1433  encPtr = enc->encPtr;
1434  if (ptr + 1 == end) {
1435    /* only a single byte available for auto-detection */
1436#ifndef XML_DTD /* FIXME */
1437    /* a well-formed document entity must have more than one byte */
1438    if (state != XML_CONTENT_STATE)
1439      return XML_TOK_PARTIAL;
1440#endif
1441    /* so we're parsing an external text entity... */
1442    /* if UTF-16 was externally specified, then we need at least 2 bytes */
1443    switch (INIT_ENC_INDEX(enc)) {
1444    case UTF_16_ENC:
1445    case UTF_16LE_ENC:
1446    case UTF_16BE_ENC:
1447      return XML_TOK_PARTIAL;
1448    }
1449    switch ((unsigned char)*ptr) {
1450    case 0xFE:
1451    case 0xFF:
1452    case 0xEF: /* possibly first byte of UTF-8 BOM */
1453      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1454          && state == XML_CONTENT_STATE)
1455        break;
1456      /* fall through */
1457    case 0x00:
1458    case 0x3C:
1459      return XML_TOK_PARTIAL;
1460    }
1461  }
1462  else {
1463    switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1464    case 0xFEFF:
1465      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1466          && state == XML_CONTENT_STATE)
1467        break;
1468      *nextTokPtr = ptr + 2;
1469      *encPtr = encodingTable[UTF_16BE_ENC];
1470      return XML_TOK_BOM;
1471    /* 00 3C is handled in the default case */
1472    case 0x3C00:
1473      if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1474           || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1475          && state == XML_CONTENT_STATE)
1476        break;
1477      *encPtr = encodingTable[UTF_16LE_ENC];
1478      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1479    case 0xFFFE:
1480      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1481          && state == XML_CONTENT_STATE)
1482        break;
1483      *nextTokPtr = ptr + 2;
1484      *encPtr = encodingTable[UTF_16LE_ENC];
1485      return XML_TOK_BOM;
1486    case 0xEFBB:
1487      /* Maybe a UTF-8 BOM (EF BB BF) */
1488      /* If there's an explicitly specified (external) encoding
1489         of ISO-8859-1 or some flavour of UTF-16
1490         and this is an external text entity,
1491         don't look for the BOM,
1492         because it might be a legal data. */
1493      if (state == XML_CONTENT_STATE) {
1494        int e = INIT_ENC_INDEX(enc);
1495        if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC || e == UTF_16LE_ENC || e == UTF_16_ENC)
1496          break;
1497      }
1498      if (ptr + 2 == end)
1499        return XML_TOK_PARTIAL;
1500      if ((unsigned char)ptr[2] == 0xBF) {
1501        *encPtr = encodingTable[UTF_8_ENC];
1502        return XML_TOK_BOM;
1503      }
1504      break;
1505    default:
1506      if (ptr[0] == '\0') {
1507        /* 0 isn't a legal data character. Furthermore a document entity can only
1508           start with ASCII characters.  So the only way this can fail to be big-endian
1509           UTF-16 if it it's an external parsed general entity that's labelled as
1510           UTF-16LE. */
1511        if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1512          break;
1513        *encPtr = encodingTable[UTF_16BE_ENC];
1514        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1515      }
1516      else if (ptr[1] == '\0') {
1517        /* We could recover here in the case:
1518            - parsing an external entity
1519            - second byte is 0
1520            - no externally specified encoding
1521            - no encoding declaration
1522           by assuming UTF-16LE.  But we don't, because this would mean when
1523           presented just with a single byte, we couldn't reliably determine
1524           whether we needed further bytes. */
1525        if (state == XML_CONTENT_STATE)
1526          break;
1527        *encPtr = encodingTable[UTF_16LE_ENC];
1528        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1529      }
1530      break;
1531    }
1532  }
1533  *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1534  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1535}
1536
1537
1538#define NS(x) x
1539#define ns(x) x
1540#include "xmltok_ns.c"
1541#undef NS
1542#undef ns
1543
1544#ifdef XML_NS
1545
1546#define NS(x) x ## NS
1547#define ns(x) x ## _ns
1548
1549#include "xmltok_ns.c"
1550
1551#undef NS
1552#undef ns
1553
1554ENCODING *
1555XmlInitUnknownEncodingNS(void *mem,
1556                         int *table,
1557                         int (*convert)(void *userData, const char *p),
1558                         void *userData)
1559{
1560  ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1561  if (enc)
1562    ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1563  return enc;
1564}
1565
1566#endif /* XML_NS */
Note: See TracBrowser for help on using the repository browser.