source: trunk/third/expat/lib/xmltok.c @ 18502

Revision 18502, 40.0 KB checked in by ghudson, 22 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r18501, which included commits to RCS files with non-trunk default branches.
Line 
1/* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
2   See the file COPYING for copying permission.
3*/
4
5#ifdef COMPILED_FROM_DSP
6#include "winconfig.h"
7#elif defined(MACOS_CLASSIC)
8#include "macconfig.h"
9#else
10#include <expat_config.h>
11#endif /* ndef COMPILED_FROM_DSP */
12
13#include "internal.h"
14#include "xmltok.h"
15#include "nametab.h"
16
17#ifdef XML_DTD
18#define IGNORE_SECTION_TOK_VTABLE , PREFIX(ignoreSectionTok)
19#else
20#define IGNORE_SECTION_TOK_VTABLE /* as nothing */
21#endif
22
23#define VTABLE1 \
24  { PREFIX(prologTok), PREFIX(contentTok), \
25    PREFIX(cdataSectionTok) IGNORE_SECTION_TOK_VTABLE }, \
26  { PREFIX(attributeValueTok), PREFIX(entityValueTok) }, \
27  PREFIX(sameName), \
28  PREFIX(nameMatchesAscii), \
29  PREFIX(nameLength), \
30  PREFIX(skipS), \
31  PREFIX(getAtts), \
32  PREFIX(charRefNumber), \
33  PREFIX(predefinedEntityName), \
34  PREFIX(updatePosition), \
35  PREFIX(isPublicId)
36
37#define VTABLE VTABLE1, PREFIX(toUtf8), PREFIX(toUtf16)
38
39#define UCS2_GET_NAMING(pages, hi, lo) \
40   (namingBitmap[(pages[hi] << 3) + ((lo) >> 5)] & (1 << ((lo) & 0x1F)))
41
42/* A 2 byte UTF-8 representation splits the characters 11 bits between
43   the bottom 5 and 6 bits of the bytes.  We need 8 bits to index into
44   pages, 3 bits to add to that index and 5 bits to generate the mask.
45*/
46#define UTF8_GET_NAMING2(pages, byte) \
47    (namingBitmap[((pages)[(((byte)[0]) >> 2) & 7] << 3) \
48                      + ((((byte)[0]) & 3) << 1) \
49                      + ((((byte)[1]) >> 5) & 1)] \
50         & (1 << (((byte)[1]) & 0x1F)))
51
52/* A 3 byte UTF-8 representation splits the characters 16 bits between
53   the bottom 4, 6 and 6 bits of the bytes.  We need 8 bits to index
54   into pages, 3 bits to add to that index and 5 bits to generate the
55   mask.
56*/
57#define UTF8_GET_NAMING3(pages, byte) \
58  (namingBitmap[((pages)[((((byte)[0]) & 0xF) << 4) \
59                             + ((((byte)[1]) >> 2) & 0xF)] \
60                       << 3) \
61                      + ((((byte)[1]) & 3) << 1) \
62                      + ((((byte)[2]) >> 5) & 1)] \
63         & (1 << (((byte)[2]) & 0x1F)))
64
65#define UTF8_GET_NAMING(pages, p, n) \
66  ((n) == 2 \
67  ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \
68  : ((n) == 3 \
69     ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) \
70     : 0))
71
72/* Detection of invalid UTF-8 sequences is based on Table 3.1B
73   of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/
74   with the additional restriction of not allowing the Unicode
75   code points 0xFFFF and 0xFFFE (sequences EF,BF,BF and EF,BF,BE).
76   Implementation details:
77     (A & 0x80) == 0     means A < 0x80
78   and
79     (A & 0xC0) == 0xC0  means A > 0xBF
80*/
81
82#define UTF8_INVALID2(p) \
83  ((*p) < 0xC2 || ((p)[1] & 0x80) == 0 || ((p)[1] & 0xC0) == 0xC0)
84
85#define UTF8_INVALID3(p) \
86  (((p)[2] & 0x80) == 0 \
87  || \
88  ((*p) == 0xEF && (p)[1] == 0xBF \
89    ? \
90    (p)[2] > 0xBD \
91    : \
92    ((p)[2] & 0xC0) == 0xC0) \
93  || \
94  ((*p) == 0xE0 \
95    ? \
96    (p)[1] < 0xA0 || ((p)[1] & 0xC0) == 0xC0 \
97    : \
98    ((p)[1] & 0x80) == 0 \
99    || \
100    ((*p) == 0xED ? (p)[1] > 0x9F : ((p)[1] & 0xC0) == 0xC0)))
101
102#define UTF8_INVALID4(p) \
103  (((p)[3] & 0x80) == 0 || ((p)[3] & 0xC0) == 0xC0 \
104  || \
105  ((p)[2] & 0x80) == 0 || ((p)[2] & 0xC0) == 0xC0 \
106  || \
107  ((*p) == 0xF0 \
108    ? \
109    (p)[1] < 0x90 || ((p)[1] & 0xC0) == 0xC0 \
110    : \
111    ((p)[1] & 0x80) == 0 \
112    || \
113    ((*p) == 0xF4 ? (p)[1] > 0x8F : ((p)[1] & 0xC0) == 0xC0)))
114
115static int FASTCALL
116isNever(const ENCODING *enc, const char *p)
117{
118  return 0;
119}
120
121static int FASTCALL
122utf8_isName2(const ENCODING *enc, const char *p)
123{
124  return UTF8_GET_NAMING2(namePages, (const unsigned char *)p);
125}
126
127static int FASTCALL
128utf8_isName3(const ENCODING *enc, const char *p)
129{
130  return UTF8_GET_NAMING3(namePages, (const unsigned char *)p);
131}
132
133#define utf8_isName4 isNever
134
135static int FASTCALL
136utf8_isNmstrt2(const ENCODING *enc, const char *p)
137{
138  return UTF8_GET_NAMING2(nmstrtPages, (const unsigned char *)p);
139}
140
141static int FASTCALL
142utf8_isNmstrt3(const ENCODING *enc, const char *p)
143{
144  return UTF8_GET_NAMING3(nmstrtPages, (const unsigned char *)p);
145}
146
147#define utf8_isNmstrt4 isNever
148
149static int FASTCALL
150utf8_isInvalid2(const ENCODING *enc, const char *p)
151{
152  return UTF8_INVALID2((const unsigned char *)p);
153}
154
155static int FASTCALL
156utf8_isInvalid3(const ENCODING *enc, const char *p)
157{
158  return UTF8_INVALID3((const unsigned char *)p);
159}
160
161static int FASTCALL
162utf8_isInvalid4(const ENCODING *enc, const char *p)
163{
164  return UTF8_INVALID4((const unsigned char *)p);
165}
166
167struct normal_encoding {
168  ENCODING enc;
169  unsigned char type[256];
170#ifdef XML_MIN_SIZE
171  int (FASTCALL *byteType)(const ENCODING *, const char *);
172  int (FASTCALL *isNameMin)(const ENCODING *, const char *);
173  int (FASTCALL *isNmstrtMin)(const ENCODING *, const char *);
174  int (FASTCALL *byteToAscii)(const ENCODING *, const char *);
175  int (FASTCALL *charMatches)(const ENCODING *, const char *, int);
176#endif /* XML_MIN_SIZE */
177  int (FASTCALL *isName2)(const ENCODING *, const char *);
178  int (FASTCALL *isName3)(const ENCODING *, const char *);
179  int (FASTCALL *isName4)(const ENCODING *, const char *);
180  int (FASTCALL *isNmstrt2)(const ENCODING *, const char *);
181  int (FASTCALL *isNmstrt3)(const ENCODING *, const char *);
182  int (FASTCALL *isNmstrt4)(const ENCODING *, const char *);
183  int (FASTCALL *isInvalid2)(const ENCODING *, const char *);
184  int (FASTCALL *isInvalid3)(const ENCODING *, const char *);
185  int (FASTCALL *isInvalid4)(const ENCODING *, const char *);
186};
187
188#define AS_NORMAL_ENCODING(enc)   ((const struct normal_encoding *) (enc))
189
190#ifdef XML_MIN_SIZE
191
192#define STANDARD_VTABLE(E) \
193 E ## byteType, \
194 E ## isNameMin, \
195 E ## isNmstrtMin, \
196 E ## byteToAscii, \
197 E ## charMatches,
198
199#else
200
201#define STANDARD_VTABLE(E) /* as nothing */
202
203#endif
204
205#define NORMAL_VTABLE(E) \
206 E ## isName2, \
207 E ## isName3, \
208 E ## isName4, \
209 E ## isNmstrt2, \
210 E ## isNmstrt3, \
211 E ## isNmstrt4, \
212 E ## isInvalid2, \
213 E ## isInvalid3, \
214 E ## isInvalid4
215
216static int FASTCALL checkCharRefNumber(int);
217
218#include "xmltok_impl.h"
219#include "ascii.h"
220
221#ifdef XML_MIN_SIZE
222#define sb_isNameMin isNever
223#define sb_isNmstrtMin isNever
224#endif
225
226#ifdef XML_MIN_SIZE
227#define MINBPC(enc) ((enc)->minBytesPerChar)
228#else
229/* minimum bytes per character */
230#define MINBPC(enc) 1
231#endif
232
233#define SB_BYTE_TYPE(enc, p) \
234  (((struct normal_encoding *)(enc))->type[(unsigned char)*(p)])
235
236#ifdef XML_MIN_SIZE
237static int FASTCALL
238sb_byteType(const ENCODING *enc, const char *p)
239{
240  return SB_BYTE_TYPE(enc, p);
241}
242#define BYTE_TYPE(enc, p) \
243 (AS_NORMAL_ENCODING(enc)->byteType(enc, p))
244#else
245#define BYTE_TYPE(enc, p) SB_BYTE_TYPE(enc, p)
246#endif
247
248#ifdef XML_MIN_SIZE
249#define BYTE_TO_ASCII(enc, p) \
250 (AS_NORMAL_ENCODING(enc)->byteToAscii(enc, p))
251static int FASTCALL
252sb_byteToAscii(const ENCODING *enc, const char *p)
253{
254  return *p;
255}
256#else
257#define BYTE_TO_ASCII(enc, p) (*(p))
258#endif
259
260#define IS_NAME_CHAR(enc, p, n) \
261 (AS_NORMAL_ENCODING(enc)->isName ## n(enc, p))
262#define IS_NMSTRT_CHAR(enc, p, n) \
263 (AS_NORMAL_ENCODING(enc)->isNmstrt ## n(enc, p))
264#define IS_INVALID_CHAR(enc, p, n) \
265 (AS_NORMAL_ENCODING(enc)->isInvalid ## n(enc, p))
266
267#ifdef XML_MIN_SIZE
268#define IS_NAME_CHAR_MINBPC(enc, p) \
269 (AS_NORMAL_ENCODING(enc)->isNameMin(enc, p))
270#define IS_NMSTRT_CHAR_MINBPC(enc, p) \
271 (AS_NORMAL_ENCODING(enc)->isNmstrtMin(enc, p))
272#else
273#define IS_NAME_CHAR_MINBPC(enc, p) (0)
274#define IS_NMSTRT_CHAR_MINBPC(enc, p) (0)
275#endif
276
277#ifdef XML_MIN_SIZE
278#define CHAR_MATCHES(enc, p, c) \
279 (AS_NORMAL_ENCODING(enc)->charMatches(enc, p, c))
280static int FASTCALL
281sb_charMatches(const ENCODING *enc, const char *p, int c)
282{
283  return *p == c;
284}
285#else
286/* c is an ASCII character */
287#define CHAR_MATCHES(enc, p, c) (*(p) == c)
288#endif
289
290#define PREFIX(ident) normal_ ## ident
291#include "xmltok_impl.c"
292
293#undef MINBPC
294#undef BYTE_TYPE
295#undef BYTE_TO_ASCII
296#undef CHAR_MATCHES
297#undef IS_NAME_CHAR
298#undef IS_NAME_CHAR_MINBPC
299#undef IS_NMSTRT_CHAR
300#undef IS_NMSTRT_CHAR_MINBPC
301#undef IS_INVALID_CHAR
302
303enum {  /* UTF8_cvalN is value of masked first byte of N byte sequence */
304  UTF8_cval1 = 0x00,
305  UTF8_cval2 = 0xc0,
306  UTF8_cval3 = 0xe0,
307  UTF8_cval4 = 0xf0
308};
309
310static void FASTCALL
311utf8_toUtf8(const ENCODING *enc,
312            const char **fromP, const char *fromLim,
313            char **toP, const char *toLim)
314{
315  char *to;
316  const char *from;
317  if (fromLim - *fromP > toLim - *toP) {
318    /* Avoid copying partial characters. */
319    for (fromLim = *fromP + (toLim - *toP); fromLim > *fromP; fromLim--)
320      if (((unsigned char)fromLim[-1] & 0xc0) != 0x80)
321        break;
322  }
323  for (to = *toP, from = *fromP; from != fromLim; from++, to++)
324    *to = *from;
325  *fromP = from;
326  *toP = to;
327}
328
329static void FASTCALL
330utf8_toUtf16(const ENCODING *enc,
331             const char **fromP, const char *fromLim,
332             unsigned short **toP, const unsigned short *toLim)
333{
334  unsigned short *to = *toP;
335  const char *from = *fromP;
336  while (from != fromLim && to != toLim) {
337    switch (((struct normal_encoding *)enc)->type[(unsigned char)*from]) {
338    case BT_LEAD2:
339      *to++ = (unsigned short)(((from[0] & 0x1f) << 6) | (from[1] & 0x3f));
340      from += 2;
341      break;
342    case BT_LEAD3:
343      *to++ = (unsigned short)(((from[0] & 0xf) << 12)
344                               | ((from[1] & 0x3f) << 6) | (from[2] & 0x3f));
345      from += 3;
346      break;
347    case BT_LEAD4:
348      {
349        unsigned long n;
350        if (to + 1 == toLim)
351          goto after;
352        n = ((from[0] & 0x7) << 18) | ((from[1] & 0x3f) << 12)
353            | ((from[2] & 0x3f) << 6) | (from[3] & 0x3f);
354        n -= 0x10000;
355        to[0] = (unsigned short)((n >> 10) | 0xD800);
356        to[1] = (unsigned short)((n & 0x3FF) | 0xDC00);
357        to += 2;
358        from += 4;
359      }
360      break;
361    default:
362      *to++ = *from++;
363      break;
364    }
365  }
366after:
367  *fromP = from;
368  *toP = to;
369}
370
371#ifdef XML_NS
372static const struct normal_encoding utf8_encoding_ns = {
373  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
374  {
375#include "asciitab.h"
376#include "utf8tab.h"
377  },
378  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
379};
380#endif
381
382static const struct normal_encoding utf8_encoding = {
383  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
384  {
385#define BT_COLON BT_NMSTRT
386#include "asciitab.h"
387#undef BT_COLON
388#include "utf8tab.h"
389  },
390  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
391};
392
393#ifdef XML_NS
394
395static const struct normal_encoding internal_utf8_encoding_ns = {
396  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
397  {
398#include "iasciitab.h"
399#include "utf8tab.h"
400  },
401  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
402};
403
404#endif
405
406static const struct normal_encoding internal_utf8_encoding = {
407  { VTABLE1, utf8_toUtf8, utf8_toUtf16, 1, 1, 0 },
408  {
409#define BT_COLON BT_NMSTRT
410#include "iasciitab.h"
411#undef BT_COLON
412#include "utf8tab.h"
413  },
414  STANDARD_VTABLE(sb_) NORMAL_VTABLE(utf8_)
415};
416
417static void FASTCALL
418latin1_toUtf8(const ENCODING *enc,
419              const char **fromP, const char *fromLim,
420              char **toP, const char *toLim)
421{
422  for (;;) {
423    unsigned char c;
424    if (*fromP == fromLim)
425      break;
426    c = (unsigned char)**fromP;
427    if (c & 0x80) {
428      if (toLim - *toP < 2)
429        break;
430      *(*toP)++ = (char)((c >> 6) | UTF8_cval2);
431      *(*toP)++ = (char)((c & 0x3f) | 0x80);
432      (*fromP)++;
433    }
434    else {
435      if (*toP == toLim)
436        break;
437      *(*toP)++ = *(*fromP)++;
438    }
439  }
440}
441
442static void FASTCALL
443latin1_toUtf16(const ENCODING *enc,
444               const char **fromP, const char *fromLim,
445               unsigned short **toP, const unsigned short *toLim)
446{
447  while (*fromP != fromLim && *toP != toLim)
448    *(*toP)++ = (unsigned char)*(*fromP)++;
449}
450
451#ifdef XML_NS
452
453static const struct normal_encoding latin1_encoding_ns = {
454  { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
455  {
456#include "asciitab.h"
457#include "latin1tab.h"
458  },
459  STANDARD_VTABLE(sb_)
460};
461
462#endif
463
464static const struct normal_encoding latin1_encoding = {
465  { VTABLE1, latin1_toUtf8, latin1_toUtf16, 1, 0, 0 },
466  {
467#define BT_COLON BT_NMSTRT
468#include "asciitab.h"
469#undef BT_COLON
470#include "latin1tab.h"
471  },
472  STANDARD_VTABLE(sb_)
473};
474
475static void FASTCALL
476ascii_toUtf8(const ENCODING *enc,
477             const char **fromP, const char *fromLim,
478             char **toP, const char *toLim)
479{
480  while (*fromP != fromLim && *toP != toLim)
481    *(*toP)++ = *(*fromP)++;
482}
483
484#ifdef XML_NS
485
486static const struct normal_encoding ascii_encoding_ns = {
487  { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
488  {
489#include "asciitab.h"
490/* BT_NONXML == 0 */
491  },
492  STANDARD_VTABLE(sb_)
493};
494
495#endif
496
497static const struct normal_encoding ascii_encoding = {
498  { VTABLE1, ascii_toUtf8, latin1_toUtf16, 1, 1, 0 },
499  {
500#define BT_COLON BT_NMSTRT
501#include "asciitab.h"
502#undef BT_COLON
503/* BT_NONXML == 0 */
504  },
505  STANDARD_VTABLE(sb_)
506};
507
508static int FASTCALL
509unicode_byte_type(char hi, char lo)
510{
511  switch ((unsigned char)hi) {
512  case 0xD8: case 0xD9: case 0xDA: case 0xDB:
513    return BT_LEAD4;
514  case 0xDC: case 0xDD: case 0xDE: case 0xDF:
515    return BT_TRAIL;
516  case 0xFF:
517    switch ((unsigned char)lo) {
518    case 0xFF:
519    case 0xFE:
520      return BT_NONXML;
521    }
522    break;
523  }
524  return BT_NONASCII;
525}
526
527#define DEFINE_UTF16_TO_UTF8(E) \
528static void  FASTCALL \
529E ## toUtf8(const ENCODING *enc, \
530            const char **fromP, const char *fromLim, \
531            char **toP, const char *toLim) \
532{ \
533  const char *from; \
534  for (from = *fromP; from != fromLim; from += 2) { \
535    int plane; \
536    unsigned char lo2; \
537    unsigned char lo = GET_LO(from); \
538    unsigned char hi = GET_HI(from); \
539    switch (hi) { \
540    case 0: \
541      if (lo < 0x80) { \
542        if (*toP == toLim) { \
543          *fromP = from; \
544          return; \
545        } \
546        *(*toP)++ = lo; \
547        break; \
548      } \
549      /* fall through */ \
550    case 0x1: case 0x2: case 0x3: \
551    case 0x4: case 0x5: case 0x6: case 0x7: \
552      if (toLim -  *toP < 2) { \
553        *fromP = from; \
554        return; \
555      } \
556      *(*toP)++ = ((lo >> 6) | (hi << 2) |  UTF8_cval2); \
557      *(*toP)++ = ((lo & 0x3f) | 0x80); \
558      break; \
559    default: \
560      if (toLim -  *toP < 3)  { \
561        *fromP = from; \
562        return; \
563      } \
564      /* 16 bits divided 4, 6, 6 amongst 3 bytes */ \
565      *(*toP)++ = ((hi >> 4) | UTF8_cval3); \
566      *(*toP)++ = (((hi & 0xf) << 2) | (lo >> 6) | 0x80); \
567      *(*toP)++ = ((lo & 0x3f) | 0x80); \
568      break; \
569    case 0xD8: case 0xD9: case 0xDA: case 0xDB: \
570      if (toLim -  *toP < 4) { \
571        *fromP = from; \
572        return; \
573      } \
574      plane = (((hi & 0x3) << 2) | ((lo >> 6) & 0x3)) + 1; \
575      *(*toP)++ = ((plane >> 2) | UTF8_cval4); \
576      *(*toP)++ = (((lo >> 2) & 0xF) | ((plane & 0x3) << 4) | 0x80); \
577      from += 2; \
578      lo2 = GET_LO(from); \
579      *(*toP)++ = (((lo & 0x3) << 4) \
580                   | ((GET_HI(from) & 0x3) << 2) \
581                   | (lo2 >> 6) \
582                   | 0x80); \
583      *(*toP)++ = ((lo2 & 0x3f) | 0x80); \
584      break; \
585    } \
586  } \
587  *fromP = from; \
588}
589
590#define DEFINE_UTF16_TO_UTF16(E) \
591static void  FASTCALL \
592E ## toUtf16(const ENCODING *enc, \
593             const char **fromP, const char *fromLim, \
594             unsigned short **toP, const unsigned short *toLim) \
595{ \
596  /* Avoid copying first half only of surrogate */ \
597  if (fromLim - *fromP > ((toLim - *toP) << 1) \
598      && (GET_HI(fromLim - 2) & 0xF8) == 0xD8) \
599    fromLim -= 2; \
600  for (; *fromP != fromLim && *toP != toLim; *fromP += 2) \
601    *(*toP)++ = (GET_HI(*fromP) << 8) | GET_LO(*fromP); \
602}
603
604#define SET2(ptr, ch) \
605  (((ptr)[0] = ((ch) & 0xff)), ((ptr)[1] = ((ch) >> 8)))
606#define GET_LO(ptr) ((unsigned char)(ptr)[0])
607#define GET_HI(ptr) ((unsigned char)(ptr)[1])
608
609DEFINE_UTF16_TO_UTF8(little2_)
610DEFINE_UTF16_TO_UTF16(little2_)
611
612#undef SET2
613#undef GET_LO
614#undef GET_HI
615
616#define SET2(ptr, ch) \
617  (((ptr)[0] = ((ch) >> 8)), ((ptr)[1] = ((ch) & 0xFF)))
618#define GET_LO(ptr) ((unsigned char)(ptr)[1])
619#define GET_HI(ptr) ((unsigned char)(ptr)[0])
620
621DEFINE_UTF16_TO_UTF8(big2_)
622DEFINE_UTF16_TO_UTF16(big2_)
623
624#undef SET2
625#undef GET_LO
626#undef GET_HI
627
628#define LITTLE2_BYTE_TYPE(enc, p) \
629 ((p)[1] == 0 \
630  ? ((struct normal_encoding *)(enc))->type[(unsigned char)*(p)] \
631  : unicode_byte_type((p)[1], (p)[0]))
632#define LITTLE2_BYTE_TO_ASCII(enc, p) ((p)[1] == 0 ? (p)[0] : -1)
633#define LITTLE2_CHAR_MATCHES(enc, p, c) ((p)[1] == 0 && (p)[0] == c)
634#define LITTLE2_IS_NAME_CHAR_MINBPC(enc, p) \
635  UCS2_GET_NAMING(namePages, (unsigned char)p[1], (unsigned char)p[0])
636#define LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
637  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[1], (unsigned char)p[0])
638
639#ifdef XML_MIN_SIZE
640
641static int FASTCALL
642little2_byteType(const ENCODING *enc, const char *p)
643{
644  return LITTLE2_BYTE_TYPE(enc, p);
645}
646
647static int FASTCALL
648little2_byteToAscii(const ENCODING *enc, const char *p)
649{
650  return LITTLE2_BYTE_TO_ASCII(enc, p);
651}
652
653static int FASTCALL
654little2_charMatches(const ENCODING *enc, const char *p, int c)
655{
656  return LITTLE2_CHAR_MATCHES(enc, p, c);
657}
658
659static int FASTCALL
660little2_isNameMin(const ENCODING *enc, const char *p)
661{
662  return LITTLE2_IS_NAME_CHAR_MINBPC(enc, p);
663}
664
665static int FASTCALL
666little2_isNmstrtMin(const ENCODING *enc, const char *p)
667{
668  return LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p);
669}
670
671#undef VTABLE
672#define VTABLE VTABLE1, little2_toUtf8, little2_toUtf16
673
674#else /* not XML_MIN_SIZE */
675
676#undef PREFIX
677#define PREFIX(ident) little2_ ## ident
678#define MINBPC(enc) 2
679/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
680#define BYTE_TYPE(enc, p) LITTLE2_BYTE_TYPE(enc, p)
681#define BYTE_TO_ASCII(enc, p) LITTLE2_BYTE_TO_ASCII(enc, p)
682#define CHAR_MATCHES(enc, p, c) LITTLE2_CHAR_MATCHES(enc, p, c)
683#define IS_NAME_CHAR(enc, p, n) 0
684#define IS_NAME_CHAR_MINBPC(enc, p) LITTLE2_IS_NAME_CHAR_MINBPC(enc, p)
685#define IS_NMSTRT_CHAR(enc, p, n) (0)
686#define IS_NMSTRT_CHAR_MINBPC(enc, p) LITTLE2_IS_NMSTRT_CHAR_MINBPC(enc, p)
687
688#include "xmltok_impl.c"
689
690#undef MINBPC
691#undef BYTE_TYPE
692#undef BYTE_TO_ASCII
693#undef CHAR_MATCHES
694#undef IS_NAME_CHAR
695#undef IS_NAME_CHAR_MINBPC
696#undef IS_NMSTRT_CHAR
697#undef IS_NMSTRT_CHAR_MINBPC
698#undef IS_INVALID_CHAR
699
700#endif /* not XML_MIN_SIZE */
701
702#ifdef XML_NS
703
704static const struct normal_encoding little2_encoding_ns = {
705  { VTABLE, 2, 0,
706#if BYTEORDER == 1234
707    1
708#else
709    0
710#endif
711  },
712  {
713#include "asciitab.h"
714#include "latin1tab.h"
715  },
716  STANDARD_VTABLE(little2_)
717};
718
719#endif
720
721static const struct normal_encoding little2_encoding = {
722  { VTABLE, 2, 0,
723#if BYTEORDER == 1234
724    1
725#else
726    0
727#endif
728  },
729  {
730#define BT_COLON BT_NMSTRT
731#include "asciitab.h"
732#undef BT_COLON
733#include "latin1tab.h"
734  },
735  STANDARD_VTABLE(little2_)
736};
737
738#if BYTEORDER != 4321
739
740#ifdef XML_NS
741
742static const struct normal_encoding internal_little2_encoding_ns = {
743  { VTABLE, 2, 0, 1 },
744  {
745#include "iasciitab.h"
746#include "latin1tab.h"
747  },
748  STANDARD_VTABLE(little2_)
749};
750
751#endif
752
753static const struct normal_encoding internal_little2_encoding = {
754  { VTABLE, 2, 0, 1 },
755  {
756#define BT_COLON BT_NMSTRT
757#include "iasciitab.h"
758#undef BT_COLON
759#include "latin1tab.h"
760  },
761  STANDARD_VTABLE(little2_)
762};
763
764#endif
765
766
767#define BIG2_BYTE_TYPE(enc, p) \
768 ((p)[0] == 0 \
769  ? ((struct normal_encoding *)(enc))->type[(unsigned char)(p)[1]] \
770  : unicode_byte_type((p)[0], (p)[1]))
771#define BIG2_BYTE_TO_ASCII(enc, p) ((p)[0] == 0 ? (p)[1] : -1)
772#define BIG2_CHAR_MATCHES(enc, p, c) ((p)[0] == 0 && (p)[1] == c)
773#define BIG2_IS_NAME_CHAR_MINBPC(enc, p) \
774  UCS2_GET_NAMING(namePages, (unsigned char)p[0], (unsigned char)p[1])
775#define BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p) \
776  UCS2_GET_NAMING(nmstrtPages, (unsigned char)p[0], (unsigned char)p[1])
777
778#ifdef XML_MIN_SIZE
779
780static int FASTCALL
781big2_byteType(const ENCODING *enc, const char *p)
782{
783  return BIG2_BYTE_TYPE(enc, p);
784}
785
786static int FASTCALL
787big2_byteToAscii(const ENCODING *enc, const char *p)
788{
789  return BIG2_BYTE_TO_ASCII(enc, p);
790}
791
792static int FASTCALL
793big2_charMatches(const ENCODING *enc, const char *p, int c)
794{
795  return BIG2_CHAR_MATCHES(enc, p, c);
796}
797
798static int FASTCALL
799big2_isNameMin(const ENCODING *enc, const char *p)
800{
801  return BIG2_IS_NAME_CHAR_MINBPC(enc, p);
802}
803
804static int FASTCALL
805big2_isNmstrtMin(const ENCODING *enc, const char *p)
806{
807  return BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p);
808}
809
810#undef VTABLE
811#define VTABLE VTABLE1, big2_toUtf8, big2_toUtf16
812
813#else /* not XML_MIN_SIZE */
814
815#undef PREFIX
816#define PREFIX(ident) big2_ ## ident
817#define MINBPC(enc) 2
818/* CHAR_MATCHES is guaranteed to have MINBPC bytes available. */
819#define BYTE_TYPE(enc, p) BIG2_BYTE_TYPE(enc, p)
820#define BYTE_TO_ASCII(enc, p) BIG2_BYTE_TO_ASCII(enc, p)
821#define CHAR_MATCHES(enc, p, c) BIG2_CHAR_MATCHES(enc, p, c)
822#define IS_NAME_CHAR(enc, p, n) 0
823#define IS_NAME_CHAR_MINBPC(enc, p) BIG2_IS_NAME_CHAR_MINBPC(enc, p)
824#define IS_NMSTRT_CHAR(enc, p, n) (0)
825#define IS_NMSTRT_CHAR_MINBPC(enc, p) BIG2_IS_NMSTRT_CHAR_MINBPC(enc, p)
826
827#include "xmltok_impl.c"
828
829#undef MINBPC
830#undef BYTE_TYPE
831#undef BYTE_TO_ASCII
832#undef CHAR_MATCHES
833#undef IS_NAME_CHAR
834#undef IS_NAME_CHAR_MINBPC
835#undef IS_NMSTRT_CHAR
836#undef IS_NMSTRT_CHAR_MINBPC
837#undef IS_INVALID_CHAR
838
839#endif /* not XML_MIN_SIZE */
840
841#ifdef XML_NS
842
843static const struct normal_encoding big2_encoding_ns = {
844  { VTABLE, 2, 0,
845#if BYTEORDER == 4321
846  1
847#else
848  0
849#endif
850  },
851  {
852#include "asciitab.h"
853#include "latin1tab.h"
854  },
855  STANDARD_VTABLE(big2_)
856};
857
858#endif
859
860static const struct normal_encoding big2_encoding = {
861  { VTABLE, 2, 0,
862#if BYTEORDER == 4321
863  1
864#else
865  0
866#endif
867  },
868  {
869#define BT_COLON BT_NMSTRT
870#include "asciitab.h"
871#undef BT_COLON
872#include "latin1tab.h"
873  },
874  STANDARD_VTABLE(big2_)
875};
876
877#if BYTEORDER != 1234
878
879#ifdef XML_NS
880
881static const struct normal_encoding internal_big2_encoding_ns = {
882  { VTABLE, 2, 0, 1 },
883  {
884#include "iasciitab.h"
885#include "latin1tab.h"
886  },
887  STANDARD_VTABLE(big2_)
888};
889
890#endif
891
892static const struct normal_encoding internal_big2_encoding = {
893  { VTABLE, 2, 0, 1 },
894  {
895#define BT_COLON BT_NMSTRT
896#include "iasciitab.h"
897#undef BT_COLON
898#include "latin1tab.h"
899  },
900  STANDARD_VTABLE(big2_)
901};
902
903#endif
904
905#undef PREFIX
906
907static int FASTCALL
908streqci(const char *s1, const char *s2)
909{
910  for (;;) {
911    char c1 = *s1++;
912    char c2 = *s2++;
913    if (ASCII_a <= c1 && c1 <= ASCII_z)
914      c1 += ASCII_A - ASCII_a;
915    if (ASCII_a <= c2 && c2 <= ASCII_z)
916      c2 += ASCII_A - ASCII_a;
917    if (c1 != c2)
918      return 0;
919    if (!c1)
920      break;
921  }
922  return 1;
923}
924
925static void FASTCALL
926initUpdatePosition(const ENCODING *enc, const char *ptr,
927                   const char *end, POSITION *pos)
928{
929  normal_updatePosition(&utf8_encoding.enc, ptr, end, pos);
930}
931
932static int FASTCALL
933toAscii(const ENCODING *enc, const char *ptr, const char *end)
934{
935  char buf[1];
936  char *p = buf;
937  XmlUtf8Convert(enc, &ptr, end, &p, p + 1);
938  if (p == buf)
939    return -1;
940  else
941    return buf[0];
942}
943
944static int FASTCALL
945isSpace(int c)
946{
947  switch (c) {
948  case 0x20:
949  case 0xD:
950  case 0xA:
951  case 0x9:     
952    return 1;
953  }
954  return 0;
955}
956
957/* Return 1 if there's just optional white space or there's an S
958   followed by name=val.
959*/
960static int FASTCALL
961parsePseudoAttribute(const ENCODING *enc,
962                     const char *ptr,
963                     const char *end,
964                     const char **namePtr,
965                     const char **nameEndPtr,
966                     const char **valPtr,
967                     const char **nextTokPtr)
968{
969  int c;
970  char open;
971  if (ptr == end) {
972    *namePtr = NULL;
973    return 1;
974  }
975  if (!isSpace(toAscii(enc, ptr, end))) {
976    *nextTokPtr = ptr;
977    return 0;
978  }
979  do {
980    ptr += enc->minBytesPerChar;
981  } while (isSpace(toAscii(enc, ptr, end)));
982  if (ptr == end) {
983    *namePtr = NULL;
984    return 1;
985  }
986  *namePtr = ptr;
987  for (;;) {
988    c = toAscii(enc, ptr, end);
989    if (c == -1) {
990      *nextTokPtr = ptr;
991      return 0;
992    }
993    if (c == ASCII_EQUALS) {
994      *nameEndPtr = ptr;
995      break;
996    }
997    if (isSpace(c)) {
998      *nameEndPtr = ptr;
999      do {
1000        ptr += enc->minBytesPerChar;
1001      } while (isSpace(c = toAscii(enc, ptr, end)));
1002      if (c != ASCII_EQUALS) {
1003        *nextTokPtr = ptr;
1004        return 0;
1005      }
1006      break;
1007    }
1008    ptr += enc->minBytesPerChar;
1009  }
1010  if (ptr == *namePtr) {
1011    *nextTokPtr = ptr;
1012    return 0;
1013  }
1014  ptr += enc->minBytesPerChar;
1015  c = toAscii(enc, ptr, end);
1016  while (isSpace(c)) {
1017    ptr += enc->minBytesPerChar;
1018    c = toAscii(enc, ptr, end);
1019  }
1020  if (c != ASCII_QUOT && c != ASCII_APOS) {
1021    *nextTokPtr = ptr;
1022    return 0;
1023  }
1024  open = (char)c;
1025  ptr += enc->minBytesPerChar;
1026  *valPtr = ptr;
1027  for (;; ptr += enc->minBytesPerChar) {
1028    c = toAscii(enc, ptr, end);
1029    if (c == open)
1030      break;
1031    if (!(ASCII_a <= c && c <= ASCII_z)
1032        && !(ASCII_A <= c && c <= ASCII_Z)
1033        && !(ASCII_0 <= c && c <= ASCII_9)
1034        && c != ASCII_PERIOD
1035        && c != ASCII_MINUS
1036        && c != ASCII_UNDERSCORE) {
1037      *nextTokPtr = ptr;
1038      return 0;
1039    }
1040  }
1041  *nextTokPtr = ptr + enc->minBytesPerChar;
1042  return 1;
1043}
1044
1045static const char KW_version[] = {
1046  ASCII_v, ASCII_e, ASCII_r, ASCII_s, ASCII_i, ASCII_o, ASCII_n, '\0'
1047};
1048
1049static const char KW_encoding[] = {
1050  ASCII_e, ASCII_n, ASCII_c, ASCII_o, ASCII_d, ASCII_i, ASCII_n, ASCII_g, '\0'
1051};
1052
1053static const char KW_standalone[] = {
1054  ASCII_s, ASCII_t, ASCII_a, ASCII_n, ASCII_d, ASCII_a, ASCII_l, ASCII_o,
1055  ASCII_n, ASCII_e, '\0'
1056};
1057
1058static const char KW_yes[] = {
1059  ASCII_y, ASCII_e, ASCII_s,  '\0'
1060};
1061
1062static const char KW_no[] = {
1063  ASCII_n, ASCII_o,  '\0'
1064};
1065
1066static int
1067doParseXmlDecl(const ENCODING *(*encodingFinder)(const ENCODING *,
1068                                                 const char *,
1069                                                 const char *),
1070               int isGeneralTextEntity,
1071               const ENCODING *enc,
1072               const char *ptr,
1073               const char *end,
1074               const char **badPtr,
1075               const char **versionPtr,
1076               const char **versionEndPtr,
1077               const char **encodingName,
1078               const ENCODING **encoding,
1079               int *standalone)
1080{
1081  const char *val = NULL;
1082  const char *name = NULL;
1083  const char *nameEnd = NULL;
1084  ptr += 5 * enc->minBytesPerChar;
1085  end -= 2 * enc->minBytesPerChar;
1086  if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)
1087      || !name) {
1088    *badPtr = ptr;
1089    return 0;
1090  }
1091  if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_version)) {
1092    if (!isGeneralTextEntity) {
1093      *badPtr = name;
1094      return 0;
1095    }
1096  }
1097  else {
1098    if (versionPtr)
1099      *versionPtr = val;
1100    if (versionEndPtr)
1101      *versionEndPtr = ptr;
1102    if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1103      *badPtr = ptr;
1104      return 0;
1105    }
1106    if (!name) {
1107      if (isGeneralTextEntity) {
1108        /* a TextDecl must have an EncodingDecl */
1109        *badPtr = ptr;
1110        return 0;
1111      }
1112      return 1;
1113    }
1114  }
1115  if (XmlNameMatchesAscii(enc, name, nameEnd, KW_encoding)) {
1116    int c = toAscii(enc, val, end);
1117    if (!(ASCII_a <= c && c <= ASCII_z) && !(ASCII_A <= c && c <= ASCII_Z)) {
1118      *badPtr = val;
1119      return 0;
1120    }
1121    if (encodingName)
1122      *encodingName = val;
1123    if (encoding)
1124      *encoding = encodingFinder(enc, val, ptr - enc->minBytesPerChar);
1125    if (!parsePseudoAttribute(enc, ptr, end, &name, &nameEnd, &val, &ptr)) {
1126      *badPtr = ptr;
1127      return 0;
1128    }
1129    if (!name)
1130      return 1;
1131  }
1132  if (!XmlNameMatchesAscii(enc, name, nameEnd, KW_standalone)
1133      || isGeneralTextEntity) {
1134    *badPtr = name;
1135    return 0;
1136  }
1137  if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_yes)) {
1138    if (standalone)
1139      *standalone = 1;
1140  }
1141  else if (XmlNameMatchesAscii(enc, val, ptr - enc->minBytesPerChar, KW_no)) {
1142    if (standalone)
1143      *standalone = 0;
1144  }
1145  else {
1146    *badPtr = val;
1147    return 0;
1148  }
1149  while (isSpace(toAscii(enc, ptr, end)))
1150    ptr += enc->minBytesPerChar;
1151  if (ptr != end) {
1152    *badPtr = ptr;
1153    return 0;
1154  }
1155  return 1;
1156}
1157
1158static int FASTCALL
1159checkCharRefNumber(int result)
1160{
1161  switch (result >> 8) {
1162  case 0xD8: case 0xD9: case 0xDA: case 0xDB:
1163  case 0xDC: case 0xDD: case 0xDE: case 0xDF:
1164    return -1;
1165  case 0:
1166    if (latin1_encoding.type[result] == BT_NONXML)
1167      return -1;
1168    break;
1169  case 0xFF:
1170    if (result == 0xFFFE || result == 0xFFFF)
1171      return -1;
1172    break;
1173  }
1174  return result;
1175}
1176
1177int
1178XmlUtf8Encode(int c, char *buf)
1179{
1180  enum {
1181    /* minN is minimum legal resulting value for N byte sequence */
1182    min2 = 0x80,
1183    min3 = 0x800,
1184    min4 = 0x10000
1185  };
1186
1187  if (c < 0)
1188    return 0;
1189  if (c < min2) {
1190    buf[0] = (char)(c | UTF8_cval1);
1191    return 1;
1192  }
1193  if (c < min3) {
1194    buf[0] = (char)((c >> 6) | UTF8_cval2);
1195    buf[1] = (char)((c & 0x3f) | 0x80);
1196    return 2;
1197  }
1198  if (c < min4) {
1199    buf[0] = (char)((c >> 12) | UTF8_cval3);
1200    buf[1] = (char)(((c >> 6) & 0x3f) | 0x80);
1201    buf[2] = (char)((c & 0x3f) | 0x80);
1202    return 3;
1203  }
1204  if (c < 0x110000) {
1205    buf[0] = (char)((c >> 18) | UTF8_cval4);
1206    buf[1] = (char)(((c >> 12) & 0x3f) | 0x80);
1207    buf[2] = (char)(((c >> 6) & 0x3f) | 0x80);
1208    buf[3] = (char)((c & 0x3f) | 0x80);
1209    return 4;
1210  }
1211  return 0;
1212}
1213
1214int
1215XmlUtf16Encode(int charNum, unsigned short *buf)
1216{
1217  if (charNum < 0)
1218    return 0;
1219  if (charNum < 0x10000) {
1220    buf[0] = (unsigned short)charNum;
1221    return 1;
1222  }
1223  if (charNum < 0x110000) {
1224    charNum -= 0x10000;
1225    buf[0] = (unsigned short)((charNum >> 10) + 0xD800);
1226    buf[1] = (unsigned short)((charNum & 0x3FF) + 0xDC00);
1227    return 2;
1228  }
1229  return 0;
1230}
1231
1232struct unknown_encoding {
1233  struct normal_encoding normal;
1234  int (*convert)(void *userData, const char *p);
1235  void *userData;
1236  unsigned short utf16[256];
1237  char utf8[256][4];
1238};
1239
1240#define AS_UNKNOWN_ENCODING(enc)  ((const struct unknown_encoding *) (enc))
1241
1242int
1243XmlSizeOfUnknownEncoding(void)
1244{
1245  return sizeof(struct unknown_encoding);
1246}
1247
1248static int FASTCALL
1249unknown_isName(const ENCODING *enc, const char *p)
1250{
1251  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1252  int c = uenc->convert(uenc->userData, p);
1253  if (c & ~0xFFFF)
1254    return 0;
1255  return UCS2_GET_NAMING(namePages, c >> 8, c & 0xFF);
1256}
1257
1258static int FASTCALL
1259unknown_isNmstrt(const ENCODING *enc, const char *p)
1260{
1261  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1262  int c = uenc->convert(uenc->userData, p);
1263  if (c & ~0xFFFF)
1264    return 0;
1265  return UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xFF);
1266}
1267
1268static int FASTCALL
1269unknown_isInvalid(const ENCODING *enc, const char *p)
1270{
1271  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1272  int c = uenc->convert(uenc->userData, p);
1273  return (c & ~0xFFFF) || checkCharRefNumber(c) < 0;
1274}
1275
1276static void FASTCALL
1277unknown_toUtf8(const ENCODING *enc,
1278               const char **fromP, const char *fromLim,
1279               char **toP, const char *toLim)
1280{
1281  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1282  char buf[XML_UTF8_ENCODE_MAX];
1283  for (;;) {
1284    const char *utf8;
1285    int n;
1286    if (*fromP == fromLim)
1287      break;
1288    utf8 = uenc->utf8[(unsigned char)**fromP];
1289    n = *utf8++;
1290    if (n == 0) {
1291      int c = uenc->convert(uenc->userData, *fromP);
1292      n = XmlUtf8Encode(c, buf);
1293      if (n > toLim - *toP)
1294        break;
1295      utf8 = buf;
1296      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1297                 - (BT_LEAD2 - 2));
1298    }
1299    else {
1300      if (n > toLim - *toP)
1301        break;
1302      (*fromP)++;
1303    }
1304    do {
1305      *(*toP)++ = *utf8++;
1306    } while (--n != 0);
1307  }
1308}
1309
1310static void FASTCALL
1311unknown_toUtf16(const ENCODING *enc,
1312                const char **fromP, const char *fromLim,
1313                unsigned short **toP, const unsigned short *toLim)
1314{
1315  const struct unknown_encoding *uenc = AS_UNKNOWN_ENCODING(enc);
1316  while (*fromP != fromLim && *toP != toLim) {
1317    unsigned short c = uenc->utf16[(unsigned char)**fromP];
1318    if (c == 0) {
1319      c = (unsigned short)
1320          uenc->convert(uenc->userData, *fromP);
1321      *fromP += (AS_NORMAL_ENCODING(enc)->type[(unsigned char)**fromP]
1322                 - (BT_LEAD2 - 2));
1323    }
1324    else
1325      (*fromP)++;
1326    *(*toP)++ = c;
1327  }
1328}
1329
1330ENCODING *
1331XmlInitUnknownEncoding(void *mem,
1332                       int *table,
1333                       int (*convert)(void *userData, const char *p),
1334                       void *userData)
1335{
1336  int i;
1337  struct unknown_encoding *e = mem;
1338  for (i = 0; i < (int)sizeof(struct normal_encoding); i++)
1339    ((char *)mem)[i] = ((char *)&latin1_encoding)[i];
1340  for (i = 0; i < 128; i++)
1341    if (latin1_encoding.type[i] != BT_OTHER
1342        && latin1_encoding.type[i] != BT_NONXML
1343        && table[i] != i)
1344      return 0;
1345  for (i = 0; i < 256; i++) {
1346    int c = table[i];
1347    if (c == -1) {
1348      e->normal.type[i] = BT_MALFORM;
1349      /* This shouldn't really get used. */
1350      e->utf16[i] = 0xFFFF;
1351      e->utf8[i][0] = 1;
1352      e->utf8[i][1] = 0;
1353    }
1354    else if (c < 0) {
1355      if (c < -4)
1356        return 0;
1357      e->normal.type[i] = (unsigned char)(BT_LEAD2 - (c + 2));
1358      e->utf8[i][0] = 0;
1359      e->utf16[i] = 0;
1360    }
1361    else if (c < 0x80) {
1362      if (latin1_encoding.type[c] != BT_OTHER
1363          && latin1_encoding.type[c] != BT_NONXML
1364          && c != i)
1365        return 0;
1366      e->normal.type[i] = latin1_encoding.type[c];
1367      e->utf8[i][0] = 1;
1368      e->utf8[i][1] = (char)c;
1369      e->utf16[i] = (unsigned short)(c == 0 ? 0xFFFF : c);
1370    }
1371    else if (checkCharRefNumber(c) < 0) {
1372      e->normal.type[i] = BT_NONXML;
1373      /* This shouldn't really get used. */
1374      e->utf16[i] = 0xFFFF;
1375      e->utf8[i][0] = 1;
1376      e->utf8[i][1] = 0;
1377    }
1378    else {
1379      if (c > 0xFFFF)
1380        return 0;
1381      if (UCS2_GET_NAMING(nmstrtPages, c >> 8, c & 0xff))
1382        e->normal.type[i] = BT_NMSTRT;
1383      else if (UCS2_GET_NAMING(namePages, c >> 8, c & 0xff))
1384        e->normal.type[i] = BT_NAME;
1385      else
1386        e->normal.type[i] = BT_OTHER;
1387      e->utf8[i][0] = (char)XmlUtf8Encode(c, e->utf8[i] + 1);
1388      e->utf16[i] = (unsigned short)c;
1389    }
1390  }
1391  e->userData = userData;
1392  e->convert = convert;
1393  if (convert) {
1394    e->normal.isName2 = unknown_isName;
1395    e->normal.isName3 = unknown_isName;
1396    e->normal.isName4 = unknown_isName;
1397    e->normal.isNmstrt2 = unknown_isNmstrt;
1398    e->normal.isNmstrt3 = unknown_isNmstrt;
1399    e->normal.isNmstrt4 = unknown_isNmstrt;
1400    e->normal.isInvalid2 = unknown_isInvalid;
1401    e->normal.isInvalid3 = unknown_isInvalid;
1402    e->normal.isInvalid4 = unknown_isInvalid;
1403  }
1404  e->normal.enc.utf8Convert = unknown_toUtf8;
1405  e->normal.enc.utf16Convert = unknown_toUtf16;
1406  return &(e->normal.enc);
1407}
1408
1409/* If this enumeration is changed, getEncodingIndex and encodings
1410must also be changed. */
1411enum {
1412  UNKNOWN_ENC = -1,
1413  ISO_8859_1_ENC = 0,
1414  US_ASCII_ENC,
1415  UTF_8_ENC,
1416  UTF_16_ENC,
1417  UTF_16BE_ENC,
1418  UTF_16LE_ENC,
1419  /* must match encodingNames up to here */
1420  NO_ENC
1421};
1422
1423static const char KW_ISO_8859_1[] = {
1424  ASCII_I, ASCII_S, ASCII_O, ASCII_MINUS, ASCII_8, ASCII_8, ASCII_5, ASCII_9,
1425  ASCII_MINUS, ASCII_1, '\0'
1426};
1427static const char KW_US_ASCII[] = {
1428  ASCII_U, ASCII_S, ASCII_MINUS, ASCII_A, ASCII_S, ASCII_C, ASCII_I, ASCII_I,
1429  '\0'
1430};
1431static const char KW_UTF_8[] =  {
1432  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_8, '\0'
1433};
1434static const char KW_UTF_16[] = {
1435  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, '\0'
1436};
1437static const char KW_UTF_16BE[] = {
1438  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_B, ASCII_E,
1439  '\0'
1440};
1441static const char KW_UTF_16LE[] = {
1442  ASCII_U, ASCII_T, ASCII_F, ASCII_MINUS, ASCII_1, ASCII_6, ASCII_L, ASCII_E,
1443  '\0'
1444};
1445
1446static int FASTCALL
1447getEncodingIndex(const char *name)
1448{
1449  static const char *encodingNames[] = {
1450    KW_ISO_8859_1,
1451    KW_US_ASCII,
1452    KW_UTF_8,
1453    KW_UTF_16,
1454    KW_UTF_16BE,
1455    KW_UTF_16LE,
1456  };
1457  int i;
1458  if (name == NULL)
1459    return NO_ENC;
1460  for (i = 0; i < (int)(sizeof(encodingNames)/sizeof(encodingNames[0])); i++)
1461    if (streqci(name, encodingNames[i]))
1462      return i;
1463  return UNKNOWN_ENC;
1464}
1465
1466/* For binary compatibility, we store the index of the encoding
1467   specified at initialization in the isUtf16 member.
1468*/
1469
1470#define INIT_ENC_INDEX(enc) ((int)(enc)->initEnc.isUtf16)
1471#define SET_INIT_ENC_INDEX(enc, i) ((enc)->initEnc.isUtf16 = (char)i)
1472
1473/* This is what detects the encoding.  encodingTable maps from
1474   encoding indices to encodings; INIT_ENC_INDEX(enc) is the index of
1475   the external (protocol) specified encoding; state is
1476   XML_CONTENT_STATE if we're parsing an external text entity, and
1477   XML_PROLOG_STATE otherwise.
1478*/
1479
1480
1481static int FASTCALL
1482initScan(const ENCODING **encodingTable,
1483         const INIT_ENCODING *enc,
1484         int state,
1485         const char *ptr,
1486         const char *end,
1487         const char **nextTokPtr)
1488{
1489  const ENCODING **encPtr;
1490
1491  if (ptr == end)
1492    return XML_TOK_NONE;
1493  encPtr = enc->encPtr;
1494  if (ptr + 1 == end) {
1495    /* only a single byte available for auto-detection */
1496#ifndef XML_DTD /* FIXME */
1497    /* a well-formed document entity must have more than one byte */
1498    if (state != XML_CONTENT_STATE)
1499      return XML_TOK_PARTIAL;
1500#endif
1501    /* so we're parsing an external text entity... */
1502    /* if UTF-16 was externally specified, then we need at least 2 bytes */
1503    switch (INIT_ENC_INDEX(enc)) {
1504    case UTF_16_ENC:
1505    case UTF_16LE_ENC:
1506    case UTF_16BE_ENC:
1507      return XML_TOK_PARTIAL;
1508    }
1509    switch ((unsigned char)*ptr) {
1510    case 0xFE:
1511    case 0xFF:
1512    case 0xEF: /* possibly first byte of UTF-8 BOM */
1513      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1514          && state == XML_CONTENT_STATE)
1515        break;
1516      /* fall through */
1517    case 0x00:
1518    case 0x3C:
1519      return XML_TOK_PARTIAL;
1520    }
1521  }
1522  else {
1523    switch (((unsigned char)ptr[0] << 8) | (unsigned char)ptr[1]) {
1524    case 0xFEFF:
1525      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1526          && state == XML_CONTENT_STATE)
1527        break;
1528      *nextTokPtr = ptr + 2;
1529      *encPtr = encodingTable[UTF_16BE_ENC];
1530      return XML_TOK_BOM;
1531    /* 00 3C is handled in the default case */
1532    case 0x3C00:
1533      if ((INIT_ENC_INDEX(enc) == UTF_16BE_ENC
1534           || INIT_ENC_INDEX(enc) == UTF_16_ENC)
1535          && state == XML_CONTENT_STATE)
1536        break;
1537      *encPtr = encodingTable[UTF_16LE_ENC];
1538      return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1539    case 0xFFFE:
1540      if (INIT_ENC_INDEX(enc) == ISO_8859_1_ENC
1541          && state == XML_CONTENT_STATE)
1542        break;
1543      *nextTokPtr = ptr + 2;
1544      *encPtr = encodingTable[UTF_16LE_ENC];
1545      return XML_TOK_BOM;
1546    case 0xEFBB:
1547      /* Maybe a UTF-8 BOM (EF BB BF) */
1548      /* If there's an explicitly specified (external) encoding
1549         of ISO-8859-1 or some flavour of UTF-16
1550         and this is an external text entity,
1551         don't look for the BOM,
1552         because it might be a legal data.
1553      */
1554      if (state == XML_CONTENT_STATE) {
1555        int e = INIT_ENC_INDEX(enc);
1556        if (e == ISO_8859_1_ENC || e == UTF_16BE_ENC
1557            || e == UTF_16LE_ENC || e == UTF_16_ENC)
1558          break;
1559      }
1560      if (ptr + 2 == end)
1561        return XML_TOK_PARTIAL;
1562      if ((unsigned char)ptr[2] == 0xBF) {
1563        *nextTokPtr = ptr + 3;
1564        *encPtr = encodingTable[UTF_8_ENC];
1565        return XML_TOK_BOM;
1566      }
1567      break;
1568    default:
1569      if (ptr[0] == '\0') {
1570        /* 0 isn't a legal data character. Furthermore a document
1571           entity can only start with ASCII characters.  So the only
1572           way this can fail to be big-endian UTF-16 if it it's an
1573           external parsed general entity that's labelled as
1574           UTF-16LE.
1575        */
1576        if (state == XML_CONTENT_STATE && INIT_ENC_INDEX(enc) == UTF_16LE_ENC)
1577          break;
1578        *encPtr = encodingTable[UTF_16BE_ENC];
1579        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1580      }
1581      else if (ptr[1] == '\0') {
1582        /* We could recover here in the case:
1583            - parsing an external entity
1584            - second byte is 0
1585            - no externally specified encoding
1586            - no encoding declaration
1587           by assuming UTF-16LE.  But we don't, because this would mean when
1588           presented just with a single byte, we couldn't reliably determine
1589           whether we needed further bytes.
1590        */
1591        if (state == XML_CONTENT_STATE)
1592          break;
1593        *encPtr = encodingTable[UTF_16LE_ENC];
1594        return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1595      }
1596      break;
1597    }
1598  }
1599  *encPtr = encodingTable[INIT_ENC_INDEX(enc)];
1600  return XmlTok(*encPtr, state, ptr, end, nextTokPtr);
1601}
1602
1603
1604#define NS(x) x
1605#define ns(x) x
1606#include "xmltok_ns.c"
1607#undef NS
1608#undef ns
1609
1610#ifdef XML_NS
1611
1612#define NS(x) x ## NS
1613#define ns(x) x ## _ns
1614
1615#include "xmltok_ns.c"
1616
1617#undef NS
1618#undef ns
1619
1620ENCODING *
1621XmlInitUnknownEncodingNS(void *mem,
1622                         int *table,
1623                         int (*convert)(void *userData, const char *p),
1624                         void *userData)
1625{
1626  ENCODING *enc = XmlInitUnknownEncoding(mem, table, convert, userData);
1627  if (enc)
1628    ((struct normal_encoding *)enc)->type[ASCII_COLON] = BT_COLON;
1629  return enc;
1630}
1631
1632#endif /* XML_NS */
Note: See TracBrowser for help on using the repository browser.