1 | /* utf8.c |
---|
2 | * |
---|
3 | * Copyright (c) 1998-2001, Larry Wall |
---|
4 | * |
---|
5 | * You may distribute under the terms of either the GNU General Public |
---|
6 | * License or the Artistic License, as specified in the README file. |
---|
7 | * |
---|
8 | */ |
---|
9 | |
---|
10 | /* |
---|
11 | * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever |
---|
12 | * heard of that we don't want to see any closer; and that's the one place |
---|
13 | * we're trying to get to! And that's just where we can't get, nohow.' |
---|
14 | * |
---|
15 | * 'Well do I understand your speech,' he answered in the same language; |
---|
16 | * 'yet few strangers do so. Why then do you not speak in the Common Tongue, |
---|
17 | * as is the custom in the West, if you wish to be answered?' |
---|
18 | * |
---|
19 | * ...the travellers perceived that the floor was paved with stones of many |
---|
20 | * hues; branching runes and strange devices intertwined beneath their feet. |
---|
21 | */ |
---|
22 | |
---|
23 | #include "EXTERN.h" |
---|
24 | #define PERL_IN_UTF8_C |
---|
25 | #include "perl.h" |
---|
26 | |
---|
27 | /* Unicode support */ |
---|
28 | |
---|
29 | /* |
---|
30 | =for apidoc A|U8*|uv_to_utf8|U8 *d|UV uv |
---|
31 | |
---|
32 | Adds the UTF8 representation of the Unicode codepoint C<uv> to the end |
---|
33 | of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free |
---|
34 | bytes available. The return value is the pointer to the byte after the |
---|
35 | end of the new character. In other words, |
---|
36 | |
---|
37 | d = uv_to_utf8(d, uv); |
---|
38 | |
---|
39 | is the recommended Unicode-aware way of saying |
---|
40 | |
---|
41 | *(d++) = uv; |
---|
42 | |
---|
43 | =cut |
---|
44 | */ |
---|
45 | |
---|
46 | U8 * |
---|
47 | Perl_uv_to_utf8(pTHX_ U8 *d, UV uv) |
---|
48 | { |
---|
49 | if (uv < 0x80) { |
---|
50 | *d++ = uv; |
---|
51 | return d; |
---|
52 | } |
---|
53 | if (uv < 0x800) { |
---|
54 | *d++ = (( uv >> 6) | 0xc0); |
---|
55 | *d++ = (( uv & 0x3f) | 0x80); |
---|
56 | return d; |
---|
57 | } |
---|
58 | if (uv < 0x10000) { |
---|
59 | *d++ = (( uv >> 12) | 0xe0); |
---|
60 | *d++ = (((uv >> 6) & 0x3f) | 0x80); |
---|
61 | *d++ = (( uv & 0x3f) | 0x80); |
---|
62 | return d; |
---|
63 | } |
---|
64 | if (uv < 0x200000) { |
---|
65 | *d++ = (( uv >> 18) | 0xf0); |
---|
66 | *d++ = (((uv >> 12) & 0x3f) | 0x80); |
---|
67 | *d++ = (((uv >> 6) & 0x3f) | 0x80); |
---|
68 | *d++ = (( uv & 0x3f) | 0x80); |
---|
69 | return d; |
---|
70 | } |
---|
71 | if (uv < 0x4000000) { |
---|
72 | *d++ = (( uv >> 24) | 0xf8); |
---|
73 | *d++ = (((uv >> 18) & 0x3f) | 0x80); |
---|
74 | *d++ = (((uv >> 12) & 0x3f) | 0x80); |
---|
75 | *d++ = (((uv >> 6) & 0x3f) | 0x80); |
---|
76 | *d++ = (( uv & 0x3f) | 0x80); |
---|
77 | return d; |
---|
78 | } |
---|
79 | if (uv < 0x80000000) { |
---|
80 | *d++ = (( uv >> 30) | 0xfc); |
---|
81 | *d++ = (((uv >> 24) & 0x3f) | 0x80); |
---|
82 | *d++ = (((uv >> 18) & 0x3f) | 0x80); |
---|
83 | *d++ = (((uv >> 12) & 0x3f) | 0x80); |
---|
84 | *d++ = (((uv >> 6) & 0x3f) | 0x80); |
---|
85 | *d++ = (( uv & 0x3f) | 0x80); |
---|
86 | return d; |
---|
87 | } |
---|
88 | #ifdef HAS_QUAD |
---|
89 | if (uv < UTF8_QUAD_MAX) |
---|
90 | #endif |
---|
91 | { |
---|
92 | *d++ = 0xfe; /* Can't match U+FEFF! */ |
---|
93 | *d++ = (((uv >> 30) & 0x3f) | 0x80); |
---|
94 | *d++ = (((uv >> 24) & 0x3f) | 0x80); |
---|
95 | *d++ = (((uv >> 18) & 0x3f) | 0x80); |
---|
96 | *d++ = (((uv >> 12) & 0x3f) | 0x80); |
---|
97 | *d++ = (((uv >> 6) & 0x3f) | 0x80); |
---|
98 | *d++ = (( uv & 0x3f) | 0x80); |
---|
99 | return d; |
---|
100 | } |
---|
101 | #ifdef HAS_QUAD |
---|
102 | { |
---|
103 | *d++ = 0xff; /* Can't match U+FFFE! */ |
---|
104 | *d++ = 0x80; /* 6 Reserved bits */ |
---|
105 | *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */ |
---|
106 | *d++ = (((uv >> 54) & 0x3f) | 0x80); |
---|
107 | *d++ = (((uv >> 48) & 0x3f) | 0x80); |
---|
108 | *d++ = (((uv >> 42) & 0x3f) | 0x80); |
---|
109 | *d++ = (((uv >> 36) & 0x3f) | 0x80); |
---|
110 | *d++ = (((uv >> 30) & 0x3f) | 0x80); |
---|
111 | *d++ = (((uv >> 24) & 0x3f) | 0x80); |
---|
112 | *d++ = (((uv >> 18) & 0x3f) | 0x80); |
---|
113 | *d++ = (((uv >> 12) & 0x3f) | 0x80); |
---|
114 | *d++ = (((uv >> 6) & 0x3f) | 0x80); |
---|
115 | *d++ = (( uv & 0x3f) | 0x80); |
---|
116 | return d; |
---|
117 | } |
---|
118 | #endif |
---|
119 | } |
---|
120 | |
---|
121 | /* |
---|
122 | =for apidoc A|STRLEN|is_utf8_char|U8 *s |
---|
123 | |
---|
124 | Tests if some arbitrary number of bytes begins in a valid UTF-8 character. |
---|
125 | The actual number of bytes in the UTF-8 character will be returned if it |
---|
126 | is valid, otherwise 0. |
---|
127 | |
---|
128 | =cut |
---|
129 | */ |
---|
130 | STRLEN |
---|
131 | Perl_is_utf8_char(pTHX_ U8 *s) |
---|
132 | { |
---|
133 | U8 u = *s; |
---|
134 | STRLEN slen, len; |
---|
135 | UV uv, ouv; |
---|
136 | |
---|
137 | if (UTF8_IS_ASCII(u)) |
---|
138 | return 1; |
---|
139 | |
---|
140 | if (!UTF8_IS_START(u)) |
---|
141 | return 0; |
---|
142 | |
---|
143 | len = UTF8SKIP(s); |
---|
144 | |
---|
145 | if (len < 2 || !UTF8_IS_CONTINUATION(s[1])) |
---|
146 | return 0; |
---|
147 | |
---|
148 | slen = len - 1; |
---|
149 | s++; |
---|
150 | uv = u; |
---|
151 | ouv = uv; |
---|
152 | while (slen--) { |
---|
153 | if (!UTF8_IS_CONTINUATION(*s)) |
---|
154 | return 0; |
---|
155 | uv = UTF8_ACCUMULATE(uv, *s); |
---|
156 | if (uv < ouv) |
---|
157 | return 0; |
---|
158 | ouv = uv; |
---|
159 | s++; |
---|
160 | } |
---|
161 | |
---|
162 | if (UNISKIP(uv) < len) |
---|
163 | return 0; |
---|
164 | |
---|
165 | return len; |
---|
166 | } |
---|
167 | |
---|
168 | /* |
---|
169 | =for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len |
---|
170 | |
---|
171 | Returns true if first C<len> bytes of the given string form valid a UTF8 |
---|
172 | string, false otherwise. |
---|
173 | |
---|
174 | =cut |
---|
175 | */ |
---|
176 | |
---|
177 | bool |
---|
178 | Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len) |
---|
179 | { |
---|
180 | U8* x = s; |
---|
181 | U8* send; |
---|
182 | STRLEN c; |
---|
183 | |
---|
184 | if (!len) |
---|
185 | len = strlen((char *)s); |
---|
186 | send = s + len; |
---|
187 | |
---|
188 | while (x < send) { |
---|
189 | c = is_utf8_char(x); |
---|
190 | if (!c) |
---|
191 | return FALSE; |
---|
192 | x += c; |
---|
193 | } |
---|
194 | if (x != send) |
---|
195 | return FALSE; |
---|
196 | |
---|
197 | return TRUE; |
---|
198 | } |
---|
199 | |
---|
200 | /* |
---|
201 | =for apidoc A|U8* s|utf8_to_uv|STRLEN curlen|STRLEN *retlen|U32 flags |
---|
202 | |
---|
203 | Returns the character value of the first character in the string C<s> |
---|
204 | which is assumed to be in UTF8 encoding and no longer than C<curlen>; |
---|
205 | C<retlen> will be set to the length, in bytes, of that character. |
---|
206 | |
---|
207 | If C<s> does not point to a well-formed UTF8 character, the behaviour |
---|
208 | is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY, |
---|
209 | it is assumed that the caller will raise a warning, and this function |
---|
210 | will silently just set C<retlen> to C<-1> and return zero. If the |
---|
211 | C<flags> does not contain UTF8_CHECK_ONLY, warnings about |
---|
212 | malformations will be given, C<retlen> will be set to the expected |
---|
213 | length of the UTF-8 character in bytes, and zero will be returned. |
---|
214 | |
---|
215 | The C<flags> can also contain various flags to allow deviations from |
---|
216 | the strict UTF-8 encoding (see F<utf8.h>). |
---|
217 | |
---|
218 | =cut */ |
---|
219 | |
---|
220 | UV |
---|
221 | Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags) |
---|
222 | { |
---|
223 | UV uv = *s, ouv; |
---|
224 | STRLEN len = 1; |
---|
225 | #ifdef EBCDIC |
---|
226 | bool dowarn = 0; |
---|
227 | #else |
---|
228 | bool dowarn = ckWARN_d(WARN_UTF8); |
---|
229 | #endif |
---|
230 | STRLEN expectlen = 0; |
---|
231 | U32 warning = 0; |
---|
232 | |
---|
233 | /* This list is a superset of the UTF8_ALLOW_XXX. */ |
---|
234 | |
---|
235 | #define UTF8_WARN_EMPTY 1 |
---|
236 | #define UTF8_WARN_CONTINUATION 2 |
---|
237 | #define UTF8_WARN_NON_CONTINUATION 3 |
---|
238 | #define UTF8_WARN_FE_FF 4 |
---|
239 | #define UTF8_WARN_SHORT 5 |
---|
240 | #define UTF8_WARN_OVERFLOW 6 |
---|
241 | #define UTF8_WARN_SURROGATE 7 |
---|
242 | #define UTF8_WARN_BOM 8 |
---|
243 | #define UTF8_WARN_LONG 9 |
---|
244 | #define UTF8_WARN_FFFF 10 |
---|
245 | |
---|
246 | if (curlen == 0 && |
---|
247 | !(flags & UTF8_ALLOW_EMPTY)) { |
---|
248 | warning = UTF8_WARN_EMPTY; |
---|
249 | goto malformed; |
---|
250 | } |
---|
251 | |
---|
252 | if (UTF8_IS_ASCII(uv)) { |
---|
253 | if (retlen) |
---|
254 | *retlen = 1; |
---|
255 | return *s; |
---|
256 | } |
---|
257 | |
---|
258 | if (UTF8_IS_CONTINUATION(uv) && |
---|
259 | !(flags & UTF8_ALLOW_CONTINUATION)) { |
---|
260 | warning = UTF8_WARN_CONTINUATION; |
---|
261 | goto malformed; |
---|
262 | } |
---|
263 | |
---|
264 | if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) && |
---|
265 | !(flags & UTF8_ALLOW_NON_CONTINUATION)) { |
---|
266 | warning = UTF8_WARN_NON_CONTINUATION; |
---|
267 | goto malformed; |
---|
268 | } |
---|
269 | |
---|
270 | if ((uv == 0xfe || uv == 0xff) && |
---|
271 | !(flags & UTF8_ALLOW_FE_FF)) { |
---|
272 | warning = UTF8_WARN_FE_FF; |
---|
273 | goto malformed; |
---|
274 | } |
---|
275 | |
---|
276 | if (!(uv & 0x20)) { len = 2; uv &= 0x1f; } |
---|
277 | else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; } |
---|
278 | else if (!(uv & 0x08)) { len = 4; uv &= 0x07; } |
---|
279 | else if (!(uv & 0x04)) { len = 5; uv &= 0x03; } |
---|
280 | else if (!(uv & 0x02)) { len = 6; uv &= 0x01; } |
---|
281 | else if (!(uv & 0x01)) { len = 7; uv = 0; } |
---|
282 | else { len = 13; uv = 0; } /* whoa! */ |
---|
283 | |
---|
284 | if (retlen) |
---|
285 | *retlen = len; |
---|
286 | |
---|
287 | expectlen = len; |
---|
288 | |
---|
289 | if ((curlen < expectlen) && |
---|
290 | !(flags & UTF8_ALLOW_SHORT)) { |
---|
291 | warning = UTF8_WARN_SHORT; |
---|
292 | goto malformed; |
---|
293 | } |
---|
294 | |
---|
295 | len--; |
---|
296 | s++; |
---|
297 | ouv = uv; |
---|
298 | |
---|
299 | while (len--) { |
---|
300 | if (!UTF8_IS_CONTINUATION(*s) && |
---|
301 | !(flags & UTF8_ALLOW_NON_CONTINUATION)) { |
---|
302 | s--; |
---|
303 | warning = UTF8_WARN_NON_CONTINUATION; |
---|
304 | goto malformed; |
---|
305 | } |
---|
306 | else |
---|
307 | uv = UTF8_ACCUMULATE(uv, *s); |
---|
308 | if (!(uv > ouv)) { |
---|
309 | /* These cannot be allowed. */ |
---|
310 | if (uv == ouv) { |
---|
311 | if (!(flags & UTF8_ALLOW_LONG)) { |
---|
312 | warning = UTF8_WARN_LONG; |
---|
313 | goto malformed; |
---|
314 | } |
---|
315 | } |
---|
316 | else { /* uv < ouv */ |
---|
317 | /* This cannot be allowed. */ |
---|
318 | warning = UTF8_WARN_OVERFLOW; |
---|
319 | goto malformed; |
---|
320 | } |
---|
321 | } |
---|
322 | s++; |
---|
323 | ouv = uv; |
---|
324 | } |
---|
325 | |
---|
326 | if (UNICODE_IS_SURROGATE(uv) && |
---|
327 | !(flags & UTF8_ALLOW_SURROGATE)) { |
---|
328 | warning = UTF8_WARN_SURROGATE; |
---|
329 | goto malformed; |
---|
330 | } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) && |
---|
331 | !(flags & UTF8_ALLOW_BOM)) { |
---|
332 | warning = UTF8_WARN_BOM; |
---|
333 | goto malformed; |
---|
334 | } else if ((expectlen > UNISKIP(uv)) && |
---|
335 | !(flags & UTF8_ALLOW_LONG)) { |
---|
336 | warning = UTF8_WARN_LONG; |
---|
337 | goto malformed; |
---|
338 | } else if (UNICODE_IS_ILLEGAL(uv) && |
---|
339 | !(flags & UTF8_ALLOW_FFFF)) { |
---|
340 | warning = UTF8_WARN_FFFF; |
---|
341 | goto malformed; |
---|
342 | } |
---|
343 | |
---|
344 | return uv; |
---|
345 | |
---|
346 | malformed: |
---|
347 | |
---|
348 | if (flags & UTF8_CHECK_ONLY) { |
---|
349 | if (retlen) |
---|
350 | *retlen = -1; |
---|
351 | return 0; |
---|
352 | } |
---|
353 | |
---|
354 | if (dowarn) { |
---|
355 | SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0)); |
---|
356 | |
---|
357 | switch (warning) { |
---|
358 | case 0: /* Intentionally empty. */ break; |
---|
359 | case UTF8_WARN_EMPTY: |
---|
360 | Perl_sv_catpvf(aTHX_ sv, "(empty string)"); |
---|
361 | break; |
---|
362 | case UTF8_WARN_CONTINUATION: |
---|
363 | Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv); |
---|
364 | break; |
---|
365 | case UTF8_WARN_NON_CONTINUATION: |
---|
366 | Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")", |
---|
367 | (UV)s[1], uv); |
---|
368 | break; |
---|
369 | case UTF8_WARN_FE_FF: |
---|
370 | Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv); |
---|
371 | break; |
---|
372 | case UTF8_WARN_SHORT: |
---|
373 | Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)", |
---|
374 | curlen, curlen == 1 ? "" : "s", expectlen); |
---|
375 | break; |
---|
376 | case UTF8_WARN_OVERFLOW: |
---|
377 | Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)", |
---|
378 | ouv, *s); |
---|
379 | break; |
---|
380 | case UTF8_WARN_SURROGATE: |
---|
381 | Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv); |
---|
382 | break; |
---|
383 | case UTF8_WARN_BOM: |
---|
384 | Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv); |
---|
385 | break; |
---|
386 | case UTF8_WARN_LONG: |
---|
387 | Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)", |
---|
388 | expectlen, expectlen == 1 ? "": "s", UNISKIP(uv)); |
---|
389 | break; |
---|
390 | case UTF8_WARN_FFFF: |
---|
391 | Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv); |
---|
392 | break; |
---|
393 | default: |
---|
394 | Perl_sv_catpvf(aTHX_ sv, "(unknown reason)"); |
---|
395 | break; |
---|
396 | } |
---|
397 | |
---|
398 | if (warning) { |
---|
399 | char *s = SvPVX(sv); |
---|
400 | |
---|
401 | if (PL_op) |
---|
402 | Perl_warner(aTHX_ WARN_UTF8, |
---|
403 | "%s in %s", s, PL_op_desc[PL_op->op_type]); |
---|
404 | else |
---|
405 | Perl_warner(aTHX_ WARN_UTF8, "%s", s); |
---|
406 | } |
---|
407 | } |
---|
408 | |
---|
409 | if (retlen) |
---|
410 | *retlen = expectlen ? expectlen : len; |
---|
411 | |
---|
412 | return 0; |
---|
413 | } |
---|
414 | |
---|
415 | /* |
---|
416 | =for apidoc A|U8* s|utf8_to_uv_simple|STRLEN *retlen |
---|
417 | |
---|
418 | Returns the character value of the first character in the string C<s> |
---|
419 | which is assumed to be in UTF8 encoding; C<retlen> will be set to the |
---|
420 | length, in bytes, of that character. |
---|
421 | |
---|
422 | If C<s> does not point to a well-formed UTF8 character, zero is |
---|
423 | returned and retlen is set, if possible, to -1. |
---|
424 | |
---|
425 | =cut |
---|
426 | */ |
---|
427 | |
---|
428 | UV |
---|
429 | Perl_utf8_to_uv_simple(pTHX_ U8* s, STRLEN* retlen) |
---|
430 | { |
---|
431 | return Perl_utf8_to_uv(aTHX_ s, UTF8_MAXLEN, retlen, 0); |
---|
432 | } |
---|
433 | |
---|
434 | /* |
---|
435 | =for apidoc A|STRLEN|utf8_length|U8* s|U8 *e |
---|
436 | |
---|
437 | Return the length of the UTF-8 char encoded string C<s> in characters. |
---|
438 | Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end |
---|
439 | up past C<e>, croaks. |
---|
440 | |
---|
441 | =cut |
---|
442 | */ |
---|
443 | |
---|
444 | STRLEN |
---|
445 | Perl_utf8_length(pTHX_ U8* s, U8* e) |
---|
446 | { |
---|
447 | STRLEN len = 0; |
---|
448 | |
---|
449 | /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. |
---|
450 | * the bitops (especially ~) can create illegal UTF-8. |
---|
451 | * In other words: in Perl UTF-8 is not just for Unicode. */ |
---|
452 | |
---|
453 | if (e < s) |
---|
454 | Perl_croak(aTHX_ "panic: utf8_length: unexpected end"); |
---|
455 | while (s < e) { |
---|
456 | U8 t = UTF8SKIP(s); |
---|
457 | |
---|
458 | if (e - s < t) |
---|
459 | Perl_croak(aTHX_ "panic: utf8_length: unaligned end"); |
---|
460 | s += t; |
---|
461 | len++; |
---|
462 | } |
---|
463 | |
---|
464 | return len; |
---|
465 | } |
---|
466 | |
---|
467 | /* |
---|
468 | =for apidoc A|IV|utf8_distance|U8 *a|U8 *b |
---|
469 | |
---|
470 | Returns the number of UTF8 characters between the UTF-8 pointers C<a> |
---|
471 | and C<b>. |
---|
472 | |
---|
473 | WARNING: use only if you *know* that the pointers point inside the |
---|
474 | same UTF-8 buffer. |
---|
475 | |
---|
476 | =cut */ |
---|
477 | |
---|
478 | IV |
---|
479 | Perl_utf8_distance(pTHX_ U8 *a, U8 *b) |
---|
480 | { |
---|
481 | IV off = 0; |
---|
482 | |
---|
483 | /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. |
---|
484 | * the bitops (especially ~) can create illegal UTF-8. |
---|
485 | * In other words: in Perl UTF-8 is not just for Unicode. */ |
---|
486 | |
---|
487 | if (a < b) { |
---|
488 | while (a < b) { |
---|
489 | U8 c = UTF8SKIP(a); |
---|
490 | |
---|
491 | if (b - a < c) |
---|
492 | Perl_croak(aTHX_ "panic: utf8_distance: unaligned end"); |
---|
493 | a += c; |
---|
494 | off--; |
---|
495 | } |
---|
496 | } |
---|
497 | else { |
---|
498 | while (b < a) { |
---|
499 | U8 c = UTF8SKIP(b); |
---|
500 | |
---|
501 | if (a - b < c) |
---|
502 | Perl_croak(aTHX_ "panic: utf8_distance: unaligned end"); |
---|
503 | b += c; |
---|
504 | off++; |
---|
505 | } |
---|
506 | } |
---|
507 | |
---|
508 | return off; |
---|
509 | } |
---|
510 | |
---|
511 | /* |
---|
512 | =for apidoc A|U8*|utf8_hop|U8 *s|I32 off |
---|
513 | |
---|
514 | Return the UTF-8 pointer C<s> displaced by C<off> characters, either |
---|
515 | forward or backward. |
---|
516 | |
---|
517 | WARNING: do not use the following unless you *know* C<off> is within |
---|
518 | the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned |
---|
519 | on the first byte of character or just after the last byte of a character. |
---|
520 | |
---|
521 | =cut */ |
---|
522 | |
---|
523 | U8 * |
---|
524 | Perl_utf8_hop(pTHX_ U8 *s, I32 off) |
---|
525 | { |
---|
526 | /* Note: cannot use UTF8_IS_...() too eagerly here since e.g |
---|
527 | * the bitops (especially ~) can create illegal UTF-8. |
---|
528 | * In other words: in Perl UTF-8 is not just for Unicode. */ |
---|
529 | |
---|
530 | if (off >= 0) { |
---|
531 | while (off--) |
---|
532 | s += UTF8SKIP(s); |
---|
533 | } |
---|
534 | else { |
---|
535 | while (off++) { |
---|
536 | s--; |
---|
537 | while (UTF8_IS_CONTINUATION(*s)) |
---|
538 | s--; |
---|
539 | } |
---|
540 | } |
---|
541 | return s; |
---|
542 | } |
---|
543 | |
---|
544 | /* |
---|
545 | =for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len |
---|
546 | |
---|
547 | Converts a string C<s> of length C<len> from UTF8 into byte encoding. |
---|
548 | Unlike C<bytes_to_utf8>, this over-writes the original string, and |
---|
549 | updates len to contain the new length. |
---|
550 | Returns zero on failure, setting C<len> to -1. |
---|
551 | |
---|
552 | =cut |
---|
553 | */ |
---|
554 | |
---|
555 | U8 * |
---|
556 | Perl_utf8_to_bytes(pTHX_ U8* s, STRLEN *len) |
---|
557 | { |
---|
558 | U8 *send; |
---|
559 | U8 *d; |
---|
560 | U8 *save = s; |
---|
561 | |
---|
562 | /* ensure valid UTF8 and chars < 256 before updating string */ |
---|
563 | for (send = s + *len; s < send; ) { |
---|
564 | U8 c = *s++; |
---|
565 | |
---|
566 | if (c >= 0x80 && |
---|
567 | ((s >= send) || |
---|
568 | ((*s++ & 0xc0) != 0x80) || ((c & 0xfe) != 0xc2))) { |
---|
569 | *len = -1; |
---|
570 | return 0; |
---|
571 | } |
---|
572 | } |
---|
573 | |
---|
574 | d = s = save; |
---|
575 | while (s < send) { |
---|
576 | if (UTF8_IS_ASCII(*s)) { |
---|
577 | *d++ = *s++; |
---|
578 | } |
---|
579 | else { |
---|
580 | STRLEN ulen; |
---|
581 | *d++ = (U8)utf8_to_uv_simple(s, &ulen); |
---|
582 | s += ulen; |
---|
583 | } |
---|
584 | } |
---|
585 | *d = '\0'; |
---|
586 | *len = d - save; |
---|
587 | return save; |
---|
588 | } |
---|
589 | |
---|
590 | /* |
---|
591 | =for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8 |
---|
592 | |
---|
593 | Converts a string C<s> of length C<len> from UTF8 into byte encoding. |
---|
594 | Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to |
---|
595 | the newly-created string, and updates C<len> to contain the new |
---|
596 | length. Returns the original string if no conversion occurs, C<len> |
---|
597 | is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to |
---|
598 | 0 if C<s> is converted or contains all 7bit characters. |
---|
599 | |
---|
600 | =cut */ |
---|
601 | |
---|
602 | U8 * |
---|
603 | Perl_bytes_from_utf8(pTHX_ U8* s, STRLEN *len, bool *is_utf8) |
---|
604 | { |
---|
605 | U8 *send; |
---|
606 | U8 *d; |
---|
607 | U8 *start = s; |
---|
608 | I32 count = 0; |
---|
609 | |
---|
610 | if (!*is_utf8) |
---|
611 | return start; |
---|
612 | |
---|
613 | /* ensure valid UTF8 and chars < 256 before converting string */ |
---|
614 | for (send = s + *len; s < send;) { |
---|
615 | U8 c = *s++; |
---|
616 | if (!UTF8_IS_ASCII(c)) { |
---|
617 | if (UTF8_IS_CONTINUATION(c) || s >= send || |
---|
618 | !UTF8_IS_CONTINUATION(*s) || UTF8_IS_DOWNGRADEABLE_START(c)) |
---|
619 | return start; |
---|
620 | s++, count++; |
---|
621 | } |
---|
622 | } |
---|
623 | |
---|
624 | *is_utf8 = 0; |
---|
625 | |
---|
626 | if (!count) |
---|
627 | return start; |
---|
628 | |
---|
629 | Newz(801, d, (*len) - count + 1, U8); |
---|
630 | s = start; start = d; |
---|
631 | while (s < send) { |
---|
632 | U8 c = *s++; |
---|
633 | |
---|
634 | if (UTF8_IS_ASCII(c)) |
---|
635 | *d++ = c; |
---|
636 | else |
---|
637 | *d++ = UTF8_ACCUMULATE(c, *s++); |
---|
638 | } |
---|
639 | *d = '\0'; |
---|
640 | *len = d - start; |
---|
641 | return start; |
---|
642 | } |
---|
643 | |
---|
644 | /* |
---|
645 | =for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len |
---|
646 | |
---|
647 | Converts a string C<s> of length C<len> from ASCII into UTF8 encoding. |
---|
648 | Returns a pointer to the newly-created string, and sets C<len> to |
---|
649 | reflect the new length. |
---|
650 | |
---|
651 | =cut |
---|
652 | */ |
---|
653 | |
---|
654 | U8* |
---|
655 | Perl_bytes_to_utf8(pTHX_ U8* s, STRLEN *len) |
---|
656 | { |
---|
657 | U8 *send; |
---|
658 | U8 *d; |
---|
659 | U8 *dst; |
---|
660 | send = s + (*len); |
---|
661 | |
---|
662 | Newz(801, d, (*len) * 2 + 1, U8); |
---|
663 | dst = d; |
---|
664 | |
---|
665 | while (s < send) { |
---|
666 | if (UTF8_IS_ASCII(*s)) |
---|
667 | *d++ = *s++; |
---|
668 | else { |
---|
669 | UV uv = *s++; |
---|
670 | |
---|
671 | *d++ = UTF8_EIGHT_BIT_HI(uv); |
---|
672 | *d++ = UTF8_EIGHT_BIT_LO(uv); |
---|
673 | } |
---|
674 | } |
---|
675 | *d = '\0'; |
---|
676 | *len = d-dst; |
---|
677 | return dst; |
---|
678 | } |
---|
679 | |
---|
680 | /* |
---|
681 | * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8. |
---|
682 | * |
---|
683 | * Destination must be pre-extended to 3/2 source. Do not use in-place. |
---|
684 | * We optimize for native, for obvious reasons. */ |
---|
685 | |
---|
686 | U8* |
---|
687 | Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) |
---|
688 | { |
---|
689 | U8* pend; |
---|
690 | U8* dstart = d; |
---|
691 | |
---|
692 | if (bytelen & 1) |
---|
693 | Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen"); |
---|
694 | |
---|
695 | pend = p + bytelen; |
---|
696 | |
---|
697 | while (p < pend) { |
---|
698 | UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */ |
---|
699 | p += 2; |
---|
700 | if (uv < 0x80) { |
---|
701 | *d++ = uv; |
---|
702 | continue; |
---|
703 | } |
---|
704 | if (uv < 0x800) { |
---|
705 | *d++ = (( uv >> 6) | 0xc0); |
---|
706 | *d++ = (( uv & 0x3f) | 0x80); |
---|
707 | continue; |
---|
708 | } |
---|
709 | if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */ |
---|
710 | UV low = *p++; |
---|
711 | if (low < 0xdc00 || low >= 0xdfff) |
---|
712 | Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); |
---|
713 | uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000; |
---|
714 | } |
---|
715 | if (uv < 0x10000) { |
---|
716 | *d++ = (( uv >> 12) | 0xe0); |
---|
717 | *d++ = (((uv >> 6) & 0x3f) | 0x80); |
---|
718 | *d++ = (( uv & 0x3f) | 0x80); |
---|
719 | continue; |
---|
720 | } |
---|
721 | else { |
---|
722 | *d++ = (( uv >> 18) | 0xf0); |
---|
723 | *d++ = (((uv >> 12) & 0x3f) | 0x80); |
---|
724 | *d++ = (((uv >> 6) & 0x3f) | 0x80); |
---|
725 | *d++ = (( uv & 0x3f) | 0x80); |
---|
726 | continue; |
---|
727 | } |
---|
728 | } |
---|
729 | *newlen = d - dstart; |
---|
730 | return d; |
---|
731 | } |
---|
732 | |
---|
733 | /* Note: this one is slightly destructive of the source. */ |
---|
734 | |
---|
735 | U8* |
---|
736 | Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) |
---|
737 | { |
---|
738 | U8* s = (U8*)p; |
---|
739 | U8* send = s + bytelen; |
---|
740 | while (s < send) { |
---|
741 | U8 tmp = s[0]; |
---|
742 | s[0] = s[1]; |
---|
743 | s[1] = tmp; |
---|
744 | s += 2; |
---|
745 | } |
---|
746 | return utf16_to_utf8(p, d, bytelen, newlen); |
---|
747 | } |
---|
748 | |
---|
749 | /* for now these are all defined (inefficiently) in terms of the utf8 versions */ |
---|
750 | |
---|
751 | bool |
---|
752 | Perl_is_uni_alnum(pTHX_ U32 c) |
---|
753 | { |
---|
754 | U8 tmpbuf[UTF8_MAXLEN+1]; |
---|
755 | uv_to_utf8(tmpbuf, (UV)c); |
---|
756 | return is_utf8_alnum(tmpbuf); |
---|
757 | } |
---|
758 | |
---|
759 | bool |
---|
760 | Perl_is_uni_alnumc(pTHX_ U32 c) |
---|
761 | { |
---|
762 | U8 tmpbuf[UTF8_MAXLEN+1]; |
---|
763 | uv_to_utf8(tmpbuf, (UV)c); |
---|
764 | return is_utf8_alnumc(tmpbuf); |
---|
765 | } |
---|
766 | |
---|
767 | bool |
---|
768 | Perl_is_uni_idfirst(pTHX_ U32 c) |
---|
769 | { |
---|
770 | U8 tmpbuf[UTF8_MAXLEN+1]; |
---|
771 | uv_to_utf8(tmpbuf, (UV)c); |
---|
772 | return is_utf8_idfirst(tmpbuf); |
---|
773 | } |
---|
774 | |
---|
775 | bool |
---|
776 | Perl_is_uni_alpha(pTHX_ U32 c) |
---|
777 | { |
---|
778 | U8 tmpbuf[UTF8_MAXLEN+1]; |
---|
779 | uv_to_utf8(tmpbuf, (UV)c); |
---|
780 | return is_utf8_alpha(tmpbuf); |
---|
781 | } |
---|
782 | |
---|
783 | bool |
---|
784 | Perl_is_uni_ascii(pTHX_ U32 c) |
---|
785 | { |
---|
786 | U8 tmpbuf[UTF8_MAXLEN+1]; |
---|
787 | uv_to_utf8(tmpbuf, (UV)c); |
---|
788 | return is_utf8_ascii(tmpbuf); |
---|
789 | } |
---|
790 | |
---|
791 | bool |
---|
792 | Perl_is_uni_space(pTHX_ U32 c) |
---|
793 | { |
---|
794 | U8 tmpbuf[UTF8_MAXLEN+1]; |
---|
795 | uv_to_utf8(tmpbuf, (UV)c); |
---|
796 | return is_utf8_space(tmpbuf); |
---|
797 | } |
---|
798 | |
---|
799 | bool |
---|
800 | Perl_is_uni_digit(pTHX_ U32 c) |
---|
801 | { |
---|
802 | U8 tmpbuf[UTF8_MAXLEN+1]; |
---|
803 | uv_to_utf8(tmpbuf, (UV)c); |
---|
804 | return is_utf8_digit(tmpbuf); |
---|
805 | } |
---|
806 | |
---|
807 | bool |
---|
808 | Perl_is_uni_upper(pTHX_ U32 c) |
---|
809 | { |
---|
810 | U8 tmpbuf[UTF8_MAXLEN+1]; |
---|
811 | uv_to_utf8(tmpbuf, (UV)c); |
---|
812 | return is_utf8_upper(tmpbuf); |
---|
813 | } |
---|
814 | |
---|
815 | bool |
---|
816 | Perl_is_uni_lower(pTHX_ U32 c) |
---|
817 | { |
---|
818 | U8 tmpbuf[UTF8_MAXLEN+1]; |
---|
819 | uv_to_utf8(tmpbuf, (UV)c); |
---|
820 | return is_utf8_lower(tmpbuf); |
---|
821 | } |
---|
822 | |
---|
823 | bool |
---|
824 | Perl_is_uni_cntrl(pTHX_ U32 c) |
---|
825 | { |
---|
826 | U8 tmpbuf[UTF8_MAXLEN+1]; |
---|
827 | uv_to_utf8(tmpbuf, (UV)c); |
---|
828 | return is_utf8_cntrl(tmpbuf); |
---|
829 | } |
---|
830 | |
---|
831 | bool |
---|
832 | Perl_is_uni_graph(pTHX_ U32 c) |
---|
833 | { |
---|
834 | U8 tmpbuf[UTF8_MAXLEN+1]; |
---|
835 | uv_to_utf8(tmpbuf, (UV)c); |
---|
836 | return is_utf8_graph(tmpbuf); |
---|
837 | } |
---|
838 | |
---|
839 | bool |
---|
840 | Perl_is_uni_print(pTHX_ U32 c) |
---|
841 | { |
---|
842 | U8 tmpbuf[UTF8_MAXLEN+1]; |
---|
843 | uv_to_utf8(tmpbuf, (UV)c); |
---|
844 | return is_utf8_print(tmpbuf); |
---|
845 | } |
---|
846 | |
---|
847 | bool |
---|
848 | Perl_is_uni_punct(pTHX_ U32 c) |
---|
849 | { |
---|
850 | U8 tmpbuf[UTF8_MAXLEN+1]; |
---|
851 | uv_to_utf8(tmpbuf, (UV)c); |
---|
852 | return is_utf8_punct(tmpbuf); |
---|
853 | } |
---|
854 | |
---|
855 | bool |
---|
856 | Perl_is_uni_xdigit(pTHX_ U32 c) |
---|
857 | { |
---|
858 | U8 tmpbuf[UTF8_MAXLEN+1]; |
---|
859 | uv_to_utf8(tmpbuf, (UV)c); |
---|
860 | return is_utf8_xdigit(tmpbuf); |
---|
861 | } |
---|
862 | |
---|
863 | U32 |
---|
864 | Perl_to_uni_upper(pTHX_ U32 c) |
---|
865 | { |
---|
866 | U8 tmpbuf[UTF8_MAXLEN+1]; |
---|
867 | uv_to_utf8(tmpbuf, (UV)c); |
---|
868 | return to_utf8_upper(tmpbuf); |
---|
869 | } |
---|
870 | |
---|
871 | U32 |
---|
872 | Perl_to_uni_title(pTHX_ U32 c) |
---|
873 | { |
---|
874 | U8 tmpbuf[UTF8_MAXLEN+1]; |
---|
875 | uv_to_utf8(tmpbuf, (UV)c); |
---|
876 | return to_utf8_title(tmpbuf); |
---|
877 | } |
---|
878 | |
---|
879 | U32 |
---|
880 | Perl_to_uni_lower(pTHX_ U32 c) |
---|
881 | { |
---|
882 | U8 tmpbuf[UTF8_MAXLEN+1]; |
---|
883 | uv_to_utf8(tmpbuf, (UV)c); |
---|
884 | return to_utf8_lower(tmpbuf); |
---|
885 | } |
---|
886 | |
---|
887 | /* for now these all assume no locale info available for Unicode > 255 */ |
---|
888 | |
---|
889 | bool |
---|
890 | Perl_is_uni_alnum_lc(pTHX_ U32 c) |
---|
891 | { |
---|
892 | return is_uni_alnum(c); /* XXX no locale support yet */ |
---|
893 | } |
---|
894 | |
---|
895 | bool |
---|
896 | Perl_is_uni_alnumc_lc(pTHX_ U32 c) |
---|
897 | { |
---|
898 | return is_uni_alnumc(c); /* XXX no locale support yet */ |
---|
899 | } |
---|
900 | |
---|
901 | bool |
---|
902 | Perl_is_uni_idfirst_lc(pTHX_ U32 c) |
---|
903 | { |
---|
904 | return is_uni_idfirst(c); /* XXX no locale support yet */ |
---|
905 | } |
---|
906 | |
---|
907 | bool |
---|
908 | Perl_is_uni_alpha_lc(pTHX_ U32 c) |
---|
909 | { |
---|
910 | return is_uni_alpha(c); /* XXX no locale support yet */ |
---|
911 | } |
---|
912 | |
---|
913 | bool |
---|
914 | Perl_is_uni_ascii_lc(pTHX_ U32 c) |
---|
915 | { |
---|
916 | return is_uni_ascii(c); /* XXX no locale support yet */ |
---|
917 | } |
---|
918 | |
---|
919 | bool |
---|
920 | Perl_is_uni_space_lc(pTHX_ U32 c) |
---|
921 | { |
---|
922 | return is_uni_space(c); /* XXX no locale support yet */ |
---|
923 | } |
---|
924 | |
---|
925 | bool |
---|
926 | Perl_is_uni_digit_lc(pTHX_ U32 c) |
---|
927 | { |
---|
928 | return is_uni_digit(c); /* XXX no locale support yet */ |
---|
929 | } |
---|
930 | |
---|
931 | bool |
---|
932 | Perl_is_uni_upper_lc(pTHX_ U32 c) |
---|
933 | { |
---|
934 | return is_uni_upper(c); /* XXX no locale support yet */ |
---|
935 | } |
---|
936 | |
---|
937 | bool |
---|
938 | Perl_is_uni_lower_lc(pTHX_ U32 c) |
---|
939 | { |
---|
940 | return is_uni_lower(c); /* XXX no locale support yet */ |
---|
941 | } |
---|
942 | |
---|
943 | bool |
---|
944 | Perl_is_uni_cntrl_lc(pTHX_ U32 c) |
---|
945 | { |
---|
946 | return is_uni_cntrl(c); /* XXX no locale support yet */ |
---|
947 | } |
---|
948 | |
---|
949 | bool |
---|
950 | Perl_is_uni_graph_lc(pTHX_ U32 c) |
---|
951 | { |
---|
952 | return is_uni_graph(c); /* XXX no locale support yet */ |
---|
953 | } |
---|
954 | |
---|
955 | bool |
---|
956 | Perl_is_uni_print_lc(pTHX_ U32 c) |
---|
957 | { |
---|
958 | return is_uni_print(c); /* XXX no locale support yet */ |
---|
959 | } |
---|
960 | |
---|
961 | bool |
---|
962 | Perl_is_uni_punct_lc(pTHX_ U32 c) |
---|
963 | { |
---|
964 | return is_uni_punct(c); /* XXX no locale support yet */ |
---|
965 | } |
---|
966 | |
---|
967 | bool |
---|
968 | Perl_is_uni_xdigit_lc(pTHX_ U32 c) |
---|
969 | { |
---|
970 | return is_uni_xdigit(c); /* XXX no locale support yet */ |
---|
971 | } |
---|
972 | |
---|
973 | U32 |
---|
974 | Perl_to_uni_upper_lc(pTHX_ U32 c) |
---|
975 | { |
---|
976 | return to_uni_upper(c); /* XXX no locale support yet */ |
---|
977 | } |
---|
978 | |
---|
979 | U32 |
---|
980 | Perl_to_uni_title_lc(pTHX_ U32 c) |
---|
981 | { |
---|
982 | return to_uni_title(c); /* XXX no locale support yet */ |
---|
983 | } |
---|
984 | |
---|
985 | U32 |
---|
986 | Perl_to_uni_lower_lc(pTHX_ U32 c) |
---|
987 | { |
---|
988 | return to_uni_lower(c); /* XXX no locale support yet */ |
---|
989 | } |
---|
990 | |
---|
991 | bool |
---|
992 | Perl_is_utf8_alnum(pTHX_ U8 *p) |
---|
993 | { |
---|
994 | if (!is_utf8_char(p)) |
---|
995 | return FALSE; |
---|
996 | if (!PL_utf8_alnum) |
---|
997 | /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true |
---|
998 | * descendant of isalnum(3), in other words, it doesn't |
---|
999 | * contain the '_'. --jhi */ |
---|
1000 | PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0); |
---|
1001 | return swash_fetch(PL_utf8_alnum, p); |
---|
1002 | /* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */ |
---|
1003 | #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */ |
---|
1004 | if (!PL_utf8_alnum) |
---|
1005 | PL_utf8_alnum = swash_init("utf8", "", |
---|
1006 | sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0); |
---|
1007 | return swash_fetch(PL_utf8_alnum, p); |
---|
1008 | #endif |
---|
1009 | } |
---|
1010 | |
---|
1011 | bool |
---|
1012 | Perl_is_utf8_alnumc(pTHX_ U8 *p) |
---|
1013 | { |
---|
1014 | if (!is_utf8_char(p)) |
---|
1015 | return FALSE; |
---|
1016 | if (!PL_utf8_alnum) |
---|
1017 | PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0); |
---|
1018 | return swash_fetch(PL_utf8_alnum, p); |
---|
1019 | /* return is_utf8_alpha(p) || is_utf8_digit(p); */ |
---|
1020 | #ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */ |
---|
1021 | if (!PL_utf8_alnum) |
---|
1022 | PL_utf8_alnum = swash_init("utf8", "", |
---|
1023 | sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0); |
---|
1024 | return swash_fetch(PL_utf8_alnum, p); |
---|
1025 | #endif |
---|
1026 | } |
---|
1027 | |
---|
1028 | bool |
---|
1029 | Perl_is_utf8_idfirst(pTHX_ U8 *p) |
---|
1030 | { |
---|
1031 | return *p == '_' || is_utf8_alpha(p); |
---|
1032 | } |
---|
1033 | |
---|
1034 | bool |
---|
1035 | Perl_is_utf8_alpha(pTHX_ U8 *p) |
---|
1036 | { |
---|
1037 | if (!is_utf8_char(p)) |
---|
1038 | return FALSE; |
---|
1039 | if (!PL_utf8_alpha) |
---|
1040 | PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0); |
---|
1041 | return swash_fetch(PL_utf8_alpha, p); |
---|
1042 | } |
---|
1043 | |
---|
1044 | bool |
---|
1045 | Perl_is_utf8_ascii(pTHX_ U8 *p) |
---|
1046 | { |
---|
1047 | if (!is_utf8_char(p)) |
---|
1048 | return FALSE; |
---|
1049 | if (!PL_utf8_ascii) |
---|
1050 | PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0); |
---|
1051 | return swash_fetch(PL_utf8_ascii, p); |
---|
1052 | } |
---|
1053 | |
---|
1054 | bool |
---|
1055 | Perl_is_utf8_space(pTHX_ U8 *p) |
---|
1056 | { |
---|
1057 | if (!is_utf8_char(p)) |
---|
1058 | return FALSE; |
---|
1059 | if (!PL_utf8_space) |
---|
1060 | PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0); |
---|
1061 | return swash_fetch(PL_utf8_space, p); |
---|
1062 | } |
---|
1063 | |
---|
1064 | bool |
---|
1065 | Perl_is_utf8_digit(pTHX_ U8 *p) |
---|
1066 | { |
---|
1067 | if (!is_utf8_char(p)) |
---|
1068 | return FALSE; |
---|
1069 | if (!PL_utf8_digit) |
---|
1070 | PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0); |
---|
1071 | return swash_fetch(PL_utf8_digit, p); |
---|
1072 | } |
---|
1073 | |
---|
1074 | bool |
---|
1075 | Perl_is_utf8_upper(pTHX_ U8 *p) |
---|
1076 | { |
---|
1077 | if (!is_utf8_char(p)) |
---|
1078 | return FALSE; |
---|
1079 | if (!PL_utf8_upper) |
---|
1080 | PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0); |
---|
1081 | return swash_fetch(PL_utf8_upper, p); |
---|
1082 | } |
---|
1083 | |
---|
1084 | bool |
---|
1085 | Perl_is_utf8_lower(pTHX_ U8 *p) |
---|
1086 | { |
---|
1087 | if (!is_utf8_char(p)) |
---|
1088 | return FALSE; |
---|
1089 | if (!PL_utf8_lower) |
---|
1090 | PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0); |
---|
1091 | return swash_fetch(PL_utf8_lower, p); |
---|
1092 | } |
---|
1093 | |
---|
1094 | bool |
---|
1095 | Perl_is_utf8_cntrl(pTHX_ U8 *p) |
---|
1096 | { |
---|
1097 | if (!is_utf8_char(p)) |
---|
1098 | return FALSE; |
---|
1099 | if (!PL_utf8_cntrl) |
---|
1100 | PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0); |
---|
1101 | return swash_fetch(PL_utf8_cntrl, p); |
---|
1102 | } |
---|
1103 | |
---|
1104 | bool |
---|
1105 | Perl_is_utf8_graph(pTHX_ U8 *p) |
---|
1106 | { |
---|
1107 | if (!is_utf8_char(p)) |
---|
1108 | return FALSE; |
---|
1109 | if (!PL_utf8_graph) |
---|
1110 | PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0); |
---|
1111 | return swash_fetch(PL_utf8_graph, p); |
---|
1112 | } |
---|
1113 | |
---|
1114 | bool |
---|
1115 | Perl_is_utf8_print(pTHX_ U8 *p) |
---|
1116 | { |
---|
1117 | if (!is_utf8_char(p)) |
---|
1118 | return FALSE; |
---|
1119 | if (!PL_utf8_print) |
---|
1120 | PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0); |
---|
1121 | return swash_fetch(PL_utf8_print, p); |
---|
1122 | } |
---|
1123 | |
---|
1124 | bool |
---|
1125 | Perl_is_utf8_punct(pTHX_ U8 *p) |
---|
1126 | { |
---|
1127 | if (!is_utf8_char(p)) |
---|
1128 | return FALSE; |
---|
1129 | if (!PL_utf8_punct) |
---|
1130 | PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0); |
---|
1131 | return swash_fetch(PL_utf8_punct, p); |
---|
1132 | } |
---|
1133 | |
---|
1134 | bool |
---|
1135 | Perl_is_utf8_xdigit(pTHX_ U8 *p) |
---|
1136 | { |
---|
1137 | if (!is_utf8_char(p)) |
---|
1138 | return FALSE; |
---|
1139 | if (!PL_utf8_xdigit) |
---|
1140 | PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0); |
---|
1141 | return swash_fetch(PL_utf8_xdigit, p); |
---|
1142 | } |
---|
1143 | |
---|
1144 | bool |
---|
1145 | Perl_is_utf8_mark(pTHX_ U8 *p) |
---|
1146 | { |
---|
1147 | if (!is_utf8_char(p)) |
---|
1148 | return FALSE; |
---|
1149 | if (!PL_utf8_mark) |
---|
1150 | PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0); |
---|
1151 | return swash_fetch(PL_utf8_mark, p); |
---|
1152 | } |
---|
1153 | |
---|
1154 | UV |
---|
1155 | Perl_to_utf8_upper(pTHX_ U8 *p) |
---|
1156 | { |
---|
1157 | UV uv; |
---|
1158 | |
---|
1159 | if (!PL_utf8_toupper) |
---|
1160 | PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0); |
---|
1161 | uv = swash_fetch(PL_utf8_toupper, p); |
---|
1162 | return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0); |
---|
1163 | } |
---|
1164 | |
---|
1165 | UV |
---|
1166 | Perl_to_utf8_title(pTHX_ U8 *p) |
---|
1167 | { |
---|
1168 | UV uv; |
---|
1169 | |
---|
1170 | if (!PL_utf8_totitle) |
---|
1171 | PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0); |
---|
1172 | uv = swash_fetch(PL_utf8_totitle, p); |
---|
1173 | return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0); |
---|
1174 | } |
---|
1175 | |
---|
1176 | UV |
---|
1177 | Perl_to_utf8_lower(pTHX_ U8 *p) |
---|
1178 | { |
---|
1179 | UV uv; |
---|
1180 | |
---|
1181 | if (!PL_utf8_tolower) |
---|
1182 | PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0); |
---|
1183 | uv = swash_fetch(PL_utf8_tolower, p); |
---|
1184 | return uv ? uv : utf8_to_uv(p,UTF8_MAXLEN,0,0); |
---|
1185 | } |
---|
1186 | |
---|
1187 | /* a "swash" is a swatch hash */ |
---|
1188 | |
---|
1189 | SV* |
---|
1190 | Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none) |
---|
1191 | { |
---|
1192 | SV* retval; |
---|
1193 | SV* tokenbufsv = sv_2mortal(NEWSV(0,0)); |
---|
1194 | dSP; |
---|
1195 | |
---|
1196 | if (!gv_stashpv(pkg, 0)) { /* demand load utf8 */ |
---|
1197 | ENTER; |
---|
1198 | Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv); |
---|
1199 | LEAVE; |
---|
1200 | } |
---|
1201 | SPAGAIN; |
---|
1202 | PUSHSTACKi(PERLSI_MAGIC); |
---|
1203 | PUSHMARK(SP); |
---|
1204 | EXTEND(SP,5); |
---|
1205 | PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg)))); |
---|
1206 | PUSHs(sv_2mortal(newSVpvn(name, strlen(name)))); |
---|
1207 | PUSHs(listsv); |
---|
1208 | PUSHs(sv_2mortal(newSViv(minbits))); |
---|
1209 | PUSHs(sv_2mortal(newSViv(none))); |
---|
1210 | PUTBACK; |
---|
1211 | ENTER; |
---|
1212 | SAVEI32(PL_hints); |
---|
1213 | PL_hints = 0; |
---|
1214 | save_re_context(); |
---|
1215 | if (PL_curcop == &PL_compiling) |
---|
1216 | /* XXX ought to be handled by lex_start */ |
---|
1217 | sv_setpv(tokenbufsv, PL_tokenbuf); |
---|
1218 | if (call_method("SWASHNEW", G_SCALAR)) |
---|
1219 | retval = newSVsv(*PL_stack_sp--); |
---|
1220 | else |
---|
1221 | retval = &PL_sv_undef; |
---|
1222 | LEAVE; |
---|
1223 | POPSTACK; |
---|
1224 | if (PL_curcop == &PL_compiling) { |
---|
1225 | STRLEN len; |
---|
1226 | char* pv = SvPV(tokenbufsv, len); |
---|
1227 | |
---|
1228 | Copy(pv, PL_tokenbuf, len+1, char); |
---|
1229 | PL_curcop->op_private = PL_hints; |
---|
1230 | } |
---|
1231 | if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) |
---|
1232 | Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref"); |
---|
1233 | return retval; |
---|
1234 | } |
---|
1235 | |
---|
1236 | UV |
---|
1237 | Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr) |
---|
1238 | { |
---|
1239 | HV* hv = (HV*)SvRV(sv); |
---|
1240 | U32 klen = UTF8SKIP(ptr) - 1; |
---|
1241 | U32 off = ptr[klen] & 127; /* NB: 64 bit always 0 when len > 1 */ |
---|
1242 | STRLEN slen; |
---|
1243 | STRLEN needents = (klen ? 64 : 128); |
---|
1244 | U8 *tmps; |
---|
1245 | U32 bit; |
---|
1246 | SV *retval; |
---|
1247 | |
---|
1248 | /* |
---|
1249 | * This single-entry cache saves about 1/3 of the utf8 overhead in test |
---|
1250 | * suite. (That is, only 7-8% overall over just a hash cache. Still, |
---|
1251 | * it's nothing to sniff at.) Pity we usually come through at least |
---|
1252 | * two function calls to get here... |
---|
1253 | * |
---|
1254 | * NB: this code assumes that swatches are never modified, once generated! |
---|
1255 | */ |
---|
1256 | |
---|
1257 | if (hv == PL_last_swash_hv && |
---|
1258 | klen == PL_last_swash_klen && |
---|
1259 | (!klen || memEQ(ptr,PL_last_swash_key,klen)) ) |
---|
1260 | { |
---|
1261 | tmps = PL_last_swash_tmps; |
---|
1262 | slen = PL_last_swash_slen; |
---|
1263 | } |
---|
1264 | else { |
---|
1265 | /* Try our second-level swatch cache, kept in a hash. */ |
---|
1266 | SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE); |
---|
1267 | |
---|
1268 | /* If not cached, generate it via utf8::SWASHGET */ |
---|
1269 | if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) { |
---|
1270 | dSP; |
---|
1271 | ENTER; |
---|
1272 | SAVETMPS; |
---|
1273 | save_re_context(); |
---|
1274 | PUSHSTACKi(PERLSI_MAGIC); |
---|
1275 | PUSHMARK(SP); |
---|
1276 | EXTEND(SP,3); |
---|
1277 | PUSHs((SV*)sv); |
---|
1278 | PUSHs(sv_2mortal(newSViv(utf8_to_uv(ptr, UTF8_MAXLEN, 0, 0) & ~(needents - 1)))); |
---|
1279 | PUSHs(sv_2mortal(newSViv(needents))); |
---|
1280 | PUTBACK; |
---|
1281 | if (call_method("SWASHGET", G_SCALAR)) |
---|
1282 | retval = newSVsv(*PL_stack_sp--); |
---|
1283 | else |
---|
1284 | retval = &PL_sv_undef; |
---|
1285 | POPSTACK; |
---|
1286 | FREETMPS; |
---|
1287 | LEAVE; |
---|
1288 | if (PL_curcop == &PL_compiling) |
---|
1289 | PL_curcop->op_private = PL_hints; |
---|
1290 | |
---|
1291 | svp = hv_store(hv, (char*)ptr, klen, retval, 0); |
---|
1292 | |
---|
1293 | if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8) |
---|
1294 | Perl_croak(aTHX_ "SWASHGET didn't return result of proper length"); |
---|
1295 | } |
---|
1296 | |
---|
1297 | PL_last_swash_hv = hv; |
---|
1298 | PL_last_swash_klen = klen; |
---|
1299 | PL_last_swash_tmps = tmps; |
---|
1300 | PL_last_swash_slen = slen; |
---|
1301 | if (klen) |
---|
1302 | Copy(ptr, PL_last_swash_key, klen, U8); |
---|
1303 | } |
---|
1304 | |
---|
1305 | switch ((int)((slen << 3) / needents)) { |
---|
1306 | case 1: |
---|
1307 | bit = 1 << (off & 7); |
---|
1308 | off >>= 3; |
---|
1309 | return (tmps[off] & bit) != 0; |
---|
1310 | case 8: |
---|
1311 | return tmps[off]; |
---|
1312 | case 16: |
---|
1313 | off <<= 1; |
---|
1314 | return (tmps[off] << 8) + tmps[off + 1] ; |
---|
1315 | case 32: |
---|
1316 | off <<= 2; |
---|
1317 | return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ; |
---|
1318 | } |
---|
1319 | Perl_croak(aTHX_ "panic: swash_fetch"); |
---|
1320 | return 0; |
---|
1321 | } |
---|