source: trunk/third/perl/utf8.c @ 14545

Revision 14545, 18.6 KB checked in by ghudson, 25 years ago (diff)
This commit was generated by cvs2svn to compensate for changes in r14544, which included commits to RCS files with non-trunk default branches.
Line 
1/*    utf8.c
2 *
3 *    Copyright (c) 1998-2000, Larry Wall
4 *
5 *    You may distribute under the terms of either the GNU General Public
6 *    License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to!  And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so.  Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
24#define PERL_IN_UTF8_C
25#include "perl.h"
26
27/* Unicode support */
28
29U8 *
30Perl_uv_to_utf8(pTHX_ U8 *d, UV uv)
31{
32    if (uv < 0x80) {
33        *d++ = uv;
34        return d;
35    }
36    if (uv < 0x800) {
37        *d++ = (( uv >>  6)         | 0xc0);
38        *d++ = (( uv        & 0x3f) | 0x80);
39        return d;
40    }
41    if (uv < 0x10000) {
42        *d++ = (( uv >> 12)         | 0xe0);
43        *d++ = (((uv >>  6) & 0x3f) | 0x80);
44        *d++ = (( uv        & 0x3f) | 0x80);
45        return d;
46    }
47    if (uv < 0x200000) {
48        *d++ = (( uv >> 18)         | 0xf0);
49        *d++ = (((uv >> 12) & 0x3f) | 0x80);
50        *d++ = (((uv >>  6) & 0x3f) | 0x80);
51        *d++ = (( uv        & 0x3f) | 0x80);
52        return d;
53    }
54    if (uv < 0x4000000) {
55        *d++ = (( uv >> 24)         | 0xf8);
56        *d++ = (((uv >> 18) & 0x3f) | 0x80);
57        *d++ = (((uv >> 12) & 0x3f) | 0x80);
58        *d++ = (((uv >>  6) & 0x3f) | 0x80);
59        *d++ = (( uv        & 0x3f) | 0x80);
60        return d;
61    }
62    if (uv < 0x80000000) {
63        *d++ = (( uv >> 30)         | 0xfc);
64        *d++ = (((uv >> 24) & 0x3f) | 0x80);
65        *d++ = (((uv >> 18) & 0x3f) | 0x80);
66        *d++ = (((uv >> 12) & 0x3f) | 0x80);
67        *d++ = (((uv >>  6) & 0x3f) | 0x80);
68        *d++ = (( uv        & 0x3f) | 0x80);
69        return d;
70    }
71#ifdef HAS_QUAD
72    if (uv < 0x1000000000LL)
73#endif
74    {
75        *d++ =                        0xfe;     /* Can't match U+FEFF! */
76        *d++ = (((uv >> 30) & 0x3f) | 0x80);
77        *d++ = (((uv >> 24) & 0x3f) | 0x80);
78        *d++ = (((uv >> 18) & 0x3f) | 0x80);
79        *d++ = (((uv >> 12) & 0x3f) | 0x80);
80        *d++ = (((uv >>  6) & 0x3f) | 0x80);
81        *d++ = (( uv        & 0x3f) | 0x80);
82        return d;
83    }
84#ifdef HAS_QUAD
85    {
86        *d++ =                        0xff;     /* Can't match U+FFFE! */
87        *d++ =                        0x80;     /* 6 Reserved bits */
88        *d++ = (((uv >> 60) & 0x0f) | 0x80);    /* 2 Reserved bits */
89        *d++ = (((uv >> 54) & 0x3f) | 0x80);
90        *d++ = (((uv >> 48) & 0x3f) | 0x80);
91        *d++ = (((uv >> 42) & 0x3f) | 0x80);
92        *d++ = (((uv >> 36) & 0x3f) | 0x80);
93        *d++ = (((uv >> 30) & 0x3f) | 0x80);
94        *d++ = (((uv >> 24) & 0x3f) | 0x80);
95        *d++ = (((uv >> 18) & 0x3f) | 0x80);
96        *d++ = (((uv >> 12) & 0x3f) | 0x80);
97        *d++ = (((uv >>  6) & 0x3f) | 0x80);
98        *d++ = (( uv        & 0x3f) | 0x80);
99        return d;
100    }
101#endif
102}
103
104/* Tests if some arbitrary number of bytes begins in a valid UTF-8 character.
105 * The actual number of bytes in the UTF-8 character will be returned if it
106 * is valid, otherwise 0. */
107int
108Perl_is_utf8_char(pTHX_ U8 *s)
109{
110    U8 u = *s;
111    int slen, len;
112
113    if (!(u & 0x80))
114        return 1;
115
116    if (!(u & 0x40))
117        return 0;
118
119    if      (!(u & 0x20))       { len = 2; }
120    else if (!(u & 0x10))       { len = 3; }
121    else if (!(u & 0x08))       { len = 4; }
122    else if (!(u & 0x04))       { len = 5; }
123    else if (!(u & 0x02))       { len = 6; }
124    else if (!(u & 0x01))       { len = 7; }
125    else                        { len = 13; } /* whoa! */
126
127    slen = len - 1;
128    s++;
129    while (slen--) {
130        if ((*s & 0xc0) != 0x80)
131            return 0;
132        s++;
133    }
134    return len;
135}
136
137UV
138Perl_utf8_to_uv(pTHX_ U8* s, I32* retlen)
139{
140    UV uv = *s;
141    int len;
142    if (!(uv & 0x80)) {
143        if (retlen)
144            *retlen = 1;
145        return *s;
146    }
147    if (!(uv & 0x40)) {
148        dTHR;
149        if (ckWARN_d(WARN_UTF8))     
150            Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
151        if (retlen)
152            *retlen = 1;
153        return *s;
154    }
155
156    if      (!(uv & 0x20))      { len = 2; uv &= 0x1f; }
157    else if (!(uv & 0x10))      { len = 3; uv &= 0x0f; }
158    else if (!(uv & 0x08))      { len = 4; uv &= 0x07; }
159    else if (!(uv & 0x04))      { len = 5; uv &= 0x03; }
160    else if (!(uv & 0x02))      { len = 6; uv &= 0x01; }
161    else if (!(uv & 0x01))      { len = 7;  uv = 0; }
162    else                        { len = 13; uv = 0; } /* whoa! */
163
164    if (retlen)
165        *retlen = len;
166    --len;
167    s++;
168    while (len--) {
169        if ((*s & 0xc0) != 0x80) {
170            dTHR;
171            if (ckWARN_d(WARN_UTF8))     
172                Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
173            if (retlen)
174                *retlen -= len + 1;
175            return 0xfffd;
176        }
177        else
178            uv = (uv << 6) | (*s++ & 0x3f);
179    }
180    return uv;
181}
182
183/* utf8_distance(a,b) is intended to be a - b in pointer arithmetic */
184
185I32
186Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
187{
188    I32 off = 0;
189    if (a < b) {
190        while (a < b) {
191            a += UTF8SKIP(a);
192            off--;
193        }
194    }
195    else {
196        while (b < a) {
197            b += UTF8SKIP(b);
198            off++;
199        }
200    }
201    return off;
202}
203
204/* WARNING: do not use the following unless you *know* off is within bounds */
205
206U8 *
207Perl_utf8_hop(pTHX_ U8 *s, I32 off)
208{
209    if (off >= 0) {
210        while (off--)
211            s += UTF8SKIP(s);
212    }
213    else {
214        while (off++) {
215            s--;
216            if (*s & 0x80) {
217                while ((*s & 0xc0) == 0x80)
218                    s--;
219            }
220        }
221    }
222    return s;
223}
224
225/* XXX NOTHING CALLS THE FOLLOWING TWO ROUTINES YET!!! */
226/*
227 * Convert native or reversed UTF-16 to UTF-8.
228 *
229 * Destination must be pre-extended to 3/2 source.  Do not use in-place.
230 * We optimize for native, for obvious reasons. */
231
232U8*
233Perl_utf16_to_utf8(pTHX_ U16* p, U8* d, I32 bytelen)
234{
235    U16* pend = p + bytelen / 2;
236    while (p < pend) {
237        UV uv = *p++;
238        if (uv < 0x80) {
239            *d++ = uv;
240            continue;
241        }
242        if (uv < 0x800) {
243            *d++ = (( uv >>  6)         | 0xc0);
244            *d++ = (( uv        & 0x3f) | 0x80);
245            continue;
246        }
247        if (uv >= 0xd800 && uv < 0xdbff) {      /* surrogates */
248            dTHR;
249            int low = *p++;
250            if (low < 0xdc00 || low >= 0xdfff) {
251                if (ckWARN_d(WARN_UTF8))     
252                    Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-16 surrogate");
253                p--;
254                uv = 0xfffd;
255            }
256            uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
257        }
258        if (uv < 0x10000) {
259            *d++ = (( uv >> 12)         | 0xe0);
260            *d++ = (((uv >>  6) & 0x3f) | 0x80);
261            *d++ = (( uv        & 0x3f) | 0x80);
262            continue;
263        }
264        else {
265            *d++ = (( uv >> 18)         | 0xf0);
266            *d++ = (((uv >> 12) & 0x3f) | 0x80);
267            *d++ = (((uv >>  6) & 0x3f) | 0x80);
268            *d++ = (( uv        & 0x3f) | 0x80);
269            continue;
270        }
271    }
272    return d;
273}
274
275/* Note: this one is slightly destructive of the source. */
276
277U8*
278Perl_utf16_to_utf8_reversed(pTHX_ U16* p, U8* d, I32 bytelen)
279{
280    U8* s = (U8*)p;
281    U8* send = s + bytelen;
282    while (s < send) {
283        U8 tmp = s[0];
284        s[0] = s[1];
285        s[1] = tmp;
286        s += 2;
287    }
288    return utf16_to_utf8(p, d, bytelen);
289}
290
291/* for now these are all defined (inefficiently) in terms of the utf8 versions */
292
293bool
294Perl_is_uni_alnum(pTHX_ U32 c)
295{
296    U8 tmpbuf[UTF8_MAXLEN];
297    uv_to_utf8(tmpbuf, (UV)c);
298    return is_utf8_alnum(tmpbuf);
299}
300
301bool
302Perl_is_uni_alnumc(pTHX_ U32 c)
303{
304    U8 tmpbuf[UTF8_MAXLEN];
305    uv_to_utf8(tmpbuf, (UV)c);
306    return is_utf8_alnumc(tmpbuf);
307}
308
309bool
310Perl_is_uni_idfirst(pTHX_ U32 c)
311{
312    U8 tmpbuf[UTF8_MAXLEN];
313    uv_to_utf8(tmpbuf, (UV)c);
314    return is_utf8_idfirst(tmpbuf);
315}
316
317bool
318Perl_is_uni_alpha(pTHX_ U32 c)
319{
320    U8 tmpbuf[UTF8_MAXLEN];
321    uv_to_utf8(tmpbuf, (UV)c);
322    return is_utf8_alpha(tmpbuf);
323}
324
325bool
326Perl_is_uni_ascii(pTHX_ U32 c)
327{
328    U8 tmpbuf[UTF8_MAXLEN];
329    uv_to_utf8(tmpbuf, (UV)c);
330    return is_utf8_ascii(tmpbuf);
331}
332
333bool
334Perl_is_uni_space(pTHX_ U32 c)
335{
336    U8 tmpbuf[UTF8_MAXLEN];
337    uv_to_utf8(tmpbuf, (UV)c);
338    return is_utf8_space(tmpbuf);
339}
340
341bool
342Perl_is_uni_digit(pTHX_ U32 c)
343{
344    U8 tmpbuf[UTF8_MAXLEN];
345    uv_to_utf8(tmpbuf, (UV)c);
346    return is_utf8_digit(tmpbuf);
347}
348
349bool
350Perl_is_uni_upper(pTHX_ U32 c)
351{
352    U8 tmpbuf[UTF8_MAXLEN];
353    uv_to_utf8(tmpbuf, (UV)c);
354    return is_utf8_upper(tmpbuf);
355}
356
357bool
358Perl_is_uni_lower(pTHX_ U32 c)
359{
360    U8 tmpbuf[UTF8_MAXLEN];
361    uv_to_utf8(tmpbuf, (UV)c);
362    return is_utf8_lower(tmpbuf);
363}
364
365bool
366Perl_is_uni_cntrl(pTHX_ U32 c)
367{
368    U8 tmpbuf[UTF8_MAXLEN];
369    uv_to_utf8(tmpbuf, (UV)c);
370    return is_utf8_cntrl(tmpbuf);
371}
372
373bool
374Perl_is_uni_graph(pTHX_ U32 c)
375{
376    U8 tmpbuf[UTF8_MAXLEN];
377    uv_to_utf8(tmpbuf, (UV)c);
378    return is_utf8_graph(tmpbuf);
379}
380
381bool
382Perl_is_uni_print(pTHX_ U32 c)
383{
384    U8 tmpbuf[UTF8_MAXLEN];
385    uv_to_utf8(tmpbuf, (UV)c);
386    return is_utf8_print(tmpbuf);
387}
388
389bool
390Perl_is_uni_punct(pTHX_ U32 c)
391{
392    U8 tmpbuf[UTF8_MAXLEN];
393    uv_to_utf8(tmpbuf, (UV)c);
394    return is_utf8_punct(tmpbuf);
395}
396
397bool
398Perl_is_uni_xdigit(pTHX_ U32 c)
399{
400    U8 tmpbuf[UTF8_MAXLEN];
401    uv_to_utf8(tmpbuf, (UV)c);
402    return is_utf8_xdigit(tmpbuf);
403}
404
405U32
406Perl_to_uni_upper(pTHX_ U32 c)
407{
408    U8 tmpbuf[UTF8_MAXLEN];
409    uv_to_utf8(tmpbuf, (UV)c);
410    return to_utf8_upper(tmpbuf);
411}
412
413U32
414Perl_to_uni_title(pTHX_ U32 c)
415{
416    U8 tmpbuf[UTF8_MAXLEN];
417    uv_to_utf8(tmpbuf, (UV)c);
418    return to_utf8_title(tmpbuf);
419}
420
421U32
422Perl_to_uni_lower(pTHX_ U32 c)
423{
424    U8 tmpbuf[UTF8_MAXLEN];
425    uv_to_utf8(tmpbuf, (UV)c);
426    return to_utf8_lower(tmpbuf);
427}
428
429/* for now these all assume no locale info available for Unicode > 255 */
430
431bool
432Perl_is_uni_alnum_lc(pTHX_ U32 c)
433{
434    return is_uni_alnum(c);     /* XXX no locale support yet */
435}
436
437bool
438Perl_is_uni_alnumc_lc(pTHX_ U32 c)
439{
440    return is_uni_alnumc(c);    /* XXX no locale support yet */
441}
442
443bool
444Perl_is_uni_idfirst_lc(pTHX_ U32 c)
445{
446    return is_uni_idfirst(c);   /* XXX no locale support yet */
447}
448
449bool
450Perl_is_uni_alpha_lc(pTHX_ U32 c)
451{
452    return is_uni_alpha(c);     /* XXX no locale support yet */
453}
454
455bool
456Perl_is_uni_ascii_lc(pTHX_ U32 c)
457{
458    return is_uni_ascii(c);     /* XXX no locale support yet */
459}
460
461bool
462Perl_is_uni_space_lc(pTHX_ U32 c)
463{
464    return is_uni_space(c);     /* XXX no locale support yet */
465}
466
467bool
468Perl_is_uni_digit_lc(pTHX_ U32 c)
469{
470    return is_uni_digit(c);     /* XXX no locale support yet */
471}
472
473bool
474Perl_is_uni_upper_lc(pTHX_ U32 c)
475{
476    return is_uni_upper(c);     /* XXX no locale support yet */
477}
478
479bool
480Perl_is_uni_lower_lc(pTHX_ U32 c)
481{
482    return is_uni_lower(c);     /* XXX no locale support yet */
483}
484
485bool
486Perl_is_uni_cntrl_lc(pTHX_ U32 c)
487{
488    return is_uni_cntrl(c);     /* XXX no locale support yet */
489}
490
491bool
492Perl_is_uni_graph_lc(pTHX_ U32 c)
493{
494    return is_uni_graph(c);     /* XXX no locale support yet */
495}
496
497bool
498Perl_is_uni_print_lc(pTHX_ U32 c)
499{
500    return is_uni_print(c);     /* XXX no locale support yet */
501}
502
503bool
504Perl_is_uni_punct_lc(pTHX_ U32 c)
505{
506    return is_uni_punct(c);     /* XXX no locale support yet */
507}
508
509bool
510Perl_is_uni_xdigit_lc(pTHX_ U32 c)
511{
512    return is_uni_xdigit(c);    /* XXX no locale support yet */
513}
514
515U32
516Perl_to_uni_upper_lc(pTHX_ U32 c)
517{
518    return to_uni_upper(c);     /* XXX no locale support yet */
519}
520
521U32
522Perl_to_uni_title_lc(pTHX_ U32 c)
523{
524    return to_uni_title(c);     /* XXX no locale support yet */
525}
526
527U32
528Perl_to_uni_lower_lc(pTHX_ U32 c)
529{
530    return to_uni_lower(c);     /* XXX no locale support yet */
531}
532
533bool
534Perl_is_utf8_alnum(pTHX_ U8 *p)
535{
536    if (!is_utf8_char(p))
537        return FALSE;
538    if (!PL_utf8_alnum)
539        PL_utf8_alnum = swash_init("utf8", "IsAlnum", &PL_sv_undef, 0, 0);
540    return swash_fetch(PL_utf8_alnum, p);
541/*    return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
542#ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
543    if (!PL_utf8_alnum)
544        PL_utf8_alnum = swash_init("utf8", "",
545            sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
546    return swash_fetch(PL_utf8_alnum, p);
547#endif
548}
549
550bool
551Perl_is_utf8_alnumc(pTHX_ U8 *p)
552{
553    if (!is_utf8_char(p))
554        return FALSE;
555    if (!PL_utf8_alnum)
556        PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
557    return swash_fetch(PL_utf8_alnum, p);
558/*    return is_utf8_alpha(p) || is_utf8_digit(p); */
559#ifdef SURPRISINGLY_SLOWER  /* probably because alpha is usually true */
560    if (!PL_utf8_alnum)
561        PL_utf8_alnum = swash_init("utf8", "",
562            sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
563    return swash_fetch(PL_utf8_alnum, p);
564#endif
565}
566
567bool
568Perl_is_utf8_idfirst(pTHX_ U8 *p)
569{
570    return *p == '_' || is_utf8_alpha(p);
571}
572
573bool
574Perl_is_utf8_alpha(pTHX_ U8 *p)
575{
576    if (!is_utf8_char(p))
577        return FALSE;
578    if (!PL_utf8_alpha)
579        PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
580    return swash_fetch(PL_utf8_alpha, p);
581}
582
583bool
584Perl_is_utf8_ascii(pTHX_ U8 *p)
585{
586    if (!is_utf8_char(p))
587        return FALSE;
588    if (!PL_utf8_ascii)
589        PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
590    return swash_fetch(PL_utf8_ascii, p);
591}
592
593bool
594Perl_is_utf8_space(pTHX_ U8 *p)
595{
596    if (!is_utf8_char(p))
597        return FALSE;
598    if (!PL_utf8_space)
599        PL_utf8_space = swash_init("utf8", "IsSpace", &PL_sv_undef, 0, 0);
600    return swash_fetch(PL_utf8_space, p);
601}
602
603bool
604Perl_is_utf8_digit(pTHX_ U8 *p)
605{
606    if (!is_utf8_char(p))
607        return FALSE;
608    if (!PL_utf8_digit)
609        PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
610    return swash_fetch(PL_utf8_digit, p);
611}
612
613bool
614Perl_is_utf8_upper(pTHX_ U8 *p)
615{
616    if (!is_utf8_char(p))
617        return FALSE;
618    if (!PL_utf8_upper)
619        PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
620    return swash_fetch(PL_utf8_upper, p);
621}
622
623bool
624Perl_is_utf8_lower(pTHX_ U8 *p)
625{
626    if (!is_utf8_char(p))
627        return FALSE;
628    if (!PL_utf8_lower)
629        PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
630    return swash_fetch(PL_utf8_lower, p);
631}
632
633bool
634Perl_is_utf8_cntrl(pTHX_ U8 *p)
635{
636    if (!is_utf8_char(p))
637        return FALSE;
638    if (!PL_utf8_cntrl)
639        PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
640    return swash_fetch(PL_utf8_cntrl, p);
641}
642
643bool
644Perl_is_utf8_graph(pTHX_ U8 *p)
645{
646    if (!is_utf8_char(p))
647        return FALSE;
648    if (!PL_utf8_graph)
649        PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
650    return swash_fetch(PL_utf8_graph, p);
651}
652
653bool
654Perl_is_utf8_print(pTHX_ U8 *p)
655{
656    if (!is_utf8_char(p))
657        return FALSE;
658    if (!PL_utf8_print)
659        PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
660    return swash_fetch(PL_utf8_print, p);
661}
662
663bool
664Perl_is_utf8_punct(pTHX_ U8 *p)
665{
666    if (!is_utf8_char(p))
667        return FALSE;
668    if (!PL_utf8_punct)
669        PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
670    return swash_fetch(PL_utf8_punct, p);
671}
672
673bool
674Perl_is_utf8_xdigit(pTHX_ U8 *p)
675{
676    if (!is_utf8_char(p))
677        return FALSE;
678    if (!PL_utf8_xdigit)
679        PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
680    return swash_fetch(PL_utf8_xdigit, p);
681}
682
683bool
684Perl_is_utf8_mark(pTHX_ U8 *p)
685{
686    if (!is_utf8_char(p))
687        return FALSE;
688    if (!PL_utf8_mark)
689        PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
690    return swash_fetch(PL_utf8_mark, p);
691}
692
693UV
694Perl_to_utf8_upper(pTHX_ U8 *p)
695{
696    UV uv;
697
698    if (!PL_utf8_toupper)
699        PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
700    uv = swash_fetch(PL_utf8_toupper, p);
701    return uv ? uv : utf8_to_uv(p,0);
702}
703
704UV
705Perl_to_utf8_title(pTHX_ U8 *p)
706{
707    UV uv;
708
709    if (!PL_utf8_totitle)
710        PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
711    uv = swash_fetch(PL_utf8_totitle, p);
712    return uv ? uv : utf8_to_uv(p,0);
713}
714
715UV
716Perl_to_utf8_lower(pTHX_ U8 *p)
717{
718    UV uv;
719
720    if (!PL_utf8_tolower)
721        PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
722    uv = swash_fetch(PL_utf8_tolower, p);
723    return uv ? uv : utf8_to_uv(p,0);
724}
725
726/* a "swash" is a swatch hash */
727
728SV*
729Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
730{
731    SV* retval;
732    char tmpbuf[256];
733    dSP;   
734
735    if (!gv_stashpv(pkg, 0)) {  /* demand load utf8 */
736        ENTER;
737        Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
738        LEAVE;
739    }
740    SPAGAIN;
741    PUSHSTACKi(PERLSI_MAGIC);
742    PUSHMARK(SP);
743    EXTEND(SP,5);
744    PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
745    PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
746    PUSHs(listsv);
747    PUSHs(sv_2mortal(newSViv(minbits)));
748    PUSHs(sv_2mortal(newSViv(none)));
749    PUTBACK;
750    ENTER;
751    SAVEI32(PL_hints);
752    PL_hints = 0;
753    save_re_context();
754    if (PL_curcop == &PL_compiling)     /* XXX ought to be handled by lex_start */
755        strncpy(tmpbuf, PL_tokenbuf, sizeof tmpbuf);
756    if (call_method("SWASHNEW", G_SCALAR))
757        retval = newSVsv(*PL_stack_sp--);   
758    else
759        retval = &PL_sv_undef;
760    LEAVE;
761    POPSTACK;
762    if (PL_curcop == &PL_compiling) {
763        strncpy(PL_tokenbuf, tmpbuf, sizeof tmpbuf);
764        PL_curcop->op_private = PL_hints;
765    }
766    if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
767        Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
768    return retval;
769}
770
771UV
772Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr)
773{
774    HV* hv = (HV*)SvRV(sv);
775    U32 klen = UTF8SKIP(ptr) - 1;
776    U32 off = ptr[klen] & 127;  /* NB: 64 bit always 0 when len > 1 */
777    STRLEN slen;
778    STRLEN needents = (klen ? 64 : 128);
779    U8 *tmps;
780    U32 bit;
781    SV *retval;
782
783    /*
784     * This single-entry cache saves about 1/3 of the utf8 overhead in test
785     * suite.  (That is, only 7-8% overall over just a hash cache.  Still,
786     * it's nothing to sniff at.)  Pity we usually come through at least
787     * two function calls to get here...
788     *
789     * NB: this code assumes that swatches are never modified, once generated!
790     */
791
792    if (hv == PL_last_swash_hv &&
793        klen == PL_last_swash_klen &&
794        (!klen || memEQ(ptr,PL_last_swash_key,klen)) )
795    {
796        tmps = PL_last_swash_tmps;
797        slen = PL_last_swash_slen;
798    }
799    else {
800        /* Try our second-level swatch cache, kept in a hash. */
801        SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
802
803        /* If not cached, generate it via utf8::SWASHGET */
804        if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
805            dSP;
806            ENTER;
807            SAVETMPS;
808            save_re_context();
809            PUSHSTACKi(PERLSI_MAGIC);
810            PUSHMARK(SP);
811            EXTEND(SP,3);
812            PUSHs((SV*)sv);
813            PUSHs(sv_2mortal(newSViv(utf8_to_uv(ptr, 0) & ~(needents - 1))));
814            PUSHs(sv_2mortal(newSViv(needents)));
815            PUTBACK;
816            if (call_method("SWASHGET", G_SCALAR))
817                retval = newSVsv(*PL_stack_sp--);   
818            else
819                retval = &PL_sv_undef;
820            POPSTACK;
821            FREETMPS;
822            LEAVE;
823            if (PL_curcop == &PL_compiling)
824                PL_curcop->op_private = PL_hints;
825
826            svp = hv_store(hv, (char*)ptr, klen, retval, 0);
827
828            if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || slen < 8)
829                Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
830        }
831
832        PL_last_swash_hv = hv;
833        PL_last_swash_klen = klen;
834        PL_last_swash_tmps = tmps;
835        PL_last_swash_slen = slen;
836        if (klen)
837            Copy(ptr, PL_last_swash_key, klen, U8);
838    }
839
840    switch ((slen << 3) / needents) {
841    case 1:
842        bit = 1 << (off & 7);
843        off >>= 3;
844        return (tmps[off] & bit) != 0;
845    case 8:
846        return tmps[off];
847    case 16:
848        off <<= 1;
849        return (tmps[off] << 8) + tmps[off + 1] ;
850    case 32:
851        off <<= 2;
852        return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
853    }
854    Perl_croak(aTHX_ "panic: swash_fetch");
855    return 0;
856}
Note: See TracBrowser for help on using the repository browser.