src - OpenBSD base system

diff options


context:
space:
mode:

author	Andrew Fresh <afresh1@cvs.openbsd.org>	2019-12-30 02:13:58 +0000
committer	Andrew Fresh <afresh1@cvs.openbsd.org>	2019-12-30 02:13:58 +0000
commit	872e2e229e80c947466f1d4838dc89e6b89140bc (patch)
tree	5cefc6ebf8f6a09292c6ffc9b546de3aeeb41a0a /gnu/usr.bin/perl/pp.c
parent	24f96998b9bcba93445125a9cf010eb192b08a7e (diff)

Fix merge issues, remove excess files - match perl-5.30.1 dist

Timing is good deraadt@, OK sthen@

Diffstat (limited to 'gnu/usr.bin/perl/pp.c')

-rw-r--r--

gnu/usr.bin/perl/pp.c

650

1 files changed, 455 insertions, 195 deletions

diff --git a/gnu/usr.bin/perl/pp.c b/gnu/usr.bin/perl/pp.c
index c6e62daa11c..babf34843e2 100644
--- a/gnu/usr.bin/perl/pp.c
+++ b/gnu/usr.bin/perl/pp.c

@@ -28,12 +28,10 @@

#include "perl.h"

#include "keywords.h"

+#include "invlist_inline.h"

#include "reentr.h"

#include "regcharclass.h"

-static const STRLEN small_mu_len = sizeof(GREEK_SMALL_LETTER_MU_UTF8) - 1;

-static const STRLEN capital_iota_len = sizeof(GREEK_CAPITAL_LETTER_IOTA_UTF8) - 1;

/* variations on pp_null */

PP(pp_stub)

@@ -364,7 +362,7 @@ PP(pp_rv2cv)

cv = SvTYPE(SvRV(gv)) == SVt_PVCV

? MUTABLE_CV(SvRV(gv))

: MUTABLE_CV(gv);

- }

+ }

else

cv = MUTABLE_CV(&PL_sv_undef);

SETs(MUTABLE_SV(cv));

@@ -670,7 +668,7 @@ PP(pp_study)

PP(pp_trans)

{

- dSP;

+ dSP;

SV *sv;

if (PL_op->op_flags & OPf_STACKED)

@@ -1161,18 +1159,18 @@ PP(pp_pow)

else if (result <= (UV)IV_MAX)

/* answer negative, fits in IV */

SETi( -(IV)result );

- else if (result == (UV)IV_MIN)

+ else if (result == (UV)IV_MIN)

/* 2's complement assumption: special case IV_MIN */

SETi( IV_MIN );

else

/* answer negative, doesn't fit */

SETn( -(NV)result );

RETURN;

- }

+ }

}

float_it:

-#endif

+#endif

{

NV right = SvNV_nomg(svr);

NV left = SvNV_nomg(svl);

@@ -1318,8 +1316,9 @@ PP(pp_multiply)

alow = aiv;

auvok = TRUE; /* effectively it's a UV now */

} else {

- /* abs, auvok == false records sign */

- alow = (aiv == IV_MIN) ? (UV)aiv : (UV)(-aiv);

+ /* abs, auvok == false records sign; Using 0- here and

+ * later to silence bogus warning from MS VC */

+ alow = (UV) (0 - (UV) aiv);

}

if (buvok) {

@@ -1331,7 +1330,7 @@ PP(pp_multiply)

buvok = TRUE; /* effectively it's a UV now */

} else {

/* abs, buvok == false records sign */

- blow = (biv == IV_MIN) ? (UV)biv : (UV)(-biv);

+ blow = (UV) (0 - (UV) biv);

}

@@ -1462,7 +1461,7 @@ PP(pp_divide)

right_non_neg = TRUE; /* effectively it's a UV now */

}

else {

- right = (biv == IV_MIN) ? (UV)biv : (UV)(-biv);

+ right = -(UV)biv;

}

/* historically undef()/0 gives a "Use of uninitialized value"

@@ -1483,7 +1482,7 @@ PP(pp_divide)

left_non_neg = TRUE; /* effectively it's a UV now */

}

else {

- left = (aiv == IV_MIN) ? (UV)aiv : (UV)(-aiv);

+ left = -(UV)aiv;

}

@@ -1503,8 +1502,11 @@ PP(pp_divide)

#endif

) {

/* Integer division can't overflow, but it can be imprecise. */

+ /* Modern compilers optimize division followed by

+ * modulo into a single div instruction */

const UV result = left / right;

- if (result * right == left) {

+ if (left % right == 0) {

SP--; /* result is valid */

if (left_non_neg == right_non_neg) {

/* signs identical, result is positive. */

@@ -1563,7 +1565,7 @@ PP(pp_modulo)

right = biv;

right_neg = FALSE; /* effectively it's a UV now */

} else {

- right = (biv == IV_MIN) ? (UV)biv : (UV)(-biv);

+ right = (UV) (0 - (UV) biv);

}

@@ -1593,7 +1595,7 @@ PP(pp_modulo)

left = aiv;

left_neg = FALSE; /* effectively it's a UV now */

} else {

- left = (aiv == IV_MIN) ? (UV)aiv : (UV)(-aiv);

+ left = (UV) (0 - (UV) aiv);

}

@@ -1891,8 +1893,8 @@ PP(pp_subtract)

if (aiv >= 0) {

auv = aiv;

auvok = 1; /* Now acting as a sign flag. */

- } else { /* 2s complement assumption for IV_MIN */

- auv = (aiv == IV_MIN) ? (UV)aiv : (UV)-aiv;

+ } else {

+ auv = (UV) (0 - (UV) aiv);

}

a_valid = 1;

@@ -1903,7 +1905,7 @@ PP(pp_subtract)

UV result;

UV buv;

bool buvok = SvUOK(svr);

if (buvok)

buv = SvUVX(svr);

else {

@@ -1912,7 +1914,7 @@ PP(pp_subtract)

buv = biv;

buvok = 1;

} else

- buv = (biv == IV_MIN) ? (UV)biv : (UV)-biv;

+ buv = (UV) (0 - (UV) biv);

}

/* ?uvok if value is >= 0. basically, flagged as UV if it's +ve,

else "IV" now, independent of how it came in.

@@ -2053,7 +2055,7 @@ PP(pp_lt)

dSP;

SV *left, *right;

- tryAMAGICbin_MG(lt_amg, AMGf_set|AMGf_numeric);

+ tryAMAGICbin_MG(lt_amg, AMGf_numeric);

right = POPs;

left = TOPs;

SETs(boolSV(

@@ -2069,7 +2071,7 @@ PP(pp_gt)

dSP;

SV *left, *right;

- tryAMAGICbin_MG(gt_amg, AMGf_set|AMGf_numeric);

+ tryAMAGICbin_MG(gt_amg, AMGf_numeric);

right = POPs;

left = TOPs;

SETs(boolSV(

@@ -2085,7 +2087,7 @@ PP(pp_le)

dSP;

SV *left, *right;

- tryAMAGICbin_MG(le_amg, AMGf_set|AMGf_numeric);

+ tryAMAGICbin_MG(le_amg, AMGf_numeric);

right = POPs;

left = TOPs;

SETs(boolSV(

@@ -2101,7 +2103,7 @@ PP(pp_ge)

dSP;

SV *left, *right;

- tryAMAGICbin_MG(ge_amg, AMGf_set|AMGf_numeric);

+ tryAMAGICbin_MG(ge_amg, AMGf_numeric);

right = POPs;

left = TOPs;

SETs(boolSV(

@@ -2117,7 +2119,7 @@ PP(pp_ne)

dSP;

SV *left, *right;

- tryAMAGICbin_MG(ne_amg, AMGf_set|AMGf_numeric);

+ tryAMAGICbin_MG(ne_amg, AMGf_numeric);

right = POPs;

left = TOPs;

SETs(boolSV(

@@ -2249,7 +2251,7 @@ PP(pp_sle)

break;

}

- tryAMAGICbin_MG(amg_type, AMGf_set);

+ tryAMAGICbin_MG(amg_type, 0);

{

dPOPTOPssrl;

const int cmp =

@@ -2267,7 +2269,7 @@ PP(pp_sle)

PP(pp_seq)

{

dSP;

- tryAMAGICbin_MG(seq_amg, AMGf_set);

+ tryAMAGICbin_MG(seq_amg, 0);

{

dPOPTOPssrl;

SETs(boolSV(sv_eq_flags(left, right, 0)));

@@ -2278,7 +2280,7 @@ PP(pp_seq)

PP(pp_sne)

{

dSP;

- tryAMAGICbin_MG(sne_amg, AMGf_set);

+ tryAMAGICbin_MG(sne_amg, 0);

{

dPOPTOPssrl;

SETs(boolSV(!sv_eq_flags(left, right, 0)));

@@ -2513,7 +2515,7 @@ PP(pp_not)

dSP;

SV *sv;

- tryAMAGICun_MG(not_amg, AMGf_set);

+ tryAMAGICun_MG(not_amg, 0);

sv = *PL_stack_sp;

*PL_stack_sp = boolSV(!SvTRUE_nomg_NN(sv));

return NORMAL;

@@ -2710,7 +2712,7 @@ PP(pp_i_subtract)

PP(pp_i_lt)

{

dSP;

- tryAMAGICbin_MG(lt_amg, AMGf_set);

+ tryAMAGICbin_MG(lt_amg, 0);

{

dPOPTOPiirl_nomg;

SETs(boolSV(left < right));

@@ -2721,7 +2723,7 @@ PP(pp_i_lt)

PP(pp_i_gt)

{

dSP;

- tryAMAGICbin_MG(gt_amg, AMGf_set);

+ tryAMAGICbin_MG(gt_amg, 0);

{

dPOPTOPiirl_nomg;

SETs(boolSV(left > right));

@@ -2732,7 +2734,7 @@ PP(pp_i_gt)

PP(pp_i_le)

{

dSP;

- tryAMAGICbin_MG(le_amg, AMGf_set);

+ tryAMAGICbin_MG(le_amg, 0);

{

dPOPTOPiirl_nomg;

SETs(boolSV(left <= right));

@@ -2743,7 +2745,7 @@ PP(pp_i_le)

PP(pp_i_ge)

{

dSP;

- tryAMAGICbin_MG(ge_amg, AMGf_set);

+ tryAMAGICbin_MG(ge_amg, 0);

{

dPOPTOPiirl_nomg;

SETs(boolSV(left >= right));

@@ -2754,7 +2756,7 @@ PP(pp_i_ge)

PP(pp_i_eq)

{

dSP;

- tryAMAGICbin_MG(eq_amg, AMGf_set);

+ tryAMAGICbin_MG(eq_amg, 0);

{

dPOPTOPiirl_nomg;

SETs(boolSV(left == right));

@@ -2765,7 +2767,7 @@ PP(pp_i_eq)

PP(pp_i_ne)

{

dSP;

- tryAMAGICbin_MG(ne_amg, AMGf_set);

+ tryAMAGICbin_MG(ne_amg, 0);

{

dPOPTOPiirl_nomg;

SETs(boolSV(left != right));

@@ -2891,7 +2893,7 @@ PP(pp_rand)

{

dSP;

NV value;

if (MAXARG < 1)

{

EXTEND(SP, 1);

@@ -2942,13 +2944,12 @@ PP(pp_srand)

"Integer overflow in srand");

anum = UV_MAX;

}

- (void)srand48_deterministic((Rand_seed_t)anum);

}

else {

anum = seed();

- (void)seedDrand01((Rand_seed_t)anum);

}

+ (void)seedDrand01((Rand_seed_t)anum);

PL_srand_called = TRUE;

if (anum)

XPUSHu(anum);

@@ -3063,7 +3064,7 @@ PP(pp_oct)

/* If Unicode, try to downgrade

* If not possible, croak. */

SV* const tsv = sv_2mortal(newSVsv(sv));

SvUTF8_on(tsv);

sv_utf8_downgrade(tsv, FALSE);

tmps = SvPV_const(tsv, len);

@@ -3538,7 +3539,7 @@ PP(pp_index)

/* $lex = (index() == -1) */

sv_setsv(TARG, TOPs);

}

- else

+ else

PUSHi(retval);

RETURN;

}

@@ -3680,7 +3681,7 @@ PP(pp_crypt)

#endif

}

-/* Generally UTF-8 and UTF-EBCDIC are indistinguishable at this level. So

+/* Generally UTF-8 and UTF-EBCDIC are indistinguishable at this level. So

* most comments below say UTF-8, when in fact they mean UTF-EBCDIC as well */

@@ -3710,6 +3711,7 @@ PP(pp_ucfirst)

STRLEN tculen; /* tculen is the byte length of the freshly titlecased (or

* lowercased) character stored in tmpbuf. May be either

* UTF-8 or not, but in either case is the number of bytes */

+ bool remove_dot_above = FALSE;

s = (const U8*)SvPV_const(source, slen);

@@ -3746,12 +3748,45 @@ PP(pp_ucfirst)

#endif

}

else {

#ifdef USE_LOCALE_CTYPE

_toLOWER_utf8_flags(s, s + slen, tmpbuf, &tculen, IN_LC_RUNTIME(LC_CTYPE));

+ /* In turkic locales, lower casing an 'I' normally yields U+0131,

+ * LATIN SMALL LETTER DOTLESS I, but not if the grapheme also

+ * contains a COMBINING DOT ABOVE. Instead it is treated like

+ * LATIN CAPITAL LETTER I WITH DOT ABOVE lowercased to 'i'. The

+ * call to lowercase above has handled this. But SpecialCasing.txt

+ * says we are supposed to remove the COMBINING DOT ABOVE. We can

+ * tell if we have this situation if I ==> i in a turkic locale. */

+ if ( UNLIKELY(PL_in_utf8_turkic_locale)

+ && IN_LC_RUNTIME(LC_CTYPE)

+ && (UNLIKELY(*s == 'I' && tmpbuf[0] == 'i')))

+ {

+ /* Here, we know there was a COMBINING DOT ABOVE. We won't be

+ * able to handle this in-place. */

+ inplace = FALSE;

+ /* It seems likely that the DOT will immediately follow the

+ * 'I'. If so, we can remove it simply by indicating to the

+ * code below to start copying the source just beyond the DOT.

+ * We know its length is 2 */

+ if (LIKELY(memBEGINs(s + 1, s + slen, COMBINING_DOT_ABOVE_UTF8))) {

+ ulen += 2;

+ }

+ else { /* But if it doesn't follow immediately, set a flag for

+ the code below */

+ remove_dot_above = TRUE;

+ }

#else

+ PERL_UNUSED_VAR(remove_dot_above);

_toLOWER_utf8_flags(s, s + slen, tmpbuf, &tculen, 0);

#endif

- }

+ }

/* we can't do in-place if the length changes. */

if (ulen != tculen) inplace = FALSE;

@@ -3759,47 +3794,69 @@ PP(pp_ucfirst)

}

else { /* Non-zero length, non-UTF-8, Need to consider locale and if

* latin1 is treated as caseless. Note that a locale takes

- * precedence */

+ * precedence */

ulen = 1; /* Original character is 1 byte */

tculen = 1; /* Most characters will require one byte, but this will

* need to be overridden for the tricky ones */

need = slen + 1;

- if (op_type == OP_LCFIRST) {

- /* lower case the first letter: no trickiness for any character */

#ifdef USE_LOCALE_CTYPE

- if (IN_LC_RUNTIME(LC_CTYPE)) {

- *tmpbuf = toLOWER_LC(*s);

- }

- else

-#endif

+ if (IN_LC_RUNTIME(LC_CTYPE)) {

+ if ( UNLIKELY(PL_in_utf8_turkic_locale)

+ && ( (op_type == OP_LCFIRST && UNLIKELY(*s == 'I'))

+ || (op_type == OP_UCFIRST && UNLIKELY(*s == 'i'))))

{

- *tmpbuf = (IN_UNI_8_BIT)

- ? toLOWER_LATIN1(*s)

- : toLOWER(*s);

+ if (*s == 'I') { /* lcfirst('I') */

+ tmpbuf[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);

+ tmpbuf[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);

+ }

+ else { /* ucfirst('i') */

+ tmpbuf[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);

+ tmpbuf[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);

+ }

+ tculen = 2;

+ inplace = FALSE;

+ doing_utf8 = TRUE;

+ convert_source_to_utf8 = TRUE;

+ need += variant_under_utf8_count(s, s + slen);

}

- }

-#ifdef USE_LOCALE_CTYPE

- /* is ucfirst() */

- else if (IN_LC_RUNTIME(LC_CTYPE)) {

- if (IN_UTF8_CTYPE_LOCALE) {

- goto do_uni_rules;

+ else if (op_type == OP_LCFIRST) {

+ /* For lc, there are no gotchas for UTF-8 locales (other than

+ * the turkish ones already handled above) */

+ *tmpbuf = toLOWER_LC(*s);

}

+ else { /* ucfirst */

- *tmpbuf = (U8) toUPPER_LC(*s); /* This would be a bug if any

- locales have upper and title case

- different */

- }

+ /* But for uc, some characters require special handling */

+ if (IN_UTF8_CTYPE_LOCALE) {

+ goto do_uni_rules;

+ }

+ /* This would be a bug if any locales have upper and title case

+ * different */

+ *tmpbuf = (U8) toUPPER_LC(*s);

+ }

+ else

#endif

- else if (! IN_UNI_8_BIT) {

- *tmpbuf = toUPPER(*s); /* Returns caseless for non-ascii, or

- * on EBCDIC machines whatever the

- * native function does */

- }

+ /* Here, not in locale. If not using Unicode rules, is a simple

+ * lower/upper, depending */

+ if (! IN_UNI_8_BIT) {

+ *tmpbuf = (op_type == OP_LCFIRST)

+ ? toLOWER(*s)

+ : toUPPER(*s);

+ }

+ else if (op_type == OP_LCFIRST) {

+ /* lower case the first letter: no trickiness for any character */

+ *tmpbuf = toLOWER_LATIN1(*s);

+ }

else {

/* Here, is ucfirst non-UTF-8, not in locale (unless that locale is

- * UTF-8, which we treat as not in locale), and cased latin1 */

+ * non-turkic UTF-8, which we treat as not in locale), and cased

+ * latin1 */

UV title_ord;

#ifdef USE_LOCALE_CTYPE

do_uni_rules:

@@ -3823,16 +3880,19 @@ PP(pp_ucfirst)

inplace = FALSE;

/* If the result won't fit in a byte, the entire result

- * will have to be in UTF-8. Assume worst case sizing in

- * conversion. (all latin1 characters occupy at most two

- * bytes in utf8) */

+ * will have to be in UTF-8. Allocate enough space for the

+ * expanded first byte, and if UTF-8, the rest of the input

+ * string, some or all of which may also expand to two

+ * bytes, plus the terminating NUL. */

if (title_ord > 255) {

doing_utf8 = TRUE;

convert_source_to_utf8 = TRUE;

- need = slen * 2 + 1;

+ need = slen

+ + variant_under_utf8_count(s, s + slen)

+ + 1;

/* The (converted) UTF-8 and UTF-EBCDIC lengths of all

- * (both) characters whose title case is above 255 is

+ * characters whose title case is above 255 is

* 2. */

ulen = 2;

}

@@ -3876,6 +3936,29 @@ PP(pp_ucfirst)

* of the string. */

sv_setpvn(dest, (char*)tmpbuf, tculen);

if (slen > ulen) {

+ /* But this boolean being set means we are in a turkic

+ * locale, and there is a DOT character that needs to be

+ * removed, and it isn't immediately after the current

+ * character. Keep concatenating characters to the output

+ * one at a time, until we find the DOT, which we simply

+ * skip */

+ if (UNLIKELY(remove_dot_above)) {

+ do {

+ Size_t this_len = UTF8SKIP(s + ulen);

+ sv_catpvn(dest, (char*)(s + ulen), this_len);

+ ulen += this_len;

+ if (memBEGINs(s + ulen, s + slen, COMBINING_DOT_ABOVE_UTF8)) {

+ ulen += 2;

+ break;

+ }

+ } while (s + ulen < s + slen);

+ }

+ /* The rest of the string can be concatenated unchanged,

+ * all at once */

sv_catpvn(dest, (char*)(s + ulen), slen - ulen);

}

@@ -3887,15 +3970,18 @@ PP(pp_ucfirst)

* into tmpbuf. First put that into dest, and then append the

* rest of the source, converting it to UTF-8 as we go. */

- /* Assert tculen is 2 here because the only two characters that

+ /* Assert tculen is 2 here because the only characters that

* get to this part of the code have 2-byte UTF-8 equivalents */

+ assert(tculen == 2);

*d++ = *tmpbuf;

*d++ = *(tmpbuf + 1);

s++; /* We have just processed the 1st char */

- for (; s < send; s++) {

- d = uvchr_to_utf8(d, *s);

- }

+ while (s < send) {

+ append_utf8_from_native_byte(*s, &d);

+ s++;

+ }

*d = '\0';

SvCUR_set(dest, d - (U8*)SvPVX_const(dest));

}

@@ -3907,7 +3993,7 @@ PP(pp_ucfirst)

}

- else { /* Neither source nor dest are in or need to be UTF-8 */

+ else { /* Neither source nor dest are, nor need to be UTF-8 */

if (slen) {

if (inplace) { /* in-place, only need to change the 1st char */

*d = *tmpbuf;

@@ -3927,7 +4013,7 @@ PP(pp_ucfirst)

/* In a "use bytes" we don't treat the source as UTF-8, but, still want

* the destination to retain that flag */

- if (SvUTF8(source) && ! IN_BYTES)

+ if (DO_UTF8(source))

SvUTF8_on(dest);

if (!inplace) { /* Finish the rest of the string, unchanged */

@@ -3948,11 +4034,9 @@ PP(pp_ucfirst)

return NORMAL;

}

-/* There's so much setup/teardown code common between uc and lc, I wonder if

- it would be worth merging the two, and just having a switch outside each

- of the three tight loops. There is less and less commonality though */

PP(pp_uc)

{

+ dVAR;

dSP;

SV *source = TOPs;

STRLEN len;

@@ -4017,6 +4101,8 @@ PP(pp_uc)

const U8 *const send = s + len;

U8 tmpbuf[UTF8_MAXBYTES_CASE+1];

+#define GREEK_CAPITAL_LETTER_IOTA 0x0399

+#define COMBINING_GREEK_YPOGEGRAMMENI 0x0345

/* All occurrences of these are to be moved to follow any other marks.

* This is context-dependent. We may not be passed enough context to

* move the iota subscript beyond all of them, but we do the best we can

@@ -4033,12 +4119,16 @@ PP(pp_uc)

STRLEN u;

STRLEN ulen;

UV uv;

- if (in_iota_subscript && ! _is_utf8_mark(s)) {

+ if (UNLIKELY(in_iota_subscript)) {

+ UV cp = utf8_to_uvchr_buf(s, send, NULL);

- /* A non-mark. Time to output the iota subscript */

- Copy(GREEK_CAPITAL_LETTER_IOTA_UTF8, d, capital_iota_len, U8);

- d += capital_iota_len;

- in_iota_subscript = FALSE;

+ if (! _invlist_contains_cp(PL_utf8_mark, cp)) {

+ /* A non-mark. Time to output the iota subscript */

+ *d++ = UTF8_TWO_BYTE_HI(GREEK_CAPITAL_LETTER_IOTA);

+ *d++ = UTF8_TWO_BYTE_LO(GREEK_CAPITAL_LETTER_IOTA);

+ in_iota_subscript = FALSE;

+ }

}

/* Then handle the current character. Get the changed case value

@@ -4050,8 +4140,6 @@ PP(pp_uc)

#else

uv = _toUPPER_utf8_flags(s, send, tmpbuf, &ulen, 0);

#endif

-#define GREEK_CAPITAL_LETTER_IOTA 0x0399

-#define COMBINING_GREEK_YPOGEGRAMMENI 0x0345

if (uv == GREEK_CAPITAL_LETTER_IOTA

&& utf8_to_uvchr_buf(s, send, 0) == COMBINING_GREEK_YPOGEGRAMMENI)

{

@@ -4065,9 +4153,10 @@ PP(pp_uc)

/* If someone uppercases one million U+03B0s we SvGROW()

* one million times. Or we could try guessing how much to

- * allocate without allocating too much. Such is life.

- * See corresponding comment in lc code for another option

- * */

+ * allocate without allocating too much. But we can't

+ * really guess without examining the rest of the string.

+ * Such is life. See corresponding comment in lc code for

+ * another option */

d = o + (U8*) SvGROW(dest, min);

}

Copy(tmpbuf, d, ulen, U8);

@@ -4076,8 +4165,8 @@ PP(pp_uc)

s += u;

}

if (in_iota_subscript) {

- Copy(GREEK_CAPITAL_LETTER_IOTA_UTF8, d, capital_iota_len, U8);

- d += capital_iota_len;

+ *d++ = UTF8_TWO_BYTE_HI(GREEK_CAPITAL_LETTER_IOTA);

+ *d++ = UTF8_TWO_BYTE_LO(GREEK_CAPITAL_LETTER_IOTA);

}

SvUTF8_on(dest);

*d = '\0';

@@ -4111,16 +4200,27 @@ PP(pp_uc)

do_uni_rules:

#endif

for (; s < send; d++, s++) {

+ Size_t extra;

*d = toUPPER_LATIN1_MOD(*s);

- if (LIKELY(*d != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {

+ if ( LIKELY(*d != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)

+#ifdef USE_LOCALE_CTYPE

+ && (LIKELY( ! PL_in_utf8_turkic_locale

+ || ! IN_LC_RUNTIME(LC_CTYPE))

+ || *s != 'i')

+#endif

+ ) {

continue;

}

/* The mainstream case is the tight loop above. To avoid

- * extra tests in that, all three characters that require

- * special handling are mapped by the MOD to the one tested

- * just above.

- * Use the source to distinguish between the three cases */

+ * extra tests in that, all three characters that always

+ * require special handling are mapped by the MOD to the

+ * one tested just above. Use the source to distinguish

+ * between those cases */

#if UNICODE_MAJOR_VERSION > 2 \

|| (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \

@@ -4129,7 +4229,7 @@ PP(pp_uc)

/* uc() of this requires 2 characters, but they are

* ASCII. If not enough room, grow the string */

- if (SvLEN(dest) < ++min) {

+ if (SvLEN(dest) < ++min) {

const UV o = d - (U8*)SvPVX_const(dest);

d = o + (U8*) SvGROW(dest, min);

}

@@ -4138,59 +4238,105 @@ PP(pp_uc)

}

#endif

- /* The other two special handling characters have their

+ /* The other special handling characters have their

* upper cases outside the latin1 range, hence need to be

- * in UTF-8, so the whole result needs to be in UTF-8. So,

- * here we are somewhere in the middle of processing a

- * non-UTF-8 string, and realize that we will have to convert

- * the whole thing to UTF-8. What to do? There are

- * several possibilities. The simplest to code is to

- * convert what we have so far, set a flag, and continue on

- * in the loop. The flag would be tested each time through

- * the loop, and if set, the next character would be

- * converted to UTF-8 and stored. But, I (khw) didn't want

- * to slow down the mainstream case at all for this fairly

- * rare case, so I didn't want to add a test that didn't

- * absolutely have to be there in the loop, besides the

- * possibility that it would get too complicated for

- * optimizers to deal with. Another possibility is to just

- * give up, convert the source to UTF-8, and restart the

- * function that way. Another possibility is to convert

- * both what has already been processed and what is yet to

- * come separately to UTF-8, then jump into the loop that

- * handles UTF-8. But the most efficient time-wise of the

- * ones I could think of is what follows, and turned out to

- * not require much extra code. */

- /* Convert what we have so far into UTF-8, telling the

+ * in UTF-8, so the whole result needs to be in UTF-8.

+ *

+ * So, here we are somewhere in the middle of processing a

+ * non-UTF-8 string, and realize that we will have to

+ * convert the whole thing to UTF-8. What to do? There

+ * are several possibilities. The simplest to code is to

+ * convert what we have so far, set a flag, and continue on

+ * in the loop. The flag would be tested each time through

+ * the loop, and if set, the next character would be

+ * converted to UTF-8 and stored. But, I (khw) didn't want

+ * to slow down the mainstream case at all for this fairly

+ * rare case, so I didn't want to add a test that didn't

+ * absolutely have to be there in the loop, besides the

+ * possibility that it would get too complicated for

+ * optimizers to deal with. Another possibility is to just

+ * give up, convert the source to UTF-8, and restart the

+ * function that way. Another possibility is to convert

+ * both what has already been processed and what is yet to

+ * come separately to UTF-8, then jump into the loop that

+ * handles UTF-8. But the most efficient time-wise of the

+ * ones I could think of is what follows, and turned out to

+ * not require much extra code.

+ *

+ * First, calculate the extra space needed for the

+ * remainder of the source needing to be in UTF-8. Except

+ * for the 'i' in Turkic locales, in UTF-8 strings, the

+ * uppercase of a character below 256 occupies the same

+ * number of bytes as the original. Therefore, the space

+ * needed is the that number plus the number of characters

+ * that become two bytes when converted to UTF-8, plus, in

+ * turkish locales, the number of 'i's. */

+ extra = send - s + variant_under_utf8_count(s, send);

+#ifdef USE_LOCALE_CTYPE

+ if (UNLIKELY(*s == 'i')) { /* We wouldn't get an 'i' here

+ unless are in a Turkic

+ locale */

+ const U8 * s_peek = s;

+ do {

+ extra++;

+ s_peek = (U8 *) memchr(s_peek + 1, 'i',

+ send - (s_peek + 1));

+ } while (s_peek != NULL);

+ }

+#endif

+ /* Convert what we have so far into UTF-8, telling the

* function that we know it should be converted, and to

* allow extra space for what we haven't processed yet.

- * Assume the worst case space requirements for converting

- * what we haven't processed so far: that it will require

- * two bytes for each remaining source character, plus the

- * NUL at the end. This may cause the string pointer to

- * move, so re-find it. */

+ *

+ * This may cause the string pointer to move, so need to

+ * save and re-find it. */

len = d - (U8*)SvPVX_const(dest);

SvCUR_set(dest, len);

len = sv_utf8_upgrade_flags_grow(dest,

SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,

- (send -s) * 2 + 1);

+ extra

+ + 1 /* trailing NUL */ );

d = (U8*)SvPVX(dest) + len;

- /* Now process the remainder of the source, converting to

- * upper and UTF-8. If a resulting byte is invariant in

- * UTF-8, output it as-is, otherwise convert to UTF-8 and

- * append it to the output. */

- for (; s < send; s++) {

- (void) _to_upper_title_latin1(*s, d, &len, 'S');

- d += len;

- }

+ /* Now process the remainder of the source, simultaneously

+ * converting to upper and UTF-8.

+ *

+ * To avoid extra tests in the loop body, and since the

+ * loop is so simple, split out the rare Turkic case into

+ * its own loop */

- /* Here have processed the whole source; no need to continue

- * with the outer loop. Each character has been converted

- * to upper case and converted to UTF-8 */

+#ifdef USE_LOCALE_CTYPE

+ if ( UNLIKELY(PL_in_utf8_turkic_locale)

+ && UNLIKELY(IN_LC_RUNTIME(LC_CTYPE)))

+ {

+ for (; s < send; s++) {

+ if (*s == 'i') {

+ *d++ = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);

+ *d++ = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);

+ }

+ else {

+ (void) _to_upper_title_latin1(*s, d, &len, 'S');

+ d += len;

+ }

+ else

+#endif

+ for (; s < send; s++) {

+ (void) _to_upper_title_latin1(*s, d, &len, 'S');

+ d += len;

+ }

+ /* Here have processed the whole source; no need to

+ * continue with the outer loop. Each character has been

+ * converted to upper case and converted to UTF-8. */

break;

} /* End of processing all latin1-style chars */

} /* End of processing all chars */

@@ -4222,15 +4368,26 @@ PP(pp_lc)

SV *dest;

const U8 *s;

U8 *d;

+ bool has_turkic_I = FALSE;

SvGETMAGIC(source);

if ( SvPADTMP(source)

&& !SvREADONLY(source) && SvPOK(source)

- && !DO_UTF8(source)) {

+ && !DO_UTF8(source)

+#ifdef USE_LOCALE_CTYPE

+ && ( LIKELY(! IN_LC_RUNTIME(LC_CTYPE))

+ || LIKELY(! PL_in_utf8_turkic_locale))

+#endif

+ ) {

- /* We can convert in place, as lowercasing anything in the latin1 range

- * (or else DO_UTF8 would have been on) doesn't lengthen it */

+ /* We can convert in place, as, outside of Turkic UTF-8 locales,

+ * lowercasing anything in the latin1 range (or else DO_UTF8 would have

+ * been on) doesn't lengthen it. */

dest = source;

s = d = (U8*)SvPV_force_nomg(source, len);

min = len + 1;

@@ -4252,7 +4409,38 @@ PP(pp_lc)

#ifdef USE_LOCALE_CTYPE

if (IN_LC_RUNTIME(LC_CTYPE)) {

+ const U8 * next_I;

_CHECK_AND_WARN_PROBLEMATIC_LOCALE;

+ /* Lowercasing in a Turkic locale can cause non-UTF-8 to need to become

+ * UTF-8 for the single case of the character 'I' */

+ if ( UNLIKELY(PL_in_utf8_turkic_locale)

+ && ! DO_UTF8(source)

+ && (next_I = (U8 *) memchr(s, 'I', len)))

+ {

+ Size_t I_count = 0;

+ const U8 *const send = s + len;

+ do {

+ I_count++;

+ next_I = (U8 *) memchr(next_I + 1, 'I',

+ send - (next_I + 1));

+ } while (next_I != NULL);

+ /* Except for the 'I', in UTF-8 strings, the lower case of a

+ * character below 256 occupies the same number of bytes as the

+ * original. Therefore, the space needed is the original length

+ * plus I_count plus the number of characters that become two bytes

+ * when converted to UTF-8 */

+ sv_utf8_upgrade_flags_grow(dest, 0, len

+ + I_count

+ + variant_under_utf8_count(s, send)

+ + 1 /* Trailing NUL */ );

+ d = (U8*)SvPVX(dest);

+ has_turkic_I = TRUE;

+ }

}

#endif

@@ -4263,19 +4451,48 @@ PP(pp_lc)

if (DO_UTF8(source)) {

const U8 *const send = s + len;

U8 tmpbuf[UTF8_MAXBYTES_CASE+1];

+ bool remove_dot_above = FALSE;

while (s < send) {

const STRLEN u = UTF8SKIP(s);

STRLEN ulen;

#ifdef USE_LOCALE_CTYPE

_toLOWER_utf8_flags(s, send, tmpbuf, &ulen, IN_LC_RUNTIME(LC_CTYPE));

+ /* If we are in a Turkic locale, we have to do more work. As noted

+ * in the comments for lcfirst, there is a special case if a 'I'

+ * is in a grapheme with COMBINING DOT ABOVE UTF8. It turns into a

+ * 'i', and the DOT must be removed. We check for that situation,

+ * and set a flag if the DOT is there. Then each time through the

+ * loop, we have to see if we need to remove the next DOT above,

+ * and if so, do it. We know that there is a DOT because

+ * _toLOWER_utf8_flags() wouldn't have returned 'i' unless there

+ * was one in a proper position. */

+ if ( UNLIKELY(PL_in_utf8_turkic_locale)

+ && IN_LC_RUNTIME(LC_CTYPE))

+ {

+ if ( UNLIKELY(remove_dot_above)

+ && memBEGINs(tmpbuf, sizeof(tmpbuf), COMBINING_DOT_ABOVE_UTF8))

+ {

+ s += u;

+ remove_dot_above = FALSE;

+ continue;

+ }

+ else if (UNLIKELY(*s == 'I' && tmpbuf[0] == 'i')) {

+ remove_dot_above = TRUE;

+ }

#else

+ PERL_UNUSED_VAR(remove_dot_above);

_toLOWER_utf8_flags(s, send, tmpbuf, &ulen, 0);

#endif

- /* Here is where we would do context-sensitive actions. See the

- * commit message for 86510fb15 for why there isn't any */

+ /* Here is where we would do context-sensitive actions for the

+ * Greek final sigma. See the commit message for 86510fb15 for why

+ * there isn't any */

if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {

@@ -4301,7 +4518,7 @@ PP(pp_lc)

SvUTF8_on(dest);

*d = '\0';

SvCUR_set(dest, d - (U8*)SvPVX_const(dest));

- } else { /* Not utf8 */

+ } else { /* 'source' not utf8 */

if (len) {

const U8 *const send = s + len;

@@ -4310,8 +4527,22 @@ PP(pp_lc)

* whole thing in a tight loop, for speed, */

#ifdef USE_LOCALE_CTYPE

if (IN_LC_RUNTIME(LC_CTYPE)) {

- for (; s < send; d++, s++)

- *d = toLOWER_LC(*s);

+ if (LIKELY( ! has_turkic_I)) {

+ for (; s < send; d++, s++)

+ *d = toLOWER_LC(*s);

+ }

+ else { /* This is the only case where lc() converts 'dest'

+ into UTF-8 from a non-UTF-8 'source' */

+ for (; s < send; s++) {

+ if (*s == 'I') {

+ *d++ = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);

+ *d++ = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);

+ }

+ else {

+ append_utf8_from_native_byte(toLOWER_LATIN1(*s), &d);

+ }

}

else

#endif

@@ -4371,7 +4602,7 @@ PP(pp_quotemeta)

#ifdef USE_LOCALE_CTYPE

/* In locale, we quote all non-ASCII Latin1 chars.

* Otherwise use the quoting rules */

IN_LC_RUNTIME(LC_CTYPE)

#endif

@@ -4519,52 +4750,80 @@ PP(pp_fc)

#ifdef USE_LOCALE_CTYPE

do_uni_folding:

#endif

- /* For ASCII and the Latin-1 range, there's only two troublesome

- * folds, \x{DF} (\N{LATIN SMALL LETTER SHARP S}), which under full

- * casefolding becomes 'ss'; and \x{B5} (\N{MICRO SIGN}), which

- * under any fold becomes \x{3BC} (\N{GREEK SMALL LETTER MU}) --

+ /* For ASCII and the Latin-1 range, there's potentially three

+ * troublesome folds:

+ * \x{DF} (\N{LATIN SMALL LETTER SHARP S}), which under full

+ * casefolding becomes 'ss';

+ * \x{B5} (\N{MICRO SIGN}), which under any fold becomes

+ * \x{3BC} (\N{GREEK SMALL LETTER MU})

+ * I only in Turkic locales, this folds to \x{131}

+ * \N{LATIN SMALL LETTER DOTLESS I}

* For the rest, the casefold is their lowercase. */

for (; s < send; d++, s++) {

- if (*s == MICRO_SIGN) {

+ if ( UNLIKELY(*s == MICRO_SIGN)

+#ifdef USE_LOCALE_CTYPE

+ || ( UNLIKELY(PL_in_utf8_turkic_locale)

+ && UNLIKELY(IN_LC_RUNTIME(LC_CTYPE))

+ && UNLIKELY(*s == 'I'))

+#endif

+ ) {

+ Size_t extra = send - s

+ + variant_under_utf8_count(s, send);

/* \N{MICRO SIGN}'s casefold is \N{GREEK SMALL LETTER MU},

- * which is outside of the latin-1 range. There's a couple

- * of ways to deal with this -- khw discusses them in

- * pp_lc/uc, so go there :) What we do here is upgrade what

- * we had already casefolded, then enter an inner loop that

- * appends the rest of the characters as UTF-8. */

+ * and 'I' in Turkic locales is \N{LATIN SMALL LETTER

+ * DOTLESS I} both of which are outside of the latin-1

+ * range. There's a couple of ways to deal with this -- khw

+ * discusses them in pp_lc/uc, so go there :) What we do

+ * here is upgrade what we had already casefolded, then

+ * enter an inner loop that appends the rest of the

+ * characters as UTF-8.

+ *

+ * First we calculate the needed size of the upgraded dest

+ * beyond what's been processed already (the upgrade

+ * function figures that out). Except for the 'I' in

+ * Turkic locales, in UTF-8 strings, the fold case of a

+ * character below 256 occupies the same number of bytes as

+ * the original (even the Sharp S). Therefore, the space

+ * needed is the number of bytes remaining plus the number

+ * of characters that become two bytes when converted to

+ * UTF-8 plus, in turkish locales, the number of 'I's */

+ if (UNLIKELY(*s == 'I')) {

+ const U8 * s_peek = s;

+ do {

+ extra++;

+ s_peek = (U8 *) memchr(s_peek + 1, 'i',

+ send - (s_peek + 1));

+ } while (s_peek != NULL);

+ }

+ /* Growing may move things, so have to save and recalculate

+ * 'd' */

len = d - (U8*)SvPVX_const(dest);

SvCUR_set(dest, len);

len = sv_utf8_upgrade_flags_grow(dest,

SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,

- /* The max expansion for latin1

- * chars is 1 byte becomes 2 */

- (send -s) * 2 + 1);

+ extra

+ + 1 /* Trailing NUL */ );

d = (U8*)SvPVX(dest) + len;

- Copy(GREEK_SMALL_LETTER_MU_UTF8, d, small_mu_len, U8);

- d += small_mu_len;

+ *d++ = UTF8_TWO_BYTE_HI(GREEK_SMALL_LETTER_MU);

+ *d++ = UTF8_TWO_BYTE_LO(GREEK_SMALL_LETTER_MU);

s++;

for (; s < send; s++) {

STRLEN ulen;

- UV fc = _to_uni_fold_flags(*s, tmpbuf, &ulen, flags);

- if UVCHR_IS_INVARIANT(fc) {

- if (full_folding

- && *s == LATIN_SMALL_LETTER_SHARP_S)

- {

- *d++ = 's';

- }

- else

- *d++ = (U8)fc;

- }

- else {

- Copy(tmpbuf, d, ulen, U8);

- d += ulen;

- }

+ _to_uni_fold_flags(*s, d, &ulen, flags);

+ d += ulen;

}

break;

}

- else if (full_folding && *s == LATIN_SMALL_LETTER_SHARP_S) {

+ else if ( UNLIKELY(*s == LATIN_SMALL_LETTER_SHARP_S)

+ && full_folding)

+ {

/* Under full casefolding, LATIN SMALL LETTER SHARP S

* becomes "ss", which may require growing the SV. */

if (SvLEN(dest) < ++min) {

@@ -4574,8 +4833,7 @@ PP(pp_fc)

*(d)++ = 's';

*d = 's';

}

- else { /* If it's not one of those two, the fold is their lower

- case */

+ else { /* Else, the fold is the lower case */

*d = toLOWER_LATIN1(*s);

}

@@ -5386,7 +5644,7 @@ PP(pp_splice)

i = -diff;

while (i)

dst[--i] = NULL;

if (newlen) {

Copy( tmparyval, AvARRAY(ary) + offset, newlen, SV* );

Safefree(tmparyval);

@@ -5837,7 +6095,7 @@ PP(pp_split)

} else {

while (m < strend && !isSPACE(*m))

++m;

- }

+ }

if (m >= strend)

break;

@@ -5875,7 +6133,7 @@ PP(pp_split)

} else {

while (s < strend && isSPACE(*s))

++s;

- }

+ }

}

else if (RX_EXTFLAGS(rx) & RXf_START_ONLY) {

@@ -5989,7 +6247,7 @@ PP(pp_split)

/* The rx->minlen is in characters but we want to step

* s ahead by bytes. */

if (do_utf8)

- s = (char*)utf8_hop((U8*)m, len);

+ s = (char*)utf8_hop_forward((U8*) m, len, (U8*) strend);

else

s = m + len; /* Fake \n at the end */

}

@@ -6013,7 +6271,7 @@ PP(pp_split)

/* The rx->minlen is in characters but we want to step

* s ahead by bytes. */

if (do_utf8)

- s = (char*)utf8_hop((U8*)m, len);

+ s = (char*)utf8_hop_forward((U8*)m, len, (U8 *) strend);

else

s = m + len; /* Fake \n at the end */

}

@@ -6559,7 +6817,7 @@ PP(pp_lvref)

}

else if (arg) {

- S_localise_gv_slot(aTHX_ (GV *)arg,

+ S_localise_gv_slot(aTHX_ (GV *)arg,

PL_op->op_private & OPpLVREF_TYPE);

}

else if (!(PL_op->op_private & OPpPAD_STATE))

@@ -6598,10 +6856,12 @@ PP(pp_lvrefslice)

while (++MARK <= SP) {

SV * const elemsv = *MARK;

- if (SvTYPE(av) == SVt_PVAV)

- S_localise_aelem_lval(aTHX_ av, elemsv, can_preserve);

- else

- S_localise_helem_lval(aTHX_ (HV *)av, elemsv, can_preserve);

+ if (UNLIKELY(localizing)) {

+ if (SvTYPE(av) == SVt_PVAV)

+ S_localise_aelem_lval(aTHX_ av, elemsv, can_preserve);

+ else

+ S_localise_helem_lval(aTHX_ (HV *)av, elemsv, can_preserve);

+ }

*MARK = sv_2mortal(newSV_type(SVt_PVMG));

sv_magic(*MARK,(SV *)av,PERL_MAGIC_lvref,(char *)elemsv,HEf_SVKEY);

}

@@ -6640,7 +6900,7 @@ PP(pp_anonconst)

* for $: (OPf_STACKED ? *sp : $_[N])

* for @/%: @_[N..$#_]

- * It's equivalent to

+ * It's equivalent to

* my $foo = $_[N];

* or

* my $foo = (value-on-stack)