diff options
author | Todd C. Miller <millert@cvs.openbsd.org> | 2001-05-24 18:36:42 +0000 |
---|---|---|
committer | Todd C. Miller <millert@cvs.openbsd.org> | 2001-05-24 18:36:42 +0000 |
commit | 8bab8b19946f98d4be49345ca9c42e56674b65fb (patch) | |
tree | bd62d7b5d463fab205d08914b30ba647eb3c8bc8 /gnu/usr.bin/perl/regcomp.c | |
parent | 483d4e680bd2a6db14835b1b4d65be33488d532b (diff) |
merge in perl 5.6.1 with our local changes
Diffstat (limited to 'gnu/usr.bin/perl/regcomp.c')
-rw-r--r-- | gnu/usr.bin/perl/regcomp.c | 817 |
1 files changed, 595 insertions, 222 deletions
diff --git a/gnu/usr.bin/perl/regcomp.c b/gnu/usr.bin/perl/regcomp.c index c0425b766d2..b0d238f168d 100644 --- a/gnu/usr.bin/perl/regcomp.c +++ b/gnu/usr.bin/perl/regcomp.c @@ -69,7 +69,7 @@ * **** Alterations to Henry's code are... **** - **** Copyright (c) 1991-2000, Larry Wall + **** Copyright (c) 1991-2001, Larry Wall **** **** You may distribute under the terms of either the GNU General Public **** License or the Artistic License, as specified in the README file. @@ -114,11 +114,6 @@ #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ ((*s) == '{' && regcurly(s))) -#ifdef atarist -#define PERL_META "^$.[()|?+*\\" -#else -#define META "^$.[()|?+*\\" -#endif #ifdef SPSTART #undef SPSTART /* dratted cpp namespace... */ @@ -151,6 +146,7 @@ typedef struct scan_data_t { I32 offset_float_max; I32 flags; I32 whilem_c; + I32 *last_closep; struct regnode_charclass_class *start_class; } scan_data_t; @@ -159,7 +155,7 @@ typedef struct scan_data_t { */ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0 }; + 0, 0, 0, 0, 0, 0}; #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) #define SF_BEFORE_SEOL 0x1 @@ -188,6 +184,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define SCF_DO_STCLASS_AND 0x0800 #define SCF_DO_STCLASS_OR 0x1000 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) +#define SCF_WHILEM_VISITED_POS 0x2000 #define RF_utf8 8 #define UTF (PL_reg_flags & RF_utf8) @@ -201,6 +198,185 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) + +/* length of regex to show in messages that don't mark a position within */ +#define RegexLengthToShowInErrorMessages 127 + +/* + * If MARKER[12] are adjusted, be sure to adjust the constants at the top + * of t/op/regmesg.t, the tests in t/op/re_tests, and those in + * op/pragma/warn/regcomp. + */ +#define MARKER1 "HERE" /* marker as it appears in the description */ +#define MARKER2 " << HERE " /* marker as it appears within the regex */ + +#define REPORT_LOCATION " before " MARKER1 " mark in regex m/%.*s" MARKER2 "%s/" + +/* + * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given + * arg. Show regex, up to a maximum length. If it's too long, chop and add + * "...". + */ +#define FAIL(msg) \ + STMT_START { \ + char *ellipses = ""; \ + unsigned len = strlen(PL_regprecomp); \ + \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \ + \ + if (len > RegexLengthToShowInErrorMessages) { \ + /* chop 10 shorter than the max, to ensure meaning of "..." */ \ + len = RegexLengthToShowInErrorMessages - 10; \ + ellipses = "..."; \ + } \ + Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ + msg, (int)len, PL_regprecomp, ellipses); \ + } STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given + * args. Show regex, up to a maximum length. If it's too long, chop and add + * "...". + */ +#define FAIL2(pat,msg) \ + STMT_START { \ + char *ellipses = ""; \ + unsigned len = strlen(PL_regprecomp); \ + \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \ + \ + if (len > RegexLengthToShowInErrorMessages) { \ + /* chop 10 shorter than the max, to ensure meaning of "..." */ \ + len = RegexLengthToShowInErrorMessages - 10; \ + ellipses = "..."; \ + } \ + S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \ + msg, (int)len, PL_regprecomp, ellipses); \ + } STMT_END + + +/* + * Simple_vFAIL -- like FAIL, but marks the current location in the scan + */ +#define Simple_vFAIL(m) \ + STMT_START { \ + unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \ + \ + Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ + m, (int)offset, PL_regprecomp, PL_regprecomp + offset); \ + } STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() + */ +#define vFAIL(m) \ + STMT_START { \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \ + Simple_vFAIL(m); \ + } STMT_END + +/* + * Like Simple_vFAIL(), but accepts two arguments. + */ +#define Simple_vFAIL2(m,a1) \ + STMT_START { \ + unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \ + \ + S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ + (int)offset, PL_regprecomp, PL_regprecomp + offset); \ + } STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). + */ +#define vFAIL2(m,a1) \ + STMT_START { \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \ + Simple_vFAIL2(m, a1); \ + } STMT_END + + +/* + * Like Simple_vFAIL(), but accepts three arguments. + */ +#define Simple_vFAIL3(m, a1, a2) \ + STMT_START { \ + unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \ + \ + S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ + (int)offset, PL_regprecomp, PL_regprecomp + offset); \ + } STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). + */ +#define vFAIL3(m,a1,a2) \ + STMT_START { \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \ + Simple_vFAIL3(m, a1, a2); \ + } STMT_END + +/* + * Like Simple_vFAIL(), but accepts four arguments. + */ +#define Simple_vFAIL4(m, a1, a2, a3) \ + STMT_START { \ + unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \ + \ + S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\ + (int)offset, PL_regprecomp, PL_regprecomp + offset); \ + } STMT_END + +/* + * Like Simple_vFAIL(), but accepts five arguments. + */ +#define Simple_vFAIL5(m, a1, a2, a3, a4) \ + STMT_START { \ + unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \ + S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\ + (int)offset, PL_regprecomp, PL_regprecomp + offset); \ + } STMT_END + + +#define vWARN(loc,m) \ + STMT_START { \ + unsigned offset = strlen(PL_regprecomp)-(PL_regxend-(loc)); \ + Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\ + m, (int)offset, PL_regprecomp, PL_regprecomp + offset); \ + } STMT_END \ + + +#define vWARN2(loc, m, a1) \ + STMT_START { \ + unsigned offset = strlen(PL_regprecomp)-(PL_regxend-(loc)); \ + Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ + a1, \ + (int)offset, PL_regprecomp, PL_regprecomp + offset); \ + } STMT_END + +#define vWARN3(loc, m, a1, a2) \ + STMT_START { \ + unsigned offset = strlen(PL_regprecomp) - (PL_regxend - (loc)); \ + Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \ + a1, a2, \ + (int)offset, PL_regprecomp, PL_regprecomp + offset); \ + } STMT_END + +#define vWARN4(loc, m, a1, a2, a3) \ + STMT_START { \ + unsigned offset = strlen(PL_regprecomp)-(PL_regxend-(loc)); \ + Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ + a1, a2, a3, \ + (int)offset, PL_regprecomp, PL_regprecomp + offset); \ + } STMT_END + + + /* Allow for side effects in s */ #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (s);} STMT_END @@ -213,7 +389,6 @@ static void clear_re(pTHXo_ void *r); STATIC void S_scan_commit(pTHX_ scan_data_t *data) { - dTHR; STRLEN l = CHR_SVLEN(data->last_found); STRLEN old_l = CHR_SVLEN(*data->longest); @@ -264,7 +439,7 @@ S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl) { int value; - for (value = 0; value < ANYOF_MAX; value += 2) + for (value = 0; value <= ANYOF_MAX; value += 2) if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) return 1; for (value = 0; value < 256; ++value) @@ -378,7 +553,6 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da /* deltap: Write maxlen-minlen here. */ /* last: Stop before this one. */ { - dTHR; I32 min = 0, pars = 0, code; regnode *scan = *scanp, *next; I32 delta = 0; @@ -417,21 +591,19 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da #endif n = regnext(n); } - else { + else if (stringok) { int oldl = STR_LEN(scan); regnode *nnext = regnext(n); - + if (oldl + STR_LEN(n) > U8_MAX) break; NEXT_OFF(scan) += NEXT_OFF(n); STR_LEN(scan) += STR_LEN(n); next = n + NODE_SZ_STR(n); /* Now we can overwrite *n : */ - Move(STRING(n), STRING(scan) + oldl, - STR_LEN(n), char); + Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char); #ifdef DEBUGGING - if (stringok) - stop = next - 1; + stop = next - 1; #endif n = nnext; } @@ -486,13 +658,17 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da if (flags & SCF_DO_STCLASS) cl_init_zero(&accum); while (OP(scan) == code) { - I32 deltanext, minnext, f = 0; + I32 deltanext, minnext, f = 0, fake = 0; struct regnode_charclass_class this_class; num++; data_fake.flags = 0; - if (data) + if (data) { data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; next = regnext(scan); scan = NEXTOPER(scan); if (code != BRANCH) @@ -502,6 +678,8 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; /* we suppose the run is continuous, last=next...*/ minnext = study_chunk(&scan, &deltanext, next, &data_fake, f); @@ -664,8 +842,8 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da flags &= ~SCF_DO_STCLASS; } else if (strchr((char*)PL_varies,OP(scan))) { - I32 mincount, maxcount, minnext, deltanext, pos_before, fl; - I32 f = flags; + I32 mincount, maxcount, minnext, deltanext, fl; + I32 f = flags, pos_before = 0; regnode *oscan = scan; struct regnode_charclass_class this_class; struct regnode_charclass_class *oclass = NULL; @@ -708,6 +886,11 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da mincount = ARG1(scan); maxcount = ARG2(scan); next = regnext(scan); + if (OP(scan) == CURLYX) { + I32 lp = (data ? *(data->last_closep) : 0); + + scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX); + } scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; do_curly: if (flags & SCF_DO_SUBSTR) { @@ -727,6 +910,14 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da f |= SCF_DO_STCLASS_AND; f &= ~SCF_DO_STCLASS_OR; } + /* These are the cases when once a subexpression + fails at a particular position, it cannot succeed + even after backtracking at the enclosing scope. + + XXXX what if minimal match and we are at the + initial run of {n,m}? */ + if ((mincount != maxcount - 1) && (maxcount != REG_INFTY)) + f &= ~SCF_WHILEM_VISITED_POS; /* This will finish on WHILEM, setting scan, or on NULL: */ minnext = study_chunk(&scan, &deltanext, last, data, @@ -764,8 +955,11 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0) && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) && maxcount <= REG_INFTY/3) /* Complement check for big count */ - Perl_warner(aTHX_ WARN_REGEXP, - "Strange *+?{} on zero-length expression"); + { + vWARN(PL_regcomp_parse, + "Quantifier unexpected on zero-length expression"); + } + min += minnext * mincount; is_inf_internal |= ((maxcount == REG_INFTY && (minnext + deltanext) > 0) @@ -828,7 +1022,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ if (OP(nxt) != CLOSE) - FAIL("panic opt close"); + FAIL("Panic opt close"); oscan->flags = ARG(nxt); OP(nxt1) = OPTIMIZED; /* was OPEN. */ OP(nxt) = OPTIMIZED; /* was CLOSE. */ @@ -859,8 +1053,14 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da else oscan->flags = 0; } - else if (OP(oscan) == CURLYX && data && ++data->whilem_c < 16) { - /* This stays as CURLYX, and can put the count/of pair. */ + else if ((OP(oscan) == CURLYX) + && (flags & SCF_WHILEM_VISITED_POS) + /* See the comment on a similar expression above. + However, this time it not a subexpression + we care about, but the expression itself. */ + && (maxcount == REG_INFTY) + && data && ++data->whilem_c < 16) { + /* This stays as CURLYX, we can put the count/of pair. */ /* Find WHILEM (as in regexec.c) */ regnode *nxt = oscan + NEXT_OFF(oscan); @@ -901,6 +1101,11 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da sv_catsv(data->last_found, last_str); data->last_end += l * (mincount - 1); } + } else { + /* start offset must point into the last copy */ + data->last_start_min += minnext * (mincount - 1); + data->last_start_max += is_inf ? 0 : (maxcount - 1) + * (minnext + data->pos_delta); } } /* It is counted once already... */ @@ -1169,29 +1374,35 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da && (scan->flags || data || (flags & SCF_DO_STCLASS)) && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { /* Lookahead/lookbehind */ - I32 deltanext, minnext; + I32 deltanext, minnext, fake = 0; regnode *nscan; struct regnode_charclass_class intrnl; int f = 0; data_fake.flags = 0; - if (data) + if (data) { data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; if ( flags & SCF_DO_STCLASS && !scan->flags && OP(scan) == IFMATCH ) { /* Lookahead */ cl_init(&intrnl); data_fake.start_class = &intrnl; - f = SCF_DO_STCLASS_AND; + f |= SCF_DO_STCLASS_AND; } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); minnext = study_chunk(&nscan, &deltanext, last, &data_fake, f); if (scan->flags) { if (deltanext) { - FAIL("variable length lookbehind not implemented"); + vFAIL("Variable length lookbehind not implemented"); } else if (minnext > U8_MAX) { - FAIL2("lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); } scan->flags = minnext; } @@ -1201,7 +1412,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da data->flags |= SF_HAS_EVAL; if (data) data->whilem_c = data_fake.whilem_c; - if (f) { + if (f & SCF_DO_STCLASS_AND) { int was = (data->start_class->flags & ANYOF_EOS); cl_and(data->start_class, &intrnl); @@ -1212,11 +1423,15 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da else if (OP(scan) == OPEN) { pars++; } - else if (OP(scan) == CLOSE && ARG(scan) == is_par) { - next = regnext(scan); + else if (OP(scan) == CLOSE) { + if (ARG(scan) == is_par) { + next = regnext(scan); - if ( next && (OP(next) != WHILEM) && next < last) - is_par = 0; /* Disable optimization */ + if ( next && (OP(next) != WHILEM) && next < last) + is_par = 0; /* Disable optimization */ + } + if (data) + *(data->last_closep) = ARG(scan); } else if (OP(scan) == EVAL) { if (data) @@ -1259,7 +1474,6 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da STATIC I32 S_add_data(pTHX_ I32 n, char *s) { - dTHR; if (PL_regcomp_rx->data) { Renewc(PL_regcomp_rx->data, sizeof(*PL_regcomp_rx->data) + sizeof(void*) * (PL_regcomp_rx->data->count + n - 1), @@ -1280,7 +1494,6 @@ S_add_data(pTHX_ I32 n, char *s) void Perl_reginitcolors(pTHX) { - dTHR; int i = 0; char *s = PerlEnv_getenv("PERL_RE_COLORS"); @@ -1302,6 +1515,7 @@ Perl_reginitcolors(pTHX) PL_colorset = 1; } + /* - pregcomp - compile a regular expression into internal code * @@ -1320,7 +1534,6 @@ Perl_reginitcolors(pTHX) regexp * Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) { - dTHR; register regexp *r; regnode *scan; regnode *first; @@ -1339,7 +1552,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) else PL_reg_flags = 0; - PL_regprecomp = savepvn(exp, xend - exp); + PL_regprecomp = exp; DEBUG_r(if (!PL_colorset) reginitcolors()); DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n", PL_colors[4],PL_colors[5],PL_colors[0], @@ -1365,7 +1578,6 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) REGC((U8)REG_MAGIC, (char*)PL_regcode); #endif if (reg(0, &flags) == NULL) { - Safefree(PL_regprecomp); PL_regprecomp = Nullch; return(NULL); } @@ -1384,14 +1596,15 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) Newc(1001, r, sizeof(regexp) + (unsigned)PL_regsize * sizeof(regnode), char, regexp); if (r == NULL) - FAIL("regexp out of space"); + FAIL("Regexp out of space"); + #ifdef DEBUGGING /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ Zero(r, sizeof(regexp) + (unsigned)PL_regsize * sizeof(regnode), char); #endif r->refcnt = 1; r->prelen = xend - exp; - r->precomp = PL_regprecomp; + r->precomp = savepvn(PL_regprecomp, r->prelen); r->subbeg = NULL; r->reganch = pm->op_pmflags & PMf_COMPILETIME; r->nparens = PL_regnpar - 1; /* set early to validate backrefs */ @@ -1436,6 +1649,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) STRLEN longest_float_length, longest_fixed_length; struct regnode_charclass_class ch_class; int stclass_flag; + I32 last_close = 0; first = scan; /* Skip introductions and multiplicators >= 1. */ @@ -1528,9 +1742,10 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) stclass_flag = SCF_DO_STCLASS_AND; } else /* XXXX Check for BOUND? */ stclass_flag = 0; + data.last_closep = &last_close; minlen = study_chunk(&first, &fake, scan + PL_regsize, /* Up to end */ - &data, SCF_DO_SUBSTR | stclass_flag); + &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag); if ( PL_regnpar == 1 && data.longest == &(data.longest_fixed) && data.last_start_min == 0 && data.last_end > 0 && !PL_seen_zerolen @@ -1632,12 +1847,14 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) /* Several toplevels. Best we can is to set minlen. */ I32 fake; struct regnode_charclass_class ch_class; + I32 last_close = 0; DEBUG_r(PerlIO_printf(Perl_debug_log, "\n")); scan = r->program + 1; cl_init(&ch_class); data.start_class = &ch_class; - minlen = study_chunk(&scan, &fake, scan + PL_regsize, &data, SCF_DO_STCLASS_AND); + data.last_closep = &last_close; + minlen = study_chunk(&scan, &fake, scan + PL_regsize, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS); r->check_substr = r->anchored_substr = r->float_substr = Nullsv; if (!(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { @@ -1667,6 +1884,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->reganch |= ROPT_EVAL_SEEN; Newz(1002, r->startp, PL_regnpar, I32); Newz(1002, r->endp, PL_regnpar, I32); + PL_regdata = r->data; /* for regprop() ANYOFUTF8 */ DEBUG_r(regdump(r)); return(r); } @@ -1684,13 +1902,13 @@ STATIC regnode * S_reg(pTHX_ I32 paren, I32 *flagp) /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */ { - dTHR; register regnode *ret; /* Will be the head of the group. */ register regnode *br; register regnode *lastbr; register regnode *ender = 0; register I32 parno = 0; I32 flags, oregflags = PL_regflags, have_branch = 0, open = 0; + char *oregcomp_parse = PL_regcomp_parse; char c; *flagp = 0; /* Tentatively. */ @@ -1701,6 +1919,7 @@ S_reg(pTHX_ I32 paren, I32 *flagp) U16 posflags = 0, negflags = 0; U16 *flagsp = &posflags; int logical = 0; + char *seqstart = PL_regcomp_parse; PL_regcomp_parse++; paren = *PL_regcomp_parse++; @@ -1721,7 +1940,7 @@ S_reg(pTHX_ I32 paren, I32 *flagp) break; case '$': case '@': - FAIL2("Sequence (?%c...) not implemented", (int)paren); + vFAIL2("Sequence (?%c...) not implemented", (int)paren); break; case '#': while (*PL_regcomp_parse && *PL_regcomp_parse != ')') @@ -1733,8 +1952,7 @@ S_reg(pTHX_ I32 paren, I32 *flagp) return NULL; case 'p': if (SIZE_ONLY) - Perl_warner(aTHX_ WARN_REGEXP, - "(?p{}) is deprecated - use (??{})"); + vWARN(PL_regcomp_parse, "(?p{}) is deprecated - use (??{})"); /* FALL THROUGH*/ case '?': logical = 1; @@ -1742,7 +1960,6 @@ S_reg(pTHX_ I32 paren, I32 *flagp) /* FALL THROUGH */ case '{': { - dTHR; I32 count = 1, n = 0; char c; char *s = PL_regcomp_parse; @@ -1761,7 +1978,10 @@ S_reg(pTHX_ I32 paren, I32 *flagp) PL_regcomp_parse++; } if (*PL_regcomp_parse != ')') - FAIL("Sequence (?{...}) not terminated or not {}-balanced"); + { + PL_regcomp_parse = s; + vFAIL("Sequence (?{...}) not terminated or not {}-balanced"); + } if (!SIZE_ONLY) { AV *av; @@ -1770,7 +1990,10 @@ S_reg(pTHX_ I32 paren, I32 *flagp) else sv = newSVpvn("", 0); + ENTER; + Perl_save_re_context(aTHX); rop = sv_compile_2op(sv, &sop, "re", &av); + LEAVE; n = add_data(3, "nop"); PL_regcomp_rx->data->data[n] = (void*)rop; @@ -1820,7 +2043,7 @@ S_reg(pTHX_ I32 paren, I32 *flagp) PL_regcomp_parse++; ret = reganode(GROUPP, parno); if ((c = *nextchar()) != ')') - FAIL2("Switch (?(number%c not recognized", c); + vFAIL("Switch condition not recognized"); insert_if: regtail(ret, reganode(IFTHEN, 0)); br = regbranch(&flags, 1); @@ -1842,7 +2065,7 @@ S_reg(pTHX_ I32 paren, I32 *flagp) else lastbr = NULL; if (c != ')') - FAIL("Switch (?(condition)... contains too many branches"); + vFAIL("Switch (?(condition)... contains too many branches"); ender = reg_node(TAIL); regtail(br, ender); if (lastbr) { @@ -1854,11 +2077,12 @@ S_reg(pTHX_ I32 paren, I32 *flagp) return ret; } else { - FAIL2("Unknown condition for (?(%.2s", PL_regcomp_parse); + vFAIL2("Unknown switch condition (?(%.2s", PL_regcomp_parse); } } case 0: - FAIL("Sequence (? incomplete"); + PL_regcomp_parse--; /* for vFAIL to print correctly */ + vFAIL("Sequence (? incomplete"); break; default: --PL_regcomp_parse; @@ -1881,8 +2105,10 @@ S_reg(pTHX_ I32 paren, I32 *flagp) break; } unknown: - if (*PL_regcomp_parse != ')') - FAIL2("Sequence (?%c...) not recognized", *PL_regcomp_parse); + if (*PL_regcomp_parse != ')') { + PL_regcomp_parse++; + vFAIL3("Sequence (%.*s...) not recognized", PL_regcomp_parse-seqstart, seqstart); + } nextchar(); *flagp = TRYAGAIN; return NULL; @@ -1994,15 +2220,17 @@ S_reg(pTHX_ I32 paren, I32 *flagp) if (paren) { PL_regflags = oregflags; if (PL_regcomp_parse >= PL_regxend || *nextchar() != ')') { - FAIL("unmatched () in regexp"); + PL_regcomp_parse = oregcomp_parse; + vFAIL("Unmatched ("); } } else if (!paren && PL_regcomp_parse < PL_regxend) { if (*PL_regcomp_parse == ')') { - FAIL("unmatched () in regexp"); + PL_regcomp_parse++; + vFAIL("Unmatched )"); } else - FAIL("junk on end of regexp"); /* "Can't happen". */ + FAIL("Junk on end of regexp"); /* "Can't happen". */ /* NOTREACHED */ } @@ -2017,7 +2245,6 @@ S_reg(pTHX_ I32 paren, I32 *flagp) STATIC regnode * S_regbranch(pTHX_ I32 *flagp, I32 first) { - dTHR; register regnode *ret; register regnode *chain = NULL; register regnode *latest; @@ -2083,7 +2310,6 @@ S_regbranch(pTHX_ I32 *flagp, I32 first) STATIC regnode * S_regpiece(pTHX_ I32 *flagp) { - dTHR; register regnode *ret; register char op; register char *next; @@ -2127,7 +2353,7 @@ S_regpiece(pTHX_ I32 *flagp) if (!max && *maxpos != '0') max = REG_INFTY; /* meaning "infinity" */ else if (max >= REG_INFTY) - FAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); + vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); PL_regcomp_parse = next; nextchar(); @@ -2161,7 +2387,7 @@ S_regpiece(pTHX_ I32 *flagp) if (max > 0) *flagp |= HASWIDTH; if (max && max < min) - FAIL("Can't do {n,m} with n > m"); + vFAIL("Can't do {n,m} with n > m"); if (!SIZE_ONLY) { ARG1_SET(ret, min); ARG2_SET(ret, max); @@ -2177,8 +2403,19 @@ S_regpiece(pTHX_ I32 *flagp) } #if 0 /* Now runtime fix should be reliable. */ + + /* if this is reinstated, don't forget to put this back into perldiag: + + =item Regexp *+ operand could be empty at {#} in regex m/%s/ + + (F) The part of the regexp subject to either the * or + quantifier + could match an empty string. The {#} shows in the regular + expression about where the problem was discovered. + + */ + if (!(flags&HASWIDTH) && op != '?') - FAIL("regexp *+ operand could be empty"); + vFAIL("Regexp *+ operand could be empty"); #endif nextchar(); @@ -2209,8 +2446,10 @@ S_regpiece(pTHX_ I32 *flagp) } nest_check: if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) { - Perl_warner(aTHX_ WARN_REGEXP, "%.*s matches null string many times", - PL_regcomp_parse - origparse, origparse); + vWARN3(PL_regcomp_parse, + "%.*s matches null string many times", + PL_regcomp_parse - origparse, + origparse); } if (*PL_regcomp_parse == '?') { @@ -2218,8 +2457,10 @@ S_regpiece(pTHX_ I32 *flagp) reginsert(MINMOD, ret); regtail(ret, ret + NODE_STEP_REGNODE); } - if (ISMULT2(PL_regcomp_parse)) - FAIL("nested *?+ in regexp"); + if (ISMULT2(PL_regcomp_parse)) { + PL_regcomp_parse++; + vFAIL("Nested quantifiers"); + } return(ret); } @@ -2232,12 +2473,10 @@ S_regpiece(pTHX_ I32 *flagp) * faster to run. Backslashed characters are exceptions, each becoming a * separate node; the code is simpler that way and it's not worth fixing. * - * [Yes, it is worth fixing, some scripts can run twice the speed.] - */ + * [Yes, it is worth fixing, some scripts can run twice the speed.] */ STATIC regnode * S_regatom(pTHX_ I32 *flagp) { - dTHR; register regnode *ret = 0; I32 flags; @@ -2256,9 +2495,9 @@ tryagain: ret = reg_node(BOL); break; case '$': - if (PL_regcomp_parse[1]) - PL_seen_zerolen++; nextchar(); + if (*PL_regcomp_parse) + PL_seen_zerolen++; if (PL_regflags & PMf_MULTILINE) ret = reg_node(MEOL); else if (PL_regflags & PMf_SINGLELINE) @@ -2285,19 +2524,29 @@ tryagain: PL_regnaughty++; break; case '[': - PL_regcomp_parse++; + { + char *oregcomp_parse = ++PL_regcomp_parse; ret = (UTF ? regclassutf8() : regclass()); - if (*PL_regcomp_parse != ']') - FAIL("unmatched [] in regexp"); + if (*PL_regcomp_parse != ']') { + PL_regcomp_parse = oregcomp_parse; + vFAIL("Unmatched ["); + } nextchar(); *flagp |= HASWIDTH|SIMPLE; break; + } case '(': nextchar(); ret = reg(1, &flags); if (ret == NULL) { - if (flags & TRYAGAIN) + if (flags & TRYAGAIN) { + if (PL_regcomp_parse == PL_regxend) { + /* Make parent create an empty node if needed. */ + *flagp |= TRYAGAIN; + return(NULL); + } goto tryagain; + } return(NULL); } *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE); @@ -2308,7 +2557,7 @@ tryagain: *flagp |= TRYAGAIN; return NULL; } - FAIL2("internal urp in regexp at /%s/", PL_regcomp_parse); + vFAIL("Internal urp"); /* Supposed to be caught earlier. */ break; case '{': @@ -2320,7 +2569,8 @@ tryagain: case '?': case '+': case '*': - FAIL("?+*{} follows nothing in regexp"); + PL_regcomp_parse++; + vFAIL("Quantifier follows nothing"); break; case '\\': switch (*++PL_regcomp_parse) { @@ -2444,8 +2694,11 @@ tryagain: if (PL_regcomp_parse[1] == '{') { PL_regxend = strchr(PL_regcomp_parse, '}'); - if (!PL_regxend) - FAIL("Missing right brace on \\p{}"); + if (!PL_regxend) { + PL_regcomp_parse += 2; + PL_regxend = oldregxend; + vFAIL("Missing right brace on \\p{}"); + } PL_regxend++; } else @@ -2478,15 +2731,16 @@ tryagain: if (num > 9 && num >= PL_regnpar) goto defchar; else { + while (isDIGIT(*PL_regcomp_parse)) + PL_regcomp_parse++; + if (!SIZE_ONLY && num > PL_regcomp_rx->nparens) - FAIL("reference to nonexistent group"); + vFAIL("Reference to nonexistent group"); PL_regsawback = 1; ret = reganode(FOLD ? (LOC ? REFFL : REFF) : REF, num); *flagp |= HASWIDTH; - while (isDIGIT(*PL_regcomp_parse)) - PL_regcomp_parse++; PL_regcomp_parse--; nextchar(); } @@ -2494,7 +2748,7 @@ tryagain: break; case '\0': if (PL_regcomp_parse >= PL_regxend) - FAIL("trailing \\ in regexp"); + FAIL("Trailing \\"); /* FALL THROUGH */ default: /* Do not generate `unrecognized' warnings here, we fall @@ -2512,11 +2766,11 @@ tryagain: /* FALL THROUGH */ default: { - register I32 len; + register STRLEN len; register UV ender; register char *p; char *oldp, *s; - I32 numlen; + STRLEN numlen; PL_regcomp_parse++; @@ -2596,20 +2850,23 @@ tryagain: if (*++p == '{') { char* e = strchr(p, '}'); - if (!e) - FAIL("Missing right brace on \\x{}"); - else if (UTF) { - ender = (UV)scan_hex(p + 1, e - p, &numlen); - if (numlen + len >= 127) { /* numlen is generous */ + if (!e) { + PL_regcomp_parse = p + 1; + vFAIL("Missing right brace on \\x{}"); + } + else { + numlen = 1; /* allow underscores */ + ender = (UV)scan_hex(p + 1, e - p - 1, &numlen); + /* numlen is generous */ + if (numlen + len >= 127) { p--; goto loopdone; } p = e + 1; } - else - FAIL("Can't use \\x{} without 'use utf8' declaration"); } else { + numlen = 0; /* disallow underscores */ ender = (UV)scan_hex(p, 2, &numlen); p += numlen; } @@ -2623,6 +2880,7 @@ tryagain: case '5': case '6': case '7': case '8':case '9': if (*p == '0' || (isDIGIT(p[1]) && atoi(p) >= PL_regnpar) ) { + numlen = 0; /* disallow underscores */ ender = (UV)scan_oct(p, 3, &numlen); p += numlen; } @@ -2633,21 +2891,19 @@ tryagain: break; case '\0': if (p >= PL_regxend) - FAIL("trailing \\ in regexp"); + FAIL("Trailing \\"); /* FALL THROUGH */ default: if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p)) - Perl_warner(aTHX_ WARN_REGEXP, - "/%.127s/: Unrecognized escape \\%c passed through", - PL_regprecomp, - *p); + vWARN2(p +1, "Unrecognized escape \\%c passed through", *p); goto normal_default; } break; default: normal_default: - if ((*p & 0xc0) == 0xc0 && UTF) { - ender = utf8_to_uv((U8*)p, &numlen); + if (UTF8_IS_START(*p) && UTF) { + ender = utf8_to_uv((U8*)p, PL_regxend - p, + &numlen, 0); p += numlen; } else @@ -2665,6 +2921,8 @@ tryagain: if (ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; + /* ender is a Unicode value so it can be > 0xff -- + * in other words, do not use UTF8_IS_CONTINUED(). */ else if (ender >= 0x80 && UTF) { reguni(ender, s, &numlen); s += numlen; @@ -2676,6 +2934,8 @@ tryagain: } break; } + /* ender is a Unicode value so it can be > 0xff -- + * in other words, do not use UTF8_IS_CONTINUED(). */ if (ender >= 0x80 && UTF) { reguni(ender, s, &numlen); s += numlen; @@ -2687,8 +2947,12 @@ tryagain: loopdone: PL_regcomp_parse = p - 1; nextchar(); - if (len < 0) - FAIL("internal disaster in regexp"); + { + /* len is STRLEN which is unsigned, need to copy to signed */ + IV iv = len; + if (iv < 0) + vFAIL("Internal disaster"); + } if (len > 0) *flagp |= HASWIDTH; if (len == 1) @@ -2731,7 +2995,6 @@ S_regwhite(pTHX_ char *p, char *e) STATIC I32 S_regpposixcc(pTHX_ I32 value) { - dTHR; char *posixcc = 0; I32 namedclass = OOB_NAMEDCLASS; @@ -2770,6 +3033,11 @@ S_regpposixcc(pTHX_ I32 value) namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII; break; + case 'b': + if (strnEQ(posixcc, "blank", 5)) + namedclass = + complement ? ANYOF_NBLANK : ANYOF_BLANK; + break; case 'c': if (strnEQ(posixcc, "cntrl", 5)) namedclass = @@ -2801,7 +3069,8 @@ S_regpposixcc(pTHX_ I32 value) case 's': if (strnEQ(posixcc, "space", 5)) namedclass = - complement ? ANYOF_NSPACE : ANYOF_SPACE; + complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC; + break; case 'u': if (strnEQ(posixcc, "upper", 5)) namedclass = @@ -2825,13 +3094,19 @@ S_regpposixcc(pTHX_ I32 value) if (namedclass == OOB_NAMEDCLASS || posixcc[skip] != ':' || posixcc[skip+1] != ']') - Perl_croak(aTHX_ - "Character class [:%.*s:] unknown", - t - s - 1, s + 1); - } else if (ckWARN(WARN_REGEXP) && !SIZE_ONLY) + { + Simple_vFAIL3("POSIX class [:%.*s:] unknown", + t - s - 1, s + 1); + } + } else if (!SIZE_ONLY) { /* [[=foo=]] and [[.foo.]] are still future. */ - Perl_warner(aTHX_ WARN_REGEXP, - "Character class syntax [%c %c] is reserved for future extensions", c, c); + + /* adjust PL_regcomp_parse so the warning shows after + the class closes */ + while (*PL_regcomp_parse && *PL_regcomp_parse != ']') + PL_regcomp_parse++; + Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); + } } else { /* Maternal grandfather: * "[:" ending in ":" but not in ":]" */ @@ -2856,11 +3131,17 @@ S_checkposixcc(pTHX) while(*s && isALNUM(*s)) s++; if (*s && c == *s && s[1] == ']') { - Perl_warner(aTHX_ WARN_REGEXP, - "Character class syntax [%c %c] belongs inside character classes", c, c); + vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c); + + /* [[=foo=]] and [[.foo.]] are still future. */ if (c == '=' || c == '.') - Perl_warner(aTHX_ WARN_REGEXP, - "Character class syntax [%c %c] is reserved for future extensions", c, c); + { + /* adjust PL_regcomp_parse so the error shows after + the class closes */ + while (*PL_regcomp_parse && *PL_regcomp_parse++ != ']') + ; + Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); + } } } } @@ -2868,12 +3149,11 @@ S_checkposixcc(pTHX) STATIC regnode * S_regclass(pTHX) { - dTHR; register U32 value; register I32 lastvalue = OOB_CHAR8; register I32 range = 0; register regnode *ret; - I32 numlen; + STRLEN numlen; I32 namedclass; char *rangebegin; bool need_class = 0; @@ -2913,7 +3193,7 @@ S_regclass(pTHX) else if (value == '\\') { value = UCHARAT(PL_regcomp_parse++); /* Some compilers cannot handle switching on 64-bit integer - * values, therefore value cannot be an UV. --jhi */ + * values, therefore the 'value' cannot be an UV. --jhi */ switch (value) { case 'w': namedclass = ANYOF_ALNUM; break; case 'W': namedclass = ANYOF_NALNUM; break; @@ -2934,6 +3214,7 @@ S_regclass(pTHX) case 'a': value = '\057'; break; #endif case 'x': + numlen = 0; /* disallow underscores */ value = (UV)scan_hex(PL_regcomp_parse, 2, &numlen); PL_regcomp_parse += numlen; break; @@ -2943,15 +3224,14 @@ S_regclass(pTHX) break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + numlen = 0; /* disallow underscores */ value = (UV)scan_oct(--PL_regcomp_parse, 3, &numlen); PL_regcomp_parse += numlen; break; default: if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value)) - Perl_warner(aTHX_ WARN_REGEXP, - "/%.127s/: Unrecognized escape \\%c in character class passed through", - PL_regprecomp, - (int)value); + + vWARN2(PL_regcomp_parse, "Unrecognized escape \\%c in character class passed through", (int)value); break; } } @@ -2962,12 +3242,11 @@ S_regclass(pTHX) if (range) { /* a-\d, a-[:digit:] */ if (!SIZE_ONLY) { if (ckWARN(WARN_REGEXP)) - Perl_warner(aTHX_ WARN_REGEXP, - "/%.127s/: false [] range \"%*.*s\" in regexp", - PL_regprecomp, - PL_regcomp_parse - rangebegin, - PL_regcomp_parse - rangebegin, - rangebegin); + vWARN4(PL_regcomp_parse, + "False [] range \"%*.*s\"", + PL_regcomp_parse - rangebegin, + PL_regcomp_parse - rangebegin, + rangebegin); ANYOF_BITMAP_SET(ret, lastvalue); ANYOF_BITMAP_SET(ret, '-'); } @@ -3093,6 +3372,24 @@ S_regclass(pTHX) #endif /* EBCDIC */ } break; + case ANYOF_BLANK: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_BLANK); + else { + for (value = 0; value < 256; value++) + if (isBLANK(value)) + ANYOF_BITMAP_SET(ret, value); + } + break; + case ANYOF_NBLANK: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_NBLANK); + else { + for (value = 0; value < 256; value++) + if (!isBLANK(value)) + ANYOF_BITMAP_SET(ret, value); + } + break; case ANYOF_CNTRL: if (LOC) ANYOF_CLASS_SET(ret, ANYOF_CNTRL); @@ -3166,6 +3463,24 @@ S_regclass(pTHX) ANYOF_BITMAP_SET(ret, value); } break; + case ANYOF_PSXSPC: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_PSXSPC); + else { + for (value = 0; value < 256; value++) + if (isPSXSPC(value)) + ANYOF_BITMAP_SET(ret, value); + } + break; + case ANYOF_NPSXSPC: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC); + else { + for (value = 0; value < 256; value++) + if (!isPSXSPC(value)) + ANYOF_BITMAP_SET(ret, value); + } + break; case ANYOF_PUNCT: if (LOC) ANYOF_CLASS_SET(ret, ANYOF_PUNCT); @@ -3221,7 +3536,7 @@ S_regclass(pTHX) } break; default: - FAIL("invalid [::] class in regexp"); + vFAIL("Invalid [::] class"); break; } if (LOC) @@ -3231,12 +3546,10 @@ S_regclass(pTHX) } if (range) { if (lastvalue > value) /* b-a */ { - Perl_croak(aTHX_ - "/%.127s/: invalid [] range \"%*.*s\" in regexp", - PL_regprecomp, - PL_regcomp_parse - rangebegin, - PL_regcomp_parse - rangebegin, - rangebegin); + Simple_vFAIL4("Invalid [] range \"%*.*s\"", + PL_regcomp_parse - rangebegin, + PL_regcomp_parse - rangebegin, + rangebegin); } range = 0; } @@ -3247,12 +3560,11 @@ S_regclass(pTHX) PL_regcomp_parse++; if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */ if (ckWARN(WARN_REGEXP)) - Perl_warner(aTHX_ WARN_REGEXP, - "/%.127s/: false [] range \"%*.*s\" in regexp", - PL_regprecomp, - PL_regcomp_parse - rangebegin, - PL_regcomp_parse - rangebegin, - rangebegin); + vWARN4(PL_regcomp_parse, + "False [] range \"%*.*s\"", + PL_regcomp_parse - rangebegin, + PL_regcomp_parse - rangebegin, + rangebegin); if (!SIZE_ONLY) ANYOF_BITMAP_SET(ret, '-'); } else @@ -3313,13 +3625,12 @@ S_regclass(pTHX) STATIC regnode * S_regclassutf8(pTHX) { - dTHR; register char *e; register U32 value; register U32 lastvalue = OOB_UTF8; register I32 range = 0; register regnode *ret; - I32 numlen; + STRLEN numlen; I32 n; SV *listsv; U8 flags = 0; @@ -3337,7 +3648,7 @@ S_regclassutf8(pTHX) flags |= ANYOF_FOLD; if (LOC) flags |= ANYOF_LOCALE; - listsv = newSVpvn("# comment\n",10); + listsv = newSVpvn("# comment\n", 10); } if (!SIZE_ONLY && ckWARN(WARN_REGEXP)) @@ -3351,12 +3662,16 @@ S_regclassutf8(pTHX) namedclass = OOB_NAMEDCLASS; if (!range) rangebegin = PL_regcomp_parse; - value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen); + value = utf8_to_uv((U8*)PL_regcomp_parse, + PL_regxend - PL_regcomp_parse, + &numlen, 0); PL_regcomp_parse += numlen; if (value == '[') namedclass = regpposixcc(value); else if (value == '\\') { - value = (U32)utf8_to_uv((U8*)PL_regcomp_parse, &numlen); + value = (U32)utf8_to_uv((U8*)PL_regcomp_parse, + PL_regxend - PL_regcomp_parse, + &numlen, 0); PL_regcomp_parse += numlen; /* Some compilers cannot handle switching on 64-bit integer * values, therefore value cannot be an UV. Yes, this will @@ -3373,7 +3688,7 @@ S_regclassutf8(pTHX) if (*PL_regcomp_parse == '{') { e = strchr(PL_regcomp_parse++, '}'); if (!e) - FAIL("Missing right brace on \\p{}"); + vFAIL("Missing right brace on \\p{}"); n = e - PL_regcomp_parse; } else { @@ -3406,14 +3721,16 @@ S_regclassutf8(pTHX) case 'x': if (*PL_regcomp_parse == '{') { e = strchr(PL_regcomp_parse++, '}'); - if (!e) - FAIL("Missing right brace on \\x{}"); + if (!e) + vFAIL("Missing right brace on \\x{}"); + numlen = 1; /* allow underscores */ value = (UV)scan_hex(PL_regcomp_parse, e - PL_regcomp_parse, &numlen); PL_regcomp_parse = e + 1; } else { + numlen = 0; /* disallow underscores */ value = (UV)scan_hex(PL_regcomp_parse, 2, &numlen); PL_regcomp_parse += numlen; } @@ -3424,15 +3741,15 @@ S_regclassutf8(pTHX) break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + numlen = 0; /* disallow underscores */ value = (UV)scan_oct(--PL_regcomp_parse, 3, &numlen); PL_regcomp_parse += numlen; break; default: if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value)) - Perl_warner(aTHX_ WARN_REGEXP, - "/%.127s/: Unrecognized escape \\%c in character class passed through", - PL_regprecomp, - (int)value); + vWARN2(PL_regcomp_parse, + "Unrecognized escape \\%c in character class passed through", + (int)value); break; } } @@ -3440,12 +3757,11 @@ S_regclassutf8(pTHX) if (range) { /* a-\d, a-[:digit:] */ if (!SIZE_ONLY) { if (ckWARN(WARN_REGEXP)) - Perl_warner(aTHX_ WARN_REGEXP, - "/%.127s/: false [] range \"%*.*s\" in regexp", - PL_regprecomp, - PL_regcomp_parse - rangebegin, - PL_regcomp_parse - rangebegin, - rangebegin); + vWARN4(PL_regcomp_parse, + "False [] range \"%*.*s\"", + PL_regcomp_parse - rangebegin, + PL_regcomp_parse - rangebegin, + rangebegin); Perl_sv_catpvf(aTHX_ listsv, /* 0x002D is Unicode for '-' */ "%04"UVxf"\n002D\n", (UV)lastvalue); @@ -3495,8 +3811,16 @@ S_regclassutf8(pTHX) case ANYOF_NPUNCT: Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break; case ANYOF_SPACE: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break; + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");break; case ANYOF_NSPACE: + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");break; + case ANYOF_BLANK: + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n"); break; + case ANYOF_NBLANK: + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n"); break; + case ANYOF_PSXSPC: + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break; + case ANYOF_NPSXSPC: Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break; case ANYOF_UPPER: Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break; @@ -3512,12 +3836,10 @@ S_regclassutf8(pTHX) } if (range) { if (lastvalue > value) { /* b-a */ - Perl_croak(aTHX_ - "/%.127s/: invalid [] range \"%*.*s\" in regexp", - PL_regprecomp, - PL_regcomp_parse - rangebegin, - PL_regcomp_parse - rangebegin, - rangebegin); + Simple_vFAIL4("Invalid [] range \"%*.*s\"", + PL_regcomp_parse - rangebegin, + PL_regcomp_parse - rangebegin, + rangebegin); } range = 0; } @@ -3528,12 +3850,11 @@ S_regclassutf8(pTHX) PL_regcomp_parse++; if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */ if (ckWARN(WARN_REGEXP)) - Perl_warner(aTHX_ WARN_REGEXP, - "/%.127s/: false [] range \"%*.*s\" in regexp", - PL_regprecomp, - PL_regcomp_parse - rangebegin, - PL_regcomp_parse - rangebegin, - rangebegin); + vWARN4(PL_regcomp_parse, + "False [] range \"%*.*s\"", + PL_regcomp_parse - rangebegin, + PL_regcomp_parse - rangebegin, + rangebegin); if (!SIZE_ONLY) Perl_sv_catpvf(aTHX_ listsv, /* 0x002D is Unicode for '-' */ @@ -3554,7 +3875,14 @@ S_regclassutf8(pTHX) if (!SIZE_ONLY) { SV *rv = swash_init("utf8", "", listsv, 1, 0); +#ifdef DEBUGGING + AV *av = newAV(); + av_push(av, rv); + av_push(av, listsv); + rv = newRV_noinc((SV*)av); +#else SvREFCNT_dec(listsv); +#endif n = add_data(1,"s"); PL_regcomp_rx->data->data[n] = (void*)rv; ARG1_SET(ret, flags); @@ -3567,7 +3895,6 @@ S_regclassutf8(pTHX) STATIC char* S_nextchar(pTHX) { - dTHR; char* retval = PL_regcomp_parse++; for (;;) { @@ -3600,7 +3927,6 @@ S_nextchar(pTHX) STATIC regnode * /* Location. */ S_reg_node(pTHX_ U8 op) { - dTHR; register regnode *ret; register regnode *ptr; @@ -3625,7 +3951,6 @@ S_reg_node(pTHX_ U8 op) STATIC regnode * /* Location. */ S_reganode(pTHX_ U8 op, U32 arg) { - dTHR; register regnode *ret; register regnode *ptr; @@ -3648,16 +3973,9 @@ S_reganode(pTHX_ U8 op, U32 arg) - reguni - emit (if appropriate) a Unicode character */ STATIC void -S_reguni(pTHX_ UV uv, char* s, I32* lenp) +S_reguni(pTHX_ UV uv, char* s, STRLEN* lenp) { - dTHR; - if (SIZE_ONLY) { - U8 tmpbuf[UTF8_MAXLEN]; - *lenp = uv_to_utf8(tmpbuf, uv) - tmpbuf; - } - else - *lenp = uv_to_utf8((U8*)s, uv) - (U8*)s; - + *lenp = SIZE_ONLY ? UNISKIP(uv) : (uv_to_utf8((U8*)s, uv) - (U8*)s); } /* @@ -3668,7 +3986,6 @@ S_reguni(pTHX_ UV uv, char* s, I32* lenp) STATIC void S_reginsert(pTHX_ U8 op, regnode *opnd) { - dTHR; register regnode *src; register regnode *dst; register regnode *place; @@ -3699,7 +4016,6 @@ S_reginsert(pTHX_ U8 op, regnode *opnd) STATIC void S_regtail(pTHX_ regnode *p, regnode *val) { - dTHR; register regnode *scan; register regnode *temp; @@ -3729,7 +4045,6 @@ S_regtail(pTHX_ regnode *p, regnode *val) STATIC void S_regoptail(pTHX_ regnode *p, regnode *val) { - dTHR; /* "Operandless" and "op != BRANCH" are synonymous in practice. */ if (p == NULL || SIZE_ONLY) return; @@ -3843,7 +4158,6 @@ void Perl_regdump(pTHX_ regexp *r) { #ifdef DEBUGGING - dTHR; SV *sv = sv_newmortal(); (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0); @@ -3910,7 +4224,7 @@ Perl_regdump(pTHX_ regexp *r) STATIC void S_put_byte(pTHX_ SV *sv, int c) { - if (c <= ' ' || c == 127 || c == 255) + if (isCNTRL(c) || c == 127 || c == 255) Perl_sv_catpvf(aTHX_ sv, "\\%o", c); else if (c == '-' || c == ']' || c == '\\' || c == '^') Perl_sv_catpvf(aTHX_ sv, "\\%c", c); @@ -3925,12 +4239,11 @@ void Perl_regprop(pTHX_ SV *sv, regnode *o) { #ifdef DEBUGGING - dTHR; register int k; sv_setpvn(sv, "", 0); if (OP(o) >= reg_num) /* regnode.type is unsigned */ - FAIL("corrupted regexp opcode"); + FAIL("Corrupted regexp opcode"); sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */ k = PL_regkind[(U8)OP(o)]; @@ -3939,7 +4252,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0], STR_LEN(o), STRING(o), PL_colors[1]); else if (k == CURLY) { - if (OP(o) == CURLYM || OP(o) == CURLYN) + if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o)); } @@ -3951,8 +4264,10 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ else if (k == ANYOF) { int i, rangestart = -1; - const char * const out[] = { /* Should be syncronized with - a table in regcomp.h */ + bool anyofutf8 = OP(o) == ANYOFUTF8; + U8 flags = anyofutf8 ? ARG1(o) : o->flags; + const char * const anyofs[] = { /* Should be syncronized with + * ANYOF_ #xdefines in regcomp.h */ "\\w", "\\W", "\\s", @@ -3976,38 +4291,94 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) "[:punct:]", "[:^punct:]", "[:upper:]", - "[:!upper:]", + "[:^upper:]", "[:xdigit:]", - "[:^xdigit:]" + "[:^xdigit:]", + "[:space:]", + "[:^space:]", + "[:blank:]", + "[:^blank:]" }; - if (o->flags & ANYOF_LOCALE) + if (flags & ANYOF_LOCALE) sv_catpv(sv, "{loc}"); - if (o->flags & ANYOF_FOLD) + if (flags & ANYOF_FOLD) sv_catpv(sv, "{i}"); Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); - if (o->flags & ANYOF_INVERT) + if (flags & ANYOF_INVERT) sv_catpv(sv, "^"); - for (i = 0; i <= 256; i++) { - if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { - if (rangestart == -1) - rangestart = i; - } else if (rangestart != -1) { - if (i <= rangestart + 3) - for (; rangestart < i; rangestart++) + if (OP(o) == ANYOF) { + for (i = 0; i <= 256; i++) { + if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { + if (rangestart == -1) + rangestart = i; + } else if (rangestart != -1) { + if (i <= rangestart + 3) + for (; rangestart < i; rangestart++) + put_byte(sv, rangestart); + else { put_byte(sv, rangestart); - else { - put_byte(sv, rangestart); - sv_catpv(sv, "-"); - put_byte(sv, i - 1); + sv_catpv(sv, "-"); + put_byte(sv, i - 1); + } + rangestart = -1; + } + } + if (o->flags & ANYOF_CLASS) + for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++) + if (ANYOF_CLASS_TEST(o,i)) + sv_catpv(sv, anyofs[i]); + } + else { + SV *rv = (SV*)PL_regdata->data[ARG2(o)]; + AV *av = (AV*)SvRV((SV*)rv); + SV *sw = *av_fetch(av, 0, FALSE); + SV *lv = *av_fetch(av, 1, FALSE); + UV i; + U8 s[UTF8_MAXLEN+1]; + for (i = 0; i <= 256; i++) { /* just the first 256 */ + U8 *e = uv_to_utf8(s, i); + if (i < 256 && swash_fetch(sw, s)) { + if (rangestart == -1) + rangestart = i; + } else if (rangestart != -1) { + U8 *p; + + if (i <= rangestart + 3) + for (; rangestart < i; rangestart++) { + for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++) + put_byte(sv, *p); + } + else { + for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++) + put_byte(sv, *p); + sv_catpv(sv, "-"); + for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++) + put_byte(sv, *p); + } + rangestart = -1; + } + } + sv_catpv(sv, "..."); + { + char *s = savepv(SvPVX(lv)); + + while(*s && *s != '\n') s++; + if (*s == '\n') { + char *t = ++s; + + while (*s) { + if (*s == '\n') + *s = ' '; + s++; + } + if (s[-1] == ' ') + s[-1] = 0; + + sv_catpv(sv, t); } - rangestart = -1; } } - if (o->flags & ANYOF_CLASS) - for (i = 0; i < sizeof(out)/sizeof(char*); i++) - if (ANYOF_CLASS_TEST(o,i)) - sv_catpv(sv, out[i]); Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) @@ -4037,7 +4408,6 @@ Perl_re_intuit_string(pTHX_ regexp *prog) void Perl_pregfree(pTHX_ struct regexp *r) { - dTHR; DEBUG_r(if (!PL_colorset) reginitcolors()); if (!r || (--r->refcnt > 0)) @@ -4082,8 +4452,13 @@ Perl_pregfree(pTHX_ struct regexp *r) Perl_croak(aTHX_ "panic: pregfree comppad"); old_comppad = PL_comppad; old_curpad = PL_curpad; - PL_comppad = new_comppad; - PL_curpad = AvARRAY(new_comppad); + /* Watch out for global destruction's random ordering. */ + if (SvTYPE(new_comppad) == SVt_PVAV) { + PL_comppad = new_comppad; + PL_curpad = AvARRAY(new_comppad); + } + else + PL_curpad = NULL; op_free((OP_4tree*)r->data->data[n]); PL_comppad = old_comppad; PL_curpad = old_curpad; @@ -4113,7 +4488,6 @@ Perl_pregfree(pTHX_ struct regexp *r) regnode * Perl_regnext(pTHX_ register regnode *p) { - dTHR; register I32 offset; if (p == &PL_regdummy) @@ -4165,7 +4539,6 @@ S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) void Perl_save_re_context(pTHX) { - dTHR; SAVEPPTR(PL_bostr); SAVEPPTR(PL_regprecomp); /* uncompiled string. */ SAVEI32(PL_regnpar); /* () count. */ @@ -4179,9 +4552,8 @@ Perl_save_re_context(pTHX) SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */ SAVEPPTR(PL_regtill); /* How far we are required to go. */ SAVEI8(PL_regprev); /* char before regbol, \n if none */ - SAVEVPTR(PL_reg_start_tmp); /* from regexec.c */ + SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */ PL_reg_start_tmp = 0; - SAVEFREEPV(PL_reg_start_tmp); SAVEI32(PL_reg_start_tmpl); /* from regexec.c */ PL_reg_start_tmpl = 0; SAVEVPTR(PL_regdata); @@ -4207,6 +4579,7 @@ Perl_save_re_context(pTHX) SAVEI32(PL_reg_oldpos); /* from regexec.c */ SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */ SAVEVPTR(PL_reg_curpm); /* from regexec.c */ + SAVEI32(PL_regnpar); /* () count. */ #ifdef DEBUGGING SAVEPPTR(PL_reg_starttry); /* from regexec.c */ #endif |