diff options
author | Andrew Fresh <afresh1@cvs.openbsd.org> | 2017-02-05 00:32:23 +0000 |
---|---|---|
committer | Andrew Fresh <afresh1@cvs.openbsd.org> | 2017-02-05 00:32:23 +0000 |
commit | 80707dac21f0fc477ec75dd64b3ca10edfcaf9c6 (patch) | |
tree | 20a45a5268e2d12c4bf7e666ec2ff12965df40d0 /gnu/usr.bin/perl/regcomp.c | |
parent | 651c07bd0f7d5345bddac9830e68a08e76605399 (diff) |
Fix merge issues, remove excess files - match perl-5.24.1 dist
Diffstat (limited to 'gnu/usr.bin/perl/regcomp.c')
-rw-r--r-- | gnu/usr.bin/perl/regcomp.c | 9670 |
1 files changed, 6625 insertions, 3045 deletions
diff --git a/gnu/usr.bin/perl/regcomp.c b/gnu/usr.bin/perl/regcomp.c index 606b3373b30..be6cb96a700 100644 --- a/gnu/usr.bin/perl/regcomp.c +++ b/gnu/usr.bin/perl/regcomp.c @@ -86,13 +86,14 @@ EXTERN_C const struct regexp_engine my_reg_engine; # include "regcomp.h" #endif -#include "dquote_static.c" -#include "charclass_invlists.h" -#include "inline_invlist.c" +#include "dquote_inline.h" +#include "invlist_inline.h" #include "unicode_constants.h" #define HAS_NONLATIN1_FOLD_CLOSURE(i) \ _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \ + _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) @@ -100,11 +101,41 @@ EXTERN_C const struct regexp_engine my_reg_engine; #define STATIC static #endif +#ifndef MIN +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#endif + +#ifndef MAX +#define MAX(a,b) ((a) > (b) ? (a) : (b)) +#endif + +/* this is a chain of data about sub patterns we are processing that + need to be handled separately/specially in study_chunk. Its so + we can simulate recursion without losing state. */ +struct scan_frame; +typedef struct scan_frame { + regnode *last_regnode; /* last node to process in this frame */ + regnode *next_regnode; /* next node to process when last is reached */ + U32 prev_recursed_depth; + I32 stopparen; /* what stopparen do we use */ + U32 is_top_frame; /* what flags do we use? */ + + struct scan_frame *this_prev_frame; /* this previous frame */ + struct scan_frame *prev_frame; /* previous frame */ + struct scan_frame *next_frame; /* next frame */ +} scan_frame; + +/* Certain characters are output as a sequence with the first being a + * backslash. */ +#define isBACKSLASHED_PUNCT(c) \ + ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^') + struct RExC_state_t { U32 flags; /* RXf_* are we folding, multilining? */ U32 pm_flags; /* PMf_* stuff from the calling PMOP */ char *precomp; /* uncompiled string. */ + char *precomp_end; /* pointer to end of uncompiled string. */ REGEXP *rx_sv; /* The SV that is the regexp. */ regexp *rx; /* perl core regexp structure */ regexp_internal *rxi; /* internal data for regexp object @@ -112,6 +143,8 @@ struct RExC_state_t { char *start; /* Start of input for compile */ char *end; /* End of input for compile */ char *parse; /* Input-scan pointer. */ + char *adjusted_start; /* 'start', adjusted. See code use */ + STRLEN precomp_adj; /* an offset beyond precomp. See code use */ SSize_t whilem_seen; /* number of WHILEM in this expr */ regnode *emit_start; /* Start of emitted-code area */ regnode *emit_bound; /* First regnode outside of the @@ -135,7 +168,7 @@ struct RExC_state_t { I32 seen_zerolen; regnode **open_parens; /* pointers to open parens */ regnode **close_parens; /* pointers to close parens */ - regnode *opend; /* END node in program */ + regnode *end_op; /* END node in program */ I32 utf8; /* whether the pattern is utf8 or not */ I32 orig_utf8; /* whether the pattern was originally in utf8 */ /* XXX use this for future optimisation of case @@ -146,20 +179,26 @@ struct RExC_state_t { HV *paren_names; /* Paren names */ regnode **recurse; /* Recurse regops */ - I32 recurse_count; /* Number of recurse regops */ - U8 *study_chunk_recursed; /* bitmap of which parens we have moved + I32 recurse_count; /* Number of recurse regops we have generated */ + U8 *study_chunk_recursed; /* bitmap of which subs we have moved through */ U32 study_chunk_recursed_bytes; /* bytes in bitmap */ I32 in_lookbehind; I32 contains_locale; I32 contains_i; I32 override_recoding; +#ifdef EBCDIC + I32 recode_x_to_native; +#endif I32 in_multi_char_class; struct reg_code_block *code_blocks; /* positions of literal (?{}) within pattern */ int num_code_blocks; /* size of code_blocks[] */ int code_index; /* next code_blocks[] slot */ SSize_t maxlen; /* mininum possible number of chars in string to match */ + scan_frame *frame_head; + scan_frame *frame_last; + U32 frame_count; #ifdef ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) @@ -169,15 +208,28 @@ struct RExC_state_t { const char *lastparse; I32 lastnum; AV *paren_name_list; /* idx -> name */ + U32 study_chunk_recursed_count; + SV *mysv1; + SV *mysv2; #define RExC_lastparse (pRExC_state->lastparse) #define RExC_lastnum (pRExC_state->lastnum) #define RExC_paren_name_list (pRExC_state->paren_name_list) +#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count) +#define RExC_mysv (pRExC_state->mysv1) +#define RExC_mysv1 (pRExC_state->mysv1) +#define RExC_mysv2 (pRExC_state->mysv2) + #endif + bool seen_unfolded_sharp_s; + bool strict; }; #define RExC_flags (pRExC_state->flags) #define RExC_pm_flags (pRExC_state->pm_flags) #define RExC_precomp (pRExC_state->precomp) +#define RExC_precomp_adj (pRExC_state->precomp_adj) +#define RExC_adjusted_start (pRExC_state->adjusted_start) +#define RExC_precomp_end (pRExC_state->precomp_end) #define RExC_rx_sv (pRExC_state->rx_sv) #define RExC_rx (pRExC_state->rx) #define RExC_rxi (pRExC_state->rxi) @@ -185,6 +237,17 @@ struct RExC_state_t { #define RExC_end (pRExC_state->end) #define RExC_parse (pRExC_state->parse) #define RExC_whilem_seen (pRExC_state->whilem_seen) + +/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any + * EXACTF node, hence was parsed under /di rules. If later in the parse, + * something forces the pattern into using /ui rules, the sharp s should be + * folded into the sequence 'ss', which takes up more space than previously + * calculated. This means that the sizing pass needs to be restarted. (The + * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node + * that gets converted to /ui (and EXACTFU) occupies the same amount of space, + * so there is no need to resize [perl #125990]. */ +#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s) + #ifdef RE_TRACK_PATTERN_OFFSETS #define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ @@ -193,7 +256,6 @@ struct RExC_state_t { #define RExC_emit_dummy (pRExC_state->emit_dummy) #define RExC_emit_start (pRExC_state->emit_start) #define RExC_emit_bound (pRExC_state->emit_bound) -#define RExC_naughty (pRExC_state->naughty) #define RExC_sawback (pRExC_state->sawback) #define RExC_seen (pRExC_state->seen) #define RExC_size (pRExC_state->size) @@ -207,7 +269,7 @@ struct RExC_state_t { #define RExC_orig_utf8 (pRExC_state->orig_utf8) #define RExC_open_parens (pRExC_state->open_parens) #define RExC_close_parens (pRExC_state->close_parens) -#define RExC_opend (pRExC_state->opend) +#define RExC_end_op (pRExC_state->end_op) #define RExC_paren_names (pRExC_state->paren_names) #define RExC_recurse (pRExC_state->recurse) #define RExC_recurse_count (pRExC_state->recurse_count) @@ -218,12 +280,32 @@ struct RExC_state_t { #define RExC_contains_locale (pRExC_state->contains_locale) #define RExC_contains_i (pRExC_state->contains_i) #define RExC_override_recoding (pRExC_state->override_recoding) +#ifdef EBCDIC +# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native) +#endif #define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) - +#define RExC_frame_head (pRExC_state->frame_head) +#define RExC_frame_last (pRExC_state->frame_last) +#define RExC_frame_count (pRExC_state->frame_count) +#define RExC_strict (pRExC_state->strict) + +/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set + * a flag to disable back-off on the fixed/floating substrings - if it's + * a high complexity pattern we assume the benefit of avoiding a full match + * is worth the cost of checking for the substrings even if they rarely help. + */ +#define RExC_naughty (pRExC_state->naughty) +#define TOO_NAUGHTY (10) +#define MARK_NAUGHTY(add) \ + if (RExC_naughty < TOO_NAUGHTY) \ + RExC_naughty += (add) +#define MARK_NAUGHTY_EXP(exp, add) \ + if (RExC_naughty < TOO_NAUGHTY) \ + RExC_naughty += RExC_naughty / (exp) + (add) #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ - ((*s) == '{' && regcurly(s, FALSE))) + ((*s) == '{' && regcurly(s))) /* * Flags to be passed up and down. @@ -239,7 +321,9 @@ struct RExC_state_t { #define SPSTART 0x04 /* Starts with * or + */ #define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */ #define TRYAGAIN 0x10 /* Weeded out a declaration. */ -#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */ +#define RESTART_PASS1 0x20 /* Need to restart sizing pass */ +#define NEED_UTF8 0x40 /* In conjunction with RESTART_PASS1, need to + calcuate sizes as UTF-8 */ #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) @@ -258,12 +342,30 @@ struct RExC_state_t { #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren) #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren)) -#define REQUIRE_UTF8 STMT_START { \ +#define REQUIRE_UTF8(flagp) STMT_START { \ if (!UTF) { \ - *flagp = RESTART_UTF8; \ + assert(PASS1); \ + *flagp = RESTART_PASS1|NEED_UTF8; \ return NULL; \ } \ - } STMT_END + } STMT_END + +/* Change from /d into /u rules, and restart the parse if we've already seen + * something whose size would increase as a result, by setting *flagp and + * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates + * we've change to /u during the parse. */ +#define REQUIRE_UNI_RULES(flagp, restart_retval) \ + STMT_START { \ + if (DEPENDS_SEMANTICS) { \ + assert(PASS1); \ + set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \ + RExC_uni_semantics = 1; \ + if (RExC_seen_unfolded_sharp_s) { \ + *flagp |= RESTART_PASS1; \ + return restart_retval; \ + } \ + } \ + } STMT_END /* This converts the named class defined in regcomp.h to its equivalent class * number defined in handy.h. */ @@ -375,24 +477,6 @@ typedef struct scan_data_t { regnode_ssc *start_class; } scan_data_t; -/* The below is perhaps overboard, but this allows us to save a test at the - * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A' - * and 'a' differ by a single bit; the same with the upper and lower case of - * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart; - * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and - * then inverts it to form a mask, with just a single 0, in the bit position - * where the upper- and lowercase differ. XXX There are about 40 other - * instances in the Perl core where this micro-optimization could be used. - * Should decide if maintenance cost is worse, before changing those - * - * Returns a boolean as to whether or not 'v' is either a lowercase or - * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a - * compile-time constant, the generated code is better than some optimizing - * compilers figure out, amounting to a mask and test. The results are - * meaningless if 'c' is not one of [A-Za-z] */ -#define isARG2_lower_or_UPPER_ARG1(c, v) \ - (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a'))) - /* * Forward declarations for pregcomp()'s friends. */ @@ -418,7 +502,20 @@ static const scan_data_t zero_scan_data = #define SF_HAS_PAR 0x0080 #define SF_IN_PAR 0x0100 #define SF_HAS_EVAL 0x0200 + + +/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the + * longest substring in the pattern. When it is not set the optimiser keeps + * track of position, but does not keep track of the actual strings seen, + * + * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but + * /foo/i will not. + * + * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble" + * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be + * turned off because of the alternation (BRANCH). */ #define SCF_DO_SUBSTR 0x0400 + #define SCF_DO_STCLASS_AND 0x0800 #define SCF_DO_STCLASS_OR 0x1000 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) @@ -427,6 +524,10 @@ static const scan_data_t zero_scan_data = #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ #define SCF_SEEN_ACCEPT 0x8000 #define SCF_TRIE_DOING_RESTUDY 0x10000 +#define SCF_IN_DEFINE 0x20000 + + + #define UTF cBOOL(RExC_utf8) @@ -476,9 +577,71 @@ static const scan_data_t zero_scan_data = #define REPORT_LOCATION " in regex; marked by " MARKER1 \ " in m/%"UTF8f MARKER2 "%"UTF8f"/" -#define REPORT_LOCATION_ARGS(offset) \ - UTF8fARG(UTF, offset, RExC_precomp), \ - UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset) +/* The code in this file in places uses one level of recursion with parsing + * rebased to an alternate string constructed by us in memory. This can take + * the form of something that is completely different from the input, or + * something that uses the input as part of the alternate. In the first case, + * there should be no possibility of an error, as we are in complete control of + * the alternate string. But in the second case we don't control the input + * portion, so there may be errors in that. Here's an example: + * /[abc\x{DF}def]/ui + * is handled specially because \x{df} folds to a sequence of more than one + * character, 'ss'. What is done is to create and parse an alternate string, + * which looks like this: + * /(?:\x{DF}|[abc\x{DF}def])/ui + * where it uses the input unchanged in the middle of something it constructs, + * which is a branch for the DF outside the character class, and clustering + * parens around the whole thing. (It knows enough to skip the DF inside the + * class while in this substitute parse.) 'abc' and 'def' may have errors that + * need to be reported. The general situation looks like this: + * + * sI tI xI eI + * Input: ---------------------------------------------------- + * Constructed: --------------------------------------------------- + * sC tC xC eC EC + * + * The input string sI..eI is the input pattern. The string sC..EC is the + * constructed substitute parse string. The portions sC..tC and eC..EC are + * constructed by us. The portion tC..eC is an exact duplicate of the input + * pattern tI..eI. In the diagram, these are vertically aligned. Suppose that + * while parsing, we find an error at xC. We want to display a message showing + * the real input string. Thus we need to find the point xI in it which + * corresponds to xC. xC >= tC, since the portion of the string sC..tC has + * been constructed by us, and so shouldn't have errors. We get: + * + * xI = sI + (tI - sI) + (xC - tC) + * + * and, the offset into sI is: + * + * (xI - sI) = (tI - sI) + (xC - tC) + * + * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj, + * and we save tC as RExC_adjusted_start. + * + * During normal processing of the input pattern, everything points to that, + * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI. + */ + +#define tI_sI RExC_precomp_adj +#define tC RExC_adjusted_start +#define sC RExC_precomp +#define xI_offset(xC) ((IV) (tI_sI + (xC - tC))) +#define xI(xC) (sC + xI_offset(xC)) +#define eC RExC_precomp_end + +#define REPORT_LOCATION_ARGS(xC) \ + UTF8fARG(UTF, \ + (xI(xC) > eC) /* Don't run off end */ \ + ? eC - sC /* Length before the <--HERE */ \ + : xI_offset(xC), \ + sC), /* The input pattern printed up to the <--HERE */ \ + UTF8fARG(UTF, \ + (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */ \ + (xI(xC) > eC) ? eC : xI(xC)) /* pattern after <--HERE */ + +/* Used to point after bad bytes for an error message, but avoid skipping + * past a nul byte. */ +#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1) /* * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given @@ -487,7 +650,7 @@ static const scan_data_t zero_scan_data = */ #define _FAIL(code) STMT_START { \ const char *ellipses = ""; \ - IV len = RExC_end - RExC_precomp; \ + IV len = RExC_precomp_end - RExC_precomp; \ \ if (!SIZE_ONLY) \ SAVEFREESV(RExC_rx_sv); \ @@ -511,9 +674,8 @@ static const scan_data_t zero_scan_data = * Simple_vFAIL -- like FAIL, but marks the current location in the scan */ #define Simple_vFAIL(m) STMT_START { \ - const IV offset = RExC_parse - RExC_precomp; \ Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ - m, REPORT_LOCATION_ARGS(offset)); \ + m, REPORT_LOCATION_ARGS(RExC_parse)); \ } STMT_END /* @@ -529,9 +691,8 @@ static const scan_data_t zero_scan_data = * Like Simple_vFAIL(), but accepts two arguments. */ #define Simple_vFAIL2(m,a1) STMT_START { \ - const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ - REPORT_LOCATION_ARGS(offset)); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(RExC_parse)); \ } STMT_END /* @@ -548,9 +709,8 @@ static const scan_data_t zero_scan_data = * Like Simple_vFAIL(), but accepts three arguments. */ #define Simple_vFAIL3(m, a1, a2) STMT_START { \ - const IV offset = RExC_parse - RExC_precomp; \ S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ - REPORT_LOCATION_ARGS(offset)); \ + REPORT_LOCATION_ARGS(RExC_parse)); \ } STMT_END /* @@ -566,9 +726,8 @@ static const scan_data_t zero_scan_data = * Like Simple_vFAIL(), but accepts four arguments. */ #define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ - const IV offset = RExC_parse - RExC_precomp; \ - S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \ - REPORT_LOCATION_ARGS(offset)); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \ + REPORT_LOCATION_ARGS(RExC_parse)); \ } STMT_END #define vFAIL4(m,a1,a2,a3) STMT_START { \ @@ -578,95 +737,108 @@ static const scan_data_t zero_scan_data = } STMT_END /* A specialized version of vFAIL2 that works with UTF8f */ -#define vFAIL2utf8f(m, a1) STMT_START { \ - const IV offset = RExC_parse - RExC_precomp; \ - if (!SIZE_ONLY) \ - SAVEFREESV(RExC_rx_sv); \ - S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ - REPORT_LOCATION_ARGS(offset)); \ +#define vFAIL2utf8f(m, a1) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(RExC_parse)); \ +} STMT_END + +#define vFAIL3utf8f(m, a1, a2) STMT_START { \ + if (!SIZE_ONLY) \ + SAVEFREESV(RExC_rx_sv); \ + S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \ + REPORT_LOCATION_ARGS(RExC_parse)); \ } STMT_END +/* These have asserts in them because of [perl #122671] Many warnings in + * regcomp.c can occur twice. If they get output in pass1 and later in that + * pass, the pattern has to be converted to UTF-8 and the pass restarted, they + * would get output again. So they should be output in pass2, and these + * asserts make sure new warnings follow that paradigm. */ /* m is not necessarily a "literal string", in this macro */ #define reg_warn_non_literal_string(loc, m) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \ - m, REPORT_LOCATION_ARGS(offset)); \ + __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ + "%s" REPORT_LOCATION, \ + m, REPORT_LOCATION_ARGS(loc)); \ } STMT_END #define ckWARNreg(loc,m) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(offset)); \ + __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc)); \ +} STMT_END + +#define vWARN(loc, m) STMT_START { \ + __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc)); \ } STMT_END #define vWARN_dep(loc, m) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(offset)); \ + __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc)); \ } STMT_END #define ckWARNdep(loc,m) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ - m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(offset)); \ + __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc)); \ } STMT_END -#define ckWARNregdep(loc,m) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ - m REPORT_LOCATION, \ - REPORT_LOCATION_ARGS(offset)); \ +#define ckWARNregdep(loc,m) STMT_START { \ + __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \ + WARN_REGEXP), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc)); \ } STMT_END -#define ckWARN2reg_d(loc,m, a1) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ - m REPORT_LOCATION, \ - a1, REPORT_LOCATION_ARGS(offset)); \ +#define ckWARN2reg_d(loc,m, a1) STMT_START { \ + __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, REPORT_LOCATION_ARGS(loc)); \ } STMT_END -#define ckWARN2reg(loc, m, a1) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, REPORT_LOCATION_ARGS(offset)); \ +#define ckWARN2reg(loc, m, a1) STMT_START { \ + __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, REPORT_LOCATION_ARGS(loc)); \ } STMT_END -#define vWARN3(loc, m, a1, a2) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, REPORT_LOCATION_ARGS(offset)); \ +#define vWARN3(loc, m, a1, a2) STMT_START { \ + __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, REPORT_LOCATION_ARGS(loc)); \ } STMT_END -#define ckWARN3reg(loc, m, a1, a2) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, REPORT_LOCATION_ARGS(offset)); \ +#define ckWARN3reg(loc, m, a1, a2) STMT_START { \ + __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, \ + REPORT_LOCATION_ARGS(loc)); \ } STMT_END #define vWARN4(loc, m, a1, a2, a3) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ + __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, a3, \ + REPORT_LOCATION_ARGS(loc)); \ } STMT_END #define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \ + __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, a3, \ + REPORT_LOCATION_ARGS(loc)); \ } STMT_END #define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \ - const IV offset = loc - RExC_precomp; \ - Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ - a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \ -} STMT_END - - -/* Allow for side effects in s */ -#define REGC(c,s) STMT_START { \ - if (!SIZE_ONLY) *(s) = (c); else (void)(s); \ + __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, a3, a4, \ + REPORT_LOCATION_ARGS(loc)); \ } STMT_END /* Macros for recording node offsets. 20001227 mjd@plover.com @@ -739,64 +911,115 @@ static const scan_data_t zero_scan_data = #define EXPERIMENTAL_INPLACESCAN #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ -#define DEBUG_RExC_seen() \ +#ifdef DEBUGGING +int +Perl_re_printf(pTHX_ const char *fmt, ...) +{ + va_list ap; + int result; + PerlIO *f= Perl_debug_log; + PERL_ARGS_ASSERT_RE_PRINTF; + va_start(ap, fmt); + result = PerlIO_vprintf(f, fmt, ap); + va_end(ap); + return result; +} + +int +Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...) +{ + va_list ap; + int result; + PerlIO *f= Perl_debug_log; + PERL_ARGS_ASSERT_RE_INDENTF; + va_start(ap, depth); + PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, ""); + result = PerlIO_vprintf(f, fmt, ap); + va_end(ap); + return result; +} +#endif /* DEBUGGING */ + +#define DEBUG_RExC_seen() \ DEBUG_OPTIMISE_MORE_r({ \ - PerlIO_printf(Perl_debug_log,"RExC_seen: "); \ + Perl_re_printf( aTHX_ "RExC_seen: "); \ \ if (RExC_seen & REG_ZERO_LEN_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \ + Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \ \ if (RExC_seen & REG_LOOKBEHIND_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \ + Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \ \ if (RExC_seen & REG_GPOS_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \ - \ - if (RExC_seen & REG_CANY_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \ + Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \ \ if (RExC_seen & REG_RECURSE_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \ + Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \ \ - if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \ + if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ + Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \ \ if (RExC_seen & REG_VERBARG_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \ + Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \ \ if (RExC_seen & REG_CUTGROUP_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \ + Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \ \ if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \ + Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \ \ if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \ - \ - if (RExC_seen & REG_GOSTART_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \ + Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \ \ - if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \ + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ + Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \ \ - PerlIO_printf(Perl_debug_log,"\n"); \ + Perl_re_printf( aTHX_ "\n"); \ }); +#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \ + if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag) + +#define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \ + if ( ( flags ) ) { \ + Perl_re_printf( aTHX_ "%s", open_str); \ + DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \ + DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \ + DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \ + DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \ + DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \ + DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \ + DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \ + DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \ + DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \ + DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \ + DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \ + DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \ + DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \ + DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \ + DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \ + Perl_re_printf( aTHX_ "%s", close_str); \ + } + + #define DEBUG_STUDYDATA(str,data,depth) \ DEBUG_OPTIMISE_MORE_r(if(data){ \ - PerlIO_printf(Perl_debug_log, \ - "%*s" str "Pos:%"IVdf"/%"IVdf \ - " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \ - (int)(depth)*2, "", \ + Perl_re_indentf( aTHX_ "" str "Pos:%"IVdf"/%"IVdf \ + " Flags: 0x%"UVXf, \ + depth, \ (IV)((data)->pos_min), \ (IV)((data)->pos_delta), \ - (UV)((data)->flags), \ + (UV)((data)->flags) \ + ); \ + DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \ + Perl_re_printf( aTHX_ \ + " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \ (IV)((data)->whilem_c), \ (IV)((data)->last_closep ? *((data)->last_closep) : -1), \ is_inf ? "INF " : "" \ ); \ if ((data)->last_found) \ - PerlIO_printf(Perl_debug_log, \ + Perl_re_printf( aTHX_ \ "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \ " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \ SvPVX_const((data)->last_found), \ @@ -813,9 +1036,178 @@ DEBUG_OPTIMISE_MORE_r(if(data){ \ (IV)((data)->offset_float_min), \ (IV)((data)->offset_float_max) \ ); \ - PerlIO_printf(Perl_debug_log,"\n"); \ + Perl_re_printf( aTHX_ "\n"); \ }); + +/* ========================================================= + * BEGIN edit_distance stuff. + * + * This calculates how many single character changes of any type are needed to + * transform a string into another one. It is taken from version 3.1 of + * + * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS + */ + +/* Our unsorted dictionary linked list. */ +/* Note we use UVs, not chars. */ + +struct dictionary{ + UV key; + UV value; + struct dictionary* next; +}; +typedef struct dictionary item; + + +PERL_STATIC_INLINE item* +push(UV key,item* curr) +{ + item* head; + Newxz(head, 1, item); + head->key = key; + head->value = 0; + head->next = curr; + return head; +} + + +PERL_STATIC_INLINE item* +find(item* head, UV key) +{ + item* iterator = head; + while (iterator){ + if (iterator->key == key){ + return iterator; + } + iterator = iterator->next; + } + + return NULL; +} + +PERL_STATIC_INLINE item* +uniquePush(item* head,UV key) +{ + item* iterator = head; + + while (iterator){ + if (iterator->key == key) { + return head; + } + iterator = iterator->next; + } + + return push(key,head); +} + +PERL_STATIC_INLINE void +dict_free(item* head) +{ + item* iterator = head; + + while (iterator) { + item* temp = iterator; + iterator = iterator->next; + Safefree(temp); + } + + head = NULL; +} + +/* End of Dictionary Stuff */ + +/* All calculations/work are done here */ +STATIC int +S_edit_distance(const UV* src, + const UV* tgt, + const STRLEN x, /* length of src[] */ + const STRLEN y, /* length of tgt[] */ + const SSize_t maxDistance +) +{ + item *head = NULL; + UV swapCount,swapScore,targetCharCount,i,j; + UV *scores; + UV score_ceil = x + y; + + PERL_ARGS_ASSERT_EDIT_DISTANCE; + + /* intialize matrix start values */ + Newxz(scores, ( (x + 2) * (y + 2)), UV); + scores[0] = score_ceil; + scores[1 * (y + 2) + 0] = score_ceil; + scores[0 * (y + 2) + 1] = score_ceil; + scores[1 * (y + 2) + 1] = 0; + head = uniquePush(uniquePush(head,src[0]),tgt[0]); + + /* work loops */ + /* i = src index */ + /* j = tgt index */ + for (i=1;i<=x;i++) { + if (i < x) + head = uniquePush(head,src[i]); + scores[(i+1) * (y + 2) + 1] = i; + scores[(i+1) * (y + 2) + 0] = score_ceil; + swapCount = 0; + + for (j=1;j<=y;j++) { + if (i == 1) { + if(j < y) + head = uniquePush(head,tgt[j]); + scores[1 * (y + 2) + (j + 1)] = j; + scores[0 * (y + 2) + (j + 1)] = score_ceil; + } + + targetCharCount = find(head,tgt[j-1])->value; + swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount; + + if (src[i-1] != tgt[j-1]){ + scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1)); + } + else { + swapCount = j; + scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore); + } + } + + find(head,src[i-1])->value = i; + } + + { + IV score = scores[(x+1) * (y + 2) + (y + 1)]; + dict_free(head); + Safefree(scores); + return (maxDistance != 0 && maxDistance < score)?(-1):score; + } +} + +/* END of edit_distance() stuff + * ========================================================= */ + +/* is c a control character for which we have a mnemonic? */ +#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) + +STATIC const char * +S_cntrl_to_mnemonic(const U8 c) +{ + /* Returns the mnemonic string that represents character 'c', if one + * exists; NULL otherwise. The only ones that exist for the purposes of + * this routine are a few control characters */ + + switch (c) { + case '\a': return "\\a"; + case '\b': return "\\b"; + case ESC_NATIVE: return "\\e"; + case '\f': return "\\f"; + case '\n': return "\\n"; + case '\r': return "\\r"; + case '\t': return "\\t"; + } + + return NULL; +} + /* Mark that we cannot extend a found fixed substring at this point. Update the longest found anchored substring and the longest found floating substrings if needed. */ @@ -845,8 +1237,8 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, else { /* *data->longest == data->longest_float */ data->offset_float_min = l ? data->last_start_min : data->pos_min; data->offset_float_max = (l - ? data->last_start_max - : (data->pos_delta == SSize_t_MAX + ? data->last_start_max + : (data->pos_delta > SSize_t_MAX - data->pos_min ? SSize_t_MAX : data->pos_min + data->pos_delta)); if (is_inf @@ -889,11 +1281,11 @@ S_ssc_anything(pTHX_ regnode_ssc *ssc) ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */ _append_range_to_invlist(ssc->invlist, 0, UV_MAX); - ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */ + ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */ } STATIC int -S_ssc_is_anything(pTHX_ const regnode_ssc *ssc) +S_ssc_is_anything(const regnode_ssc *ssc) { /* Returns TRUE if the SSC 'ssc' can match the empty string and any code * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys @@ -907,7 +1299,7 @@ S_ssc_is_anything(pTHX_ const regnode_ssc *ssc) assert(is_ANYOF_SYNTHETIC(ssc)); - if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) { + if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) { return FALSE; } @@ -946,16 +1338,16 @@ S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) Zero(ssc, 1, regnode_ssc); set_ANYOF_SYNTHETIC(ssc); - ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY); + ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP); ssc_anything(ssc); - /* If any portion of the regex is to operate under locale rules, - * initialization includes it. The reason this isn't done for all regexes - * is that the optimizer was written under the assumption that locale was - * all-or-nothing. Given the complexity and lack of documentation in the - * optimizer, and that there are inadequate test cases for locale, many - * parts of it may not work properly, it is safest to avoid locale unless - * necessary. */ + /* If any portion of the regex is to operate under locale rules that aren't + * fully known at compile time, initialization includes it. The reason + * this isn't done for all regexes is that the optimizer was written under + * the assumption that locale was all-or-nothing. Given the complexity and + * lack of documentation in the optimizer, and that there are inadequate + * test cases for locale, many parts of it may not work properly, it is + * safest to avoid locale unless necessary. */ if (RExC_contains_locale) { ANYOF_POSIXL_SETALL(ssc); } @@ -965,8 +1357,8 @@ S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) } STATIC int -S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state, - const regnode_ssc *ssc) +S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state, + const regnode_ssc *ssc) { /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only * to the list of code points matched, and locale posix classes; hence does @@ -1007,7 +1399,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, * returned list must, and will, contain every code point that is a * possibility. */ - SV* invlist = sv_2mortal(_new_invlist(0)); + SV* invlist = NULL; SV* only_utf8_locale_invlist = NULL; unsigned int i; const U32 n = ARG(node); @@ -1016,7 +1408,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC; /* Look at the data structure created by S_set_ANYOF_arg() */ - if (n != ANYOF_NONBITMAP_EMPTY) { + if (n != ANYOF_ONLY_HAS_BITMAP) { SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]); AV * const av = MUTABLE_AV(SvRV(rv)); SV **const ary = AvARRAY(av); @@ -1029,6 +1421,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, /* Here, no compile-time swash, and there are things that won't be * known until runtime -- we have to assume it could be anything */ + invlist = sv_2mortal(_new_invlist(1)); return _add_range_to_invlist(invlist, 0, UV_MAX); } else if (ary[3] && ary[3] != &PL_sv_undef) { @@ -1039,22 +1432,27 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, } /* Get the code points valid only under UTF-8 locales */ - if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) + if ((ANYOF_FLAGS(node) & ANYOFL_FOLD) && ary[2] && ary[2] != &PL_sv_undef) { only_utf8_locale_invlist = ary[2]; } } - /* An ANYOF node contains a bitmap for the first 256 code points, and an - * inversion list for the others, but if there are code points that should - * match only conditionally on the target string being UTF-8, those are - * placed in the inversion list, and not the bitmap. Since there are - * circumstances under which they could match, they are included in the - * SSC. But if the ANYOF node is to be inverted, we have to exclude them - * here, so that when we invert below, the end result actually does include - * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here - * before we add the unconditionally matched code points */ + if (! invlist) { + invlist = sv_2mortal(_new_invlist(0)); + } + + /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS + * code points, and an inversion list for the others, but if there are code + * points that should match only conditionally on the target string being + * UTF-8, those are placed in the inversion list, and not the bitmap. + * Since there are circumstances under which they could match, they are + * included in the SSC. But if the ANYOF node is to be inverted, we have + * to exclude them here, so that when we invert below, the end result + * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We + * have to do this here before we add the unconditionally matched code + * points */ if (ANYOF_FLAGS(node) & ANYOF_INVERT) { _invlist_intersection_complement_2nd(invlist, PL_UpperLatin1, @@ -1062,28 +1460,37 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, } /* Add in the points from the bit map */ - for (i = 0; i < 256; i++) { + for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { if (ANYOF_BITMAP_TEST(node, i)) { - invlist = add_cp_to_invlist(invlist, i); + unsigned int start = i++; + + for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) { + /* empty */ + } + invlist = _add_range_to_invlist(invlist, start, i-1); new_node_has_latin1 = TRUE; } } /* If this can match all upper Latin1 code points, have to add them - * as well */ - if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) { + * as well. But don't add them if inverting, as when that gets done below, + * it would exclude all these characters, including the ones it shouldn't + * that were added just above */ + if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD + && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)) + { _invlist_union(invlist, PL_UpperLatin1, &invlist); } /* Similarly for these */ - if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { - invlist = _add_range_to_invlist(invlist, 256, UV_MAX); + if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) { + _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist); } if (ANYOF_FLAGS(node) & ANYOF_INVERT) { _invlist_invert(invlist); } - else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) { + else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) { /* Under /li, any 0-255 could fold to any other 0-255, depending on the * locale. We can skip this if there are no 0-255 at all. */ @@ -1110,8 +1517,8 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, #define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) /* 'AND' a given class with another one. Can create false positives. 'ssc' - * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if - * 'and_with' is a regnode_charclass instead of a regnode_ssc. */ + * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be + * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */ STATIC void S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, @@ -1149,12 +1556,24 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, * that should be; while the consequences for having /l bugs is * incorrect matches */ if (ssc_is_anything((regnode_ssc *)and_with)) { - anded_flags |= ANYOF_WARN_SUPER; + anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; } } else { anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); - anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; + if (OP(and_with) == ANYOFD) { + anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; + } + else { + anded_flags = ANYOF_FLAGS(and_with) + &( ANYOF_COMMON_FLAGS + |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER + |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP); + if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) { + anded_flags &= + ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; + } + } } ANYOF_FLAGS(ssc) &= anded_flags; @@ -1202,7 +1621,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, /* If either P1 or P2 is empty, the intersection will be also; can skip * the loop */ - if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) { + if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) { ANYOF_POSIXL_ZERO(ssc); } else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { @@ -1261,16 +1680,16 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, else { ssc->invlist = anded_cp_list; ANYOF_POSIXL_ZERO(ssc); - if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) { ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); } } } else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) - || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) + || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) { /* One or the other of P1, P2 is non-empty. */ - if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) { + if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) { ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); } ssc_union(ssc, anded_cp_list, FALSE); @@ -1305,6 +1724,16 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, else { ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; + if (OP(or_with) != ANYOFD) { + ored_flags + |= ANYOF_FLAGS(or_with) + & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER + |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP); + if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) { + ored_flags |= + ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; + } + } } ANYOF_FLAGS(ssc) |= ored_flags; @@ -1332,7 +1761,7 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, { /* We ignore P2, leaving P1 going forward */ } /* else Not inverted */ - else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) { + else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) { ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { unsigned int i; @@ -1410,10 +1839,9 @@ S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp) } PERL_STATIC_INLINE void -S_ssc_clear_locale(pTHX_ regnode_ssc *ssc) +S_ssc_clear_locale(regnode_ssc *ssc) { /* Set the SSC 'ssc' to not match any locale things */ - PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE; assert(is_ANYOF_SYNTHETIC(ssc)); @@ -1422,12 +1850,71 @@ S_ssc_clear_locale(pTHX_ regnode_ssc *ssc) ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS; } +#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C + +STATIC bool +S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc) +{ + /* The synthetic start class is used to hopefully quickly winnow down + * places where a pattern could start a match in the target string. If it + * doesn't really narrow things down that much, there isn't much point to + * having the overhead of using it. This function uses some very crude + * heuristics to decide if to use the ssc or not. + * + * It returns TRUE if 'ssc' rules out more than half what it considers to + * be the "likely" possible matches, but of course it doesn't know what the + * actual things being matched are going to be; these are only guesses + * + * For /l matches, it assumes that the only likely matches are going to be + * in the 0-255 range, uniformly distributed, so half of that is 127 + * For /a and /d matches, it assumes that the likely matches will be just + * the ASCII range, so half of that is 63 + * For /u and there isn't anything matching above the Latin1 range, it + * assumes that that is the only range likely to be matched, and uses + * half that as the cut-off: 127. If anything matches above Latin1, + * it assumes that all of Unicode could match (uniformly), except for + * non-Unicode code points and things in the General Category "Other" + * (unassigned, private use, surrogates, controls and formats). This + * is a much large number. */ + + U32 count = 0; /* Running total of number of code points matched by + 'ssc' */ + UV start, end; /* Start and end points of current range in inversion + list */ + const U32 max_code_points = (LOC) + ? 256 + : (( ! UNI_SEMANTICS + || invlist_highest(ssc->invlist) < 256) + ? 128 + : NON_OTHER_COUNT); + const U32 max_match = max_code_points / 2; + + PERL_ARGS_ASSERT_IS_SSC_WORTH_IT; + + invlist_iterinit(ssc->invlist); + while (invlist_iternext(ssc->invlist, &start, &end)) { + if (start >= max_code_points) { + break; + } + end = MIN(end, max_code_points - 1); + count += end - start + 1; + if (count >= max_match) { + invlist_iterfinish(ssc->invlist); + return FALSE; + } + } + + return TRUE; +} + + STATIC void S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) { /* The inversion list in the SSC is marked mortal; now we need a more * permanent copy, which is stored the same way that is done in a regular - * ANYOF node, with the first 256 code points in a bit map */ + * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit + * map */ SV* invlist = invlist_clone(ssc->invlist); @@ -1436,9 +1923,12 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) assert(is_ANYOF_SYNTHETIC(ssc)); /* The code in this file assumes that all but these flags aren't relevant - * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the - * time we reach here */ - assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS)); + * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared + * by the time we reach here */ + assert(! (ANYOF_FLAGS(ssc) + & ~( ANYOF_COMMON_FLAGS + |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER + |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP))); populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); @@ -1449,7 +1939,11 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) ssc->invlist = NULL; if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { - ANYOF_FLAGS(ssc) |= ANYOF_POSIXL; + ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL; + } + + if (RExC_contains_locale) { + OP(ssc) = ANYOFL; } assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale); @@ -1496,14 +1990,13 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, PERL_ARGS_ASSERT_DUMP_TRIE; - PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ", - (int)depth * 2 + 2,"", - "Match","Base","Ofs" ); + Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ", + depth+1, "Match","Base","Ofs" ); for( state = 0 ; state < trie->uniquecharcount ; state++ ) { SV ** const tmp = av_fetch( revcharmap, state, 0); if ( tmp ) { - PerlIO_printf( Perl_debug_log, "%*s", + Perl_re_printf( aTHX_ "%*s", colwidth, pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, PL_colors[0], PL_colors[1], @@ -1513,27 +2006,25 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, ); } } - PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------", - (int)depth * 2 + 2,""); + Perl_re_printf( aTHX_ "\n"); + Perl_re_indentf( aTHX_ "State|-----------------------", depth+1); for( state = 0 ; state < trie->uniquecharcount ; state++ ) - PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------"); - PerlIO_printf( Perl_debug_log, "\n"); + Perl_re_printf( aTHX_ "%.*s", colwidth, "--------"); + Perl_re_printf( aTHX_ "\n"); for( state = 1 ; state < trie->statecount ; state++ ) { const U32 base = trie->states[ state ].trans.base; - PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", - (int)depth * 2 + 2,"", (UV)state); + Perl_re_indentf( aTHX_ "#%4"UVXf"|", depth+1, (UV)state); if ( trie->states[ state ].wordnum ) { - PerlIO_printf( Perl_debug_log, " W%4X", - trie->states[ state ].wordnum ); + Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum ); } else { - PerlIO_printf( Perl_debug_log, "%6s", "" ); + Perl_re_printf( aTHX_ "%6s", "" ); } - PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base ); + Perl_re_printf( aTHX_ " @%4"UVXf" ", (UV)base ); if ( base ) { U32 ofs = 0; @@ -1544,7 +2035,7 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, != state)) ofs++; - PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs); + Perl_re_printf( aTHX_ "+%2"UVXf"[ ", (UV)ofs); for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { if ( ( base + ofs >= trie->uniquecharcount ) @@ -1553,28 +2044,27 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, && trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) { - PerlIO_printf( Perl_debug_log, "%*"UVXf, - colwidth, - (UV)trie->trans[ base + ofs - - trie->uniquecharcount ].next ); + Perl_re_printf( aTHX_ "%*"UVXf, colwidth, + (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next + ); } else { - PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." ); + Perl_re_printf( aTHX_ "%*s",colwidth," ." ); } } - PerlIO_printf( Perl_debug_log, "]"); + Perl_re_printf( aTHX_ "]"); } - PerlIO_printf( Perl_debug_log, "\n" ); + Perl_re_printf( aTHX_ "\n" ); } - PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", - (int)depth*2, ""); + Perl_re_indentf( aTHX_ "word_info N:(prev,len)=", + depth); for (word=1; word <= trie->wordcount; word++) { - PerlIO_printf(Perl_debug_log, " %d:(%d,%d)", + Perl_re_printf( aTHX_ " %d:(%d,%d)", (int)word, (int)(trie->wordinfo[word].prev), (int)(trie->wordinfo[word].len)); } - PerlIO_printf(Perl_debug_log, "\n" ); + Perl_re_printf( aTHX_ "\n" ); } /* Dumps a fully constructed but uncompressed trie in list form. @@ -1595,19 +2085,20 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; /* print out the table precompression. */ - PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s", - (int)depth * 2 + 2,"", (int)depth * 2 + 2,"", - "------:-----+-----------------\n" ); + Perl_re_indentf( aTHX_ "State :Word | Transition Data\n", + depth+1 ); + Perl_re_indentf( aTHX_ "%s", + depth+1, "------:-----+-----------------\n" ); for( state=1 ; state < next_alloc ; state ++ ) { U16 charid; - PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :", - (int)depth * 2 + 2,"", (UV)state ); + Perl_re_indentf( aTHX_ " %4"UVXf" :", + depth+1, (UV)state ); if ( ! trie->states[ state ].wordnum ) { - PerlIO_printf( Perl_debug_log, "%5s| ",""); + Perl_re_printf( aTHX_ "%5s| ",""); } else { - PerlIO_printf( Perl_debug_log, "W%4x| ", + Perl_re_printf( aTHX_ "W%4x| ", trie->states[ state ].wordnum ); } @@ -1615,7 +2106,7 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0); if ( tmp ) { - PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ", + Perl_re_printf( aTHX_ "%*s:%3X=%4"UVXf" | ", colwidth, pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, @@ -1627,11 +2118,11 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, (UV)TRIE_LIST_ITEM(state,charid).newstate ); if (!(charid % 10)) - PerlIO_printf(Perl_debug_log, "\n%*s| ", + Perl_re_printf( aTHX_ "\n%*s| ", (int)((depth * 2) + 14), ""); } } - PerlIO_printf( Perl_debug_log, "\n"); + Perl_re_printf( aTHX_ "\n"); } } @@ -1659,12 +2150,12 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, that they are identical. */ - PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" ); + Perl_re_indentf( aTHX_ "Char : ", depth+1 ); for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { SV ** const tmp = av_fetch( revcharmap, charid, 0); if ( tmp ) { - PerlIO_printf( Perl_debug_log, "%*s", + Perl_re_printf( aTHX_ "%*s", colwidth, pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, PL_colors[0], PL_colors[1], @@ -1675,32 +2166,33 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, } } - PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" ); + Perl_re_printf( aTHX_ "\n"); + Perl_re_indentf( aTHX_ "State+-", depth+1 ); for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) { - PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------"); + Perl_re_printf( aTHX_ "%.*s", colwidth,"--------"); } - PerlIO_printf( Perl_debug_log, "\n" ); + Perl_re_printf( aTHX_ "\n" ); for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { - PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ", - (int)depth * 2 + 2,"", + Perl_re_indentf( aTHX_ "%4"UVXf" : ", + depth+1, (UV)TRIE_NODENUM( state ) ); for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); if (v) - PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v ); + Perl_re_printf( aTHX_ "%*"UVXf, colwidth, v ); else - PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." ); + Perl_re_printf( aTHX_ "%*s", colwidth, "." ); } if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { - PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", + Perl_re_printf( aTHX_ " (%4"UVXf")\n", (UV)trie->trans[ state ].check ); } else { - PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", + Perl_re_printf( aTHX_ " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check, trie->states[ TRIE_NODENUM( state ) ].wordnum ); } @@ -1718,7 +2210,7 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, May be the same as tail. tail : item following the branch sequence count : words in the sequence - flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/ + flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/ depth : indent depth Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. @@ -1751,7 +2243,7 @@ then read 'r' and go to state 8 followed by 's' which takes us to state 9 which is also accepting. Thus we know that we can match both 'he' and 'hers' with a single traverse. We store a mapping from accepting to state to which word was matched, and then when we have multiple possibilities we try to complete the -rest of the regex in the order in which they occured in the alternation. +rest of the regex in the order in which they occurred in the alternation. The only prior NFA like behaviour that would be changed by the TRIE support is the silent ignoring of duplicate alternations which are of the form: @@ -1830,7 +2322,7 @@ is the recommended Unicode-aware way of saying #define TRIE_STORE_REVCHAR(val) \ STMT_START { \ if (UTF) { \ - SV *zlopp = newSV(7); /* XXX: optimize me */ \ + SV *zlopp = newSV(UTF8_MAXBYTES); \ unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ SvCUR_set(zlopp, kapow - flrbbbbb); \ @@ -1947,7 +2439,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth) { - dVAR; /* first pass, loop through and scan words */ reg_trie_data *trie; HV *widecharmap = NULL; @@ -1983,10 +2474,11 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, #endif switch (flags) { - case EXACT: break; + case EXACT: case EXACTL: break; case EXACTFA: case EXACTFU_SS: - case EXACTFU: folder = PL_fold_latin1; break; + case EXACTFU: + case EXACTFLU8: folder = PL_fold_latin1; break; case EXACTF: folder = PL_fold; break; default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); } @@ -1997,7 +2489,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, trie->wordcount = word_count; RExC_rxi->data->data[ data_slot ] = (void*)trie; trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); - if (flags == EXACT) + if (flags == EXACT || flags == EXACTL) trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( trie->wordcount+1, sizeof(reg_trie_wordinfo)); @@ -2007,13 +2499,14 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, }); re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); + assert(re_trie_maxbuff); if (!SvIOK(re_trie_maxbuff)) { sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); } DEBUG_TRIE_COMPILE_r({ - PerlIO_printf( Perl_debug_log, - "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", - (int)depth * 2 + 2, "", + Perl_re_indentf( aTHX_ + "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", + depth+1, REG_NODE_NUM(startbranch),REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); }); @@ -2052,8 +2545,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, for ( cur = first ; cur < last ; cur = regnext( cur ) ) { regnode *noper = NEXTOPER( cur ); - const U8 *uc = (U8*)STRING( noper ); - const U8 *e = uc + STR_LEN( noper ); + const U8 *uc; + const U8 *e; int foldlen = 0; U32 wordlen = 0; /* required init */ STRLEN minchars = 0; @@ -2063,17 +2556,19 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); - if (noper_next != tail && OP(noper_next) == flags) { - noper = noper_next; - uc= (U8*)STRING(noper); - e= uc + STR_LEN(noper); - trie->minlen= STR_LEN(noper); - } else { - trie->minlen= 0; - continue; - } + if (noper_next < tail) + noper= noper_next; + } + + if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) { + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + } else { + trie->minlen= 0; + continue; } + if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte regardless of encoding */ @@ -2216,9 +2711,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, } } /* end first pass */ DEBUG_TRIE_COMPILE_r( - PerlIO_printf( Perl_debug_log, - "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", - (int)depth * 2 + 2,"", + Perl_re_indentf( aTHX_ + "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", + depth+1, ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, (int)trie->minlen, (int)trie->maxlen ) @@ -2266,9 +2761,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, STRLEN transcount = 1; - DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, - "%*sCompiling trie using list compiler\n", - (int)depth * 2 + 2, "")); + DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n", + depth+1)); trie->states = (reg_trie_state *) PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, @@ -2279,22 +2773,20 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, for ( cur = first ; cur < last ; cur = regnext( cur ) ) { regnode *noper = NEXTOPER( cur ); - U8 *uc = (U8*)STRING( noper ); - const U8 *e = uc + STR_LEN( noper ); U32 state = 1; /* required init */ U16 charid = 0; /* sanity init */ U32 wordlen = 0; /* required init */ if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); - if (noper_next != tail && OP(noper_next) == flags) { - noper = noper_next; - uc= (U8*)STRING(noper); - e= uc + STR_LEN(noper); - } + if (noper_next < tail) + noper= noper_next; } - if (OP(noper) != NOTHING) { + if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) { + const U8 *uc= (U8*)STRING(noper); + const U8 *e= uc + STR_LEN(noper); + for ( ; uc < e ; uc += len ) { TRIE_READ_CHAR; @@ -2376,7 +2868,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, /* DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp) + Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp) ); */ @@ -2438,7 +2930,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, } /* DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf( Perl_debug_log, " base: %d\n",base); + Perl_re_printf( aTHX_ " base: %d\n",base); ); */ trie->states[ state ].trans.base=base; @@ -2481,9 +2973,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, we have to use TRIE_NODENUM() to convert. */ - DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, - "%*sCompiling trie using table compiler\n", - (int)depth * 2 + 2, "")); + DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n", + depth+1)); trie->trans = (reg_trie_trans *) PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) @@ -2498,8 +2989,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, for ( cur = first ; cur < last ; cur = regnext( cur ) ) { regnode *noper = NEXTOPER( cur ); - const U8 *uc = (U8*)STRING( noper ); - const U8 *e = uc + STR_LEN( noper ); U32 state = 1; /* required init */ @@ -2510,14 +2999,14 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); - if (noper_next != tail && OP(noper_next) == flags) { - noper = noper_next; - uc= (U8*)STRING(noper); - e= uc + STR_LEN(noper); - } + if (noper_next < tail) + noper= noper_next; } - if ( OP(noper) != NOTHING ) { + if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) { + const U8 *uc= (U8*)STRING(noper); + const U8 *e= uc + STR_LEN(noper); + for ( ; uc < e ; uc += len ) { TRIE_READ_CHAR; @@ -2675,9 +3164,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, PerlMemShared_realloc( trie->states, laststate * sizeof(reg_trie_state) ); DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf( Perl_debug_log, - "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", - (int)depth * 2 + 2,"", + Perl_re_indentf( aTHX_ "Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", + depth+1, (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ), (IV)next_alloc, @@ -2688,9 +3176,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, } /* end table compress */ } DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf(Perl_debug_log, - "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", - (int)depth * 2 + 2, "", + Perl_re_indentf( aTHX_ "Statecount:%"UVxf" Lasttrans:%"UVxf"\n", + depth+1, (UV)trie->statecount, (UV)trie->lasttrans) ); @@ -2740,9 +3227,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, }); } DEBUG_OPTIMISE_r( - PerlIO_printf(Perl_debug_log, - "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", - (int)depth * 2 + 2, "", + Perl_re_indentf( aTHX_ "MJD offset:%"UVuf" MJD length:%"UVuf"\n", + depth+1, (UV)mjd_offset, (UV)mjd_nodelen) ); #endif @@ -2772,9 +3258,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if ( count == 2 ) { Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); DEBUG_OPTIMISE_r( - PerlIO_printf(Perl_debug_log, - "%*sNew Start State=%"UVuf" Class: [", - (int)depth * 2 + 2, "", + Perl_re_indentf( aTHX_ "New Start State=%"UVuf" Class: [", + depth+1, (UV)state)); if (idx >= 0) { SV ** const tmp = av_fetch( revcharmap, idx, 0); @@ -2784,14 +3269,14 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if ( folder ) TRIE_BITMAP_SET(trie, folder[ *ch ]); DEBUG_OPTIMISE_r( - PerlIO_printf(Perl_debug_log, "%s", (char*)ch) + Perl_re_printf( aTHX_ "%s", (char*)ch) ); } } TRIE_BITMAP_SET(trie,*ch); if ( folder ) TRIE_BITMAP_SET(trie,folder[ *ch ]); - DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch)); + DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch)); } idx = ofs; } @@ -2802,9 +3287,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, char *ch = SvPV( *tmp, len ); DEBUG_OPTIMISE_r({ SV *sv=sv_newmortal(); - PerlIO_printf( Perl_debug_log, - "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", - (int)depth * 2 + 2, "", + Perl_re_indentf( aTHX_ "Prefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", + depth+1, (UV)state, (UV)idx, pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, PL_colors[0], PL_colors[1], @@ -2824,7 +3308,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, } else { #ifdef DEBUGGING if (state>1) - DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n")); + DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n")); #endif break; } @@ -2981,8 +3465,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, : MADE_TRIE; } -STATIC void -S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) +STATIC regnode * +S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth) { /* The Trie is constructed and compressed now so we can build a fail array if * it's needed @@ -3020,13 +3504,27 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode U32 *fail; reg_ac_data *aho; const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T")); + regnode *stclass; GET_RE_DEBUG_FLAGS_DECL; - PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE; + PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE; + PERL_UNUSED_CONTEXT; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif + if ( OP(source) == TRIE ) { + struct regnode_1 *op = (struct regnode_1 *) + PerlMemShared_calloc(1, sizeof(struct regnode_1)); + StructCopy(source,op,struct regnode_1); + stclass = (regnode *)op; + } else { + struct regnode_charclass *op = (struct regnode_charclass *) + PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); + StructCopy(source,op,struct regnode_charclass); + stclass = (regnode *)op; + } + OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */ ARG_SET( stclass, data_slot ); aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); @@ -3082,31 +3580,31 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode */ fail[ 0 ] = fail[ 1 ] = 0; DEBUG_TRIE_COMPILE_r({ - PerlIO_printf(Perl_debug_log, - "%*sStclass Failtable (%"UVuf" states): 0", - (int)(depth * 2), "", (UV)numstates + Perl_re_indentf( aTHX_ "Stclass Failtable (%"UVuf" states): 0", + depth, (UV)numstates ); for( q_read=1; q_read<numstates; q_read++ ) { - PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]); + Perl_re_printf( aTHX_ ", %"UVuf, (UV)fail[q_read]); } - PerlIO_printf(Perl_debug_log, "\n"); + Perl_re_printf( aTHX_ "\n"); }); Safefree(q); /*RExC_seen |= REG_TRIEDFA_SEEN;*/ + return stclass; } -#define DEBUG_PEEP(str,scan,depth) \ - DEBUG_OPTIMISE_r({if (scan){ \ - SV * const mysv=sv_newmortal(); \ - regnode *Next = regnext(scan); \ - regprop(RExC_rx, mysv, scan, NULL); \ - PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \ - (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\ - Next ? (REG_NODE_NUM(Next)) : 0 ); \ +#define DEBUG_PEEP(str,scan,depth) \ + DEBUG_OPTIMISE_r({if (scan){ \ + regnode *Next = regnext(scan); \ + regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);\ + Perl_re_indentf( aTHX_ "" str ">%3d: %s (%d)", \ + depth, REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\ + Next ? (REG_NODE_NUM(Next)) : 0 );\ + DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\ + Perl_re_printf( aTHX_ "\n"); \ }}); - /* The below joins as many adjacent EXACTish nodes as possible into a single * one. The regop may be changed if the node(s) contain certain sequences that * require special handling. The joining is only done if: @@ -3117,6 +3615,14 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * The adjacent nodes actually may be separated by NOTHING-kind nodes, and * these get optimized out * + * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full + * as possible, even if that means splitting an existing node so that its first + * part is moved to the preceeding node. This would maximise the efficiency of + * memEQ during matching. Elsewhere in this file, khw proposes splitting + * EXACTFish nodes into portions that don't change under folding vs those that + * do. Those portions that don't change may be the only things in the pattern that + * could be used to find fixed and floating strings. + * * If a node is to match under /i (folded), the number of characters it matches * can be different than its character length if it contains a multi-character * fold. *min_subtract is set to the total delta number of characters of the @@ -3324,7 +3830,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, * this final joining, sequences could have been split over boundaries, and * hence missed). The sequences only happen in folding, hence for any * non-EXACT EXACTish node */ - if (OP(scan) != EXACT) { + if (OP(scan) != EXACT && OP(scan) != EXACTL) { U8* s0 = (U8*) STRING(scan); U8* s = s0; U8* s_end = s0 + STR_LEN(scan); @@ -3416,9 +3922,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, else { /* Here is a generic multi-char fold. */ U8* multi_end = s + len; - /* Count how many characters in it. In the case of /aa, no - * folds which contain ASCII code points are allowed, so - * check for those, and skip if found. */ + /* Count how many characters are in it. In the case of + * /aa, no folds which contain ASCII code points are + * allowed, so check for those, and skip if found. */ if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) { count = utf8_length(s, multi_end); s = multi_end; @@ -3465,6 +3971,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, * this function, we need to flag any occurrences of the sharp s. * This character forbids trie formation (because of added * complexity) */ +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) while (s < s_end) { if (*s == LATIN_SMALL_LETTER_SHARP_S) { OP(scan) = EXACTFA_NO_TRIE; @@ -3472,7 +3981,6 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, break; } s++; - continue; } } else { @@ -3500,8 +4008,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, } if (len == 2 - && isARG2_lower_or_UPPER_ARG1('s', *s) - && isARG2_lower_or_UPPER_ARG1('s', *(s+1))) + && isALPHA_FOLD_EQ(*s, 's') + && isALPHA_FOLD_EQ(*(s+1), 's')) { /* EXACTF nodes need to know that the minimum length @@ -3518,6 +4026,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, *min_subtract += len - 1; s += len; } +#endif } } @@ -3547,17 +4056,17 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, Newx(and_withp,1, regnode_ssc); \ SAVEFREEPV(and_withp) -/* this is a chain of data about sub patterns we are processing that - need to be handled separately/specially in study_chunk. Its so - we can simulate recursion without losing state. */ -struct scan_frame; -typedef struct scan_frame { - regnode *last; /* last node to process in this frame */ - regnode *next; /* next node to process when last is reached */ - struct scan_frame *prev; /*previous frame*/ - U32 prev_recursed_depth; - I32 stop; /* what stopparen do we use */ -} scan_frame; + +static void +S_unwind_scan_frames(pTHX_ const void *p) +{ + scan_frame *f= (scan_frame *)p; + do { + scan_frame *n= f->next_frame; + Safefree(f); + f= n; + } while (f); +} STATIC SSize_t @@ -3577,7 +4086,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* recursed: which subroutines have we recursed into */ /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ { - dVAR; /* There must be at least this number of characters to match */ SSize_t min = 0; I32 pars = 0, code; @@ -3595,9 +4103,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, PERL_ARGS_ASSERT_STUDY_CHUNK; -#ifdef DEBUGGING - StructCopy(&zero_scan_data, &data_fake, scan_data_t); -#endif + if ( depth == 0 ) { while (first_non_open && OP(first_non_open) == OPEN) first_non_open=regnext(first_non_open); @@ -3605,42 +4111,57 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, fake_study_recurse: + DEBUG_r( + RExC_study_chunk_recursed_count++; + ); + DEBUG_OPTIMISE_MORE_r( + { + Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p", + depth, (long)stopparen, + (unsigned long)RExC_study_chunk_recursed_count, + (unsigned long)depth, (unsigned long)recursed_depth, + scan, + last); + if (recursed_depth) { + U32 i; + U32 j; + for ( j = 0 ; j < recursed_depth ; j++ ) { + for ( i = 0 ; i < (U32)RExC_npar ; i++ ) { + if ( + PAREN_TEST(RExC_study_chunk_recursed + + ( j * RExC_study_chunk_recursed_bytes), i ) + && ( + !j || + !PAREN_TEST(RExC_study_chunk_recursed + + (( j - 1 ) * RExC_study_chunk_recursed_bytes), i) + ) + ) { + Perl_re_printf( aTHX_ " %d",(int)i); + break; + } + } + if ( j + 1 < recursed_depth ) { + Perl_re_printf( aTHX_ ","); + } + } + } + Perl_re_printf( aTHX_ "\n"); + } + ); while ( scan && OP(scan) != END && scan < last ){ UV min_subtract = 0; /* How mmany chars to subtract from the minimum node length to get a real minimum (because the folded version may be shorter) */ bool unfolded_multi_char = FALSE; /* Peephole optimizer: */ - DEBUG_OPTIMISE_MORE_r( - { - PerlIO_printf(Perl_debug_log, - "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ", - ((int) depth*2), "", (long)stopparen, - (unsigned long)depth, (unsigned long)recursed_depth); - if (recursed_depth) { - U32 i; - U32 j; - for ( j = 0 ; j < recursed_depth ; j++ ) { - PerlIO_printf(Perl_debug_log,"["); - for ( i = 0 ; i < (U32)RExC_npar ; i++ ) - PerlIO_printf(Perl_debug_log,"%d", - PAREN_TEST(RExC_study_chunk_recursed + - (j * RExC_study_chunk_recursed_bytes), i) - ? 1 : 0 - ); - PerlIO_printf(Perl_debug_log,"]"); - } - } - PerlIO_printf(Perl_debug_log,"\n"); - } - ); DEBUG_STUDYDATA("Peep:", data, depth); DEBUG_PEEP("Peep", scan, depth); - /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/ - * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled - * by a different invocation of reg() -- Yves + /* The reason we do this here is that we need to deal with things like + * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT + * parsing code, as each (?:..) is handled by a different invocation of + * reg() -- Yves */ JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); @@ -3667,17 +4188,47 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, NEXT_OFF(scan) = off; } - - /* The principal pseudo-switch. Cannot be a switch, since we look into several different things. */ - if (OP(scan) == BRANCH || OP(scan) == BRANCHJ - || OP(scan) == IFTHEN) { + if ( OP(scan) == DEFINEP ) { + SSize_t minlen = 0; + SSize_t deltanext = 0; + SSize_t fake_last_close = 0; + I32 f = SCF_IN_DEFINE; + + StructCopy(&zero_scan_data, &data_fake, scan_data_t); + scan = regnext(scan); + assert( OP(scan) == IFTHEN ); + DEBUG_PEEP("expect IFTHEN", scan, depth); + + data_fake.last_closep= &fake_last_close; + minlen = *minlenp; + next = regnext(scan); + scan = NEXTOPER(NEXTOPER(scan)); + DEBUG_PEEP("scan", scan, depth); + DEBUG_PEEP("next", next, depth); + + /* we suppose the run is continuous, last=next... + * NOTE we dont use the return here! */ + (void)study_chunk(pRExC_state, &scan, &minlen, + &deltanext, next, &data_fake, stopparen, + recursed_depth, NULL, f, depth+1); + + scan = next; + } else + if ( + OP(scan) == BRANCH || + OP(scan) == BRANCHJ || + OP(scan) == IFTHEN + ) { next = regnext(scan); code = OP(scan); - /* demq: the op(next)==code check is to see if we have - * "branch-branch" AFAICT */ + /* The op(next)==code check below is to see if we + * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN" + * IFTHEN is special as it might not appear in pairs. + * Not sure whether BRANCH-BRANCHJ is possible, regardless + * we dont handle it cleanly. */ if (OP(next) == code || code == IFTHEN) { /* NOTE - There is similar code to this block below for * handling TRIE nodes on a re-study. If you change stuff here @@ -3699,8 +4250,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 f = 0; regnode_ssc this_class; + DEBUG_PEEP("Branch", scan, depth); + num++; - data_fake.flags = 0; + StructCopy(&zero_scan_data, &data_fake, scan_data_t); if (data) { data_fake.whilem_c = data->whilem_c; data_fake.last_closep = data->last_closep; @@ -3710,9 +4263,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data_fake.pos_delta = delta; next = regnext(scan); - scan = NEXTOPER(scan); - if (code != BRANCH) + + scan = NEXTOPER(scan); /* everything */ + if (code != BRANCH) /* everything but BRANCH */ scan = NEXTOPER(scan); + if (flags & SCF_DO_STCLASS) { ssc_init(pRExC_state, &this_class); data_fake.start_class = &this_class; @@ -3725,6 +4280,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext, next, &data_fake, stopparen, recursed_depth, NULL, f,depth+1); + if (min1 > minnext) min1 = minnext; if (deltanext == SSize_t_MAX) { @@ -3850,9 +4406,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, U8 trietype = 0; U32 count=0; -#ifdef DEBUGGING - SV * const mysv = sv_newmortal(); /* for dumping */ -#endif /* var tail is used because there may be a TAIL regop in the way. Ie, the exacts will point to the thing following the TAIL, but the last branch will @@ -3868,11 +4421,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, tail, NULL); - PerlIO_printf( Perl_debug_log, "%*s%s%s\n", - (int)depth * 2 + 2, "", - "Looking for TRIE'able sequences. Tail node is: ", - SvPV_nolen_const( mysv ) + regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state); + Perl_re_indentf( aTHX_ "%s %"UVuf":%s\n", + depth+1, + "Looking for TRIE'able sequences. Tail node is ", + (UV)(tail - RExC_emit_start), + SvPV_nolen_const( RExC_mysv ) ); }); @@ -3928,14 +4482,24 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, EXACTFU | EXACTFU EXACTFU_SS | EXACTFU EXACTFA | EXACTFA + EXACTL | EXACTL + EXACTFLU8 | EXACTFLU8 */ -#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \ - ( EXACT == (X) ) ? EXACT : \ - ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \ - ( EXACTFA == (X) ) ? EXACTFA : \ - 0 ) +#define TRIE_TYPE(X) ( ( NOTHING == (X) ) \ + ? NOTHING \ + : ( EXACT == (X) ) \ + ? EXACT \ + : ( EXACTFU == (X) || EXACTFU_SS == (X) ) \ + ? EXACTFU \ + : ( EXACTFA == (X) ) \ + ? EXACTFA \ + : ( EXACTL == (X) ) \ + ? EXACTL \ + : ( EXACTFLU8 == (X) ) \ + ? EXACTFLU8 \ + : 0 ) /* dont use tail as the end marker for this traverse */ for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { @@ -3944,25 +4508,26 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, U8 noper_trietype = TRIE_TYPE( noper_type ); #if defined(DEBUGGING) || defined(NOJUMPTRIE) regnode * const noper_next = regnext( noper ); - U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0; - U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0; + U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0; + U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0; #endif DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur, NULL); - PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", - (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); + regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); + Perl_re_indentf( aTHX_ "- %d:%s (%d)", + depth+1, + REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) ); - regprop(RExC_rx, mysv, noper, NULL); - PerlIO_printf( Perl_debug_log, " -> %s", - SvPV_nolen_const(mysv)); + regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state); + Perl_re_printf( aTHX_ " -> %d:%s", + REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv)); if ( noper_next ) { - regprop(RExC_rx, mysv, noper_next, NULL); - PerlIO_printf( Perl_debug_log,"\t=> %s\t", - SvPV_nolen_const(mysv)); + regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state); + Perl_re_printf( aTHX_ "\t=> %d:%s\t", + REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv)); } - PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n", + Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n", REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] ); @@ -3973,12 +4538,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( noper_trietype && ( - ( noper_trietype == NOTHING) + ( noper_trietype == NOTHING ) || ( trietype == NOTHING ) || ( trietype == noper_trietype ) ) #ifdef NOJUMPTRIE - && noper_next == tail + && noper_next >= tail #endif && count < U16_MAX) { @@ -3991,7 +4556,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( noper_trietype == NOTHING ) { #if !defined(DEBUGGING) && !defined(NOJUMPTRIE) regnode * const noper_next = regnext( noper ); - U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0; + U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0; U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0; #endif @@ -4038,7 +4603,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } if ( noper_trietype #ifdef NOJUMPTRIE - && noper_next == tail + && noper_next >= tail #endif ){ /* noper is triable, so we can start a new @@ -4057,11 +4622,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } /* end handle unmergable node */ } /* loop over branches */ DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur, NULL); - PerlIO_printf( Perl_debug_log, - "%*s- %s (%d) <SCAN FINISHED>\n", - (int)depth * 2 + 2, - "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); + regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); + Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ", + depth+1, SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur)); + Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n", + REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), + PL_reg_name[trietype] + ); }); if ( last && trietype ) { @@ -4079,7 +4646,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, depth==0 ) { flags |= SCF_TRIE_RESTUDY; if ( startbranch == first - && scan == tail ) + && scan >= tail ) { RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; } @@ -4097,10 +4664,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * something like this: (?:|) So we can * turn it into a plain NOTHING op. */ DEBUG_TRIE_COMPILE_r({ - regprop(RExC_rx, mysv, cur, NULL); - PerlIO_printf( Perl_debug_log, - "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2, - "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); + regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); + Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n", + depth+1, + SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur)); }); OP(startbranch)= NOTHING; @@ -4120,29 +4687,64 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else /* single branch is optimized. */ scan = NEXTOPER(scan); continue; - } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) { - scan_frame *newframe = NULL; - I32 paren; - regnode *start; - regnode *end; + } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) { + I32 paren = 0; + regnode *start = NULL; + regnode *end = NULL; U32 my_recursed_depth= recursed_depth; - if (OP(scan) != SUSPEND) { - /* set the pointer */ - if (OP(scan) == GOSUB) { - paren = ARG(scan); - RExC_recurse[ARG2L(scan)] = scan; - start = RExC_open_parens[paren-1]; - end = RExC_close_parens[paren-1]; - } else { - paren = 0; - start = RExC_rxi->program + 1; - end = RExC_opend; + if (OP(scan) != SUSPEND) { /* GOSUB */ + /* Do setup, note this code has side effects beyond + * the rest of this block. Specifically setting + * RExC_recurse[] must happen at least once during + * study_chunk(). */ + paren = ARG(scan); + RExC_recurse[ARG2L(scan)] = scan; + start = RExC_open_parens[paren]; + end = RExC_close_parens[paren]; + + /* NOTE we MUST always execute the above code, even + * if we do nothing with a GOSUB */ + if ( + ( flags & SCF_IN_DEFINE ) + || + ( + (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF)) + && + ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 ) + ) + ) { + /* no need to do anything here if we are in a define. */ + /* or we are after some kind of infinite construct + * so we can skip recursing into this item. + * Since it is infinite we will not change the maxlen + * or delta, and if we miss something that might raise + * the minlen it will merely pessimise a little. + * + * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/ + * might result in a minlen of 1 and not of 4, + * but this doesn't make us mismatch, just try a bit + * harder than we should. + * */ + scan= regnext(scan); + continue; } - if (!recursed_depth + + if ( + !recursed_depth || !PAREN_TEST(RExC_study_chunk_recursed + ((recursed_depth-1) * RExC_study_chunk_recursed_bytes), paren) ) { + /* it is quite possible that there are more efficient ways + * to do this. We maintain a bitmap per level of recursion + * of which patterns we have entered so we can detect if a + * pattern creates a possible infinite loop. When we + * recurse down a level we copy the previous levels bitmap + * down. When we are at recursion level 0 we zero the top + * level bitmap. It would be nice to implement a different + * more efficient way of doing this. In particular the top + * level bitmap may be unnecessary. + */ if (!recursed_depth) { Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8); } else { @@ -4151,12 +4753,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, RExC_study_chunk_recursed_bytes, U8); } /* we havent recursed into this paren yet, so recurse into it */ - DEBUG_STUDYDATA("set:", data,depth); + DEBUG_STUDYDATA("gosub-set:", data,depth); PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren); my_recursed_depth= recursed_depth + 1; - Newx(newframe,1,scan_frame); } else { - DEBUG_STUDYDATA("inf:", data,depth); + DEBUG_STUDYDATA("gosub-inf:", data,depth); /* some form of infinite recursion, assume infinite length * */ if (flags & SCF_DO_SUBSTR) { @@ -4167,22 +4768,37 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ ssc_anything(data->start_class); flags &= ~SCF_DO_STCLASS; + + start= NULL; /* reset start so we dont recurse later on. */ } } else { - Newx(newframe,1,scan_frame); paren = stopparen; - start = scan+2; + start = scan + 2; end = regnext(scan); } - if (newframe) { - assert(start); + if (start) { + scan_frame *newframe; assert(end); - SAVEFREEPV(newframe); - newframe->next = regnext(scan); - newframe->last = last; - newframe->stop = stopparen; - newframe->prev = frame; + if (!RExC_frame_last) { + Newxz(newframe, 1, scan_frame); + SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe); + RExC_frame_head= newframe; + RExC_frame_count++; + } else if (!RExC_frame_last->next_frame) { + Newxz(newframe,1,scan_frame); + RExC_frame_last->next_frame= newframe; + newframe->prev_frame= RExC_frame_last; + RExC_frame_count++; + } else { + newframe= RExC_frame_last->next_frame; + } + RExC_frame_last= newframe; + + newframe->next_regnode = regnext(scan); + newframe->last_regnode = last; + newframe->stopparen = stopparen; newframe->prev_recursed_depth = recursed_depth; + newframe->this_prev_frame= frame; DEBUG_STUDYDATA("frame-new:",data,depth); DEBUG_PEEP("fnew", scan, depth); @@ -4197,7 +4813,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, continue; } } - else if (OP(scan) == EXACT) { + else if (OP(scan) == EXACT || OP(scan) == EXACTL) { SSize_t l = STR_LEN(scan); UV uc; if (UTF) { @@ -4236,7 +4852,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * can't match null string */ if (flags & SCF_DO_STCLASS_AND) { ssc_cp_and(data->start_class, uc); - ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; ssc_clear_locale(data->start_class); } else if (flags & SCF_DO_STCLASS_OR) { @@ -4244,15 +4860,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ - ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; } flags &= ~SCF_DO_STCLASS; } - else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */ + else if (PL_regkind[OP(scan)] == EXACT) { + /* But OP != EXACT!, so is EXACTFish */ SSize_t l = STR_LEN(scan); - UV uc = *((U8*)STRING(scan)); - SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2 - separate code points */ + const U8 * s = (U8*)STRING(scan); /* Search for fixed substrings supports EXACT only. */ if (flags & SCF_DO_SUBSTR) { @@ -4260,8 +4875,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, scan_commit(pRExC_state, data, minlenp, is_inf); } if (UTF) { - const U8 * const s = (U8 *)STRING(scan); - uc = utf8_to_uvchr_buf(s, s + l, NULL); l = utf8_length(s, s + l); } if (unfolded_multi_char) { @@ -4280,88 +4893,28 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->longest = &(data->longest_float); } } - if (OP(scan) == EXACTFL) { - /* We don't know what the folds are; it could be anything. XXX - * Actually, we only support UTF-8 encoding for code points - * above Latin1, so we could know what those folds are. */ - EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist, - 0, - UV_MAX); - } - else { /* Non-locale EXACTFish */ - EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (flags & SCF_DO_STCLASS) { + SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan); + + assert(EXACTF_invlist); if (flags & SCF_DO_STCLASS_AND) { - ssc_clear_locale(data->start_class); + if (OP(scan) != EXACTFL) + ssc_clear_locale(data->start_class); + ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; + ANYOF_POSIXL_ZERO(data->start_class); + ssc_intersection(data->start_class, EXACTF_invlist, FALSE); } - if (uc < 256) { /* We know what the Latin1 folds are ... */ - if (IS_IN_SOME_FOLD_L1(uc)) { /* For instance, we - know if anything folds - with this */ - EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, - PL_fold_latin1[uc]); - if (OP(scan) != EXACTFA) { /* The folds below aren't - legal under /iaa */ - if (isARG2_lower_or_UPPER_ARG1('s', uc)) { - EXACTF_invlist - = add_cp_to_invlist(EXACTF_invlist, - LATIN_SMALL_LETTER_SHARP_S); - } - else if (uc == LATIN_SMALL_LETTER_SHARP_S) { - EXACTF_invlist - = add_cp_to_invlist(EXACTF_invlist, 's'); - EXACTF_invlist - = add_cp_to_invlist(EXACTF_invlist, 'S'); - } - } + else { /* SCF_DO_STCLASS_OR */ + ssc_union(data->start_class, EXACTF_invlist, FALSE); + ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - /* We also know if there are above-Latin1 code points - * that fold to this (none legal for ASCII and /iaa) */ - if ((! isASCII(uc) || OP(scan) != EXACTFA) - && HAS_NONLATIN1_FOLD_CLOSURE(uc)) - { - /* XXX We could know exactly what does fold to this - * if the reverse folds are loaded, as currently in - * S_regclass() */ - _invlist_union(EXACTF_invlist, - PL_AboveLatin1, - &EXACTF_invlist); - } - } - } - else { /* Non-locale, above Latin1. XXX We don't currently - know what participates in folds with this, so have - to assume anything could */ - - /* XXX We could know exactly what does fold to this if the - * reverse folds are loaded, as currently in S_regclass(). - * But we do know that under /iaa nothing in the ASCII - * range can participate */ - if (OP(scan) == EXACTFA) { - _invlist_union_complement_2nd(EXACTF_invlist, - PL_XPosix_ptrs[_CC_ASCII], - &EXACTF_invlist); - } - else { - EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist, - 0, UV_MAX); - } + /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ + ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; } + flags &= ~SCF_DO_STCLASS; + SvREFCNT_dec(EXACTF_invlist); } - if (flags & SCF_DO_STCLASS_AND) { - ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; - ANYOF_POSIXL_ZERO(data->start_class); - ssc_intersection(data->start_class, EXACTF_invlist, FALSE); - } - else if (flags & SCF_DO_STCLASS_OR) { - ssc_union(data->start_class, EXACTF_invlist, FALSE); - ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp); - - /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ - ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; - } - flags &= ~SCF_DO_STCLASS; - SvREFCNT_dec(EXACTF_invlist); } else if (REGNODE_VARIES(OP(scan))) { SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0; @@ -4378,7 +4931,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, case PLUS: if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { next = NEXTOPER(scan); - if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) { + if (OP(next) == EXACT + || OP(next) == EXACTL + || (flags & SCF_DO_STCLASS)) + { mincount = 1; maxcount = REG_INFTY; next = regnext(scan); @@ -4389,7 +4945,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_SUBSTR) data->pos_min++; min++; - /* Fall through. */ + /* FALLTHROUGH */ case STAR: if (flags & SCF_DO_STCLASS) { mincount = 0; @@ -4477,7 +5033,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, flags &= ~SCF_DO_STCLASS_AND; StructCopy(&this_class, data->start_class, regnode_ssc); flags |= SCF_DO_STCLASS_OR; - ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; + ANYOF_FLAGS(data->start_class) + |= SSC_MATCHES_EMPTY_STRING; } } else { /* Non-zero len */ if (flags & SCF_DO_STCLASS_OR) { @@ -4500,8 +5057,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, { /* Fatal warnings may leak the regexp without this: */ SAVEFREESV(RExC_rx_sv); - ckWARNreg(RExC_parse, - "Quantifier unexpected on zero-length expression"); + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), + "Quantifier unexpected on zero-length expression " + "in regex m/%"UTF8f"/", + UTF8fARG(UTF, RExC_precomp_end - RExC_precomp, + RExC_precomp)); (void)ReREFCNT_inc(RExC_rx_sv); } @@ -4540,8 +5100,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (OP(nxt) != CLOSE) goto nogo; if (RExC_open_parens) { - RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ - RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/ + RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/ + RExC_close_parens[ARG(nxt1)]=nxt+2; /*close->while*/ } /* Now we know that nxt2 is the only contents: */ oscan->flags = (U8)ARG(nxt); @@ -4587,8 +5147,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, oscan->flags = (U8)ARG(nxt); if (RExC_open_parens) { - RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ - RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/ + RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/ + RExC_close_parens[ARG(nxt1)]=nxt2+1; /*close->NOTHING*/ } OP(nxt1) = OPTIMIZED; /* was OPEN. */ OP(nxt) = OPTIMIZED; /* was CLOSE. */ @@ -4685,20 +5245,23 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else { /* start offset must point into the last copy */ data->last_start_min += minnext * (mincount - 1); - data->last_start_max += is_inf ? SSize_t_MAX - : (maxcount - 1) * (minnext + data->pos_delta); + data->last_start_max = + is_inf + ? SSize_t_MAX + : data->last_start_max + + (maxcount - 1) * (minnext + data->pos_delta); } } /* It is counted once already... */ data->pos_min += minnext * (mincount - counted); #if 0 -PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf - " SSize_t_MAX=%"UVdf" minnext=%"UVdf - " maxcount=%"UVdf" mincount=%"UVdf"\n", +Perl_re_printf( aTHX_ "counted=%"UVuf" deltanext=%"UVuf + " SSize_t_MAX=%"UVuf" minnext=%"UVuf + " maxcount=%"UVuf" mincount=%"UVuf"\n", (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount, (UV)mincount); if (deltanext != SSize_t_MAX) -PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", +Perl_re_printf( aTHX_ "LHS=%"UVuf" RHS=%"UVuf"\n", (UV)(-counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta)); #endif @@ -4773,7 +5336,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", ssc_intersection(data->start_class, PL_XPosix_ptrs[_CC_VERTSPACE], FALSE); ssc_clear_locale(data->start_class); - ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_FLAGS(data->start_class) + &= ~SSC_MATCHES_EMPTY_STRING; } else if (flags & SCF_DO_STCLASS_OR) { ssc_union(data->start_class, @@ -4783,12 +5347,14 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", /* See commit msg for * 749e076fceedeb708a624933726e7989f2302f6a */ - ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_FLAGS(data->start_class) + &= ~SSC_MATCHES_EMPTY_STRING; } flags &= ~SCF_DO_STCLASS; } min++; - delta++; /* Because of the 2 char string cr-lf */ + if (delta != SSize_t_MAX) + delta++; /* Because of the 2 char string cr-lf */ if (flags & SCF_DO_SUBSTR) { /* Cannot expect anything... */ scan_commit(pRExC_state, data, minlenp, is_inf); @@ -4810,7 +5376,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", U8 namedclass; /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */ - ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING; + ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING; /* Some of the logic below assumes that switching locale on will only add false positives. */ @@ -4821,7 +5387,6 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); #endif - case CANY: case SANY: if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ ssc_match_all_cp(data->start_class); @@ -4850,6 +5415,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", } break; + case ANYOFD: + case ANYOFL: case ANYOF: if (flags & SCF_DO_STCLASS_AND) ssc_and(pRExC_state, data->start_class, @@ -4861,7 +5428,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", case NPOSIXL: invert = 1; - /* FALL THROUGH */ + /* FALLTHROUGH */ case POSIXL: namedclass = classnum_to_namedclass(FLAGS(scan)) + invert; @@ -4902,7 +5469,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", case NPOSIXA: /* For these, we always know the exact set of what's matched */ invert = 1; - /* FALL THROUGH */ + /* FALLTHROUGH */ case POSIXA: if (FLAGS(scan) == _CC_ASCII) { my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]); @@ -4917,7 +5484,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", case NPOSIXD: case NPOSIXU: invert = 1; - /* FALL THROUGH */ + /* FALLTHROUGH */ case POSIXD: case POSIXU: my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]); @@ -4959,34 +5526,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", else if ( PL_regkind[OP(scan)] == BRANCHJ /* Lookbehind, or need to calculate parens/evals/stclass: */ && (scan->flags || data || (flags & SCF_DO_STCLASS)) - && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { - if ( OP(scan) == UNLESSM && - scan->flags == 0 && - OP(NEXTOPER(NEXTOPER(scan))) == NOTHING && - OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED - ) { - regnode *opt; - regnode *upto= regnext(scan); - DEBUG_PARSE_r({ - SV * const mysv_val=sv_newmortal(); - DEBUG_STUDYDATA("OPFAIL",data,depth); - - /*DEBUG_PARSE_MSG("opfail");*/ - regprop(RExC_rx, mysv_val, upto, NULL); - PerlIO_printf(Perl_debug_log, - "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", - SvPV_nolen_const(mysv_val), - (IV)REG_NODE_NUM(upto), - (IV)(upto - scan) - ); - }); - OP(scan) = OPFAIL; - NEXT_OFF(scan) = upto - scan; - for (opt= scan + 1; opt < upto ; opt++) - OP(opt) = OPTIMIZED; - scan= upto; - continue; - } + && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) + { if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY || OP(scan) == UNLESSM ) { @@ -4999,7 +5540,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", regnode_ssc intrnl; int f = 0; - data_fake.flags = 0; + StructCopy(&zero_scan_data, &data_fake, scan_data_t); if (data) { data_fake.whilem_c = data->whilem_c; data_fake.last_closep = data->last_closep; @@ -5051,7 +5592,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", * assertions are zero-length, so can match an EMPTY * string */ ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); - ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; + ANYOF_FLAGS(data->start_class) + |= SSC_MATCHES_EMPTY_STRING; } } } @@ -5123,7 +5665,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", if (f & SCF_DO_STCLASS_AND) { ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl); - ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING; + ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING; } if (data) { if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) @@ -5242,7 +5784,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", SSize_t deltanext=0, minnext=0, f = 0, fake; regnode_ssc this_class; - data_fake.flags = 0; + StructCopy(&zero_scan_data, &data_fake, scan_data_t); if (data) { data_fake.whilem_c = data->whilem_c; data_fake.last_closep = data->last_closep; @@ -5305,7 +5847,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", data->longest = &(data->longest_float); } min += min1; - delta += max1 - min1; + if (delta != SSize_t_MAX) + delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum); if (min1) { @@ -5365,16 +5908,19 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", } */ if (frame) { + depth = depth - 1; + DEBUG_STUDYDATA("frame-end:",data,depth); DEBUG_PEEP("fend", scan, depth); + /* restore previous context */ - last = frame->last; - scan = frame->next; - stopparen = frame->stop; + last = frame->last_regnode; + scan = frame->next_regnode; + stopparen = frame->stopparen; recursed_depth = frame->prev_recursed_depth; - depth = depth - 1; - frame = frame->prev; + RExC_frame_last = frame->prev_frame; + frame = frame->this_prev_frame; goto fake_study_recurse; } @@ -5407,12 +5953,15 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", { SSize_t final_minlen= min < stopmin ? min : stopmin; - if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) { - RExC_maxlen = final_minlen + delta; + if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) { + if (final_minlen > SSize_t_MAX - delta) + RExC_maxlen = SSize_t_MAX; + else if (RExC_maxlen < final_minlen + delta) + RExC_maxlen = final_minlen + delta; } return final_minlen; } - /* not-reached */ + NOT_REACHED; /* NOTREACHED */ } STATIC U32 @@ -5434,12 +5983,12 @@ S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) return count; } -/*XXX: todo make this not included in a non debugging perl */ +/*XXX: todo make this not included in a non debugging perl, but appears to be + * used anyway there, in 'use re' */ #ifndef PERL_IN_XSUB_RE void Perl_reginitcolors(pTHX) { - dVAR; const char * const s = PerlEnv_getenv("PERL_RE_COLORS"); if (s) { char *t = savepv(s); @@ -5493,8 +6042,6 @@ Perl_reginitcolors(pTHX) regexp_engine const * Perl_current_re_engine(pTHX) { - dVAR; - if (IN_PERL_COMPILETIME) { HV * const table = GvHV(PL_hintgv); SV **ptr; @@ -5521,7 +6068,6 @@ Perl_current_re_engine(pTHX) REGEXP * Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) { - dVAR; regexp_engine const *eng = current_re_engine(); GET_RE_DEBUG_FLAGS_DECL; @@ -5529,7 +6075,7 @@ Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) /* Dispatch a request to compile a regexp to correct regexp engine. */ DEBUG_COMPILE_r({ - PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", + Perl_re_printf( aTHX_ "Using engine %"UVxf"\n", PTR2UV(eng)); }); return CALLREGCOMP_ENG(eng, pattern, flags); @@ -5568,42 +6114,37 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, char **pat_p, STRLEN *plen_p, int num_code_blocks) { U8 *const src = (U8*)*pat_p; - U8 *dst; + U8 *dst, *d; int n=0; - STRLEN s = 0, d = 0; + STRLEN s = 0; bool do_end = 0; GET_RE_DEBUG_FLAGS_DECL; - DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + DEBUG_PARSE_r(Perl_re_printf( aTHX_ "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); Newx(dst, *plen_p * 2 + 1, U8); + d = dst; while (s < *plen_p) { - if (NATIVE_BYTE_IS_INVARIANT(src[s])) - dst[d] = src[s]; - else { - dst[d++] = UTF8_EIGHT_BIT_HI(src[s]); - dst[d] = UTF8_EIGHT_BIT_LO(src[s]); - } + append_utf8_from_native_byte(src[s], &d); if (n < num_code_blocks) { if (!do_end && pRExC_state->code_blocks[n].start == s) { - pRExC_state->code_blocks[n].start = d; - assert(dst[d] == '('); + pRExC_state->code_blocks[n].start = d - dst - 1; + assert(*(d - 1) == '('); do_end = 1; } else if (do_end && pRExC_state->code_blocks[n].end == s) { - pRExC_state->code_blocks[n].end = d; - assert(dst[d] == ')'); + pRExC_state->code_blocks[n].end = d - dst - 1; + assert(*(d - 1) == ')'); do_end = 0; n++; } } s++; - d++; } - dst[d] = '\0'; - *plen_p = d; + *d = '\0'; + *plen_p = d - dst; *pat_p = (char*) dst; SAVEFREEPV(*pat_p); RExC_orig_utf8 = RExC_utf8 = 1; @@ -5637,7 +6178,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, /* if we know we have at least two args, create an empty string, * then concatenate args to that. For no args, return an empty string */ if (!pat && pat_count != 1) { - pat = newSVpvn("", 0); + pat = newSVpvs(""); SAVEFREESV(pat); alloced = TRUE; } @@ -5673,7 +6214,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, if (oplist) { assert(oplist->op_type == OP_PADAV || oplist->op_type == OP_RV2AV); - oplist = oplist->op_sibling;; + oplist = OpSIBLING(oplist); } if (SvRMAGICAL(av)) { @@ -5720,10 +6261,10 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, pRExC_state->code_blocks[n].src_regex = NULL; n++; code = 1; - oplist = oplist->op_sibling; /* skip CONST */ + oplist = OpSIBLING(oplist); /* skip CONST */ assert(oplist); } - oplist = oplist->op_sibling;; + oplist = OpSIBLING(oplist);; } /* apply magic and QR overloading to arg */ @@ -5843,6 +6384,8 @@ S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, { int n = 0; STRLEN s; + + PERL_UNUSED_CONTEXT; for (s = 0; s < plen; s++) { if (n < pRExC_state->num_code_blocks @@ -5945,7 +6488,7 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, *p++ = 'x'; *p++ = '\0'; DEBUG_COMPILE_r({ - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ "%sre-parsing pattern for runtime code:%s %s\n", PL_colors[4],PL_colors[5],newpat); }); @@ -6158,7 +6701,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, OP *expr, const regexp_engine* eng, REGEXP *old_re, bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags) { - dVAR; REGEXP *rx; struct regexp *r; regexp_internal *ri; @@ -6195,18 +6737,35 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, DEBUG_r(if (!PL_colorset) reginitcolors()); -#ifndef PERL_IN_XSUB_RE /* Initialize these here instead of as-needed, as is quick and avoids * having to test them each time otherwise */ if (! PL_AboveLatin1) { +#ifdef DEBUGGING + char * dump_len_string; +#endif + PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); PL_Latin1 = _new_invlist_C_array(Latin1_invlist); PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); PL_utf8_foldable = _new_invlist_C_array(_Perl_Any_Folds_invlist); PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Folds_To_Multi_Char_invlist); - } + + /* This is calculated here, because the Perl program that generates the + * static global ones doesn't currently have access to + * NUM_ANYOF_CODE_POINTS */ + PL_InBitmap = _new_invlist(2); + PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0, + NUM_ANYOF_CODE_POINTS - 1); +#ifdef DEBUGGING + dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN"); + if ( ! dump_len_string + || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL)) + { + PL_dump_re_max_len = 0; + } #endif + } pRExC_state->code_blocks = NULL; pRExC_state->num_code_blocks = 0; @@ -6220,7 +6779,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, OP *o; int ncode = 0; - for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) + for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) ncode++; /* count of DO blocks */ if (ncode) { @@ -6241,7 +6800,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if (expr->op_type == OP_CONST) n = 1; else - for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) { + for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) { if (o->op_type == OP_CONST) n++; } @@ -6257,14 +6816,14 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if (expr->op_type == OP_CONST) new_patternp[n] = cSVOPx_sv(expr); else - for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) { + for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) { if (o->op_type == OP_CONST) new_patternp[n++] = cSVOPo_sv; } } - DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Assembling pattern from %d elements%s\n", pat_count, orig_rx_flags & RXf_SPLIT ? " for split" : "")); @@ -6277,7 +6836,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, assert( expr->op_type == OP_PUSHMARK || (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK) || expr->op_type == OP_PADRANGE); - expr = expr->op_sibling; + expr = OpSIBLING(expr); } pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count, @@ -6293,7 +6852,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, *is_bare_re = TRUE; SvREFCNT_inc(re); Safefree(pRExC_state->code_blocks); - DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Precompiled pattern%s\n", orig_rx_flags & RXf_SPLIT ? " for split" : "")); @@ -6318,21 +6877,31 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* ignore the utf8ness if the pattern is 0 length */ RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); + RExC_uni_semantics = 0; + RExC_seen_unfolded_sharp_s = 0; RExC_contains_locale = 0; RExC_contains_i = 0; + RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT); pRExC_state->runtime_code_qr = NULL; + RExC_frame_head= NULL; + RExC_frame_last= NULL; + RExC_frame_count= 0; + DEBUG_r({ + RExC_mysv1= sv_newmortal(); + RExC_mysv2= sv_newmortal(); + }); DEBUG_COMPILE_r({ SV *dsv= sv_newmortal(); RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60); - PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", + Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n", PL_colors[4],PL_colors[5],s); }); redo_first_pass: - /* we jump here if we upgrade the pattern to utf8 and have to - * recompile */ + /* we jump here if we have to recompile, e.g., from upgrading the pattern + * to utf8 */ if ((pm_flags & PMf_USE_RE_EVAL) /* this second condition covers the non-regex literal case, @@ -6366,7 +6935,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if (rx_flags & PMf_FOLD) { RExC_contains_i = 1; } - if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { + if ( initial_charset == REGEX_DEPENDS_CHARSET + && (RExC_utf8 ||RExC_uni_semantics)) + { /* Set to use unicode semantics if the pattern is in utf8 and has the * 'depends' charset specified, as it means unicode when utf8 */ @@ -6374,11 +6945,13 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, } RExC_precomp = exp; + RExC_precomp_adj = 0; RExC_flags = rx_flags; RExC_pm_flags = pm_flags; if (runtime_code) { - if (TAINTING_get && TAINT_get) + assert(TAINTING_get || !TAINT_get); + if (TAINT_get) Perl_croak(aTHX_ "Eval-group in insecure regular expression"); if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) { @@ -6399,12 +6972,16 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_extralen = 0; RExC_override_recoding = 0; +#ifdef EBCDIC + RExC_recode_x_to_native = 0; +#endif RExC_in_multi_char_class = 0; /* First pass: determine size, legality. */ RExC_parse = exp; - RExC_start = exp; + RExC_start = RExC_adjusted_start = exp; RExC_end = exp + plen; + RExC_precomp_end = RExC_end; RExC_naughty = 0; RExC_npar = 1; RExC_nestroot = 0; @@ -6413,7 +6990,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_whilem_seen = 0; RExC_open_parens = NULL; RExC_close_parens = NULL; - RExC_opend = NULL; + RExC_end_op = NULL; RExC_paren_names = NULL; #ifdef DEBUGGING RExC_paren_name_list = NULL; @@ -6424,12 +7001,17 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_recurse_count = 0; pRExC_state->code_index = 0; -#if 0 /* REGC() is (currently) a NOP at the first pass. - * Clever compilers notice this and complain. --jhi */ - REGC((U8)REG_MAGIC, (char*)RExC_emit); -#endif + /* This NUL is guaranteed because the pattern comes from an SV*, and the sv + * code makes sure the final byte is an uncounted NUL. But should this + * ever not be the case, lots of things could read beyond the end of the + * buffer: loops like + * while(isFOO(*RExC_parse)) RExC_parse++; + * strchr(RExC_parse, "foo"); + * etc. So it is worth noting. */ + assert(*RExC_end == '\0'); + DEBUG_PARSE_r( - PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"); + Perl_re_printf( aTHX_ "Starting first pass (sizing)\n"); RExC_lastnum=0; RExC_lastparse=NULL; ); @@ -6453,9 +7035,16 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, at least some part of the pattern, and therefore must convert the whole thing. -- dmq */ - if (flags & RESTART_UTF8) { - S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, + if (flags & RESTART_PASS1) { + if (flags & NEED_UTF8) { + S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, pRExC_state->num_code_blocks); + } + else { + DEBUG_PARSE_r(Perl_re_printf( aTHX_ + "Need to redo pass 1\n")); + } + goto redo_first_pass; } Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags); @@ -6464,7 +7053,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */ DEBUG_PARSE_r({ - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ "Required size %"IVdf" nodes\n" "Starting second pass (creation)\n", (IV)RExC_size); @@ -6522,7 +7111,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, for (n = 0; n < pRExC_state->num_code_blocks; n++) if (pRExC_state->code_blocks[n].src_regex) SAVEFREESV(pRExC_state->code_blocks[n].src_regex); - SAVEFREEPV(pRExC_state->code_blocks); + if(pRExC_state->code_blocks) + SAVEFREEPV(pRExC_state->code_blocks); /* often null */ } { @@ -6537,25 +7127,25 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, || ! has_charset); bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN) == REG_RUN_ON_COMMENT_SEEN); - U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) + U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD) >> RXf_PMf_STD_PMMOD_SHIFT); - const char *fptr = STD_PAT_MODS; /*"msix"*/ + const char *fptr = STD_PAT_MODS; /*"msixn"*/ char *p; - /* Allocate for the worst case, which is all the std flags are turned - * on. If more precision is desired, we could do a population count of - * the flags set. This could be done with a small lookup table, or by - * shifting, masking and adding, or even, when available, assembly - * language for a machine-language population count. - * We never output a minus, as all those are defaults, so are + + /* We output all the necessary flags; we never output a minus, as all + * those are defaults, so are * covered by the caret */ const STRLEN wraplen = plen + has_p + has_runon + has_default /* If needs a caret */ + + PL_bitcount[reganch] /* 1 char for each set standard flag */ /* If needs a character set specifier */ + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0) - + (sizeof(STD_PAT_MODS) - 1) + (sizeof("(?:)") - 1); + /* make sure PL_bitcount bounds not exceeded */ + assert(sizeof(STD_PAT_MODS) <= 8); + Newx(p, wraplen + 1, char); /* +1 for the ending NUL */ r->xpv_len_u.xpvlenu_pv = p; if (RExC_utf8) @@ -6598,28 +7188,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, r->intflags = 0; r->nparens = RExC_npar - 1; /* set early to validate backrefs */ - /* setup various meta data about recursion, this all requires - * RExC_npar to be correctly set, and a bit later on we clear it */ - if (RExC_seen & REG_RECURSE_SEEN) { - Newxz(RExC_open_parens, RExC_npar,regnode *); - SAVEFREEPV(RExC_open_parens); - Newxz(RExC_close_parens,RExC_npar,regnode *); - SAVEFREEPV(RExC_close_parens); - } - if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) { - /* Note, RExC_npar is 1 + the number of parens in a pattern. - * So its 1 if there are no parens. */ - RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) + - ((RExC_npar & 0x07) != 0); - Newx(RExC_study_chunk_recursed, - RExC_study_chunk_recursed_bytes * RExC_npar, U8); - SAVEFREEPV(RExC_study_chunk_recursed); - } - /* Useful during FAIL. */ #ifdef RE_TRACK_PATTERN_OFFSETS Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ - DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log, + DEBUG_OFFSETS_r(Perl_re_printf( aTHX_ "%s %"UVuf" bytes for offset annotations.\n", ri->u.offsets ? "Got" : "Couldn't get", (UV)((2*RExC_size+1) * sizeof(U32)))); @@ -6635,17 +7207,51 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_parse = exp; RExC_end = exp + plen; RExC_naughty = 0; - RExC_npar = 1; RExC_emit_start = ri->program; RExC_emit = ri->program; RExC_emit_bound = ri->program + RExC_size + 1; pRExC_state->code_index = 0; - REGC((U8)REG_MAGIC, (char*) RExC_emit++); + *((char*) RExC_emit++) = (char) REG_MAGIC; + /* setup various meta data about recursion, this all requires + * RExC_npar to be correctly set, and a bit later on we clear it */ + if (RExC_seen & REG_RECURSE_SEEN) { + DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ + "%*s%*s Setting up open/close parens\n", + 22, "| |", (int)(0 * 2 + 1), "")); + + /* setup RExC_open_parens, which holds the address of each + * OPEN tag, and to make things simpler for the 0 index + * the start of the program - this is used later for offsets */ + Newxz(RExC_open_parens, RExC_npar,regnode *); + SAVEFREEPV(RExC_open_parens); + RExC_open_parens[0] = RExC_emit; + + /* setup RExC_close_parens, which holds the address of each + * CLOSE tag, and to make things simpler for the 0 index + * the end of the program - this is used later for offsets */ + Newxz(RExC_close_parens, RExC_npar,regnode *); + SAVEFREEPV(RExC_close_parens); + /* we dont know where end op starts yet, so we dont + * need to set RExC_close_parens[0] like we do RExC_open_parens[0] above */ + + /* Note, RExC_npar is 1 + the number of parens in a pattern. + * So its 1 if there are no parens. */ + RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) + + ((RExC_npar & 0x07) != 0); + Newx(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); + SAVEFREEPV(RExC_study_chunk_recursed); + } + RExC_npar = 1; if (reg(pRExC_state, 0, &flags,1) == NULL) { ReREFCNT_dec(rx); Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags); } + DEBUG_OPTIMISE_r( + Perl_re_printf( aTHX_ "Starting post parse optimization\n"); + ); + /* XXXX To minimize changes to RE engine we always allocate 3-units-long substrs field. */ Newx(r->substrs, 1, struct reg_substr_data); @@ -6654,12 +7260,17 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, SAVEFREEPV(RExC_recurse); } -reStudy: + reStudy: r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0; + DEBUG_r( + RExC_study_chunk_recursed_count= 0; + ); Zero(r->substrs, 1, struct reg_substr_data); - if (RExC_study_chunk_recursed) + if (RExC_study_chunk_recursed) { Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes * RExC_npar, U8); + } + #ifdef TRIE_STUDY_OPT if (!restudied) { @@ -6667,7 +7278,7 @@ reStudy: copyRExC_state = RExC_state; } else { U32 seen=RExC_seen; - DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); + DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n")); RExC_state = copyRExC_state; if (seen & REG_TOP_LEVEL_BRANCHES_SEEN) @@ -6687,7 +7298,7 @@ reStudy: if (UTF) SvUTF8_on(rx); /* Unicode in it? */ ri->regstclass = NULL; - if (RExC_naughty >= 10) /* Probably an expensive pattern. */ + if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */ r->intflags |= PREGf_NAUGHTY; scan = ri->program + 1; /* First BRANCH. */ @@ -6746,7 +7357,7 @@ reStudy: DEBUG_PEEP("first:",first,0); /* Ignore EXACT as we deal with it later. */ if (PL_regkind[OP(first)] == EXACT) { - if (OP(first) == EXACT) + if (OP(first) == EXACT || OP(first) == EXACTL) NOOP; /* Empty, get anchored substr later. */ else ri->regstclass = first; @@ -6755,22 +7366,8 @@ reStudy: else if (PL_regkind[OP(first)] == TRIE && ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) { - regnode *trie_op; - /* this can happen only on restudy */ - if ( OP(first) == TRIE ) { - struct regnode_1 *trieop = (struct regnode_1 *) - PerlMemShared_calloc(1, sizeof(struct regnode_1)); - StructCopy(first,trieop,struct regnode_1); - trie_op=(regnode *)trieop; - } else { - struct regnode_charclass *trieop = (struct regnode_charclass *) - PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); - StructCopy(first,trieop,struct regnode_charclass); - trie_op=(regnode *)trieop; - } - OP(trie_op)+=2; - make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0); - ri->regstclass = trie_op; + /* this can happen only on restudy */ + ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0); } #endif else if (REGNODE_SIMPLE(OP(first))) @@ -6781,9 +7378,7 @@ reStudy: else if (PL_regkind[OP(first)] == BOL) { r->intflags |= (OP(first) == MBOL ? PREGf_ANCH_MBOL - : (OP(first) == SBOL - ? PREGf_ANCH_SBOL - : PREGf_ANCH_BOL)); + : PREGf_ANCH_SBOL); first = NEXTOPER(first); goto again; } @@ -6793,6 +7388,7 @@ reStudy: goto again; } else if ((!sawopen || !RExC_sawback) && + !sawlookahead && (OP(first) == STAR && PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks) @@ -6816,12 +7412,12 @@ reStudy: #ifdef TRIE_STUDY_OPT DEBUG_PARSE_r( if (!restudied) - PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", + Perl_re_printf( aTHX_ "first at %"IVdf"\n", (IV)(first - scan + 1)) ); #else DEBUG_PARSE_r( - PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", + Perl_re_printf( aTHX_ "first at %"IVdf"\n", (IV)(first - scan + 1)) ); #endif @@ -6935,8 +7531,8 @@ reStudy: if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) && stclass_flag - && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) - && !ssc_is_anything(data.start_class)) + && ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING) + && is_ssc_worth_it(pRExC_state, data.start_class)) { const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); @@ -6949,8 +7545,8 @@ reStudy: ri->regstclass = (regnode*)RExC_rxi->data->data[n]; r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); - regprop(r, sv, (regnode*)data.start_class, NULL); - PerlIO_printf(Perl_debug_log, + regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state); + Perl_re_printf( aTHX_ "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); data.start_class = NULL; @@ -6995,7 +7591,7 @@ reStudy: regnode_ssc ch_class; SSize_t last_close = 0; - DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); + DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n")); scan = ri->program + 1; ssc_init(pRExC_state, &ch_class); @@ -7015,8 +7611,8 @@ reStudy: r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 = r->float_substr = r->float_utf8 = NULL; - if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING) - && ! ssc_is_anything(data.start_class)) + if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING) + && is_ssc_worth_it(pRExC_state, data.start_class)) { const U32 n = add_data(pRExC_state, STR_WITH_LEN("f")); @@ -7029,8 +7625,8 @@ reStudy: ri->regstclass = (regnode*)RExC_rxi->data->data[n]; r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); - regprop(r, sv, (regnode*)data.start_class, NULL); - PerlIO_printf(Perl_debug_log, + regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state); + Perl_re_printf( aTHX_ "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); data.start_class = NULL; @@ -7048,13 +7644,17 @@ reStudy: /* Guard against an embedded (?=) or (?<=) with a longer minlen than the "real" pattern. */ DEBUG_OPTIMISE_r({ - PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%ld\n", - (IV)minlen, (IV)r->minlen, RExC_maxlen); + Perl_re_printf( aTHX_ "minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n", + (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen); }); r->minlenret = minlen; if (r->minlen < minlen) r->minlen = minlen; + if (RExC_seen & REG_RECURSE_SEEN ) { + r->intflags |= PREGf_RECURSE_SEEN; + Newxz(r->recurse_locinput, r->nparens + 1, char *); + } if (RExC_seen & REG_GPOS_SEEN) r->intflags |= PREGf_GPOS_SEEN; if (RExC_seen & REG_LOOKBEHIND_SEEN) @@ -7062,8 +7662,6 @@ reStudy: lookbehind */ if (pRExC_state->num_code_blocks) r->extflags |= RXf_EVAL_SEEN; - if (RExC_seen & REG_CANY_SEEN) - r->intflags |= PREGf_CANY_SEEN; if (RExC_seen & REG_VERBARG_SEEN) { r->intflags |= PREGf_VERBARG_SEEN; @@ -7094,22 +7692,27 @@ reStudy: * flags appropriately - Yves */ regnode *first = ri->program + 1; U8 fop = OP(first); - regnode *next = NEXTOPER(first); + regnode *next = regnext(first); U8 nop = OP(next); if (PL_regkind[fop] == NOTHING && nop == END) r->extflags |= RXf_NULL; - else if (PL_regkind[fop] == BOL && nop == END) + else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END) + /* when fop is SBOL first->flags will be true only when it was + * produced by parsing /\A/, and not when parsing /^/. This is + * very important for the split code as there we want to + * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m. + * See rt #122761 for more details. -- Yves */ r->extflags |= RXf_START_ONLY; else if (fop == PLUS && PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE - && OP(regnext(first)) == END) + && nop == END) r->extflags |= RXf_WHITE; else if ( r->extflags & RXf_SPLIT - && fop == EXACT + && (fop == EXACT || fop == EXACTL) && STR_LEN(first) == 1 && *(STRING(first)) == ' ' - && OP(regnext(first)) == END ) + && nop == END ) r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); } @@ -7125,20 +7728,22 @@ reStudy: = (void*)SvREFCNT_inc(RExC_paren_name_list); } else #endif - ri->name_list_idx = 0; + ri->name_list_idx = 0; - if (RExC_recurse_count) { - for ( ; RExC_recurse_count ; RExC_recurse_count-- ) { - const regnode *scan = RExC_recurse[RExC_recurse_count-1]; - ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan ); - } + while ( RExC_recurse_count > 0 ) { + const regnode *scan = RExC_recurse[ --RExC_recurse_count ]; + ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - scan ); } + Newxz(r->offs, RExC_npar, regexp_paren_pair); /* assume we don't need to swap parens around before we match */ - + DEBUG_TEST_r({ + Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n", + (unsigned long)RExC_study_chunk_recursed_count); + }); DEBUG_DUMP_r({ DEBUG_RExC_seen(); - PerlIO_printf(Perl_debug_log,"Final program:\n"); + Perl_re_printf( aTHX_ "Final program:\n"); regdump(r); }); #ifdef RE_TRACK_PATTERN_OFFSETS @@ -7146,14 +7751,14 @@ reStudy: const STRLEN len = ri->u.offsets[0]; STRLEN i; GET_RE_DEBUG_FLAGS_DECL; - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); for (i = 1; i <= len; i++) { if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) - PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", + Perl_re_printf( aTHX_ "%"UVuf":%"UVuf"[%"UVuf"] ", (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]); } - PerlIO_printf(Perl_debug_log, "\n"); + Perl_re_printf( aTHX_ "\n"); }); #endif @@ -7467,13 +8072,8 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, sv_setpvn(sv, s, i); TAINT_set(oldtainted); #endif - if ( (rx->intflags & PREGf_CANY_SEEN) - ? (RXp_MATCH_UTF8(rx) - && (!i || is_utf8_string((U8*)s, i))) - : (RXp_MATCH_UTF8(rx)) ) - { + if (RXp_MATCH_UTF8(rx)) SvUTF8_on(sv); - } else SvUTF8_off(sv); if (TAINTING_get) { @@ -7630,7 +8230,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) assert (RExC_parse <= RExC_end); if (RExC_parse == RExC_end) NOOP; else if (isIDFIRST_lazy_if(RExC_parse, UTF)) { - /* skip IDFIRST by using do...while */ + /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by + * using do...while */ if (UTF) do { RExC_parse += UTF8SKIP(RExC_parse); @@ -7667,39 +8268,37 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", (unsigned long) flags); } - assert(0); /* NOT REACHED */ + NOT_REACHED; /* NOTREACHED */ } return NULL; } #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ - int rem=(int)(RExC_end - RExC_parse); \ - int cut; \ int num; \ - int iscut=0; \ - if (rem>10) { \ - rem=10; \ - iscut=1; \ - } \ - cut=10-rem; \ - if (RExC_lastparse!=RExC_parse) \ - PerlIO_printf(Perl_debug_log," >%.*s%-*s", \ - rem, RExC_parse, \ - cut + 4, \ - iscut ? "..." : "<" \ + if (RExC_lastparse!=RExC_parse) { \ + Perl_re_printf( aTHX_ "%s", \ + Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \ + RExC_end - RExC_parse, 16, \ + "", "", \ + PERL_PV_ESCAPE_UNI_DETECT | \ + PERL_PV_PRETTY_ELLIPSES | \ + PERL_PV_PRETTY_LTGT | \ + PERL_PV_ESCAPE_RE | \ + PERL_PV_PRETTY_EXACTSIZE \ + ) \ ); \ - else \ - PerlIO_printf(Perl_debug_log,"%16s",""); \ + } else \ + Perl_re_printf( aTHX_ "%16s",""); \ \ if (SIZE_ONLY) \ num = RExC_size + 1; \ else \ num=REG_NODE_NUM(RExC_emit); \ if (RExC_lastnum!=num) \ - PerlIO_printf(Perl_debug_log,"|%4d",num); \ + Perl_re_printf( aTHX_ "|%4d",num); \ else \ - PerlIO_printf(Perl_debug_log,"|%4s",""); \ - PerlIO_printf(Perl_debug_log,"|%*s%-4s", \ + Perl_re_printf( aTHX_ "|%4s",""); \ + Perl_re_printf( aTHX_ "|%*s%-4s", \ (int)((depth*2)), "", \ (funcname) \ ); \ @@ -7711,11 +8310,11 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \ DEBUG_PARSE_MSG((funcname)); \ - PerlIO_printf(Perl_debug_log,"%4s","\n"); \ + Perl_re_printf( aTHX_ "%4s","\n"); \ }) -#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \ +#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\ DEBUG_PARSE_MSG((funcname)); \ - PerlIO_printf(Perl_debug_log,fmt "\n",args); \ + Perl_re_printf( aTHX_ fmt "\n",args); \ }) /* This section of code defines the inversion list object and its methods. The @@ -7760,10 +8359,10 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) * Some of the methods should always be private to the implementation, and some * should eventually be made public */ -/* The header definitions are in F<inline_invlist.c> */ +/* The header definitions are in F<invlist_inline.h> */ PERL_STATIC_INLINE UV* -S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0) +S__invlist_array_init(SV* const invlist, const bool will_have_0) { /* Returns a pointer to the first element in the inversion list's array. * This is called upon initialization of an inversion list. Where the @@ -7788,33 +8387,12 @@ S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0) return zero_addr + *offset; } -PERL_STATIC_INLINE UV* -S_invlist_array(pTHX_ SV* const invlist) -{ - /* Returns the pointer to the inversion list's array. Every time the - * length changes, this needs to be called in case malloc or realloc moved - * it */ - - PERL_ARGS_ASSERT_INVLIST_ARRAY; - - /* Must not be empty. If these fail, you probably didn't check for <len> - * being non-zero before trying to get the array */ - assert(_invlist_len(invlist)); - - /* The very first element always contains zero, The array begins either - * there, or if the inversion list is offset, at the element after it. - * The offset header field determines which; it contains 0 or 1 to indicate - * how much additionally to add */ - assert(0 == *(SvPVX(invlist))); - return ((UV *) SvPVX(invlist) + *get_invlist_offset_addr(invlist)); -} - PERL_STATIC_INLINE void S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) { /* Sets the current number of elements stored in the inversion list. * Updates SvCUR correspondingly */ - + PERL_UNUSED_CONTEXT; PERL_ARGS_ASSERT_INVLIST_SET_LEN; assert(SvTYPE(invlist) == SVt_INVLIST); @@ -7826,12 +8404,59 @@ S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist)); } +#ifndef PERL_IN_XSUB_RE + +STATIC void +S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src) +{ + /* Replaces the inversion list in 'src' with the one in 'dest'. It steals + * the list from 'src', so 'src' is made to have a NULL list. This is + * similar to what SvSetMagicSV() would do, if it were implemented on + * inversion lists, though this routine avoids a copy */ + + const UV src_len = _invlist_len(src); + const bool src_offset = *get_invlist_offset_addr(src); + const STRLEN src_byte_len = SvLEN(src); + char * array = SvPVX(src); + + const int oldtainted = TAINT_get; + + PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC; + + assert(SvTYPE(src) == SVt_INVLIST); + assert(SvTYPE(dest) == SVt_INVLIST); + assert(! invlist_is_iterating(src)); + assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src)); + + /* Make sure it ends in the right place with a NUL, as our inversion list + * manipulations aren't careful to keep this true, but sv_usepvn_flags() + * asserts it */ + array[src_byte_len - 1] = '\0'; + + TAINT_NOT; /* Otherwise it breaks */ + sv_usepvn_flags(dest, + (char *) array, + src_byte_len - 1, + + /* This flag is documented to cause a copy to be avoided */ + SV_HAS_TRAILING_NUL); + TAINT_set(oldtainted); + SvPV_set(src, 0); + SvLEN_set(src, 0); + SvCUR_set(src, 0); + + /* Finish up copying over the other fields in an inversion list */ + *get_invlist_offset_addr(dest) = src_offset; + invlist_set_len(dest, src_len, src_offset); + *get_invlist_previous_index_addr(dest) = 0; + invlist_iterfinish(dest); +} + PERL_STATIC_INLINE IV* -S_get_invlist_previous_index_addr(pTHX_ SV* invlist) +S_get_invlist_previous_index_addr(SV* invlist) { /* Return the address of the IV that is reserved to hold the cached index * */ - PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; assert(SvTYPE(invlist) == SVt_INVLIST); @@ -7840,7 +8465,7 @@ S_get_invlist_previous_index_addr(pTHX_ SV* invlist) } PERL_STATIC_INLINE IV -S_invlist_previous_index(pTHX_ SV* const invlist) +S_invlist_previous_index(SV* const invlist) { /* Returns cached index of previous search */ @@ -7850,7 +8475,7 @@ S_invlist_previous_index(pTHX_ SV* const invlist) } PERL_STATIC_INLINE void -S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index) +S_invlist_set_previous_index(SV* const invlist, const IV index) { /* Caches <index> for later retrieval */ @@ -7861,8 +8486,45 @@ S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index) *get_invlist_previous_index_addr(invlist) = index; } +PERL_STATIC_INLINE void +S_invlist_trim(SV* invlist) +{ + /* Free the not currently-being-used space in an inversion list */ + + /* But don't free up the space needed for the 0 UV that is always at the + * beginning of the list, nor the trailing NUL */ + const UV min_size = TO_INTERNAL_SIZE(1) + 1; + + PERL_ARGS_ASSERT_INVLIST_TRIM; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1)); +} + +PERL_STATIC_INLINE void +S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */ +{ + PERL_ARGS_ASSERT_INVLIST_CLEAR; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + invlist_set_len(invlist, 0, 0); + invlist_trim(invlist); +} + +#endif /* ifndef PERL_IN_XSUB_RE */ + +PERL_STATIC_INLINE bool +S_invlist_is_iterating(SV* const invlist) +{ + PERL_ARGS_ASSERT_INVLIST_IS_ITERATING; + + return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX; +} + PERL_STATIC_INLINE UV -S_invlist_max(pTHX_ SV* const invlist) +S_invlist_max(SV* const invlist) { /* Returns the maximum number of elements storable in the inversion list's * array, without having to realloc() */ @@ -7980,18 +8642,6 @@ S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1)); } -PERL_STATIC_INLINE void -S_invlist_trim(pTHX_ SV* const invlist) -{ - PERL_ARGS_ASSERT_INVLIST_TRIM; - - assert(SvTYPE(invlist) == SVt_INVLIST); - - /* Change the length of the inversion list to how many entries it currently - * has */ - SvPV_shrink_to_cur((SV *) invlist); -} - STATIC void S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end) @@ -8080,12 +8730,14 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, #ifndef PERL_IN_XSUB_RE IV -Perl__invlist_search(pTHX_ SV* const invlist, const UV cp) +Perl__invlist_search(SV* const invlist, const UV cp) { /* Searches the inversion list for the entry that contains the input code * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the * return value is the index into the list's array of the range that - * contains <cp> */ + * contains <cp>, that is, 'i' such that + * array[i] <= cp < array[i+1] + */ IV low = 0; IV mid; @@ -8104,7 +8756,10 @@ Perl__invlist_search(pTHX_ SV* const invlist, const UV cp) array = invlist_array(invlist); mid = invlist_previous_index(invlist); - assert(mid >=0 && mid <= highest_element); + assert(mid >=0); + if (mid > highest_element) { + mid = highest_element; + } /* <mid> contains the cache of the result of the previous call to this * function (0 the first time). See if this call is for the same result, @@ -8168,8 +8823,8 @@ Perl__invlist_search(pTHX_ SV* const invlist, const UV cp) } void -Perl__invlist_populate_swatch(pTHX_ SV* const invlist, - const UV start, const UV end, U8* swatch) +Perl__invlist_populate_swatch(SV* const invlist, + const UV start, const UV end, U8* swatch) { /* populates a swatch of a swash the same way swatch_get() does in utf8.c, * but is used when the swash has an inversion list. This makes this much @@ -8236,7 +8891,7 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, swatch[offset >> 3] |= 1 << (offset & 7); } - join_end_of_list: + join_end_of_list: /* Quit if at the end of the list */ if (i >= len) { @@ -8268,10 +8923,10 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Take the union of two inversion lists and point <output> to it. *output * SHOULD BE DEFINED upon input, and if it points to one of the two lists, * the reference count to that list will be decremented if not already a - * temporary (mortal); otherwise *output will be made correspondingly - * mortal. The first list, <a>, may be NULL, in which case a copy of the - * second list is returned. If <complement_b> is TRUE, the union is taken - * of the complement (inversion) of <b> instead of b itself. + * temporary (mortal); otherwise just its contents will be modified to be + * the union. The first list, <a>, may be NULL, in which case a copy of + * the second list is returned. If <complement_b> is TRUE, the union is + * taken of the complement (inversion) of <b> instead of b itself. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some @@ -8293,7 +8948,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, SV* u; /* the resulting union */ UV* array_u; - UV len_u; + UV len_u = 0; UV i_a = 0; /* current index into a's array */ UV i_b = 0; @@ -8310,56 +8965,103 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; assert(a != b); - /* If either one is empty, the union is the other one */ - if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { - bool make_temp = FALSE; /* Should we mortalize the result? */ + len_b = _invlist_len(b); + if (len_b == 0) { - if (*output == a) { - if (a != NULL) { - if (! (make_temp = cBOOL(SvTEMP(a)))) { - SvREFCNT_dec_NN(a); - } + /* Here, 'b' is empty. If the output is the complement of 'b', the + * union is all possible code points, and we need not even look at 'a'. + * It's easiest to create a new inversion list that matches everything. + * */ + if (complement_b) { + SV* everything = _new_invlist(1); + _append_range_to_invlist(everything, 0, UV_MAX); + + /* If the output didn't exist, just point it at the new list */ + if (*output == NULL) { + *output = everything; + return; } - } - if (*output != b) { - *output = invlist_clone(b); - if (complement_b) { - _invlist_invert(*output); + + /* Otherwise, replace its contents with the new list */ + invlist_replace_list_destroys_src(*output, everything); + SvREFCNT_dec_NN(everything); + return; + } + + /* Here, we don't want the complement of 'b', and since it is empty, + * the union will come entirely from 'a'. If 'a' is NULL or empty, the + * output will be empty */ + + if (a == NULL) { + *output = _new_invlist(0); + return; + } + + if (_invlist_len(a) == 0) { + invlist_clear(*output); + return; + } + + /* Here, 'a' is not empty, and entirely determines the union. If the + * output is not to overwrite 'b', we can just return 'a'. */ + if (*output != b) { + + /* If the output is to overwrite 'a', we have a no-op, as it's + * already in 'a' */ + if (*output == a) { + return; } - } /* else *output already = b; */ - if (make_temp) { - sv_2mortal(*output); + /* But otherwise we have to copy 'a' to the output */ + *output = invlist_clone(a); + return; } + + /* Here, 'b' is to be overwritten by the output, which will be 'a' */ + u = invlist_clone(a); + invlist_replace_list_destroys_src(*output, u); + SvREFCNT_dec_NN(u); + return; } - else if ((len_b = _invlist_len(b)) == 0) { - bool make_temp = FALSE; - if (*output == b) { - if (! (make_temp = cBOOL(SvTEMP(b)))) { - SvREFCNT_dec_NN(b); + + if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { + + /* Here, 'a' is empty (and b is not). That means the union will come + * entirely from 'b'. If the output is not to overwrite 'a', we can + * just return what's in 'b'. */ + if (*output != a) { + + /* If the output is to overwrite 'b', it's already in 'b', but + * otherwise we have to copy 'b' to the output */ + if (*output != b) { + *output = invlist_clone(b); } - } - /* The complement of an empty list is a list that has everything in it, - * so the union with <a> includes everything too */ - if (complement_b) { - if (a == *output) { - if (! (make_temp = cBOOL(SvTEMP(a)))) { - SvREFCNT_dec_NN(a); - } + /* And if the output is to be the inversion of 'b', do that */ + if (complement_b) { + _invlist_invert(*output); } - *output = _new_invlist(1); - _append_range_to_invlist(*output, 0, UV_MAX); + + return; } - else if (*output != a) { - *output = invlist_clone(a); + + /* Here, 'a', which is empty or even NULL, is to be overwritten by the + * output, which will either be 'b' or the complement of 'b' */ + + if (a == NULL) { + *output = invlist_clone(b); } - /* else *output already = a; */ + else { + u = invlist_clone(b); + invlist_replace_list_destroys_src(*output, u); + SvREFCNT_dec_NN(u); + } - if (make_temp) { - sv_2mortal(*output); + if (complement_b) { + _invlist_invert(*output); } + return; } @@ -8425,7 +9127,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Here, have chosen which of the two inputs to look at. Only output * if the running count changes to/from 0, which marks the - * beginning/end of a range in that's in the set */ + * beginning/end of a range that's in the set */ if (cp_in_set) { if (count == 0) { array_u[i_u++] = cp; @@ -8459,7 +9161,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, * 4) the exhausted wasn't in its set, non-exhausted is, count is 1; * decrementing to 0 insures that we look at the remainder of the * non-exhausted set */ - if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) { count--; @@ -8475,8 +9177,9 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, len_u += (len_a - i_a) + (len_b - i_b); } - /* Set result to final length, which can change the pointer to array_u, so - * re-find it */ + /* Set the result to the final length, which can change the pointer to + * array_u, so re-find it. (Note that it is unlikely that this will + * change, as we are shrinking the space, not enlarging it) */ if (len_u != _invlist_len(u)) { invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); invlist_trim(u); @@ -8486,7 +9189,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* When 'count' is 0, the list that was exhausted (if one was shorter than * the other) ended with everything above it not in its set. That means * that the remaining part of the union is precisely the same as the - * non-exhausted list, so can just copy it unchanged. (If both list were + * non-exhausted list, so can just copy it unchanged. (If both lists were * exhausted at the same time, then the operations below will be both 0.) */ if (count == 0) { @@ -8499,21 +9202,32 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } } - /* We may be removing a reference to one of the inputs. If so, the output - * is made mortal if the input was. (Mortal SVs shouldn't have their ref - * count decremented) */ - if (a == *output || b == *output) { + /* If the output is not to overwrite either of the inputs, just return the + * calculated union */ + if (a != *output && b != *output) { + *output = u; + } + else { + /* Here, the output is to be the same as one of the input scalars, + * hence replacing it. The simple thing to do is to free the input + * scalar, making it instead be the output one. But experience has + * shown [perl #127392] that if the input is a mortal, we can get a + * huge build-up of these during regex compilation before they get + * freed. So for that case, replace just the input's interior with + * the output's, and then free the output */ + assert(! invlist_is_iterating(*output)); - if ((SvTEMP(*output))) { - sv_2mortal(u); + + if (! SvTEMP(*output)) { + SvREFCNT_dec_NN(*output); + *output = u; } else { - SvREFCNT_dec_NN(*output); + invlist_replace_list_destroys_src(*output, u); + SvREFCNT_dec_NN(u); } } - *output = u; - return; } @@ -8524,11 +9238,11 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Take the intersection of two inversion lists and point <i> to it. *i * SHOULD BE DEFINED upon input, and if it points to one of the two lists, * the reference count to that list will be decremented if not already a - * temporary (mortal); otherwise *i will be made correspondingly mortal. - * The first list, <a>, may be NULL, in which case an empty list is - * returned. If <complement_b> is TRUE, the result will be the - * intersection of <a> and the complement (or inversion) of <b> instead of - * <b> directly. + * temporary (mortal); otherwise just its contents will be modified to be + * the intersection. The first list, <a>, may be NULL, in which case an + * empty list is returned. If <complement_b> is TRUE, the result will be + * the intersection of <a> and the complement (or inversion) of <b> instead + * of <b> directly. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some @@ -8546,7 +9260,7 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, SV* r; /* the resulting intersection */ UV* array_r; - UV len_r; + UV len_r = 0; UV i_a = 0; /* current index into a's array */ UV i_b = 0; @@ -8566,48 +9280,38 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Special case if either one is empty */ len_a = (a == NULL) ? 0 : _invlist_len(a); if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { - bool make_temp = FALSE; - if (len_a != 0 && complement_b) { - /* Here, 'a' is not empty, therefore from the above 'if', 'b' must - * be empty. Here, also we are using 'b's complement, which hence - * must be every possible code point. Thus the intersection is - * simply 'a'. */ - if (*i != a) { - if (*i == b) { - if (! (make_temp = cBOOL(SvTEMP(b)))) { - SvREFCNT_dec_NN(b); - } - } + /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b' + * must be empty. Here, also we are using 'b's complement, which + * hence must be every possible code point. Thus the intersection + * is simply 'a'. */ - *i = invlist_clone(a); + if (*i == a) { /* No-op */ + return; } - /* else *i is already 'a' */ - if (make_temp) { - sv_2mortal(*i); + /* If not overwriting either input, just make a copy of 'a' */ + if (*i != b) { + *i = invlist_clone(a); + return; } + + /* Here we are overwriting 'b' with 'a's contents */ + r = invlist_clone(a); + invlist_replace_list_destroys_src(*i, r); + SvREFCNT_dec_NN(r); return; } /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The * intersection must be empty */ - if (*i == a) { - if (! (make_temp = cBOOL(SvTEMP(a)))) { - SvREFCNT_dec_NN(a); - } - } - else if (*i == b) { - if (! (make_temp = cBOOL(SvTEMP(b)))) { - SvREFCNT_dec_NN(b); - } - } - *i = _new_invlist(0); - if (make_temp) { - sv_2mortal(*i); + if (*i == NULL) { + *i = _new_invlist(0); + return; } + invlist_clear(*i); return; } @@ -8705,7 +9409,7 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, * everything that remains in the non-exhausted set. * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and * remains 1. And the intersection has nothing more. */ - if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) { count++; @@ -8719,8 +9423,9 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, len_r += (len_a - i_a) + (len_b - i_b); } - /* Set result to final length, which can change the pointer to array_r, so - * re-find it */ + /* Set the result to the final length, which can change the pointer to + * array_r, so re-find it. (Note that it is unlikely that this will + * change, as we are shrinking the space, not enlarging it) */ if (len_r != _invlist_len(r)) { invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); invlist_trim(r); @@ -8738,21 +9443,38 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } } - /* We may be removing a reference to one of the inputs. If so, the output - * is made mortal if the input was. (Mortal SVs shouldn't have their ref - * count decremented) */ - if (a == *i || b == *i) { + /* If the output is not to overwrite either of the inputs, just return the + * calculated intersection */ + if (a != *i && b != *i) { + *i = r; + } + else { + /* Here, the output is to be the same as one of the input scalars, + * hence replacing it. The simple thing to do is to free the input + * scalar, making it instead be the output one. But experience has + * shown [perl #127392] that if the input is a mortal, we can get a + * huge build-up of these during regex compilation before they get + * freed. So for that case, replace just the input's interior with + * the output's, and then free the output. A short-cut in this case + * is if the output is empty, we can just set the input to be empty */ + assert(! invlist_is_iterating(*i)); - if (SvTEMP(*i)) { - sv_2mortal(r); + + if (! SvTEMP(*i)) { + SvREFCNT_dec_NN(*i); + *i = r; } else { - SvREFCNT_dec_NN(*i); + if (len_r) { + invlist_replace_list_destroys_src(*i, r); + } + else { + invlist_clear(*i); + } + SvREFCNT_dec_NN(r); } } - *i = r; - return; } @@ -8762,7 +9484,7 @@ Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) /* Add the range from 'start' to 'end' inclusive to the inversion list's * set. A pointer to the inversion list is returned. This may actually be * a new list, in which case the passed in one has been destroyed. The - * passed in inversion list can be NULL, in which case a new one is created + * passed-in inversion list can be NULL, in which case a new one is created * with just the one range in it */ SV* range_invlist; @@ -8787,7 +9509,13 @@ Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) } /* Here, can't just append things, create and return a new inversion list - * which is the union of this range and the existing inversion list */ + * which is the union of this range and the existing inversion list. (If + * the new range is well-behaved wrt to the old one, we could just insert + * it, doing a Move() down on the tail of the old one (potentially growing + * it first). But to determine that means we would have the extra + * (possibly throw-away) work of first finding where the new one goes and + * whether it disrupts (splits) an existing range, so it doesn't appear to + * me (khw) that it's worth it) */ range_invlist = _new_invlist(2); _append_range_to_invlist(range_invlist, start, end); @@ -8881,7 +9609,7 @@ S_invlist_clone(pTHX_ SV* const invlist) } PERL_STATIC_INLINE STRLEN* -S_get_invlist_iter_addr(pTHX_ SV* invlist) +S_get_invlist_iter_addr(SV* invlist) { /* Return the address of the UV that contains the current iteration * position */ @@ -8894,7 +9622,7 @@ S_get_invlist_iter_addr(pTHX_ SV* invlist) } PERL_STATIC_INLINE void -S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */ +S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */ { PERL_ARGS_ASSERT_INVLIST_ITERINIT; @@ -8902,7 +9630,7 @@ S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */ } PERL_STATIC_INLINE void -S_invlist_iterfinish(pTHX_ SV* invlist) +S_invlist_iterfinish(SV* invlist) { /* Terminate iterator for invlist. This is to catch development errors. * Any iteration that is interrupted before completed should call this @@ -8918,7 +9646,7 @@ S_invlist_iterfinish(pTHX_ SV* invlist) } STATIC bool -S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end) +S_invlist_iternext(SV* invlist, UV* start, UV* end) { /* An C<invlist_iterinit> call on <invlist> must be used to set this up. * This call sets in <*start> and <*end>, the next range in <invlist>. @@ -8952,16 +9680,8 @@ S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end) return TRUE; } -PERL_STATIC_INLINE bool -S_invlist_is_iterating(pTHX_ SV* const invlist) -{ - PERL_ARGS_ASSERT_INVLIST_IS_ITERATING; - - return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX; -} - PERL_STATIC_INLINE UV -S_invlist_highest(pTHX_ SV* const invlist) +S_invlist_highest(SV* const invlist) { /* Returns the highest code point that matches an inversion list. This API * has an ambiguity, as it returns 0 under either the highest is actually @@ -8990,38 +9710,56 @@ S_invlist_highest(pTHX_ SV* const invlist) : array[len - 1] - 1; } -#ifndef PERL_IN_XSUB_RE -SV * -Perl__invlist_contents(pTHX_ SV* const invlist) +STATIC SV * +S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style) { /* Get the contents of an inversion list into a string SV so that they can - * be printed out. It uses the format traditionally done for debug tracing - */ + * be printed out. If 'traditional_style' is TRUE, it uses the format + * traditionally done for debug tracing; otherwise it uses a format + * suitable for just copying to the output, with blanks between ranges and + * a dash between range components */ UV start, end; - SV* output = newSVpvs("\n"); + SV* output; + const char intra_range_delimiter = (traditional_style ? '\t' : '-'); + const char inter_range_delimiter = (traditional_style ? '\n' : ' '); + + if (traditional_style) { + output = newSVpvs("\n"); + } + else { + output = newSVpvs(""); + } - PERL_ARGS_ASSERT__INVLIST_CONTENTS; + PERL_ARGS_ASSERT_INVLIST_CONTENTS; assert(! invlist_is_iterating(invlist)); invlist_iterinit(invlist); while (invlist_iternext(invlist, &start, &end)) { if (end == UV_MAX) { - Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start); + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%cINFINITY%c", + start, intra_range_delimiter, + inter_range_delimiter); } else if (end != start) { - Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n", - start, end); + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c%04"UVXf"%c", + start, + intra_range_delimiter, + end, inter_range_delimiter); } else { - Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start); + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c", + start, inter_range_delimiter); } } + if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */ + SvCUR_set(output, SvCUR(output) - 1); + } + return output; } -#endif #ifndef PERL_IN_XSUB_RE void @@ -9073,11 +9811,28 @@ Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, count += 2; } } + +void +Perl__load_PL_utf8_foldclosures (pTHX) +{ + assert(! PL_utf8_foldclosures); + + /* If the folds haven't been read in, call a fold function + * to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES_CASE+1]; + + /* This string is just a short named one above \xff */ + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); + assert(PL_utf8_tofold); /* Verify that worked */ + } + PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); +} #endif -#ifdef PERL_ARGS_ASSERT__INVLISTEQ +#if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE) bool -S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) +Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) { /* Return a boolean as to if the two passed in inversion lists are * identical. The final argument, if TRUE, says to take the complement of @@ -9137,6 +9892,152 @@ S__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) } #endif +/* + * As best we can, determine the characters that can match the start of + * the given EXACTF-ish node. + * + * Returns the invlist as a new SV*; it is the caller's responsibility to + * call SvREFCNT_dec() when done with it. + */ +STATIC SV* +S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) +{ + const U8 * s = (U8*)STRING(node); + SSize_t bytelen = STR_LEN(node); + UV uc; + /* Start out big enough for 2 separate code points */ + SV* invlist = _new_invlist(4); + + PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST; + + if (! UTF) { + uc = *s; + + /* We punt and assume can match anything if the node begins + * with a multi-character fold. Things are complicated. For + * example, /ffi/i could match any of: + * "\N{LATIN SMALL LIGATURE FFI}" + * "\N{LATIN SMALL LIGATURE FF}I" + * "F\N{LATIN SMALL LIGATURE FI}" + * plus several other things; and making sure we have all the + * possibilities is hard. */ + if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) { + invlist = _add_range_to_invlist(invlist, 0, UV_MAX); + } + else { + /* Any Latin1 range character can potentially match any + * other depending on the locale */ + if (OP(node) == EXACTFL) { + _invlist_union(invlist, PL_Latin1, &invlist); + } + else { + /* But otherwise, it matches at least itself. We can + * quickly tell if it has a distinct fold, and if so, + * it matches that as well */ + invlist = add_cp_to_invlist(invlist, uc); + if (IS_IN_SOME_FOLD_L1(uc)) + invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]); + } + + /* Some characters match above-Latin1 ones under /i. This + * is true of EXACTFL ones when the locale is UTF-8 */ + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc) + && (! isASCII(uc) || (OP(node) != EXACTFA + && OP(node) != EXACTFA_NO_TRIE))) + { + add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist); + } + } + } + else { /* Pattern is UTF-8 */ + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; + STRLEN foldlen = UTF8SKIP(s); + const U8* e = s + bytelen; + SV** listp; + + uc = utf8_to_uvchr_buf(s, s + bytelen, NULL); + + /* The only code points that aren't folded in a UTF EXACTFish + * node are are the problematic ones in EXACTFL nodes */ + if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) { + /* We need to check for the possibility that this EXACTFL + * node begins with a multi-char fold. Therefore we fold + * the first few characters of it so that we can make that + * check */ + U8 *d = folded; + int i; + + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD(*s); + s++; + } + else { + STRLEN len; + to_utf8_fold(s, d, &len); + d += len; + s += UTF8SKIP(s); + } + } + + /* And set up so the code below that looks in this folded + * buffer instead of the node's string */ + e = d; + foldlen = UTF8SKIP(folded); + s = folded; + } + + /* When we reach here 's' points to the fold of the first + * character(s) of the node; and 'e' points to far enough along + * the folded string to be just past any possible multi-char + * fold. 'foldlen' is the length in bytes of the first + * character in 's' + * + * Unlike the non-UTF-8 case, the macro for determining if a + * string is a multi-char fold requires all the characters to + * already be folded. This is because of all the complications + * if not. Note that they are folded anyway, except in EXACTFL + * nodes. Like the non-UTF case above, we punt if the node + * begins with a multi-char fold */ + + if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) { + invlist = _add_range_to_invlist(invlist, 0, UV_MAX); + } + else { /* Single char fold */ + + /* It matches all the things that fold to it, which are + * found in PL_utf8_foldclosures (including itself) */ + invlist = add_cp_to_invlist(invlist, uc); + if (! PL_utf8_foldclosures) + _load_PL_utf8_foldclosures(); + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) s, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex_nomg(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + assert(c_p); + + c = SvUV(*c_p); + + /* /aa doesn't allow folds between ASCII and non- */ + if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE) + && isASCII(c) != isASCII(uc)) + { + continue; + } + + invlist = add_cp_to_invlist(invlist, c); + } + } + } + } + + return invlist; +} + #undef HEADER_LENGTH #undef TO_INTERNAL_SIZE #undef FROM_INTERNAL_SIZE @@ -9167,6 +10068,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) regex_charset cs; bool has_use_defaults = FALSE; const char* const seqstart = RExC_parse - 1; /* Point to the '?' */ + int x_mod_count = 0; PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS; @@ -9187,14 +10089,14 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) cs = REGEX_UNICODE_CHARSET; } - while (*RExC_parse) { + while (RExC_parse < RExC_end) { /* && strchr("iogcmsx", *RExC_parse) */ /* (?g), (?gc) and (?o) are useless here and must be globally applied -- japhy */ switch (*RExC_parse) { - /* Code for the imsx flags */ - CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); + /* Code for the imsxn flags */ + CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count); case LOCALE_PAT_MOD: if (has_charset_modifier) { @@ -9252,7 +10154,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) : REGEX_DEPENDS_CHARSET; has_charset_modifier = DEPENDS_PAT_MOD; break; - excess_modifier: + excess_modifier: RExC_parse++; if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) { vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); @@ -9264,15 +10166,15 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) else { vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); } - /*NOTREACHED*/ - neg_modifier: + NOT_REACHED; /*NOTREACHED*/ + neg_modifier: RExC_parse++; vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1)); - /*NOTREACHED*/ + NOT_REACHED; /*NOTREACHED*/ case ONCE_PAT_MOD: /* 'o' */ case GLOBAL_PAT_MOD: /* 'g' */ - if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { + if (PASS2 && ckWARN(WARN_REGEXP)) { const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; @@ -9292,7 +10194,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) break; case CONTINUE_PAT_MOD: /* 'c' */ - if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { + if (PASS2 && ckWARN(WARN_REGEXP)) { if (! (wastedflags & WASTED_C) ) { wastedflags |= WASTED_GC; /* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */ @@ -9307,7 +10209,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) break; case KEEPCOPY_PAT_MOD: /* 'p' */ if (flagsp == &negflags) { - if (SIZE_ONLY) + if (PASS2) ckWARNreg(RExC_parse + 1,"Useless use of (?-p)"); } else { *flagsp |= RXf_PMf_KEEPCOPY; @@ -9331,19 +10233,24 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) if (RExC_flags & RXf_PMf_FOLD) { RExC_contains_i = 1; } + if (PASS2) { + STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count); + } return; /*NOTREACHED*/ default: - fail_modifiers: - RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + fail_modifiers: + RExC_parse += SKIP_IF_CHAR(RExC_parse); /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized", UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); - /*NOTREACHED*/ + NOT_REACHED; /*NOTREACHED*/ } - ++RExC_parse; + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; } + + vFAIL("Sequence (?... not terminated"); } /* @@ -9362,11 +10269,59 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) #endif +PERL_STATIC_INLINE regnode * +S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, + I32 *flagp, + char * parse_start, + char ch + ) +{ + regnode *ret; + char* name_start = RExC_parse; + U32 num = 0; + SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY + ? REG_RSN_RETURN_NULL + : REG_RSN_RETURN_DATA); + GET_RE_DEBUG_FLAGS_DECL; + + PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF; + + if (RExC_parse == name_start || *RExC_parse != ch) { + /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ + vFAIL2("Sequence %.3s... not terminated",parse_start); + } + + if (!SIZE_ONLY) { + num = add_data( pRExC_state, STR_WITH_LEN("S")); + RExC_rxi->data->data[num]=(void*)sv_dat; + SvREFCNT_inc_simple_void(sv_dat); + } + RExC_sawback = 1; + ret = reganode(pRExC_state, + ((! FOLD) + ? NREF + : (ASCII_FOLD_RESTRICTED) + ? NREFFA + : (AT_LEAST_UNI_SEMANTICS) + ? NREFFU + : (LOC) + ? NREFFL + : NREFF), + num); + *flagp |= HASWIDTH; + + Set_Node_Offset(ret, parse_start+1); + Set_Node_Cur_Length(ret, parse_start); + + nextchar(pRExC_state); + return ret; +} + /* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets - flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan - needs to be restarted. - Otherwise would only return NULL if regbranch() returns NULL, which - cannot happen. */ + flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan + needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be + upgraded to UTF-8. Otherwise would only return NULL if regbranch() returns + NULL, which cannot happen. */ STATIC regnode * S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter. @@ -9374,7 +10329,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) * RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and * this flag alerts us to the need to check for that */ { - dVAR; regnode *ret; /* Will be the head of the group. */ regnode *br; regnode *lastbr; @@ -9386,6 +10340,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) bool is_open = 0; I32 freeze_paren = 0; I32 after_freeze = 0; + I32 num; /* numeric backreferences */ char * parse_start = RExC_parse; /* MJD */ char * const oregcomp_parse = RExC_parse; @@ -9397,6 +10352,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) *flagp = 0; /* Tentatively. */ + /* Having this true makes it feasible to have a lot fewer tests for the + * parse pointer being in scope. For example, we can write + * while(isFOO(*RExC_parse)) RExC_parse++; + * instead of + * while(RExC_parse < RExC_end && isFOO(*RExC_parse)) RExC_parse++; + */ + assert(*RExC_end == '\0'); /* Make an OPEN node, if parenthesized. */ if (paren) { @@ -9407,40 +10369,49 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) * indivisible */ bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '('; + if (RExC_parse >= RExC_end) { + vFAIL("Unmatched ("); + } + if ( *RExC_parse == '*') { /* (*VERB:ARG) */ - char *start_verb = RExC_parse; - STRLEN verb_len = 0; + char *start_verb = RExC_parse + 1; + STRLEN verb_len; char *start_arg = NULL; unsigned char op = 0; - int argok = 1; - int internal_argval = 0; /* internal_argval is only useful if - !argok */ + int arg_required = 0; + int internal_argval = -1; /* if >-1 we are not allowed an argument*/ - if (has_intervening_patws && SIZE_ONLY) { - ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated"); + if (has_intervening_patws) { + RExC_parse++; /* past the '*' */ + vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent"); } - while ( *RExC_parse && *RExC_parse != ')' ) { + while (RExC_parse < RExC_end && *RExC_parse != ')' ) { if ( *RExC_parse == ':' ) { start_arg = RExC_parse + 1; break; } - RExC_parse++; + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; } - ++start_verb; verb_len = RExC_parse - start_verb; if ( start_arg ) { - RExC_parse++; - while ( *RExC_parse && *RExC_parse != ')' ) - RExC_parse++; - if ( *RExC_parse != ')' ) + if (RExC_parse >= RExC_end) { + goto unterminated_verb_pattern; + } + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + while ( RExC_parse < RExC_end && *RExC_parse != ')' ) + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) + unterminated_verb_pattern: vFAIL("Unterminated verb pattern argument"); if ( RExC_parse == start_arg ) start_arg = NULL; } else { - if ( *RExC_parse != ')' ) + if ( RExC_parse >= RExC_end || *RExC_parse != ')' ) vFAIL("Unterminated verb pattern"); } + /* Here, we know that RExC_parse < RExC_end */ + switch ( *start_verb ) { case 'A': /* (*ACCEPT) */ if ( memEQs(start_verb,verb_len,"ACCEPT") ) { @@ -9455,14 +10426,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case 'F': /* (*FAIL) */ if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) { op = OPFAIL; - argok = 0; } break; case ':': /* (*:NAME) */ case 'M': /* (*MARK:NAME) */ if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) { op = MARKPOINT; - argok = -1; + arg_required = 1; } break; case 'P': /* (*PRUNE) */ @@ -9487,100 +10457,78 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) "Unknown verb pattern '%"UTF8f"'", UTF8fARG(UTF, verb_len, start_verb)); } - if ( argok ) { - if ( start_arg && internal_argval ) { - vFAIL3("Verb pattern '%.*s' may not have an argument", - verb_len, start_verb); - } else if ( argok < 0 && !start_arg ) { - vFAIL3("Verb pattern '%.*s' has a mandatory argument", - verb_len, start_verb); - } else { - ret = reganode(pRExC_state, op, internal_argval); - if ( ! internal_argval && ! SIZE_ONLY ) { - if (start_arg) { - SV *sv = newSVpvn( start_arg, - RExC_parse - start_arg); - ARG(ret) = add_data( pRExC_state, - STR_WITH_LEN("S")); - RExC_rxi->data->data[ARG(ret)]=(void*)sv; - ret->flags = 0; - } else { - ret->flags = 1; - } - } - } - if (!internal_argval) - RExC_seen |= REG_VERBARG_SEEN; - } else if ( start_arg ) { - vFAIL3("Verb pattern '%.*s' may not have an argument", - verb_len, start_verb); - } else { - ret = reg_node(pRExC_state, op); - } + if ( arg_required && !start_arg ) { + vFAIL3("Verb pattern '%.*s' has a mandatory argument", + verb_len, start_verb); + } + if (internal_argval == -1) { + ret = reganode(pRExC_state, op, 0); + } else { + ret = reg2Lanode(pRExC_state, op, 0, internal_argval); + } + RExC_seen |= REG_VERBARG_SEEN; + if ( ! SIZE_ONLY ) { + if (start_arg) { + SV *sv = newSVpvn( start_arg, + RExC_parse - start_arg); + ARG(ret) = add_data( pRExC_state, + STR_WITH_LEN("S")); + RExC_rxi->data->data[ARG(ret)]=(void*)sv; + ret->flags = 1; + } else { + ret->flags = 0; + } + if ( internal_argval != -1 ) + ARG2L_SET(ret, internal_argval); + } nextchar(pRExC_state); return ret; } else if (*RExC_parse == '?') { /* (?...) */ bool is_logical = 0; const char * const seqstart = RExC_parse; - if (has_intervening_patws && SIZE_ONLY) { - ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated"); + const char * endptr; + if (has_intervening_patws) { + RExC_parse++; + vFAIL("In '(?...)', the '(' and '?' must be adjacent"); } - RExC_parse++; - paren = *RExC_parse++; + RExC_parse++; /* past the '?' */ + paren = *RExC_parse; /* might be a trailing NUL, if not + well-formed */ + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + if (RExC_parse > RExC_end) { + paren = '\0'; + } ret = NULL; /* For look-ahead/behind. */ switch (paren) { case 'P': /* (?P...) variants for those used to PCRE/Python */ - paren = *RExC_parse++; - if ( paren == '<') /* (?P<...>) named capture */ + paren = *RExC_parse; + if ( paren == '<') { /* (?P<...>) named capture */ + RExC_parse++; + if (RExC_parse >= RExC_end) { + vFAIL("Sequence (?P<... not terminated"); + } goto named_capture; + } else if (paren == '>') { /* (?P>name) named recursion */ + RExC_parse++; + if (RExC_parse >= RExC_end) { + vFAIL("Sequence (?P>... not terminated"); + } goto named_recursion; } else if (paren == '=') { /* (?P=...) named backref */ - /* this pretty much dupes the code for \k<NAME> in - * regatom(), if you change this make sure you change that - * */ - char* name_start = RExC_parse; - U32 num = 0; - SV *sv_dat = reg_scan_name(pRExC_state, - SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); - if (RExC_parse == name_start || *RExC_parse != ')') - /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */ - vFAIL2("Sequence %.3s... not terminated",parse_start); - - if (!SIZE_ONLY) { - num = add_data( pRExC_state, STR_WITH_LEN("S")); - RExC_rxi->data->data[num]=(void*)sv_dat; - SvREFCNT_inc_simple_void(sv_dat); - } - RExC_sawback = 1; - ret = reganode(pRExC_state, - ((! FOLD) - ? NREF - : (ASCII_FOLD_RESTRICTED) - ? NREFFA - : (AT_LEAST_UNI_SEMANTICS) - ? NREFFU - : (LOC) - ? NREFFL - : NREFF), - num); - *flagp |= HASWIDTH; - - Set_Node_Offset(ret, parse_start+1); - Set_Node_Cur_Length(ret, parse_start); - - nextchar(pRExC_state); - return ret; + RExC_parse++; + return handle_named_backref(pRExC_state, flagp, + parse_start, ')'); } - RExC_parse++; + RExC_parse += SKIP_IF_CHAR(RExC_parse); /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); - /*NOTREACHED*/ + NOT_REACHED; /*NOTREACHED*/ case '<': /* (?<...) */ if (*RExC_parse == '!') paren = ','; @@ -9590,15 +10538,20 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) char *name_start; SV *svname; paren= '>'; + /* FALLTHROUGH */ case '\'': /* (?'...') */ - name_start= RExC_parse; - svname = reg_scan_name(pRExC_state, + name_start = RExC_parse; + svname = reg_scan_name(pRExC_state, SIZE_ONLY /* reverse test from the others */ ? REG_RSN_RETURN_NAME : REG_RSN_RETURN_NULL); - if (RExC_parse == name_start || *RExC_parse != paren) + if ( RExC_parse == name_start + || RExC_parse >= RExC_end + || *RExC_parse != paren) + { vFAIL2("Sequence (?%c... not terminated", paren=='>' ? '<' : paren); + } if (SIZE_ONLY) { HE *he_str; SV *sv_dat = NULL; @@ -9666,13 +10619,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_seen |= REG_LOOKBEHIND_SEEN; RExC_in_lookbehind++; RExC_parse++; + assert(RExC_parse < RExC_end); + /* FALLTHROUGH */ case '=': /* (?=...) */ RExC_seen_zerolen++; break; case '!': /* (?!...) */ RExC_seen_zerolen++; + /* check if we're really just a "FAIL" assertion */ + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + FALSE /* Don't force to /x */ ); if (*RExC_parse == ')') { - ret=reg_node(pRExC_state, OPFAIL); + ret=reganode(pRExC_state, OPFAIL, 0); nextchar(pRExC_state); return ret; } @@ -9690,30 +10648,16 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '@': /* (?@...) */ vFAIL2("Sequence (?%c...) not implemented", (int)paren); break; - case '#': /* (?#...) */ - /* XXX As soon as we disallow separating the '?' and '*' (by - * spaces or (?#...) comment), it is believed that this case - * will be unreachable and can be removed. See - * [perl #117327] */ - while (*RExC_parse && *RExC_parse != ')') - RExC_parse++; - if (*RExC_parse != ')') - FAIL("Sequence (?#... not terminated"); - nextchar(pRExC_state); - *flagp = TRYAGAIN; - return NULL; case '0' : /* (?0) */ case 'R' : /* (?R) */ - if (*RExC_parse != ')') + if (RExC_parse == RExC_end || *RExC_parse != ')') FAIL("Sequence (?R) not terminated"); - ret = reg_node(pRExC_state, GOSTART); - RExC_seen |= REG_GOSTART_SEEN; + num = 0; + RExC_seen |= REG_RECURSE_SEEN; *flagp |= POSTPONED; - nextchar(pRExC_state); - return ret; + goto gen_recurse_regop; /*notreached*/ - { /* named and numeric backreferences */ - I32 num; + /* named and numeric backreferences */ case '&': /* (?&NAME) */ parse_start = RExC_parse - 1; named_recursion: @@ -9722,33 +10666,47 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; } - if (RExC_parse == RExC_end || *RExC_parse != ')') + if (RExC_parse >= RExC_end || *RExC_parse != ')') vFAIL("Sequence (?&... not terminated"); goto gen_recurse_regop; - assert(0); /* NOT REACHED */ + /* NOTREACHED */ case '+': if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { RExC_parse++; vFAIL("Illegal pattern"); } goto parse_recursion; - /* NOT REACHED*/ + /* NOTREACHED*/ case '-': /* (?-1) */ if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { RExC_parse--; /* rewind to let it be handled later */ goto parse_flags; } - /*FALLTHROUGH */ + /* FALLTHROUGH */ case '1': case '2': case '3': case '4': /* (?1) */ case '5': case '6': case '7': case '8': case '9': - RExC_parse--; + RExC_parse = (char *) seqstart + 1; /* Point to the digit */ parse_recursion: - num = atoi(RExC_parse); - parse_start = RExC_parse - 1; /* MJD */ - if (*RExC_parse == '-') - RExC_parse++; - while (isDIGIT(*RExC_parse)) - RExC_parse++; + { + bool is_neg = FALSE; + UV unum; + parse_start = RExC_parse - 1; /* MJD */ + if (*RExC_parse == '-') { + RExC_parse++; + is_neg = TRUE; + } + if (grok_atoUV(RExC_parse, &unum, &endptr) + && unum <= I32_MAX + ) { + num = (I32)unum; + RExC_parse = (char*)endptr; + } else + num = I32_MAX; + if (is_neg) { + /* Some limit for num? */ + num = -num; + } + } if (*RExC_parse!=')') vFAIL("Expecting close bracket"); @@ -9773,44 +10731,53 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else if ( paren == '+' ) { num = RExC_npar + num - 1; } + /* We keep track how many GOSUB items we have produced. + To start off the ARG2L() of the GOSUB holds its "id", + which is used later in conjunction with RExC_recurse + to calculate the offset we need to jump for the GOSUB, + which it will store in the final representation. + We have to defer the actual calculation until much later + as the regop may move. + */ - ret = reganode(pRExC_state, GOSUB, num); + ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count); if (!SIZE_ONLY) { if (num > (I32)RExC_rx->nparens) { RExC_parse++; vFAIL("Reference to nonexistent group"); } - ARG2L_SET( ret, RExC_recurse_count++); - RExC_emit++; - DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, - "Recurse #%"UVuf" to %"IVdf"\n", + RExC_recurse_count++; + DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ + "%*s%*s Recurse #%"UVuf" to %"IVdf"\n", + 22, "| |", (int)(depth * 2 + 1), "", (UV)ARG(ret), (IV)ARG2L(ret))); - } else { - RExC_size++; - } - RExC_seen |= REG_RECURSE_SEEN; + } + RExC_seen |= REG_RECURSE_SEEN; + Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ Set_Node_Offset(ret, parse_start); /* MJD */ *flagp |= POSTPONED; + assert(*RExC_parse == ')'); nextchar(pRExC_state); return ret; - } /* named and numeric backreferences */ - assert(0); /* NOT REACHED */ + + /* NOTREACHED */ case '?': /* (??...) */ is_logical = 1; if (*RExC_parse != '{') { - RExC_parse++; + RExC_parse += SKIP_IF_CHAR(RExC_parse); /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ vFAIL2utf8f( "Sequence (%"UTF8f"...) not recognized", UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); - /*NOTREACHED*/ + NOT_REACHED; /*NOTREACHED*/ } *flagp |= POSTPONED; - paren = *RExC_parse++; - /* FALL THROUGH */ + paren = '{'; + RExC_parse++; + /* FALLTHROUGH */ case '{': /* (?{...}) */ { U32 n = 0; @@ -9851,17 +10818,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (is_logical) { regnode *eval; ret = reg_node(pRExC_state, LOGICAL); - eval = reganode(pRExC_state, EVAL, n); + + eval = reg2Lanode(pRExC_state, EVAL, + n, + + /* for later propagation into (??{}) + * return value */ + RExC_flags & RXf_PMf_COMPILETIME + ); if (!SIZE_ONLY) { ret->flags = 2; - /* for later propagation into (??{}) return value */ - eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME); } REGTAIL(pRExC_state, ret, eval); /* deal with the length of this later - MJD */ return ret; } - ret = reganode(pRExC_state, EVAL, n); + ret = reg2Lanode(pRExC_state, EVAL, n, 0); Set_Node_Length(ret, RExC_parse - parse_start + 1); Set_Node_Offset(ret, parse_start); return ret; @@ -9869,10 +10841,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '(': /* (?(?{...})...) and (?(?=...)...) */ { int is_define= 0; + const int DEFINE_len = sizeof("DEFINE") - 1; if (RExC_parse[0] == '?') { /* (?(?...)) */ - if (RExC_parse[1] == '=' || RExC_parse[1] == '!' - || RExC_parse[1] == '<' - || RExC_parse[1] == '{') { /* Lookahead or eval. */ + if ( RExC_parse < RExC_end - 1 + && ( RExC_parse[1] == '=' + || RExC_parse[1] == '!' + || RExC_parse[1] == '<' + || RExC_parse[1] == '{') + ) { /* Lookahead or eval. */ I32 flag; regnode *tail; @@ -9881,13 +10857,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret->flags = 1; tail = reg(pRExC_state, 1, &flag, depth+1); - if (flag & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flag & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flag & (RESTART_PASS1|NEED_UTF8); return NULL; } REGTAIL(pRExC_state, ret, tail); goto insert_if; } + /* Fall through to ‘Unknown switch condition’ at the + end of the if/else chain. */ } else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */ || RExC_parse[0] == '\'' ) /* (?('NAME')...) */ @@ -9897,9 +10875,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) U32 num = 0; SV *sv_dat=reg_scan_name(pRExC_state, SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); - if (RExC_parse == name_start || *RExC_parse != ch) + if ( RExC_parse == name_start + || RExC_parse >= RExC_end + || *RExC_parse != ch) + { vFAIL2("Sequence (?(%c... not terminated", (ch == '>' ? '<' : ch)); + } RExC_parse++; if (!SIZE_ONLY) { num = add_data( pRExC_state, STR_WITH_LEN("S")); @@ -9909,25 +10891,34 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reganode(pRExC_state,NGROUPP,num); goto insert_if_check_paren; } - else if (RExC_parse[0] == 'D' && - RExC_parse[1] == 'E' && - RExC_parse[2] == 'F' && - RExC_parse[3] == 'I' && - RExC_parse[4] == 'N' && - RExC_parse[5] == 'E') - { + else if (RExC_end - RExC_parse >= DEFINE_len + && strnEQ(RExC_parse, "DEFINE", DEFINE_len)) + { ret = reganode(pRExC_state,DEFINEP,0); - RExC_parse +=6 ; + RExC_parse += DEFINE_len; is_define = 1; goto insert_if_check_paren; } else if (RExC_parse[0] == 'R') { RExC_parse++; + /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval" + * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)" + * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)" + */ parno = 0; - if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { - parno = atoi(RExC_parse++); - while (isDIGIT(*RExC_parse)) - RExC_parse++; + if (RExC_parse[0] == '0') { + parno = 1; + RExC_parse++; + } + else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { + UV uv; + if (grok_atoUV(RExC_parse, &uv, &endptr) + && uv <= I32_MAX + ) { + parno = (I32)uv + 1; + RExC_parse = (char*)endptr; + } + /* else "Switch condition not recognized" below */ } else if (RExC_parse[0] == '&') { SV *sv_dat; RExC_parse++; @@ -9935,7 +10926,19 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); - parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; + + /* we should only have a false sv_dat when + * SIZE_ONLY is true, and we always have false + * sv_dat when SIZE_ONLY is true. + * reg_scan_name() will VFAIL() if the name is + * unknown when SIZE_ONLY is false, and otherwise + * will return something, and when SIZE_ONLY is + * true, reg_scan_name() just parses the string, + * and doesnt return anything. (in theory) */ + assert(SIZE_ONLY ? !sv_dat : !!sv_dat); + + if (sv_dat) + parno = 1 + *((I32 *)SvPVX(sv_dat)); } ret = reganode(pRExC_state,INSUBP,parno); goto insert_if_check_paren; @@ -9943,28 +10946,30 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { /* (?(1)...) */ char c; - char *tmp; - parno = atoi(RExC_parse++); - - while (isDIGIT(*RExC_parse)) - RExC_parse++; + UV uv; + if (grok_atoUV(RExC_parse, &uv, &endptr) + && uv <= I32_MAX + ) { + parno = (I32)uv; + RExC_parse = (char*)endptr; + } + else { + vFAIL("panic: grok_atoUV returned FALSE"); + } ret = reganode(pRExC_state, GROUPP, parno); insert_if_check_paren: - if (*(tmp = nextchar(pRExC_state)) != ')') { - /* nextchar also skips comments, so undo its work - * and skip over the the next character. - */ - RExC_parse = tmp; + if (UCHARAT(RExC_parse) != ')') { RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; vFAIL("Switch condition not recognized"); } + nextchar(pRExC_state); insert_if: REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); br = regbranch(pRExC_state, &flags, 1,depth+1); if (br == NULL) { - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", @@ -9972,7 +10977,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); - c = *nextchar(pRExC_state); + c = UCHARAT(RExC_parse); + nextchar(pRExC_state); if (flags&HASWIDTH) *flagp |= HASWIDTH; if (c == '|') { @@ -9983,8 +10989,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) lastbr = reganode(pRExC_state, IFTHEN, 0); if (!regbranch(pRExC_state, &flags, 1,depth+1)) { - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", @@ -9993,12 +10999,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) REGTAIL(pRExC_state, ret, lastbr); if (flags&HASWIDTH) *flagp |= HASWIDTH; - c = *nextchar(pRExC_state); + c = UCHARAT(RExC_parse); + nextchar(pRExC_state); } else lastbr = NULL; - if (c != ')') - vFAIL("Switch (?(condition)... contains too many branches"); + if (c != ')') { + if (RExC_parse >= RExC_end) + vFAIL("Switch (?(condition)... not terminated"); + else + vFAIL("Switch (?(condition)... contains too many branches"); + } ender = reg_node(pRExC_state, TAIL); REGTAIL(pRExC_state, br, ender); if (lastbr) { @@ -10012,24 +11023,23 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) but I can't figure out why. -- dmq*/ return ret; } - else { - RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; - vFAIL("Unknown switch condition (?(...))"); - } + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Unknown switch condition (?(...))"); } case '[': /* (?[ ... ]) */ return handle_regex_sets(pRExC_state, NULL, flagp, depth, oregcomp_parse); - case 0: + case 0: /* A NUL */ RExC_parse--; /* for vFAIL to print correctly */ vFAIL("Sequence (? incomplete"); break; default: /* e.g., (?i) */ - --RExC_parse; + RExC_parse = (char *) seqstart + 1; parse_flags: parse_lparen_question_flags(pRExC_state); if (UCHARAT(RExC_parse) != ':') { - nextchar(pRExC_state); + if (RExC_parse < RExC_end) + nextchar(pRExC_state); *flagp = TRYAGAIN; return NULL; } @@ -10039,7 +11049,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) goto parse_rest; } /* end switch */ } - else { /* (...) */ + else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */ capturing_parens: parno = RExC_npar; RExC_npar++; @@ -10048,18 +11058,22 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY ){ if (!RExC_nestroot) RExC_nestroot = parno; - if (RExC_seen & REG_RECURSE_SEEN - && !RExC_open_parens[parno-1]) + if (RExC_open_parens && !RExC_open_parens[parno]) { - DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, - "Setting open paren #%"IVdf" to %d\n", + DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ + "%*s%*s Setting open paren #%"IVdf" to %d\n", + 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ret))); - RExC_open_parens[parno-1]= ret; + RExC_open_parens[parno]= ret; } } Set_Node_Length(ret, 1); /* MJD */ Set_Node_Offset(ret, RExC_parse); /* MJD */ is_open = 1; + } else { + /* with RXf_PMf_NOCAPTURE treat (...) as (?:...) */ + paren = ':'; + ret = NULL; } } else /* ! paren */ @@ -10073,8 +11087,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* branch_len = (paren != 0); */ if (br == NULL) { - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); @@ -10120,8 +11134,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) br = regbranch(pRExC_state, &flags, 0, depth+1); if (br == NULL) { - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); @@ -10139,11 +11153,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) break; case 1: case 2: ender = reganode(pRExC_state, CLOSE, parno); - if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) { - DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, - "Setting close paren #%"IVdf" to %d\n", - (IV)parno, REG_NODE_NUM(ender))); - RExC_close_parens[parno-1]= ender; + if ( RExC_close_parens ) { + DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ + "%*s%*s Setting close paren #%"IVdf" to %d\n", + 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender))); + RExC_close_parens[parno]= ender; if (RExC_nestroot == parno) RExC_nestroot = 0; } @@ -10155,28 +11169,33 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '=': case '!': *flagp &= ~HASWIDTH; - /* FALL THROUGH */ + /* FALLTHROUGH */ case '>': ender = reg_node(pRExC_state, SUCCEED); break; case 0: ender = reg_node(pRExC_state, END); if (!SIZE_ONLY) { - assert(!RExC_opend); /* there can only be one! */ - RExC_opend = ender; + assert(!RExC_end_op); /* there can only be one! */ + RExC_end_op = ender; + if (RExC_close_parens) { + DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ + "%*s%*s Setting close paren #0 (END) to %d\n", + 22, "| |", (int)(depth * 2 + 1), "", REG_NODE_NUM(ender))); + + RExC_close_parens[0]= ender; + } } break; } DEBUG_PARSE_r(if (!SIZE_ONLY) { - SV * const mysv_val1=sv_newmortal(); - SV * const mysv_val2=sv_newmortal(); DEBUG_PARSE_MSG("lsbr"); - regprop(RExC_rx, mysv_val1, lastbr, NULL); - regprop(RExC_rx, mysv_val2, ender, NULL); - PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", - SvPV_nolen_const(mysv_val1), + regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state); + regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state); + Perl_re_printf( aTHX_ "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(RExC_mysv1), (IV)REG_NODE_NUM(lastbr), - SvPV_nolen_const(mysv_val2), + SvPV_nolen_const(RExC_mysv2), (IV)REG_NODE_NUM(ender), (IV)(ender - lastbr) ); @@ -10209,15 +11228,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (is_nothing) { br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret; DEBUG_PARSE_r(if (!SIZE_ONLY) { - SV * const mysv_val1=sv_newmortal(); - SV * const mysv_val2=sv_newmortal(); DEBUG_PARSE_MSG("NADA"); - regprop(RExC_rx, mysv_val1, ret, NULL); - regprop(RExC_rx, mysv_val2, ender, NULL); - PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", - SvPV_nolen_const(mysv_val1), + regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state); + regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state); + Perl_re_printf( aTHX_ "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(RExC_mysv1), (IV)REG_NODE_NUM(ret), - SvPV_nolen_const(mysv_val2), + SvPV_nolen_const(RExC_mysv2), (IV)REG_NODE_NUM(ender), (IV)(ender - ret) ); @@ -10256,12 +11273,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* Check for proper termination. */ if (paren) { - /* restore original flags, but keep (?p) */ + /* restore original flags, but keep (?p) and, if we've changed from /d + * rules to /u, keep the /u */ RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY); - if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') { + if (DEPENDS_SEMANTICS && RExC_uni_semantics) { + set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); + } + if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') { RExC_parse = oregcomp_parse; vFAIL("Unmatched ("); } + nextchar(pRExC_state); } else if (!paren && RExC_parse < RExC_end) { if (*RExC_parse == ')') { @@ -10270,7 +11292,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else FAIL("Junk on end of regexp"); /* "Can't happen". */ - assert(0); /* NOTREACHED */ + NOT_REACHED; /* NOTREACHED */ } if (RExC_in_lookbehind) { @@ -10286,13 +11308,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) * * Implements the concatenation operator. * - * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be - * restarted. + * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be + * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8 */ STATIC regnode * S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) { - dVAR; regnode *ret; regnode *chain = NULL; regnode *latest; @@ -10319,16 +11340,16 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) *flagp = WORST; /* Tentatively. */ - RExC_parse--; - nextchar(pRExC_state); + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + FALSE /* Don't force to /x */ ); while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { flags &= ~TRYAGAIN; latest = regpiece(pRExC_state, &flags,depth+1); if (latest == NULL) { if (flags & TRYAGAIN) continue; - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags); @@ -10339,7 +11360,9 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) if (chain == NULL) /* First piece. */ *flagp |= flags&SPSTART; else { - RExC_naughty++; + /* FIXME adding one for every branch after the first is probably + * excessive now we have TRIE support. (hv) */ + MARK_NAUGHTY(1); REGTAIL(pRExC_state, chain, latest); } chain = latest; @@ -10368,13 +11391,12 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) * * Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with * TRYAGAIN. - * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be - * restarted. + * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be + * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8 */ STATIC regnode * S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { - dVAR; regnode *ret; char op; char *next; @@ -10386,6 +11408,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) char *parse_start; #endif const char *maxpos = NULL; + UV uv; /* Save the original in case we change the emitted regop to a FAIL. */ regnode * const orig_emit = RExC_emit; @@ -10398,8 +11421,8 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) ret = regatom(pRExC_state, &flags,depth+1); if (ret == NULL) { - if (flags & (TRYAGAIN|RESTART_UTF8)) - *flagp |= flags & (TRYAGAIN|RESTART_UTF8); + if (flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8)) + *flagp |= flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8); else FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags); return(NULL); @@ -10407,7 +11430,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) op = *RExC_parse; - if (op == '{' && regcurly(RExC_parse, FALSE)) { + if (op == '{' && regcurly(RExC_parse)) { maxpos = NULL; #ifdef RE_TRACK_PATTERN_OFFSETS parse_start = RExC_parse; /* MJD */ @@ -10423,53 +11446,77 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) next++; } if (*next == '}') { /* got one */ + const char* endptr; if (!maxpos) maxpos = next; RExC_parse++; - min = atoi(RExC_parse); + if (isDIGIT(*RExC_parse)) { + if (!grok_atoUV(RExC_parse, &uv, &endptr)) + vFAIL("Invalid quantifier in {,}"); + if (uv >= REG_INFTY) + vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); + min = (I32)uv; + } else { + min = 0; + } if (*maxpos == ',') maxpos++; else maxpos = RExC_parse; - max = atoi(maxpos); - if (!max && *maxpos != '0') + if (isDIGIT(*maxpos)) { + if (!grok_atoUV(maxpos, &uv, &endptr)) + vFAIL("Invalid quantifier in {,}"); + if (uv >= REG_INFTY) + vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); + max = (I32)uv; + } else { max = REG_INFTY; /* meaning "infinity" */ - else if (max >= REG_INFTY) - vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); + } RExC_parse = next; nextchar(pRExC_state); if (max < min) { /* If can't match, warn and optimize to fail unconditionally */ if (SIZE_ONLY) { - ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); /* We can't back off the size because we have to reserve * enough space for all the things we are about to throw - * away, but we can shrink it by the ammount we are about + * away, but we can shrink it by the amount we are about * to re-use here */ - RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL]; + RExC_size += PREVOPER(RExC_size) - regarglen[(U8)OPFAIL]; } else { + ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); RExC_emit = orig_emit; } - ret = reg_node(pRExC_state, OPFAIL); + ret = reganode(pRExC_state, OPFAIL, 0); return ret; } - else if (min == max && RExC_parse < RExC_end && *RExC_parse == '?') + else if (min == max && *RExC_parse == '?') { - if (SIZE_ONLY) { + if (PASS2) { ckWARN2reg(RExC_parse + 1, "Useless use of greediness modifier '%c'", *RExC_parse); } - /* Absorb the modifier, so later code doesn't see nor use - * it */ - nextchar(pRExC_state); } - do_curly: + do_curly: if ((flags&SIMPLE)) { - RExC_naughty += 2 + RExC_naughty / 2; + if (min == 0 && max == REG_INFTY) { + reginsert(pRExC_state, STAR, ret, depth+1); + ret->flags = 0; + MARK_NAUGHTY(4); + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; + goto nest_check; + } + if (min == 1 && max == REG_INFTY) { + reginsert(pRExC_state, PLUS, ret, depth+1); + ret->flags = 0; + MARK_NAUGHTY(3); + RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; + goto nest_check; + } + MARK_NAUGHTY_EXP(2, 2); reginsert(pRExC_state, CURLY, ret, depth+1); Set_Node_Offset(ret, parse_start+1); /* MJD */ Set_Node_Cur_Length(ret, parse_start); @@ -10495,7 +11542,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING)); if (SIZE_ONLY) RExC_whilem_seen++, RExC_extralen += 3; - RExC_naughty += 4 + RExC_naughty; /* compound interest */ + MARK_NAUGHTY_EXP(1, 4); /* compound interest */ } ret->flags = 0; @@ -10542,22 +11589,10 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); - if (op == '*' && (flags&SIMPLE)) { - reginsert(pRExC_state, STAR, ret, depth+1); - ret->flags = 0; - RExC_naughty += 4; - RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; - } - else if (op == '*') { + if (op == '*') { min = 0; goto do_curly; } - else if (op == '+' && (flags&SIMPLE)) { - reginsert(pRExC_state, PLUS, ret, depth+1); - ret->flags = 0; - RExC_naughty += 3; - RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN; - } else if (op == '+') { min = 1; goto do_curly; @@ -10578,13 +11613,12 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) (void)ReREFCNT_inc(RExC_rx_sv); } - if (RExC_parse < RExC_end && *RExC_parse == '?') { + if (*RExC_parse == '?') { nextchar(pRExC_state); reginsert(pRExC_state, MINMOD, ret, depth+1); REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE); } - else - if (RExC_parse < RExC_end && *RExC_parse == '+') { + else if (*RExC_parse == '+') { regnode *ender; nextchar(pRExC_state); ender = reg_node(pRExC_state, SUCCEED); @@ -10595,7 +11629,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) REGTAIL(pRExC_state, ret, ender); } - if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) { + if (ISMULT2(RExC_parse)) { RExC_parse++; vFAIL("Nested quantifiers"); } @@ -10604,64 +11638,95 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } STATIC bool -S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, - UV *valuep, I32 *flagp, U32 depth, bool in_char_class, - const bool strict /* Apply stricter parsing rules? */ +S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, + regnode ** node_p, + UV * code_point_p, + int * cp_count, + I32 * flagp, + const bool strict, + const U32 depth ) { - - /* This is expected to be called by a parser routine that has recognized '\N' - and needs to handle the rest. RExC_parse is expected to point at the first - char following the N at the time of the call. On successful return, - RExC_parse has been updated to point to just after the sequence identified - by this routine, and <*flagp> has been updated. - - The \N may be inside (indicated by the boolean <in_char_class>) or outside a - character class. - - \N may begin either a named sequence, or if outside a character class, mean - to match a non-newline. For non single-quoted regexes, the tokenizer has - attempted to decide which, and in the case of a named sequence, converted it - into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...}, - where c1... are the characters in the sequence. For single-quoted regexes, - the tokenizer passes the \N sequence through unchanged; this code will not - attempt to determine this nor expand those, instead raising a syntax error. - The net effect is that if the beginning of the passed-in pattern isn't '{U+' - or there is no '}', it signals that this \N occurrence means to match a - non-newline. - - Only the \N{U+...} form should occur in a character class, for the same - reason that '.' inside a character class means to just match a period: it - just doesn't make sense. - - The function raises an error (via vFAIL), and doesn't return for various - syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on - success; it returns FALSE otherwise. Returns FALSE, setting *flagp to - RESTART_UTF8 if the sizing scan needs to be restarted. Such a restart is - only possible if node_p is non-NULL. - - - If <valuep> is non-null, it means the caller can accept an input sequence - consisting of a just a single code point; <*valuep> is set to that value - if the input is such. - - If <node_p> is non-null it signifies that the caller can accept any other - legal sequence (i.e., one that isn't just a single code point). <*node_p> - is set as follows: - 1) \N means not-a-NL: points to a newly created REG_ANY node; - 2) \N{}: points to a new NOTHING node; - 3) otherwise: points to a new EXACT node containing the resolved - string. - Note that FALSE is returned for single code point sequences if <valuep> is - null. - */ - - char * endbrace; /* '}' following the name */ - char* p; + /* This routine teases apart the various meanings of \N and returns + * accordingly. The input parameters constrain which meaning(s) is/are valid + * in the current context. + * + * Exactly one of <node_p> and <code_point_p> must be non-NULL. + * + * If <code_point_p> is not NULL, the context is expecting the result to be a + * single code point. If this \N instance turns out to a single code point, + * the function returns TRUE and sets *code_point_p to that code point. + * + * If <node_p> is not NULL, the context is expecting the result to be one of + * the things representable by a regnode. If this \N instance turns out to be + * one such, the function generates the regnode, returns TRUE and sets *node_p + * to point to that regnode. + * + * If this instance of \N isn't legal in any context, this function will + * generate a fatal error and not return. + * + * On input, RExC_parse should point to the first char following the \N at the + * time of the call. On successful return, RExC_parse will have been updated + * to point to just after the sequence identified by this routine. Also + * *flagp has been updated as needed. + * + * When there is some problem with the current context and this \N instance, + * the function returns FALSE, without advancing RExC_parse, nor setting + * *node_p, nor *code_point_p, nor *flagp. + * + * If <cp_count> is not NULL, the caller wants to know the length (in code + * points) that this \N sequence matches. This is set even if the function + * returns FALSE, as detailed below. + * + * There are 5 possibilities here, as detailed in the next 5 paragraphs. + * + * Probably the most common case is for the \N to specify a single code point. + * *cp_count will be set to 1, and *code_point_p will be set to that code + * point. + * + * Another possibility is for the input to be an empty \N{}, which for + * backwards compatibility we accept. *cp_count will be set to 0. *node_p + * will be set to a generated NOTHING node. + * + * Still another possibility is for the \N to mean [^\n]. *cp_count will be + * set to 0. *node_p will be set to a generated REG_ANY node. + * + * The fourth possibility is that \N resolves to a sequence of more than one + * code points. *cp_count will be set to the number of code points in the + * sequence. *node_p * will be set to a generated node returned by this + * function calling S_reg(). + * + * The final possibility is that it is premature to be calling this function; + * that pass1 needs to be restarted. This can happen when this changes from + * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The + * latter occurs only when the fourth possibility would otherwise be in + * effect, and is because one of those code points requires the pattern to be + * recompiled as UTF-8. The function returns FALSE, and sets the + * RESTART_PASS1 and NEED_UTF8 flags in *flagp, as appropriate. When this + * happens, the caller needs to desist from continuing parsing, and return + * this information to its caller. This is not set for when there is only one + * code point, as this can be called as part of an ANYOF node, and they can + * store above-Latin1 code points without the pattern having to be in UTF-8. + * + * For non-single-quoted regexes, the tokenizer has resolved character and + * sequence names inside \N{...} into their Unicode values, normalizing the + * result into what we should see here: '\N{U+c1.c2...}', where c1... are the + * hex-represented code points in the sequence. This is done there because + * the names can vary based on what charnames pragma is in scope at the time, + * so we need a way to take a snapshot of what they resolve to at the time of + * the original parse. [perl #56444]. + * + * That parsing is skipped for single-quoted regexes, so we may here get + * '\N{NAME}'. This is a fatal error. These names have to be resolved by the + * parser. But if the single-quoted regex is something like '\N{U+41}', that + * is legal and handled here. The code point is Unicode, and has to be + * translated into the native character set for non-ASCII platforms. + */ + + char * endbrace; /* points to '}' following the name */ char *endchar; /* Points to '.' or '}' ending cur char in the input stream */ - bool has_multiple_chars; /* true if the input stream contains a sequence of - more than one character */ + char* p = RExC_parse; /* Temporary */ GET_RE_DEBUG_FLAGS_DECL; @@ -10669,32 +11734,35 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, GET_RE_DEBUG_FLAGS; - assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */ + assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */ + assert(! (node_p && cp_count)); /* At most 1 should be set */ + + if (cp_count) { /* Initialize return for the most common case */ + *cp_count = 1; + } /* The [^\n] meaning of \N ignores spaces and comments under the /x - * modifier. The other meaning does not, so use a temporary until we find + * modifier. The other meanings do not, so use a temporary until we find * out which we are being called with */ - p = (RExC_flags & RXf_PMf_EXTENDED) - ? regwhite( pRExC_state, RExC_parse ) - : RExC_parse; + skip_to_be_ignored_text(pRExC_state, &p, + FALSE /* Don't force to /x */ ); /* Disambiguate between \N meaning a named character versus \N meaning - * [^\n]. The former is assumed when it can't be the latter. */ - if (*p != '{' || regcurly(p, FALSE)) { + * [^\n]. The latter is assumed when the {...} following the \N is a legal + * quantifier, or there is no '{' at all */ + if (*p != '{' || regcurly(p)) { RExC_parse = p; + if (cp_count) { + *cp_count = -1; + } + if (! node_p) { - /* no bare \N allowed in a charclass */ - if (in_char_class) { - vFAIL("\\N in a character class must be a named character: \\N{...}"); - } return FALSE; } - RExC_parse--; /* Need to back off so nextchar() doesn't skip the - current char */ - nextchar(pRExC_state); + *node_p = reg_node(pRExC_state, REG_ANY); *flagp |= HASWIDTH|SIMPLE; - RExC_naughty++; + MARK_NAUGHTY(1); Set_Node_Length(*node_p, 1); /* MJD */ return TRUE; } @@ -10711,118 +11779,131 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, RExC_parse++; /* Skip past the '{' */ - if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ + if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ || ! (endbrace == RExC_parse /* nothing between the {} */ - || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below - */ - && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) - */ + || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked... */ + && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better + error msg) */ { if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ vFAIL("\\N{NAME} must be resolved by the lexer"); } + REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode + semantics */ + if (endbrace == RExC_parse) { /* empty: \N{} */ - bool ret = TRUE; - if (node_p) { - *node_p = reg_node(pRExC_state,NOTHING); - } - else if (in_char_class) { - if (SIZE_ONLY && in_char_class) { - if (strict) { - RExC_parse++; /* Position after the "}" */ - vFAIL("Zero length \\N{}"); - } - else { - ckWARNreg(RExC_parse, - "Ignoring zero length \\N{} in character class"); - } - } - ret = FALSE; - } - else { - return FALSE; + if (strict) { + RExC_parse++; /* Position after the "}" */ + vFAIL("Zero length \\N{}"); + } + if (cp_count) { + *cp_count = 0; } nextchar(pRExC_state); - return ret; + if (! node_p) { + return FALSE; + } + + *node_p = reg_node(pRExC_state,NOTHING); + return TRUE; } - RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */ RExC_parse += 2; /* Skip past the 'U+' */ + /* Because toke.c has generated a special construct for us guaranteed not + * to have NULs, we can use a str function */ endchar = RExC_parse + strcspn(RExC_parse, ".}"); /* Code points are separated by dots. If none, there is only one code * point, and is terminated by the brace */ - has_multiple_chars = (endchar < endbrace); - if (valuep && (! has_multiple_chars || in_char_class)) { - /* We only pay attention to the first char of - multichar strings being returned in char classes. I kinda wonder - if this makes sense as it does change the behaviour - from earlier versions, OTOH that behaviour was broken - as well. XXX Solution is to recharacterize as - [rest-of-class]|multi1|multi2... */ + if (endchar >= endbrace) { + STRLEN length_of_hex; + I32 grok_hex_flags; - STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse); - I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES - | PERL_SCAN_DISALLOW_PREFIX - | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); + /* Here, exactly one code point. If that isn't what is wanted, fail */ + if (! code_point_p) { + RExC_parse = p; + return FALSE; + } - *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL); + /* Convert code point from hex */ + length_of_hex = (STRLEN)(endchar - RExC_parse); + grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX + + /* No errors in the first pass (See [perl + * #122671].) We let the code below find the + * errors when there are multiple chars. */ + | ((SIZE_ONLY) + ? PERL_SCAN_SILENT_ILLDIGIT + : 0); + + /* This routine is the one place where both single- and double-quotish + * \N{U+xxxx} are evaluated. The value is a Unicode code point which + * must be converted to native. */ + *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse, + &length_of_hex, + &grok_hex_flags, + NULL)); /* The tokenizer should have guaranteed validity, but it's possible to - * bypass it by using single quoting, so check */ - if (length_of_hex == 0 - || length_of_hex != (STRLEN)(endchar - RExC_parse) ) - { - RExC_parse += length_of_hex; /* Includes all the valid */ - RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ - ? UTF8SKIP(RExC_parse) - : 1; - /* Guard against malformed utf8 */ - if (RExC_parse >= endchar) { + * bypass it by using single quoting, so check. Don't do the check + * here when there are multiple chars; we do it below anyway. */ + if (length_of_hex == 0 + || length_of_hex != (STRLEN)(endchar - RExC_parse) ) + { + RExC_parse += length_of_hex; /* Includes all the valid */ + RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ + ? UTF8SKIP(RExC_parse) + : 1; + /* Guard against malformed utf8 */ + if (RExC_parse >= endchar) { RExC_parse = endchar; } - vFAIL("Invalid hexadecimal number in \\N{U+...}"); - } - - if (in_char_class && has_multiple_chars) { - if (strict) { - RExC_parse = endbrace; - vFAIL("\\N{} in character class restricted to one character"); - } - else { - ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); - } + vFAIL("Invalid hexadecimal number in \\N{U+...}"); } RExC_parse = endbrace + 1; + return TRUE; } - else if (! node_p || ! has_multiple_chars) { - - /* Here, the input is legal, but not according to the caller's - * options. We fail without advancing the parse, so that the - * caller can try again */ - RExC_parse = p; - return FALSE; - } - else { - - /* What is done here is to convert this to a sub-pattern of the form - * (?:\x{char1}\x{char2}...) - * and then call reg recursively. That way, it retains its atomicness, - * while not having to worry about special handling that some code - * points may have. toke.c has converted the original Unicode values - * to native, so that we can just pass on the hex values unchanged. We - * do have to set a flag to keep recoding from happening in the - * recursion */ - - SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP); + else { /* Is a multiple character sequence */ + SV * substitute_parse; STRLEN len; char *orig_end = RExC_end; + char *save_start = RExC_start; I32 flags; + /* Count the code points, if desired, in the sequence */ + if (cp_count) { + *cp_count = 0; + while (RExC_parse < endbrace) { + /* Point to the beginning of the next character in the sequence. */ + RExC_parse = endchar + 1; + endchar = RExC_parse + strcspn(RExC_parse, ".}"); + (*cp_count)++; + } + } + + /* Fail if caller doesn't want to handle a multi-code-point sequence. + * But don't backup up the pointer if the caller want to know how many + * code points there are (they can then handle things) */ + if (! node_p) { + if (! cp_count) { + RExC_parse = p; + } + return FALSE; + } + + /* What is done here is to convert this to a sub-pattern of the form + * \x{char1}\x{char2}... and then call reg recursively to parse it + * (enclosing in "(?: ... )" ). That way, it retains its atomicness, + * while not having to worry about special handling that some code + * points may have. */ + + substitute_parse = newSVpvs("?:"); + while (RExC_parse < endbrace) { /* Convert to notation the rest of the code understands */ @@ -10833,38 +11914,54 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, /* Point to the beginning of the next character in the sequence. */ RExC_parse = endchar + 1; endchar = RExC_parse + strcspn(RExC_parse, ".}"); + } - sv_catpv(substitute_parse, ")"); + sv_catpv(substitute_parse, ")"); - RExC_parse = SvPV(substitute_parse, len); + RExC_parse = RExC_start = RExC_adjusted_start = SvPV(substitute_parse, + len); /* Don't allow empty number */ - if (len < 8) { + if (len < (STRLEN) 8) { + RExC_parse = endbrace; vFAIL("Invalid hexadecimal number in \\N{U+...}"); } RExC_end = RExC_parse + len; - /* The values are Unicode, and therefore not subject to recoding */ + /* The values are Unicode, and therefore not subject to recoding, but + * have to be converted to native on a non-Unicode (meaning non-ASCII) + * platform. */ RExC_override_recoding = 1; +#ifdef EBCDIC + RExC_recode_x_to_native = 1; +#endif - if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) { - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; - return FALSE; + if (node_p) { + if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) { + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); + return FALSE; + } + FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", + (UV) flags); } - FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", - (UV) flags); + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); } - *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); + /* Restore the saved values */ + RExC_start = RExC_adjusted_start = save_start; RExC_parse = endbrace; RExC_end = orig_end; RExC_override_recoding = 0; +#ifdef EBCDIC + RExC_recode_x_to_native = 0; +#endif + SvREFCNT_dec_NN(substitute_parse); nextchar(pRExC_state); - } - return TRUE; + return TRUE; + } } @@ -10879,10 +11976,10 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, * it returns U+FFFD (Replacement character) and sets *encp to NULL. */ STATIC UV -S_reg_recode(pTHX_ const char value, SV **encp) +S_reg_recode(pTHX_ const U8 value, SV **encp) { STRLEN numlen = 1; - SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP); + SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP); const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); const STRLEN newlen = SvCUR(sv); UV uv = UNICODE_REPLACEMENT; @@ -10902,14 +11999,16 @@ S_reg_recode(pTHX_ const char value, SV **encp) } PERL_STATIC_INLINE U8 -S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) +S_compute_EXACTish(RExC_state_t *pRExC_state) { U8 op; PERL_ARGS_ASSERT_COMPUTE_EXACTISH; if (! FOLD) { - return EXACT; + return (LOC) + ? EXACTL + : EXACT; } op = get_regex_charset(RExC_flags); @@ -10966,11 +12065,11 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, if (! len_passed_in) { if (UTF) { - if (UNI_IS_INVARIANT(code_point)) { + if (UVCHR_IS_INVARIANT(code_point)) { if (LOC || ! FOLD) { /* /l defers folding until runtime */ *character = (U8) code_point; } - else { /* Here is /i and not /l (toFOLD() is defined on just + else { /* Here is /i and not /l. (toFOLD() is defined on just ASCII, which isn't the same thing as INVARIANT on EBCDIC, but it works there, as the extra invariants fold to themselves) */ @@ -11001,10 +12100,15 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, ? FOLD_FLAGS_NOMIX_ASCII : 0)); if (downgradable - && folded == code_point + && folded == code_point /* This quickly rules out many + cases, avoiding the + _invlist_contains_cp() overhead + for those. */ && ! _invlist_contains_cp(PL_utf8_foldable, code_point)) { - OP(node) = EXACT; + OP(node) = (LOC) + ? EXACTL + : EXACT; } } else if (code_point <= MAX_UTF8_TWO_BYTE) { @@ -11023,8 +12127,13 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, *character = (U8) code_point; len = 1; } /* Else is folded non-UTF8 */ +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { - +#else + else if (1) { +#endif /* We don't fold any non-UTF8 except possibly the Sharp s (see * comments at join_exact()); */ *character = (U8) code_point; @@ -11067,10 +12176,14 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, /* A single character node is SIMPLE, except for the special-cased SHARP S * under /di. */ - if ((len == 1 || (UTF && len == UNISKIP(code_point))) - && (code_point != LATIN_SMALL_LETTER_SHARP_S - || ! FOLD || ! DEPENDS_SEMANTICS)) - { + if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point))) +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) + && ( code_point != LATIN_SMALL_LETTER_SHARP_S + || ! FOLD || ! DEPENDS_SEMANTICS) +#endif + ) { *flagp |= SIMPLE; } @@ -11081,18 +12194,17 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, } -/* return atoi(p), unless it's too big to sensibly be a backref, +/* Parse backref decimal value, unless it's too big to sensibly be a backref, * in which case return I32_MAX (rather than possibly 32-bit wrapping) */ static I32 S_backref_value(char *p) { - char *q = p; - - for (;isDIGIT(*q); q++); /* calculate length of num */ - if (q - p == 0 || q - p > 9) - return I32_MAX; - return atoi(p); + const char* endptr; + UV val; + if (grok_atoUV(p, &val, &endptr) && val <= I32_MAX) + return (I32)val; + return I32_MAX; } @@ -11157,20 +12269,20 @@ S_backref_value(char *p) Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with TRYAGAIN. - Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be - restarted. + Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be + restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8 Otherwise does not return NULL. */ STATIC regnode * S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { - dVAR; regnode *ret = NULL; I32 flags = 0; - char *parse_start = RExC_parse; + char *parse_start; U8 op; int invert = 0; + U8 arg; GET_RE_DEBUG_FLAGS_DECL; @@ -11180,17 +12292,17 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) PERL_ARGS_ASSERT_REGATOM; -tryagain: + tryagain: + parse_start = RExC_parse; + assert(RExC_parse < RExC_end); switch ((U8)*RExC_parse) { case '^': RExC_seen_zerolen++; nextchar(pRExC_state); if (RExC_flags & RXf_PMf_MULTILINE) ret = reg_node(pRExC_state, MBOL); - else if (RExC_flags & RXf_PMf_SINGLELINE) - ret = reg_node(pRExC_state, SBOL); else - ret = reg_node(pRExC_state, BOL); + ret = reg_node(pRExC_state, SBOL); Set_Node_Length(ret, 1); /* MJD */ break; case '$': @@ -11199,10 +12311,8 @@ tryagain: RExC_seen_zerolen++; if (RExC_flags & RXf_PMf_MULTILINE) ret = reg_node(pRExC_state, MEOL); - else if (RExC_flags & RXf_PMf_SINGLELINE) - ret = reg_node(pRExC_state, SEOL); else - ret = reg_node(pRExC_state, EOL); + ret = reg_node(pRExC_state, SEOL); Set_Node_Length(ret, 1); /* MJD */ break; case '.': @@ -11212,7 +12322,7 @@ tryagain: else ret = reg_node(pRExC_state, REG_ANY); *flagp |= HASWIDTH|SIMPLE; - RExC_naughty++; + MARK_NAUGHTY(1); Set_Node_Length(ret, 1); /* MJD */ break; case '[': @@ -11222,17 +12332,20 @@ tryagain: FALSE, /* means parse the whole char class */ TRUE, /* allow multi-char folds */ FALSE, /* don't silence non-portable warnings. */ + (bool) RExC_strict, + TRUE, /* Allow an optimized regnode result */ + NULL, NULL); - if (*RExC_parse != ']') { - RExC_parse = oregcomp_parse; - vFAIL("Unmatched ["); - } if (ret == NULL) { - if (*flagp & RESTART_UTF8) + if (*flagp & (RESTART_PASS1|NEED_UTF8)) return NULL; FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", (UV) *flagp); } + if (*RExC_parse != ']') { + RExC_parse = oregcomp_parse; + vFAIL("Unmatched ["); + } nextchar(pRExC_state); Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ break; @@ -11242,15 +12355,15 @@ tryagain: ret = reg(pRExC_state, 2, &flags,depth+1); if (ret == NULL) { if (flags & TRYAGAIN) { - if (RExC_parse == RExC_end) { + if (RExC_parse >= RExC_end) { /* Make parent create an empty node if needed. */ *flagp |= TRYAGAIN; return(NULL); } goto tryagain; } - if (flags & RESTART_UTF8) { - *flagp = RESTART_UTF8; + if (flags & (RESTART_PASS1|NEED_UTF8)) { + *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", @@ -11267,12 +12380,6 @@ tryagain: vFAIL("Internal urp"); /* Supposed to be caught earlier. */ break; - case '{': - if (!regcurly(RExC_parse, FALSE)) { - RExC_parse++; - goto defchar; - } - /* FALL THROUGH */ case '?': case '+': case '*': @@ -11292,12 +12399,17 @@ tryagain: required, as the default for this switch is to jump to the literal text handling code. */ - switch ((U8)*++RExC_parse) { - U8 arg; + RExC_parse++; + switch ((U8)*RExC_parse) { /* Special Escapes */ case 'A': RExC_seen_zerolen++; ret = reg_node(pRExC_state, SBOL); + /* SBOL is shared with /^/ so we set the flags so we can tell + * /\A/ from /^/ in split. We check ret because first pass we + * have no regop struct to set the flags on. */ + if (PASS2) + ret->flags = 1; *flagp |= SIMPLE; goto finish_meta_pat; case 'G': @@ -11326,10 +12438,7 @@ tryagain: RExC_seen_zerolen++; /* Do not optimize RE away */ goto finish_meta_pat; case 'C': - ret = reg_node(pRExC_state, CANY); - RExC_seen |= REG_CANY_SEEN; - *flagp |= HASWIDTH|SIMPLE; - goto finish_meta_pat; + vFAIL("\\C no longer supported"); case 'X': ret = reg_node(pRExC_state, CLUMP); *flagp |= HASWIDTH; @@ -11342,49 +12451,127 @@ tryagain: arg = ANYOF_WORDCHAR; goto join_posix; + case 'B': + invert = 1; + /* FALLTHROUGH */ case 'b': + { + regex_charset charset = get_regex_charset(RExC_flags); + RExC_seen_zerolen++; RExC_seen |= REG_LOOKBEHIND_SEEN; - op = BOUND + get_regex_charset(RExC_flags); - if (op > BOUNDA) { /* /aa is same as /a */ - op = BOUNDA; - } - else if (op == BOUNDL) { + op = BOUND + charset; + + if (op == BOUNDL) { RExC_contains_locale = 1; } + ret = reg_node(pRExC_state, op); - FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; - if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - /* diag_listed_as: Use "%s" instead of "%s" */ - vFAIL("Use \"\\b\\{\" instead of \"\\b{\""); - } - goto finish_meta_pat; - case 'B': - RExC_seen_zerolen++; - RExC_seen |= REG_LOOKBEHIND_SEEN; - op = NBOUND + get_regex_charset(RExC_flags); - if (op > NBOUNDA) { /* /aa is same as /a */ - op = NBOUNDA; - } - else if (op == NBOUNDL) { - RExC_contains_locale = 1; + if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') { + FLAGS(ret) = TRADITIONAL_BOUND; + if (PASS2 && op > BOUNDA) { /* /aa is same as /a */ + OP(ret) = BOUNDA; + } } - ret = reg_node(pRExC_state, op); - FLAGS(ret) = get_regex_charset(RExC_flags); - *flagp |= SIMPLE; - if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - /* diag_listed_as: Use "%s" instead of "%s" */ - vFAIL("Use \"\\B\\{\" instead of \"\\B{\""); + else { + STRLEN length; + char name = *RExC_parse; + char * endbrace; + RExC_parse += 2; + endbrace = strchr(RExC_parse, '}'); + + if (! endbrace) { + vFAIL2("Missing right brace on \\%c{}", name); + } + /* XXX Need to decide whether to take spaces or not. Should be + * consistent with \p{}, but that currently is SPACE, which + * means vertical too, which seems wrong + * while (isBLANK(*RExC_parse)) { + RExC_parse++; + }*/ + if (endbrace == RExC_parse) { + RExC_parse++; /* After the '}' */ + vFAIL2("Empty \\%c{}", name); + } + length = endbrace - RExC_parse; + /*while (isBLANK(*(RExC_parse + length - 1))) { + length--; + }*/ + switch (*RExC_parse) { + case 'g': + if (length != 1 + && (length != 3 || strnNE(RExC_parse + 1, "cb", 2))) + { + goto bad_bound_type; + } + FLAGS(ret) = GCB_BOUND; + break; + case 'l': + if (length != 2 || *(RExC_parse + 1) != 'b') { + goto bad_bound_type; + } + FLAGS(ret) = LB_BOUND; + break; + case 's': + if (length != 2 || *(RExC_parse + 1) != 'b') { + goto bad_bound_type; + } + FLAGS(ret) = SB_BOUND; + break; + case 'w': + if (length != 2 || *(RExC_parse + 1) != 'b') { + goto bad_bound_type; + } + FLAGS(ret) = WB_BOUND; + break; + default: + bad_bound_type: + RExC_parse = endbrace; + vFAIL2utf8f( + "'%"UTF8f"' is an unknown bound type", + UTF8fARG(UTF, length, endbrace - length)); + NOT_REACHED; /*NOTREACHED*/ + } + RExC_parse = endbrace; + REQUIRE_UNI_RULES(flagp, NULL); + + if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */ + OP(ret) = BOUNDU; + length += 4; + + /* Don't have to worry about UTF-8, in this message because + * to get here the contents of the \b must be ASCII */ + ckWARN4reg(RExC_parse + 1, /* Include the '}' in msg */ + "Using /u for '%.*s' instead of /%s", + (unsigned) length, + endbrace - length + 1, + (charset == REGEX_ASCII_RESTRICTED_CHARSET) + ? ASCII_RESTRICT_PAT_MODS + : ASCII_MORE_RESTRICT_PAT_MODS); + } } + + if (PASS2 && invert) { + OP(ret) += NBOUND - BOUND; + } goto finish_meta_pat; + } case 'D': invert = 1; /* FALLTHROUGH */ case 'd': arg = ANYOF_DIGIT; - goto join_posix; + if (! DEPENDS_SEMANTICS) { + goto join_posix; + } + + /* \d doesn't have any matches in the upper Latin1 range, hence /d + * is equivalent to /u. Changing to /u saves some branches at + * runtime */ + op = POSIXU; + goto join_posix_op_known; case 'R': ret = reg_node(pRExC_state, LNBREAK); @@ -11413,7 +12600,7 @@ tryagain: case 's': arg = ANYOF_SPACE; - join_posix: + join_posix: op = POSIXD + get_regex_charset(RExC_flags); if (op > POSIXA) { /* /aa is same as /a */ @@ -11423,7 +12610,7 @@ tryagain: RExC_contains_locale = 1; } - join_posix_op_known: + join_posix_op_known: if (invert) { op += NPOSIXD - POSIXD; @@ -11435,106 +12622,95 @@ tryagain: } *flagp |= HASWIDTH|SIMPLE; - /* FALL THROUGH */ + /* FALLTHROUGH */ - finish_meta_pat: + finish_meta_pat: nextchar(pRExC_state); Set_Node_Length(ret, 2); /* MJD */ break; case 'p': case 'P': - { -#ifdef DEBUGGING - char* parse_start = RExC_parse - 2; -#endif - - RExC_parse--; - - ret = regclass(pRExC_state, flagp,depth+1, - TRUE, /* means just parse this element */ - FALSE, /* don't allow multi-char folds */ - FALSE, /* don't silence non-portable warnings. - It would be a bug if these returned - non-portables */ - NULL); - /* regclass() can only return RESTART_UTF8 if multi-char folds - are allowed. */ - if (!ret) - FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", - (UV) *flagp); + RExC_parse--; + + ret = regclass(pRExC_state, flagp,depth+1, + TRUE, /* means just parse this element */ + FALSE, /* don't allow multi-char folds */ + FALSE, /* don't silence non-portable warnings. It + would be a bug if these returned + non-portables */ + (bool) RExC_strict, + TRUE, /* Allow an optimized regnode result */ + NULL, + NULL); + if (*flagp & RESTART_PASS1) + return NULL; + /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if + * multi-char folds are allowed. */ + if (!ret) + FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", + (UV) *flagp); - RExC_parse--; + RExC_parse--; - Set_Node_Offset(ret, parse_start + 2); - Set_Node_Cur_Length(ret, parse_start); - nextchar(pRExC_state); - } + Set_Node_Offset(ret, parse_start); + Set_Node_Cur_Length(ret, parse_start - 2); + nextchar(pRExC_state); break; case 'N': - /* Handle \N and \N{NAME} with multiple code points here and not - * below because it can be multicharacter. join_exact() will join - * them up later on. Also this makes sure that things like - * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq. - * The options to the grok function call causes it to fail if the - * sequence is just a single code point. We then go treat it as - * just another character in the current EXACT node, and hence it - * gets uniform treatment with all the other characters. The - * special treatment for quantifiers is not needed for such single - * character sequences */ + /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the + * \N{...} evaluates to a sequence of more than one code points). + * The function call below returns a regnode, which is our result. + * The parameters cause it to fail if the \N{} evaluates to a + * single code point; we handle those like any other literal. The + * reason that the multicharacter case is handled here and not as + * part of the EXACtish code is because of quantifiers. In + * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it + * this way makes that Just Happen. dmq. + * join_exact() will join this up with adjacent EXACTish nodes + * later on, if appropriate. */ ++RExC_parse; - if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE, - FALSE /* not strict */ )) { - if (*flagp & RESTART_UTF8) - return NULL; - RExC_parse--; - goto defchar; + if (grok_bslash_N(pRExC_state, + &ret, /* Want a regnode returned */ + NULL, /* Fail if evaluates to a single code + point */ + NULL, /* Don't need a count of how many code + points */ + flagp, + RExC_strict, + depth) + ) { + break; } - break; + + if (*flagp & RESTART_PASS1) + return NULL; + + /* Here, evaluates to a single code point. Go get that */ + RExC_parse = parse_start; + goto defchar; + case 'k': /* Handle \k<NAME> and \k'NAME' */ - parse_named_seq: + parse_named_seq: { - char ch= RExC_parse[1]; - if (ch != '<' && ch != '\'' && ch != '{') { + char ch; + if ( RExC_parse >= RExC_end - 1 + || (( ch = RExC_parse[1]) != '<' + && ch != '\'' + && ch != '{')) + { RExC_parse++; /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ vFAIL2("Sequence %.2s... not terminated",parse_start); } else { - /* this pretty much dupes the code for (?P=...) in reg(), if - you change this make sure you change that */ - char* name_start = (RExC_parse += 2); - U32 num = 0; - SV *sv_dat = reg_scan_name(pRExC_state, - SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); - ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\''; - if (RExC_parse == name_start || *RExC_parse != ch) - /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */ - vFAIL2("Sequence %.3s... not terminated",parse_start); - - if (!SIZE_ONLY) { - num = add_data( pRExC_state, STR_WITH_LEN("S")); - RExC_rxi->data->data[num]=(void*)sv_dat; - SvREFCNT_inc_simple_void(sv_dat); - } - - RExC_sawback = 1; - ret = reganode(pRExC_state, - ((! FOLD) - ? NREF - : (ASCII_FOLD_RESTRICTED) - ? NREFFA - : (AT_LEAST_UNI_SEMANTICS) - ? NREFFU - : (LOC) - ? NREFFL - : NREFF), - num); - *flagp |= HASWIDTH; - - /* override incorrect value set in reganode MJD */ - Set_Node_Offset(ret, parse_start+1); - Set_Node_Cur_Length(ret, parse_start); - nextchar(pRExC_state); - + RExC_parse += 2; + ret = handle_named_backref(pRExC_state, + flagp, + parse_start, + (ch == '<') + ? '>' + : (ch == '{') + ? '}' + : '\''); } break; } @@ -11563,6 +12739,9 @@ tryagain: goto parse_named_seq; } + if (RExC_parse >= RExC_end) { + goto unterminated_g; + } num = S_backref_value(RExC_parse); if (num == 0) vFAIL("Reference to invalid group 0"); @@ -11570,6 +12749,7 @@ tryagain: if (isDIGIT(*RExC_parse)) vFAIL("Reference to nonexistent group"); else + unterminated_g: vFAIL("Unterminated \\g... pattern"); } @@ -11581,82 +12761,105 @@ tryagain: } else { num = S_backref_value(RExC_parse); - /* bare \NNN might be backref or octal - if it is larger than or equal - * RExC_npar then it is assumed to be and octal escape. - * Note RExC_npar is +1 from the actual number of parens*/ - if (num == I32_MAX || (num > 9 && num >= RExC_npar - && *RExC_parse != '8' && *RExC_parse != '9')) + /* bare \NNN might be backref or octal - if it is larger + * than or equal RExC_npar then it is assumed to be an + * octal escape. Note RExC_npar is +1 from the actual + * number of parens. */ + /* Note we do NOT check if num == I32_MAX here, as that is + * handled by the RExC_npar check */ + + if ( + /* any numeric escape < 10 is always a backref */ + num > 9 + /* any numeric escape < RExC_npar is a backref */ + && num >= RExC_npar + /* cannot be an octal escape if it starts with 8 */ + && *RExC_parse != '8' + /* cannot be an octal escape it it starts with 9 */ + && *RExC_parse != '9' + ) { - /* Probably a character specified in octal, e.g. \35 */ + /* Probably not a backref, instead likely to be an + * octal character escape, e.g. \35 or \777. + * The above logic should make it obvious why using + * octal escapes in patterns is problematic. - Yves */ + RExC_parse = parse_start; goto defchar; } } - /* at this point RExC_parse definitely points to a backref - * number */ - { -#ifdef RE_TRACK_PATTERN_OFFSETS - char * const parse_start = RExC_parse - 1; /* MJD */ -#endif - while (isDIGIT(*RExC_parse)) - RExC_parse++; - if (hasbrace) { - if (*RExC_parse != '}') - vFAIL("Unterminated \\g{...} pattern"); - RExC_parse++; - } - if (!SIZE_ONLY) { - if (num > (I32)RExC_rx->nparens) - vFAIL("Reference to nonexistent group"); - } - RExC_sawback = 1; - ret = reganode(pRExC_state, - ((! FOLD) - ? REF - : (ASCII_FOLD_RESTRICTED) - ? REFFA - : (AT_LEAST_UNI_SEMANTICS) - ? REFFU - : (LOC) - ? REFFL - : REFF), - num); - *flagp |= HASWIDTH; - - /* override incorrect value set in reganode MJD */ - Set_Node_Offset(ret, parse_start+1); - Set_Node_Cur_Length(ret, parse_start); - RExC_parse--; - nextchar(pRExC_state); - } + /* At this point RExC_parse points at a numeric escape like + * \12 or \88 or something similar, which we should NOT treat + * as an octal escape. It may or may not be a valid backref + * escape. For instance \88888888 is unlikely to be a valid + * backref. */ + while (isDIGIT(*RExC_parse)) + RExC_parse++; + if (hasbrace) { + if (*RExC_parse != '}') + vFAIL("Unterminated \\g{...} pattern"); + RExC_parse++; + } + if (!SIZE_ONLY) { + if (num > (I32)RExC_rx->nparens) + vFAIL("Reference to nonexistent group"); + } + RExC_sawback = 1; + ret = reganode(pRExC_state, + ((! FOLD) + ? REF + : (ASCII_FOLD_RESTRICTED) + ? REFFA + : (AT_LEAST_UNI_SEMANTICS) + ? REFFU + : (LOC) + ? REFFL + : REFF), + num); + *flagp |= HASWIDTH; + + /* override incorrect value set in reganode MJD */ + Set_Node_Offset(ret, parse_start); + Set_Node_Cur_Length(ret, parse_start-1); + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + FALSE /* Don't force to /x */ ); } break; case '\0': if (RExC_parse >= RExC_end) FAIL("Trailing \\"); - /* FALL THROUGH */ + /* FALLTHROUGH */ default: /* Do not generate "unrecognized" warnings here, we fall back into the quick-grab loop below */ - parse_start--; + RExC_parse = parse_start; goto defchar; - } + } /* end of switch on a \foo sequence */ break; case '#': - if (RExC_flags & RXf_PMf_EXTENDED) { - if ( reg_skipcomment( pRExC_state ) ) + + /* '#' comments should have been spaced over before this function was + * called */ + assert((RExC_flags & RXf_PMf_EXTENDED) == 0); + /* + if (RExC_flags & RXf_PMf_EXTENDED) { + RExC_parse = reg_skipcomment( pRExC_state, RExC_parse ); + if (RExC_parse < RExC_end) goto tryagain; } - /* FALL THROUGH */ + */ - default: + /* FALLTHROUGH */ - parse_start = RExC_parse - 1; + default: + defchar: { - RExC_parse++; + /* Here, we have determined that the next thing is probably a + * literal character. RExC_parse points to the first byte of its + * definition. (It still may be an escape sequence that evaluates + * to a single character) */ - defchar: { STRLEN len = 0; UV ender = 0; char *p; @@ -11674,7 +12877,7 @@ tryagain: * string's UTF8ness. The reason to do this is that EXACTF is not * trie-able, EXACTFU is. * - * Similarly, we can convert EXACTFL nodes to EXACTFU if they + * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they * contain only above-Latin1 characters (hence must be in UTF8), * which don't participate in folds with Latin1-range characters, * as the latter's folds aren't known until runtime. (We don't @@ -11695,10 +12898,15 @@ tryagain: s0 = s; - reparse: + reparse: - /* We do the EXACTFish to EXACT node only if folding. (And we - * don't need to figure this out until pass 2) */ + /* We look for the EXACTFish to EXACT node optimizaton only if + * folding. (And we don't need to figure this out until pass 2). + * XXX It might actually make sense to split the node into portions + * that are exact and ones that aren't, so that we could later use + * the exact ones to find the longest fixed and floating strings. + * One would want to join them back into a larger node. One could + * use a pseudo regnode like 'EXACT_ORIG_FOLD' */ maybe_exact = FOLD && PASS2; /* XXX The node can hold up to 255 bytes, yet this only goes to @@ -11719,14 +12927,21 @@ tryagain: * could back off to end with only a code point that isn't such a * non-final, but it is possible for there not to be any in the * entire node. */ - for (p = RExC_parse - 1; + + assert( ! UTF /* Is at the beginning of a character */ + || UTF8_IS_INVARIANT(UCHARAT(RExC_parse)) + || UTF8_IS_START(UCHARAT(RExC_parse))); + + for (p = RExC_parse; len < upper_parse && p < RExC_end; len++) { oldp = p; - if (RExC_flags & RXf_PMf_EXTENDED) - p = regwhite( pRExC_state, p ); + /* White space has already been ignored */ + assert( (RExC_flags & RXf_PMf_EXTENDED) == 0 + || ! is_PATWS_safe((p), RExC_end, UTF)); + switch ((U8)*p) { case '^': case '$': @@ -11778,22 +12993,33 @@ tryagain: p++; break; case 'N': /* Handle a single-code point named character. */ - /* The options cause it to fail if a multiple code - * point sequence. Handle those in the switch() above - * */ RExC_parse = p + 1; - if (! grok_bslash_N(pRExC_state, NULL, &ender, - flagp, depth, FALSE, - FALSE /* not strict */ )) - { - if (*flagp & RESTART_UTF8) - FAIL("panic: grok_bslash_N set RESTART_UTF8"); + if (! grok_bslash_N(pRExC_state, + NULL, /* Fail if evaluates to + anything other than a + single code point */ + &ender, /* The returned single code + point */ + NULL, /* Don't need a count of + how many code points */ + flagp, + RExC_strict, + depth) + ) { + if (*flagp & NEED_UTF8) + FAIL("panic: grok_bslash_N set NEED_UTF8"); + if (*flagp & RESTART_PASS1) + return NULL; + + /* Here, it wasn't a single code point. Go close + * up this EXACTish node. The switch() prior to + * this switch handles the other cases */ RExC_parse = p = oldp; goto loopdone; } p = RExC_parse; if (ender > 0xff) { - REQUIRE_UTF8; + REQUIRE_UTF8(flagp); } break; case 'r': @@ -11809,11 +13035,11 @@ tryagain: p++; break; case 'e': - ender = ASCII_TO_NATIVE('\033'); + ender = ESC_NATIVE; p++; break; case 'a': - ender = '\a'; + ender = '\a'; p++; break; case 'o': @@ -11824,8 +13050,8 @@ tryagain: bool valid = grok_bslash_o(&p, &result, &error_msg, - TRUE, /* out warnings */ - FALSE, /* not strict */ + PASS2, /* out warnings */ + (bool) RExC_strict, TRUE, /* Output warnings for non- portables */ @@ -11836,11 +13062,11 @@ tryagain: vFAIL(error_msg); } ender = result; - if (PL_encoding && ender < 0x100) { + if (IN_ENCODING && ender < 0x100) { goto recode_encoding; } if (ender > 0xff) { - REQUIRE_UTF8; + REQUIRE_UTF8(flagp); } break; } @@ -11853,9 +13079,9 @@ tryagain: bool valid = grok_bslash_x(&p, &result, &error_msg, - TRUE, /* out warnings */ - FALSE, /* not strict */ - TRUE, /* Output warnings + PASS2, /* out warnings */ + (bool) RExC_strict, + TRUE, /* Silence warnings for non- portables */ UTF); @@ -11866,20 +13092,31 @@ tryagain: } ender = result; - if (PL_encoding && ender < 0x100) { - goto recode_encoding; + if (ender < 0x100) { +#ifdef EBCDIC + if (RExC_recode_x_to_native) { + ender = LATIN1_TO_NATIVE(ender); + } + else +#endif + if (IN_ENCODING) { + goto recode_encoding; + } } - if (ender > 0xff) { - REQUIRE_UTF8; + else { + REQUIRE_UTF8(flagp); } break; } case 'c': p++; - ender = grok_bslash_c(*p++, SIZE_ONLY); + ender = grok_bslash_c(*p++, PASS2); break; case '8': case '9': /* must be a backreference */ --p; + /* we have an escape like \8 which cannot be an octal escape + * so we exit the loop, and let the outer loop handle this + * escape which may or may not be a legitimate backref. */ goto loopdone; case '1': case '2': case '3':case '4': case '5': case '6': case '7': @@ -11888,8 +13125,8 @@ tryagain: * from \1 - \9 is a backreference, any multi-digit * escape which does not start with 0 and which when * evaluated as decimal could refer to an already - * parsed capture buffer is a backslash. Anything else - * is octal. + * parsed capture buffer is a back reference. Anything + * else is octal. * * Note this implies that \118 could be interpreted as * 118 OR as "\11" . "8" depending on whether there @@ -11905,18 +13142,18 @@ tryagain: --p; goto loopdone; } + /* FALLTHROUGH */ case '0': { I32 flags = PERL_SCAN_SILENT_ILLDIGIT; STRLEN numlen = 3; ender = grok_oct(p, &numlen, &flags, NULL); if (ender > 0xff) { - REQUIRE_UTF8; + REQUIRE_UTF8(flagp); } p += numlen; - if (SIZE_ONLY /* like \08, \178 */ + if (PASS2 /* like \08, \178 */ && numlen < 3 - && p < RExC_end && isDIGIT(*p) && ckWARN(WARN_REGEXP)) { reg_warn_non_literal_string( @@ -11924,25 +13161,25 @@ tryagain: form_short_octal_warning(p, numlen)); } } - if (PL_encoding && ender < 0x100) + if (IN_ENCODING && ender < 0x100) goto recode_encoding; break; - recode_encoding: + recode_encoding: if (! RExC_override_recoding) { - SV* enc = PL_encoding; - ender = reg_recode((const char)(U8)ender, &enc); - if (!enc && SIZE_ONLY) + SV* enc = _get_encoding(); + ender = reg_recode((U8)ender, &enc); + if (!enc && PASS2) ckWARNreg(p, "Invalid escape in the specified encoding"); - REQUIRE_UTF8; + REQUIRE_UTF8(flagp); } break; case '\0': if (p >= RExC_end) FAIL("Trailing \\"); - /* FALL THROUGH */ + /* FALLTHROUGH */ default: if (!SIZE_ONLY&& isALPHANUMERIC(*p)) { - /* Include any { following the alpha to emphasize + /* Include any left brace following the alpha to emphasize * that it could be part of an escape at some point * in the future */ int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1; @@ -11951,19 +13188,20 @@ tryagain: goto normal_default; } /* End of switch on '\' */ break; + case '{': + /* Currently we don't warn when the lbrace is at the start + * of a construct. This catches it in the middle of a + * literal string, or when it's the first thing after + * something like "\b" */ + if (! SIZE_ONLY + && (len || (p > RExC_start && isALPHA_A(*(p -1))))) + { + ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through"); + } + /*FALLTHROUGH*/ default: /* A literal character */ - - if (! SIZE_ONLY - && RExC_flags & RXf_PMf_EXTENDED - && ckWARN_d(WARN_DEPRECATED) - && is_PATWS_non_low_safe(p, RExC_end, UTF)) - { - vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1), - "Escape literal pattern white space under /x"); - } - normal_default: - if (UTF8_IS_START(*p) && UTF) { + if (! UTF8_IS_INVARIANT(*p) && UTF) { STRLEN numlen; ender = utf8n_to_uvchr((U8*)p, RExC_end - p, &numlen, UTF8_ALLOW_DEFAULT); @@ -11975,11 +13213,13 @@ tryagain: } /* End of switch on the literal */ /* Here, have looked at the literal character and <ender> - * contains its ordinal, <p> points to the character after it - */ - - if ( RExC_flags & RXf_PMf_EXTENDED) - p = regwhite( pRExC_state, p ); + * contains its ordinal, <p> points to the character after it. + * We need to check if the next non-ignored thing is a + * quantifier. Move <p> to after anything that should be + * ignored, which, as a side effect, positions <p> for the next + * loop iteration */ + skip_to_be_ignored_text(pRExC_state, &p, + FALSE /* Don't force to /x */ ); /* If the next thing is a quantifier, it applies to this * character only, which means that this character has to be in @@ -11988,82 +13228,107 @@ tryagain: * the node, close the node with just them, and set up to do * this character again next time through, when it will be the * only thing in its new node */ - if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len) + if ((next_is_quantifier = ( LIKELY(p < RExC_end) + && UNLIKELY(ISMULT2(p)))) + && LIKELY(len)) { p = oldp; goto loopdone; } - if (! FOLD /* The simple case, just append the literal */ - || (LOC /* Also don't fold for tricky chars under /l */ - && is_PROBLEMATIC_LOCALE_FOLD_cp(ender))) - { - if (UTF) { - - /* Normally, we don't need the representation of the - * character in the sizing pass--just its size, but if - * folding, we have to actually put the character out - * even in the sizing pass, because the size could - * change as we juggle things at the end of this loop - * to avoid splitting a too-full node in the middle of - * a potential multi-char fold [perl #123539] */ - const STRLEN unilen = (SIZE_ONLY && ! FOLD) - ? UNISKIP(ender) - : (uvchr_to_utf8((U8*)s, ender) - (U8*)s); - if (unilen > 0) { - s += unilen; - len += unilen; - } + /* Ready to add 'ender' to the node */ - /* The loop increments <len> each time, as all but this - * path (and one other) through it add a single byte to - * the EXACTish node. But this one has changed len to - * be the correct final value, so subtract one to - * cancel out the increment that follows */ - len--; - } - else if (FOLD) { - /* See comment above for [perl #123539] */ - *(s++) = (char) ender; - } - else { - REGC((char)ender, s++); - } + if (! FOLD) { /* The simple case, just append the literal */ - /* Can get here if folding only if is one of the /l - * characters whose fold depends on the locale. The - * occurrence of any of these indicate that we can't - * simplify things */ - if (FOLD) { - maybe_exact = FALSE; - maybe_exactfu = FALSE; + /* In the sizing pass, we need only the size of the + * character we are appending, hence we can delay getting + * its representation until PASS2. */ + if (SIZE_ONLY) { + if (UTF) { + const STRLEN unilen = UVCHR_SKIP(ender); + s += unilen; + + /* We have to subtract 1 just below (and again in + * the corresponding PASS2 code) because the loop + * increments <len> each time, as all but this path + * (and one other) through it add a single byte to + * the EXACTish node. But these paths would change + * len to be the correct final value, so cancel out + * the increment that follows */ + len += unilen - 1; + } + else { + s++; + } + } else { /* PASS2 */ + not_fold_common: + if (UTF) { + U8 * new_s = uvchr_to_utf8((U8*)s, ender); + len += (char *) new_s - s - 1; + s = (char *) new_s; + } + else { + *(s++) = (char) ender; + } } } - else /* FOLD */ - if (! ( UTF - /* See comments for join_exact() as to why we fold this - * non-UTF at compile time */ - || (node_type == EXACTFU - && ender == LATIN_SMALL_LETTER_SHARP_S))) - { + else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) { + + /* Here are folding under /l, and the code point is + * problematic. First, we know we can't simplify things */ + maybe_exact = FALSE; + maybe_exactfu = FALSE; + + /* A problematic code point in this context means that its + * fold isn't known until runtime, so we can't fold it now. + * (The non-problematic code points are the above-Latin1 + * ones that fold to also all above-Latin1. Their folds + * don't vary no matter what the locale is.) But here we + * have characters whose fold depends on the locale. + * Unlike the non-folding case above, we have to keep track + * of these in the sizing pass, so that we can make sure we + * don't split too-long nodes in the middle of a potential + * multi-char fold. And unlike the regular fold case + * handled in the else clauses below, we don't actually + * fold and don't have special cases to consider. What we + * do for both passes is the PASS2 code for non-folding */ + goto not_fold_common; + } + else /* A regular FOLD code point */ + if (! ( UTF +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) + /* See comments for join_exact() as to why we fold + * this non-UTF at compile time */ + || ( node_type == EXACTFU + && ender == LATIN_SMALL_LETTER_SHARP_S) +#endif + )) { /* Here, are folding and are not UTF-8 encoded; therefore * the character must be in the range 0-255, and is not /l * (Not /l because we already handled these under /l in - * is_PROBLEMATIC_LOCALE_FOLD_cp */ + * is_PROBLEMATIC_LOCALE_FOLD_cp) */ if (IS_IN_SOME_FOLD_L1(ender)) { maybe_exact = FALSE; /* See if the character's fold differs between /d and * /u. This includes the multi-char fold SHARP S to * 'ss' */ - if (maybe_exactfu + if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) { + RExC_seen_unfolded_sharp_s = 1; + maybe_exactfu = FALSE; + } + else if (maybe_exactfu && (PL_fold[ender] != PL_fold_latin1[ender] - || ender == LATIN_SMALL_LETTER_SHARP_S - || (len > 0 - && isARG2_lower_or_UPPER_ARG1('s', ender) - && isARG2_lower_or_UPPER_ARG1('s', - *(s-1))))) - { +#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ + || UNICODE_DOT_DOT_VERSION > 0) + || ( len > 0 + && isALPHA_FOLD_EQ(ender, 's') + && isALPHA_FOLD_EQ(*(s-1), 's')) +#endif + )) { maybe_exactfu = FALSE; } } @@ -12072,18 +13337,17 @@ tryagain: * we have an array that finds its fold quickly */ *(s++) = (char) ender; } - else { /* FOLD and UTF */ + else { /* FOLD, and UTF (or sharp s) */ /* Unlike the non-fold case, we do actually have to * calculate the results here in pass 1. This is for two * reasons, the folded length may be longer than the * unfolded, and we have to calculate how many EXACTish * nodes it will take; and we may run out of room in a node * in the middle of a potential multi-char fold, and have - * to back off accordingly. (Hence we can't use REGC for - * the simple case just below.) */ + * to back off accordingly. */ UV folded; - if (isASCII(ender)) { + if (isASCII_uni(ender)) { folded = toFOLD(ender); *(s)++ = (U8) folded; } @@ -12203,7 +13467,7 @@ tryagain: } } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { - if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE( + if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE( *s, *(s+1)))) { break; @@ -12246,7 +13510,7 @@ tryagain: * as if it turns into an EXACTFU, it could later get * joined with another 's' that would then wrongly match * the sharp s */ - if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender)) + if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's')) { maybe_exactfu = FALSE; } @@ -12299,8 +13563,8 @@ tryagain: } } /* End of verifying node ends with an appropriate char */ - loopdone: /* Jumped to when encounters something that shouldn't be in - the node */ + loopdone: /* Jumped to when encounters something that shouldn't be + in the node */ /* I (khw) don't know if you can get here with zero length, but the * old code handled this situation by creating a zero-length EXACT @@ -12316,10 +13580,14 @@ tryagain: * differently depending on UTF8ness of the target string * (for /u), or depending on locale for /l */ if (maybe_exact) { - OP(ret) = EXACT; + OP(ret) = (LOC) + ? EXACTL + : EXACT; } else if (maybe_exactfu) { - OP(ret) = EXACTFU; + OP(ret) = (LOC) + ? EXACTFLU8 + : EXACTFU; } } alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender, @@ -12332,7 +13600,9 @@ tryagain: RExC_parse = p - 1; Set_Node_Cur_Length(ret, parse_start); - nextchar(pRExC_state); + RExC_parse = p; + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + FALSE /* Don't force to /x */ ); { /* len is STRLEN which is unsigned, need to copy to signed */ IV iv = len; @@ -12347,66 +13617,6 @@ tryagain: return(ret); } -STATIC char * -S_regwhite( RExC_state_t *pRExC_state, char *p ) -{ - const char *e = RExC_end; - - PERL_ARGS_ASSERT_REGWHITE; - - while (p < e) { - if (isSPACE(*p)) - ++p; - else if (*p == '#') { - bool ended = 0; - do { - if (*p++ == '\n') { - ended = 1; - break; - } - } while (p < e); - if (!ended) - RExC_seen |= REG_RUN_ON_COMMENT_SEEN; - } - else - break; - } - return p; -} - -STATIC char * -S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) -{ - /* Returns the next non-pattern-white space, non-comment character (the - * latter only if 'recognize_comment is true) in the string p, which is - * ended by RExC_end. If there is no line break ending a comment, - * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */ - const char *e = RExC_end; - - PERL_ARGS_ASSERT_REGPATWS; - - while (p < e) { - STRLEN len; - if ((len = is_PATWS_safe(p, e, UTF))) { - p += len; - } - else if (recognize_comment && *p == '#') { - bool ended = 0; - do { - p++; - if (is_LNBREAK_safe(p, e, UTF)) { - ended = 1; - break; - } - } while (p < e); - if (!ended) - RExC_seen |= REG_RUN_ON_COMMENT_SEEN; - } - else - break; - } - return p; -} STATIC void S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) @@ -12432,22 +13642,21 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) UV high; int i; - if (end == UV_MAX && start <= 256) { - ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL; - } - else if (end >= 256) { - ANYOF_FLAGS(node) |= ANYOF_UTF8; + if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) { + ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP; } /* Quit if are above what we should change */ - if (start > 255) { + if (start >= NUM_ANYOF_CODE_POINTS) { break; } change_invlist = TRUE; /* Set all the bits in the range, up to the max that we are doing */ - high = (end < 255) ? end : 255; + high = (end < NUM_ANYOF_CODE_POINTS - 1) + ? end + : NUM_ANYOF_CODE_POINTS - 1; for (i = start; i <= (int) high; i++) { if (! ANYOF_BITMAP_TEST(node, i)) { ANYOF_BITMAP_SET(node, i); @@ -12457,13 +13666,13 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) invlist_iterfinish(*invlist_ptr); /* Done with loop; remove any code points that are in the bitmap from - * *invlist_ptr; similarly for code points above latin1 if we have a - * flag to match all of them anyways */ + * *invlist_ptr; similarly for code points above the bitmap if we have + * a flag to match all of them anyways */ if (change_invlist) { - _invlist_subtract(*invlist_ptr, PL_Latin1, invlist_ptr); + _invlist_subtract(*invlist_ptr, PL_InBitmap, invlist_ptr); } - if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) { - _invlist_intersection(*invlist_ptr, PL_Latin1, invlist_ptr); + if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) { + _invlist_intersection(*invlist_ptr, PL_InBitmap, invlist_ptr); } /* If have completely emptied it, remove it completely */ @@ -12483,202 +13692,843 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) #define POSIXCC_DONE(c) ((c) == ':') #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) +#define MAYBE_POSIXCC(c) (POSIXCC(c) || (c) == '^' || (c) == ';') + +#define WARNING_PREFIX "Assuming NOT a POSIX class since " +#define NO_BLANKS_POSIX_WARNING "no blanks are allowed in one" +#define SEMI_COLON_POSIX_WARNING "a semi-colon was found instead of a colon" + +#define NOT_MEANT_TO_BE_A_POSIX_CLASS (OOB_NAMEDCLASS - 1) + +/* 'posix_warnings' and 'warn_text' are names of variables in the following + * routine. q.v. */ +#define ADD_POSIX_WARNING(p, text) STMT_START { \ + if (posix_warnings) { \ + if (! warn_text) warn_text = newAV(); \ + av_push(warn_text, Perl_newSVpvf(aTHX_ \ + WARNING_PREFIX \ + text \ + REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(p))); \ + } \ + } STMT_END -PERL_STATIC_INLINE I32 -S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict) +STATIC int +S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, + + const char * const s, /* Where the putative posix class begins. + Normally, this is one past the '['. This + parameter exists so it can be somewhere + besides RExC_parse. */ + char ** updated_parse_ptr, /* Where to set the updated parse pointer, or + NULL */ + AV ** posix_warnings, /* Where to place any generated warnings, or + NULL */ + const bool check_only /* Don't die if error */ +) { - dVAR; - I32 namedclass = OOB_NAMEDCLASS; + /* This parses what the caller thinks may be one of the three POSIX + * constructs: + * 1) a character class, like [:blank:] + * 2) a collating symbol, like [. .] + * 3) an equivalence class, like [= =] + * In the latter two cases, it croaks if it finds a syntactically legal + * one, as these are not handled by Perl. + * + * The main purpose is to look for a POSIX character class. It returns: + * a) the class number + * if it is a completely syntactically and semantically legal class. + * 'updated_parse_ptr', if not NULL, is set to point to just after the + * closing ']' of the class + * b) OOB_NAMEDCLASS + * if it appears that one of the three POSIX constructs was meant, but + * its specification was somehow defective. 'updated_parse_ptr', if + * not NULL, is set to point to the character just after the end + * character of the class. See below for handling of warnings. + * c) NOT_MEANT_TO_BE_A_POSIX_CLASS + * if it doesn't appear that a POSIX construct was intended. + * 'updated_parse_ptr' is not changed. No warnings nor errors are + * raised. + * + * In b) there may be errors or warnings generated. If 'check_only' is + * TRUE, then any errors are discarded. Warnings are returned to the + * caller via an AV* created into '*posix_warnings' if it is not NULL. If + * instead it is NULL, warnings are suppressed. This is done in all + * passes. The reason for this is that the rest of the parsing is heavily + * dependent on whether this routine found a valid posix class or not. If + * it did, the closing ']' is absorbed as part of the class. If no class, + * or an invalid one is found, any ']' will be considered the terminator of + * the outer bracketed character class, leading to very different results. + * In particular, a '(?[ ])' construct will likely have a syntax error if + * the class is parsed other than intended, and this will happen in pass1, + * before the warnings would normally be output. This mechanism allows the + * caller to output those warnings in pass1 just before dieing, giving a + * much better clue as to what is wrong. + * + * The reason for this function, and its complexity is that a bracketed + * character class can contain just about anything. But it's easy to + * mistype the very specific posix class syntax but yielding a valid + * regular bracketed class, so it silently gets compiled into something + * quite unintended. + * + * The solution adopted here maintains backward compatibility except that + * it adds a warning if it looks like a posix class was intended but + * improperly specified. The warning is not raised unless what is input + * very closely resembles one of the 14 legal posix classes. To do this, + * it uses fuzzy parsing. It calculates how many single-character edits it + * would take to transform what was input into a legal posix class. Only + * if that number is quite small does it think that the intention was a + * posix class. Obviously these are heuristics, and there will be cases + * where it errs on one side or another, and they can be tweaked as + * experience informs. + * + * The syntax for a legal posix class is: + * + * qr/(?xa: \[ : \^? [:lower:]{4,6} : \] )/ + * + * What this routine considers syntactically to be an intended posix class + * is this (the comments indicate some restrictions that the pattern + * doesn't show): + * + * qr/(?x: \[? # The left bracket, possibly + * # omitted + * \h* # possibly followed by blanks + * (?: \^ \h* )? # possibly a misplaced caret + * [:;]? # The opening class character, + * # possibly omitted. A typo + * # semi-colon can also be used. + * \h* + * \^? # possibly a correctly placed + * # caret, but not if there was also + * # a misplaced one + * \h* + * .{3,15} # The class name. If there are + * # deviations from the legal syntax, + * # its edit distance must be close + * # to a real class name in order + * # for it to be considered to be + * # an intended posix class. + * \h* + * [:punct:]? # The closing class character, + * # possibly omitted. If not a colon + * # nor semi colon, the class name + * # must be even closer to a valid + * # one + * \h* + * \]? # The right bracket, possibly + * # omitted. + * )/ + * + * In the above, \h must be ASCII-only. + * + * These are heuristics, and can be tweaked as field experience dictates. + * There will be cases when someone didn't intend to specify a posix class + * that this warns as being so. The goal is to minimize these, while + * maximizing the catching of things intended to be a posix class that + * aren't parsed as such. + */ - PERL_ARGS_ASSERT_REGPPOSIXCC; + const char* p = s; + const char * const e = RExC_end; + unsigned complement = 0; /* If to complement the class */ + bool found_problem = FALSE; /* Assume OK until proven otherwise */ + bool has_opening_bracket = FALSE; + bool has_opening_colon = FALSE; + int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find + valid class */ + AV* warn_text = NULL; /* any warning messages */ + const char * possible_end = NULL; /* used for a 2nd parse pass */ + const char* name_start; /* ptr to class name first char */ - if (value == '[' && RExC_parse + 1 < RExC_end && - /* I smell either [: or [= or [. -- POSIX has been here, right? */ - POSIXCC(UCHARAT(RExC_parse))) + /* If the number of single-character typos the input name is away from a + * legal name is no more than this number, it is considered to have meant + * the legal name */ + int max_distance = 2; + + /* to store the name. The size determines the maximum length before we + * decide that no posix class was intended. Should be at least + * sizeof("alphanumeric") */ + UV input_text[15]; + + PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX; + + if (p >= e) { + return NOT_MEANT_TO_BE_A_POSIX_CLASS; + } + + if (*(p - 1) != '[') { + ADD_POSIX_WARNING(p, "it doesn't start with a '['"); + found_problem = TRUE; + } + else { + has_opening_bracket = TRUE; + } + + /* They could be confused and think you can put spaces between the + * components */ + if (isBLANK(*p)) { + found_problem = TRUE; + + do { + p++; + } while (p < e && isBLANK(*p)); + + ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); + } + + /* For [. .] and [= =]. These are quite different internally from [: :], + * so they are handled separately. */ + if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']' + and 1 for at least one char in it + */ { - const char c = UCHARAT(RExC_parse); - char* const s = RExC_parse++; + const char open_char = *p; + const char * temp_ptr = p + 1; + + /* These two constructs are not handled by perl, and if we find a + * syntactically valid one, we croak. khw, who wrote this code, finds + * this explanation of them very unclear: + * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html + * And searching the rest of the internet wasn't very helpful either. + * It looks like just about any byte can be in these constructs, + * depending on the locale. But unless the pattern is being compiled + * under /l, which is very rare, Perl runs under the C or POSIX locale. + * In that case, it looks like [= =] isn't allowed at all, and that + * [. .] could be any single code point, but for longer strings the + * constituent characters would have to be the ASCII alphabetics plus + * the minus-hyphen. Any sensible locale definition would limit itself + * to these. And any portable one definitely should. Trying to parse + * the general case is a nightmare (see [perl #127604]). So, this code + * looks only for interiors of these constructs that match: + * qr/.|[-\w]{2,}/ + * Using \w relaxes the apparent rules a little, without adding much + * danger of mistaking something else for one of these constructs. + * + * [. .] in some implementations described on the internet is usable to + * escape a character that otherwise is special in bracketed character + * classes. For example [.].] means a literal right bracket instead of + * the ending of the class + * + * [= =] can legitimately contain a [. .] construct, but we don't + * handle this case, as that [. .] construct will later get parsed + * itself and croak then. And [= =] is checked for even when not under + * /l, as Perl has long done so. + * + * The code below relies on there being a trailing NUL, so it doesn't + * have to keep checking if the parse ptr < e. + */ + if (temp_ptr[1] == open_char) { + temp_ptr++; + } + else while ( temp_ptr < e + && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-')) + { + temp_ptr++; + } - while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c) - RExC_parse++; - if (RExC_parse == RExC_end) { - if (strict) { + if (*temp_ptr == open_char) { + temp_ptr++; + if (*temp_ptr == ']') { + temp_ptr++; + if (! found_problem && ! check_only) { + RExC_parse = (char *) temp_ptr; + vFAIL3("POSIX syntax [%c %c] is reserved for future " + "extensions", open_char, open_char); + } - /* Try to give a better location for the error (than the end of - * the string) by looking for the matching ']' */ - RExC_parse = s; - while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') { - RExC_parse++; + /* Here, the syntax wasn't completely valid, or else the call + * is to check-only */ + if (updated_parse_ptr) { + *updated_parse_ptr = (char *) temp_ptr; } - vFAIL2("Unmatched '%c' in POSIX class", c); + + return OOB_NAMEDCLASS; } - /* Grandfather lone [:, [=, [. */ - RExC_parse = s; } - else { - const char* const t = RExC_parse++; /* skip over the c */ - assert(*t == c); - - if (UCHARAT(RExC_parse) == ']') { - const char *posixcc = s + 1; - RExC_parse++; /* skip over the ending ] */ - - if (*s == ':') { - const I32 complement = *posixcc == '^' ? *posixcc++ : 0; - const I32 skip = t - posixcc; - - /* Initially switch on the length of the name. */ - switch (skip) { - case 4: - if (memEQ(posixcc, "word", 4)) /* this is not POSIX, - this is the Perl \w - */ - namedclass = ANYOF_WORDCHAR; - break; - case 5: - /* Names all of length 5. */ - /* alnum alpha ascii blank cntrl digit graph lower - print punct space upper */ - /* Offset 4 gives the best switch position. */ - switch (posixcc[4]) { - case 'a': - if (memEQ(posixcc, "alph", 4)) /* alpha */ - namedclass = ANYOF_ALPHA; - break; - case 'e': - if (memEQ(posixcc, "spac", 4)) /* space */ - namedclass = ANYOF_PSXSPC; - break; - case 'h': - if (memEQ(posixcc, "grap", 4)) /* graph */ - namedclass = ANYOF_GRAPH; - break; - case 'i': - if (memEQ(posixcc, "asci", 4)) /* ascii */ - namedclass = ANYOF_ASCII; - break; - case 'k': - if (memEQ(posixcc, "blan", 4)) /* blank */ - namedclass = ANYOF_BLANK; - break; - case 'l': - if (memEQ(posixcc, "cntr", 4)) /* cntrl */ - namedclass = ANYOF_CNTRL; - break; - case 'm': - if (memEQ(posixcc, "alnu", 4)) /* alnum */ - namedclass = ANYOF_ALPHANUMERIC; - break; - case 'r': - if (memEQ(posixcc, "lowe", 4)) /* lower */ - namedclass = (FOLD) ? ANYOF_CASED : ANYOF_LOWER; - else if (memEQ(posixcc, "uppe", 4)) /* upper */ - namedclass = (FOLD) ? ANYOF_CASED : ANYOF_UPPER; - break; - case 't': - if (memEQ(posixcc, "digi", 4)) /* digit */ - namedclass = ANYOF_DIGIT; - else if (memEQ(posixcc, "prin", 4)) /* print */ - namedclass = ANYOF_PRINT; - else if (memEQ(posixcc, "punc", 4)) /* punct */ - namedclass = ANYOF_PUNCT; - break; - } - break; - case 6: - if (memEQ(posixcc, "xdigit", 6)) - namedclass = ANYOF_XDIGIT; - break; - } - if (namedclass == OOB_NAMEDCLASS) - vFAIL2utf8f( - "POSIX class [:%"UTF8f":] unknown", - UTF8fARG(UTF, t - s - 1, s + 1)); + /* If we find something that started out to look like one of these + * constructs, but isn't, we continue below so that it can be checked + * for being a class name with a typo of '.' or '=' instead of a colon. + * */ + } - /* The #defines are structured so each complement is +1 to - * the normal one */ - if (complement) { - namedclass++; - } - assert (posixcc[skip] == ':'); - assert (posixcc[skip+1] == ']'); - } else if (!SIZE_ONLY) { - /* [[=foo=]] and [[.foo.]] are still future. */ - - /* adjust RExC_parse so the warning shows after - the class closes */ - while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']') - RExC_parse++; - vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); - } - } else { - /* Maternal grandfather: - * "[:" ending in ":" but not in ":]" */ - if (strict) { - vFAIL("Unmatched '[' in POSIX class"); - } + /* Here, we think there is a possibility that a [: :] class was meant, and + * we have the first real character. It could be they think the '^' comes + * first */ + if (*p == '^') { + found_problem = TRUE; + ADD_POSIX_WARNING(p + 1, "the '^' must come after the colon"); + complement = 1; + p++; - /* Grandfather lone [:, [=, [. */ - RExC_parse = s; - } - } + if (isBLANK(*p)) { + found_problem = TRUE; + + do { + p++; + } while (p < e && isBLANK(*p)); + + ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); + } } - return namedclass; -} + /* But the first character should be a colon, which they could have easily + * mistyped on a qwerty keyboard as a semi-colon (and which may be hard to + * distinguish from a colon, so treat that as a colon). */ + if (*p == ':') { + p++; + has_opening_colon = TRUE; + } + else if (*p == ';') { + found_problem = TRUE; + p++; + ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING); + has_opening_colon = TRUE; + } + else { + found_problem = TRUE; + ADD_POSIX_WARNING(p, "there must be a starting ':'"); -STATIC bool -S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state) -{ - /* This applies some heuristics at the current parse position (which should - * be at a '[') to see if what follows might be intended to be a [:posix:] - * class. It returns true if it really is a posix class, of course, but it - * also can return true if it thinks that what was intended was a posix - * class that didn't quite make it. - * - * It will return true for - * [:alphanumerics: - * [:alphanumerics] (as long as the ] isn't followed immediately by a - * ')' indicating the end of the (?[ - * [:any garbage including %^&$ punctuation:] - * - * This is designed to be called only from S_handle_regex_sets; it could be - * easily adapted to be called from the spot at the beginning of regclass() - * that checks to see in a normal bracketed class if the surrounding [] - * have been omitted ([:word:] instead of [[:word:]]). But doing so would - * change long-standing behavior, so I (khw) didn't do that */ - char* p = RExC_parse + 1; - char first_char = *p; + /* Consider an initial punctuation (not one of the recognized ones) to + * be a left terminator */ + if (*p != '^' && *p != ']' && isPUNCT(*p)) { + p++; + } + } - PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS; + /* They may think that you can put spaces between the components */ + if (isBLANK(*p)) { + found_problem = TRUE; - assert(*(p - 1) == '['); + do { + p++; + } while (p < e && isBLANK(*p)); - if (! POSIXCC(first_char)) { - return FALSE; + ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); } - p++; - while (p < RExC_end && isWORDCHAR(*p)) p++; + if (*p == '^') { - if (p >= RExC_end) { - return FALSE; + /* We consider something like [^:^alnum:]] to not have been intended to + * be a posix class, but XXX maybe we should */ + if (complement) { + return NOT_MEANT_TO_BE_A_POSIX_CLASS; + } + + complement = 1; + p++; } - if (p - RExC_parse > 2 /* Got at least 1 word character */ - && (*p == first_char - || (*p == ']' && p + 1 < RExC_end && *(p + 1) != ')'))) - { - return TRUE; + /* Again, they may think that you can put spaces between the components */ + if (isBLANK(*p)) { + found_problem = TRUE; + + do { + p++; + } while (p < e && isBLANK(*p)); + + ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); } - p = (char *) memchr(RExC_parse, ']', RExC_end - RExC_parse); + if (*p == ']') { + + /* XXX This ']' may be a typo, and something else was meant. But + * treating it as such creates enough complications, that that + * possibility isn't currently considered here. So we assume that the + * ']' is what is intended, and if we've already found an initial '[', + * this leaves this construct looking like [:] or [:^], which almost + * certainly weren't intended to be posix classes */ + if (has_opening_bracket) { + return NOT_MEANT_TO_BE_A_POSIX_CLASS; + } + + /* But this function can be called when we parse the colon for + * something like qr/[alpha:]]/, so we back up to look for the + * beginning */ + p--; + + if (*p == ';') { + found_problem = TRUE; + ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING); + } + else if (*p != ':') { + + /* XXX We are currently very restrictive here, so this code doesn't + * consider the possibility that, say, /[alpha.]]/ was intended to + * be a posix class. */ + return NOT_MEANT_TO_BE_A_POSIX_CLASS; + } + + /* Here we have something like 'foo:]'. There was no initial colon, + * and we back up over 'foo. XXX Unlike the going forward case, we + * don't handle typos of non-word chars in the middle */ + has_opening_colon = FALSE; + p--; + + while (p > RExC_start && isWORDCHAR(*p)) { + p--; + } + p++; + + /* Here, we have positioned ourselves to where we think the first + * character in the potential class is */ + } + + /* Now the interior really starts. There are certain key characters that + * can end the interior, or these could just be typos. To catch both + * cases, we may have to do two passes. In the first pass, we keep on + * going unless we come to a sequence that matches + * qr/ [[:punct:]] [[:blank:]]* \] /xa + * This means it takes a sequence to end the pass, so two typos in a row if + * that wasn't what was intended. If the class is perfectly formed, just + * this one pass is needed. We also stop if there are too many characters + * being accumulated, but this number is deliberately set higher than any + * real class. It is set high enough so that someone who thinks that + * 'alphanumeric' is a correct name would get warned that it wasn't. + * While doing the pass, we keep track of where the key characters were in + * it. If we don't find an end to the class, and one of the key characters + * was found, we redo the pass, but stop when we get to that character. + * Thus the key character was considered a typo in the first pass, but a + * terminator in the second. If two key characters are found, we stop at + * the second one in the first pass. Again this can miss two typos, but + * catches a single one + * + * In the first pass, 'possible_end' starts as NULL, and then gets set to + * point to the first key character. For the second pass, it starts as -1. + * */ + + name_start = p; + parse_name: + { + bool has_blank = FALSE; + bool has_upper = FALSE; + bool has_terminating_colon = FALSE; + bool has_terminating_bracket = FALSE; + bool has_semi_colon = FALSE; + unsigned int name_len = 0; + int punct_count = 0; + + while (p < e) { + + /* Squeeze out blanks when looking up the class name below */ + if (isBLANK(*p) ) { + has_blank = TRUE; + found_problem = TRUE; + p++; + continue; + } + + /* The name will end with a punctuation */ + if (isPUNCT(*p)) { + const char * peek = p + 1; + + /* Treat any non-']' punctuation followed by a ']' (possibly + * with intervening blanks) as trying to terminate the class. + * ']]' is very likely to mean a class was intended (but + * missing the colon), but the warning message that gets + * generated shows the error position better if we exit the + * loop at the bottom (eventually), so skip it here. */ + if (*p != ']') { + if (peek < e && isBLANK(*peek)) { + has_blank = TRUE; + found_problem = TRUE; + do { + peek++; + } while (peek < e && isBLANK(*peek)); + } + + if (peek < e && *peek == ']') { + has_terminating_bracket = TRUE; + if (*p == ':') { + has_terminating_colon = TRUE; + } + else if (*p == ';') { + has_semi_colon = TRUE; + has_terminating_colon = TRUE; + } + else { + found_problem = TRUE; + } + p = peek + 1; + goto try_posix; + } + } + + /* Here we have punctuation we thought didn't end the class. + * Keep track of the position of the key characters that are + * more likely to have been class-enders */ + if (*p == ']' || *p == '[' || *p == ':' || *p == ';') { + + /* Allow just one such possible class-ender not actually + * ending the class. */ + if (possible_end) { + break; + } + possible_end = p; + } + + /* If we have too many punctuation characters, no use in + * keeping going */ + if (++punct_count > max_distance) { + break; + } + + /* Treat the punctuation as a typo. */ + input_text[name_len++] = *p; + p++; + } + else if (isUPPER(*p)) { /* Use lowercase for lookup */ + input_text[name_len++] = toLOWER(*p); + has_upper = TRUE; + found_problem = TRUE; + p++; + } else if (! UTF || UTF8_IS_INVARIANT(*p)) { + input_text[name_len++] = *p; + p++; + } + else { + input_text[name_len++] = utf8_to_uvchr_buf((U8 *) p, e, NULL); + p+= UTF8SKIP(p); + } + + /* The declaration of 'input_text' is how long we allow a potential + * class name to be, before saying they didn't mean a class name at + * all */ + if (name_len >= C_ARRAY_LENGTH(input_text)) { + break; + } + } + + /* We get to here when the possible class name hasn't been properly + * terminated before: + * 1) we ran off the end of the pattern; or + * 2) found two characters, each of which might have been intended to + * be the name's terminator + * 3) found so many punctuation characters in the purported name, + * that the edit distance to a valid one is exceeded + * 4) we decided it was more characters than anyone could have + * intended to be one. */ + + found_problem = TRUE; + + /* In the final two cases, we know that looking up what we've + * accumulated won't lead to a match, even a fuzzy one. */ + if ( name_len >= C_ARRAY_LENGTH(input_text) + || punct_count > max_distance) + { + /* If there was an intermediate key character that could have been + * an intended end, redo the parse, but stop there */ + if (possible_end && possible_end != (char *) -1) { + possible_end = (char *) -1; /* Special signal value to say + we've done a first pass */ + p = name_start; + goto parse_name; + } + + /* Otherwise, it can't have meant to have been a class */ + return NOT_MEANT_TO_BE_A_POSIX_CLASS; + } + + /* If we ran off the end, and the final character was a punctuation + * one, back up one, to look at that final one just below. Later, we + * will restore the parse pointer if appropriate */ + if (name_len && p == e && isPUNCT(*(p-1))) { + p--; + name_len--; + } + + if (p < e && isPUNCT(*p)) { + if (*p == ']') { + has_terminating_bracket = TRUE; + + /* If this is a 2nd ']', and the first one is just below this + * one, consider that to be the real terminator. This gives a + * uniform and better positioning for the warning message */ + if ( possible_end + && possible_end != (char *) -1 + && *possible_end == ']' + && name_len && input_text[name_len - 1] == ']') + { + name_len--; + p = possible_end; + + /* And this is actually equivalent to having done the 2nd + * pass now, so set it to not try again */ + possible_end = (char *) -1; + } + } + else { + if (*p == ':') { + has_terminating_colon = TRUE; + } + else if (*p == ';') { + has_semi_colon = TRUE; + has_terminating_colon = TRUE; + } + p++; + } + } + + try_posix: + + /* Here, we have a class name to look up. We can short circuit the + * stuff below for short names that can't possibly be meant to be a + * class name. (We can do this on the first pass, as any second pass + * will yield an even shorter name) */ + if (name_len < 3) { + return NOT_MEANT_TO_BE_A_POSIX_CLASS; + } + + /* Find which class it is. Initially switch on the length of the name. + * */ + switch (name_len) { + case 4: + if (memEQ(name_start, "word", 4)) { + /* this is not POSIX, this is the Perl \w */ + class_number = ANYOF_WORDCHAR; + } + break; + case 5: + /* Names all of length 5: alnum alpha ascii blank cntrl digit + * graph lower print punct space upper + * Offset 4 gives the best switch position. */ + switch (name_start[4]) { + case 'a': + if (memEQ(name_start, "alph", 4)) /* alpha */ + class_number = ANYOF_ALPHA; + break; + case 'e': + if (memEQ(name_start, "spac", 4)) /* space */ + class_number = ANYOF_SPACE; + break; + case 'h': + if (memEQ(name_start, "grap", 4)) /* graph */ + class_number = ANYOF_GRAPH; + break; + case 'i': + if (memEQ(name_start, "asci", 4)) /* ascii */ + class_number = ANYOF_ASCII; + break; + case 'k': + if (memEQ(name_start, "blan", 4)) /* blank */ + class_number = ANYOF_BLANK; + break; + case 'l': + if (memEQ(name_start, "cntr", 4)) /* cntrl */ + class_number = ANYOF_CNTRL; + break; + case 'm': + if (memEQ(name_start, "alnu", 4)) /* alnum */ + class_number = ANYOF_ALPHANUMERIC; + break; + case 'r': + if (memEQ(name_start, "lowe", 4)) /* lower */ + class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER; + else if (memEQ(name_start, "uppe", 4)) /* upper */ + class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER; + break; + case 't': + if (memEQ(name_start, "digi", 4)) /* digit */ + class_number = ANYOF_DIGIT; + else if (memEQ(name_start, "prin", 4)) /* print */ + class_number = ANYOF_PRINT; + else if (memEQ(name_start, "punc", 4)) /* punct */ + class_number = ANYOF_PUNCT; + break; + } + break; + case 6: + if (memEQ(name_start, "xdigit", 6)) + class_number = ANYOF_XDIGIT; + break; + } + + /* If the name exactly matches a posix class name the class number will + * here be set to it, and the input almost certainly was meant to be a + * posix class, so we can skip further checking. If instead the syntax + * is exactly correct, but the name isn't one of the legal ones, we + * will return that as an error below. But if neither of these apply, + * it could be that no posix class was intended at all, or that one + * was, but there was a typo. We tease these apart by doing fuzzy + * matching on the name */ + if (class_number == OOB_NAMEDCLASS && found_problem) { + const UV posix_names[][6] = { + { 'a', 'l', 'n', 'u', 'm' }, + { 'a', 'l', 'p', 'h', 'a' }, + { 'a', 's', 'c', 'i', 'i' }, + { 'b', 'l', 'a', 'n', 'k' }, + { 'c', 'n', 't', 'r', 'l' }, + { 'd', 'i', 'g', 'i', 't' }, + { 'g', 'r', 'a', 'p', 'h' }, + { 'l', 'o', 'w', 'e', 'r' }, + { 'p', 'r', 'i', 'n', 't' }, + { 'p', 'u', 'n', 'c', 't' }, + { 's', 'p', 'a', 'c', 'e' }, + { 'u', 'p', 'p', 'e', 'r' }, + { 'w', 'o', 'r', 'd' }, + { 'x', 'd', 'i', 'g', 'i', 't' } + }; + /* The names of the above all have added NULs to make them the same + * size, so we need to also have the real lengths */ + const UV posix_name_lengths[] = { + sizeof("alnum") - 1, + sizeof("alpha") - 1, + sizeof("ascii") - 1, + sizeof("blank") - 1, + sizeof("cntrl") - 1, + sizeof("digit") - 1, + sizeof("graph") - 1, + sizeof("lower") - 1, + sizeof("print") - 1, + sizeof("punct") - 1, + sizeof("space") - 1, + sizeof("upper") - 1, + sizeof("word") - 1, + sizeof("xdigit")- 1 + }; + unsigned int i; + int temp_max = max_distance; /* Use a temporary, so if we + reparse, we haven't changed the + outer one */ + + /* Use a smaller max edit distance if we are missing one of the + * delimiters */ + if ( has_opening_bracket + has_opening_colon < 2 + || has_terminating_bracket + has_terminating_colon < 2) + { + temp_max--; + } - return (p - && p - RExC_parse > 2 /* [:] evaluates to colon; - [::] is a bad posix class. */ - && first_char == *(p - 1)); + /* See if the input name is close to a legal one */ + for (i = 0; i < C_ARRAY_LENGTH(posix_names); i++) { + + /* Short circuit call if the lengths are too far apart to be + * able to match */ + if (abs( (int) (name_len - posix_name_lengths[i])) + > temp_max) + { + continue; + } + + if (edit_distance(input_text, + posix_names[i], + name_len, + posix_name_lengths[i], + temp_max + ) + > -1) + { /* If it is close, it probably was intended to be a class */ + goto probably_meant_to_be; + } + } + + /* Here the input name is not close enough to a valid class name + * for us to consider it to be intended to be a posix class. If + * we haven't already done so, and the parse found a character that + * could have been terminators for the name, but which we absorbed + * as typos during the first pass, repeat the parse, signalling it + * to stop at that character */ + if (possible_end && possible_end != (char *) -1) { + possible_end = (char *) -1; + p = name_start; + goto parse_name; + } + + /* Here neither pass found a close-enough class name */ + return NOT_MEANT_TO_BE_A_POSIX_CLASS; + } + + probably_meant_to_be: + + /* Here we think that a posix specification was intended. Update any + * parse pointer */ + if (updated_parse_ptr) { + *updated_parse_ptr = (char *) p; + } + + /* If a posix class name was intended but incorrectly specified, we + * output or return the warnings */ + if (found_problem) { + + /* We set flags for these issues in the parse loop above instead of + * adding them to the list of warnings, because we can parse it + * twice, and we only want one warning instance */ + if (has_upper) { + ADD_POSIX_WARNING(p, "the name must be all lowercase letters"); + } + if (has_blank) { + ADD_POSIX_WARNING(p, NO_BLANKS_POSIX_WARNING); + } + if (has_semi_colon) { + ADD_POSIX_WARNING(p, SEMI_COLON_POSIX_WARNING); + } + else if (! has_terminating_colon) { + ADD_POSIX_WARNING(p, "there is no terminating ':'"); + } + if (! has_terminating_bracket) { + ADD_POSIX_WARNING(p, "there is no terminating ']'"); + } + + if (warn_text) { + if (posix_warnings) { + /* mortalize to avoid a leak with FATAL warnings */ + *posix_warnings = (AV *) sv_2mortal((SV *) warn_text); + } + else { + SvREFCNT_dec_NN(warn_text); + } + } + } + else if (class_number != OOB_NAMEDCLASS) { + /* If it is a known class, return the class. The class number + * #defines are structured so each complement is +1 to the normal + * one */ + return class_number + complement; + } + else if (! check_only) { + + /* Here, it is an unrecognized class. This is an error (unless the + * call is to check only, which we've already handled above) */ + const char * const complement_string = (complement) + ? "^" + : ""; + RExC_parse = (char *) p; + vFAIL3utf8f("POSIX class [:%s%"UTF8f":] unknown", + complement_string, + UTF8fARG(UTF, RExC_parse - name_start - 2, name_start)); + } + } + + return OOB_NAMEDCLASS; +} +#undef ADD_POSIX_WARNING + +STATIC unsigned int +S_regex_set_precedence(const U8 my_operator) { + + /* Returns the precedence in the (?[...]) construct of the input operator, + * specified by its character representation. The precedence follows + * general Perl rules, but it extends this so that ')' and ']' have (low) + * precedence even though they aren't really operators */ + + switch (my_operator) { + case '!': + return 5; + case '&': + return 4; + case '^': + case '|': + case '+': + case '-': + return 3; + case ')': + return 2; + case ']': + return 1; + } + + NOT_REACHED; /* NOTREACHED */ + return 0; /* Silence compiler warning */ } STATIC regnode * @@ -12688,24 +14538,41 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, { /* Handle the (?[...]) construct to do set operations */ - U8 curchar; - UV start, end; /* End points of code point ranges */ - SV* result_string; - char *save_end, *save_parse; - SV* final; - STRLEN len; - regnode* node; - AV* stack; - const bool save_fold = FOLD; + U8 curchar; /* Current character being parsed */ + UV start, end; /* End points of code point ranges */ + SV* final = NULL; /* The end result inversion list */ + SV* result_string; /* 'final' stringified */ + AV* stack; /* stack of operators and operands not yet + resolved */ + AV* fence_stack = NULL; /* A stack containing the positions in + 'stack' of where the undealt-with left + parens would be if they were actually + put there */ + /* The 'VOL' (expanding to 'volatile') is a workaround for an optimiser bug + * in Solaris Studio 12.3. See RT #127455 */ + VOL IV fence = 0; /* Position of where most recent undealt- + with left paren in stack is; -1 if none. + */ + STRLEN len; /* Temporary */ + regnode* node; /* Temporary, and final regnode returned by + this function */ + const bool save_fold = FOLD; /* Temporary */ + char *save_end, *save_parse; /* Temporaries */ + const bool in_locale = LOC; /* we turn off /l during processing */ + AV* posix_warnings = NULL; GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_HANDLE_REGEX_SETS; - if (LOC) { - vFAIL("(?[...]) not valid in locale"); + if (in_locale) { + set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); } - RExC_uni_semantics = 1; + + REQUIRE_UNI_RULES(flagp, NULL); /* The use of this operator implies /u. + This is required so that the compile + time values are valid in all runtime + cases */ /* This will return only an ANYOF regnode, or (unlikely) something smaller * (such as EXACT). Thus we can skip most everything if just sizing. We @@ -12717,57 +14584,59 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, if (SIZE_ONLY) { UV depth = 0; /* how many nested (?[...]) constructs */ - Perl_ck_warner_d(aTHX_ - packWARN(WARN_EXPERIMENTAL__REGEX_SETS), - "The regex_sets feature is experimental" REPORT_LOCATION, - UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp), - UTF8fARG(UTF, - RExC_end - RExC_start - (RExC_parse - RExC_precomp), - RExC_precomp + (RExC_parse - RExC_precomp))); - while (RExC_parse < RExC_end) { SV* current = NULL; - RExC_parse = regpatws(pRExC_state, RExC_parse, - TRUE); /* means recognize comments */ + + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + TRUE /* Force /x */ ); + switch (*RExC_parse) { case '?': if (RExC_parse[1] == '[') depth++, RExC_parse++; - /* FALL THROUGH */ + /* FALLTHROUGH */ default: break; case '\\': - /* Skip the next byte (which could cause us to end up in - * the middle of a UTF-8 character, but since none of those - * are confusable with anything we currently handle in this - * switch (invariants all), it's safe. We'll just hit the - * default: case next time and keep on incrementing until - * we find one of the invariants we do handle. */ + /* Skip past this, so the next character gets skipped, after + * the switch */ RExC_parse++; + if (*RExC_parse == 'c') { + /* Skip the \cX notation for control characters */ + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + } break; + case '[': { - /* If this looks like it is a [:posix:] class, leave the - * parse pointer at the '[' to fool regclass() into - * thinking it is part of a '[[:posix:]]'. That function - * will use strict checking to force a syntax error if it - * doesn't work out to a legitimate class */ - bool is_posix_class - = could_it_be_a_POSIX_class(pRExC_state); + /* See if this is a [:posix:] class. */ + bool is_posix_class = (OOB_NAMEDCLASS + < handle_possible_posix(pRExC_state, + RExC_parse + 1, + NULL, + NULL, + TRUE /* checking only */)); + /* If it is a posix class, leave the parse pointer at the + * '[' to fool regclass() into thinking it is part of a + * '[[:posix:]]'. */ if (! is_posix_class) { RExC_parse++; } - /* regclass() can only return RESTART_UTF8 if multi-char - folds are allowed. */ + /* regclass() can only return RESTART_PASS1 and NEED_UTF8 + * if multi-char folds are allowed. */ if (!regclass(pRExC_state, flagp,depth+1, is_posix_class, /* parse the whole char class only if not a posix class */ FALSE, /* don't allow multi-char folds */ TRUE, /* silence non-portable warnings. */ - ¤t)) - FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", - (UV) *flagp); + TRUE, /* strict */ + FALSE, /* Require return to be an ANYOF */ + ¤t, + &posix_warnings + )) + FAIL2("panic: regclass returned NULL to handle_sets, " + "flags=%#"UVxf"", (UV) *flagp); /* function call leaves parse pointing to the ']', except * if we faked it */ @@ -12782,88 +14651,150 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, case ']': if (depth--) break; RExC_parse++; - if (RExC_parse < RExC_end - && *RExC_parse == ')') - { + if (*RExC_parse == ')') { node = reganode(pRExC_state, ANYOF, 0); RExC_size += ANYOF_SKIP; nextchar(pRExC_state); Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ + if (in_locale) { + set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET); + } + return node; } goto no_close; } - RExC_parse++; + + RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + } + + no_close: + /* We output the messages even if warnings are off, because we'll fail + * the very next thing, and these give a likely diagnosis for that */ + if (posix_warnings && av_tindex_nomg(posix_warnings) >= 0) { + output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL); } - no_close: FAIL("Syntax error in (?[...])"); } - /* Pass 2 only after this. Everything in this construct is a - * metacharacter. Operands begin with either a '\' (for an escape - * sequence), or a '[' for a bracketed character class. Any other - * character should be an operator, or parenthesis for grouping. Both - * types of operands are handled by calling regclass() to parse them. It - * is called with a parameter to indicate to return the computed inversion - * list. The parsing here is implemented via a stack. Each entry on the - * stack is a single character representing one of the operators, or the - * '('; or else a pointer to an operand inversion list. */ - -#define IS_OPERAND(a) (! SvIOK(a)) - - /* The stack starts empty. It is a syntax error if the first thing parsed - * is a binary operator; everything else is pushed on the stack. When an - * operand is parsed, the top of the stack is examined. If it is a binary - * operator, the item before it should be an operand, and both are replaced - * by the result of doing that operation on the new operand and the one on - * the stack. Thus a sequence of binary operands is reduced to a single - * one before the next one is parsed. + /* Pass 2 only after this. */ + Perl_ck_warner_d(aTHX_ + packWARN(WARN_EXPERIMENTAL__REGEX_SETS), + "The regex_sets feature is experimental" REPORT_LOCATION, + REPORT_LOCATION_ARGS(RExC_parse)); + + /* Everything in this construct is a metacharacter. Operands begin with + * either a '\' (for an escape sequence), or a '[' for a bracketed + * character class. Any other character should be an operator, or + * parenthesis for grouping. Both types of operands are handled by calling + * regclass() to parse them. It is called with a parameter to indicate to + * return the computed inversion list. The parsing here is implemented via + * a stack. Each entry on the stack is a single character representing one + * of the operators; or else a pointer to an operand inversion list. */ + +#define IS_OPERATOR(a) SvIOK(a) +#define IS_OPERAND(a) (! IS_OPERATOR(a)) + + /* The stack is kept in Łukasiewicz order. (That's pronounced similar + * to luke-a-shave-itch (or -itz), but people who didn't want to bother + * with pronouncing it called it Reverse Polish instead, but now that YOU + * know how to pronounce it you can use the correct term, thus giving due + * credit to the person who invented it, and impressing your geek friends. + * Wikipedia says that the pronounciation of "Ł" has been changing so that + * it is now more like an English initial W (as in wonk) than an L.) + * + * This means that, for example, 'a | b & c' is stored on the stack as + * + * c [4] + * b [3] + * & [2] + * a [1] + * | [0] + * + * where the numbers in brackets give the stack [array] element number. + * In this implementation, parentheses are not stored on the stack. + * Instead a '(' creates a "fence" so that the part of the stack below the + * fence is invisible except to the corresponding ')' (this allows us to + * replace testing for parens, by using instead subtraction of the fence + * position). As new operands are processed they are pushed onto the stack + * (except as noted in the next paragraph). New operators of higher + * precedence than the current final one are inserted on the stack before + * the lhs operand (so that when the rhs is pushed next, everything will be + * in the correct positions shown above. When an operator of equal or + * lower precedence is encountered in parsing, all the stacked operations + * of equal or higher precedence are evaluated, leaving the result as the + * top entry on the stack. This makes higher precedence operations + * evaluate before lower precedence ones, and causes operations of equal + * precedence to left associate. * - * A unary operator may immediately follow a binary in the input, for - * example + * The only unary operator '!' is immediately pushed onto the stack when + * encountered. When an operand is encountered, if the top of the stack is + * a '!", the complement is immediately performed, and the '!' popped. The + * resulting value is treated as a new operand, and the logic in the + * previous paragraph is executed. Thus in the expression * [a] + ! [b] - * When an operand is parsed and the top of the stack is a unary operator, - * the operation is performed, and then the stack is rechecked to see if - * this new operand is part of a binary operation; if so, it is handled as - * above. + * the stack looks like * - * A '(' is simply pushed on the stack; it is valid only if the stack is - * empty, or the top element of the stack is an operator or another '(' - * (for which the parenthesized expression will become an operand). By the - * time the corresponding ')' is parsed everything in between should have - * been parsed and evaluated to a single operand (or else is a syntax - * error), and is handled as a regular operand */ + * ! + * a + * + + * + * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack + * becomes + * + * !b + * a + * + + * + * A ')' is treated as an operator with lower precedence than all the + * aforementioned ones, which causes all operations on the stack above the + * corresponding '(' to be evaluated down to a single resultant operand. + * Then the fence for the '(' is removed, and the operand goes through the + * algorithm above, without the fence. + * + * A separate stack is kept of the fence positions, so that the position of + * the latest so-far unbalanced '(' is at the top of it. + * + * The ']' ending the construct is treated as the lowest operator of all, + * so that everything gets evaluated down to a single operand, which is the + * result */ sv_2mortal((SV *)(stack = newAV())); + sv_2mortal((SV *)(fence_stack = newAV())); while (RExC_parse < RExC_end) { - I32 top_index = av_tindex(stack); - SV** top_ptr; - SV* current = NULL; + I32 top_index; /* Index of top-most element in 'stack' */ + SV** top_ptr; /* Pointer to top 'stack' element */ + SV* current = NULL; /* To contain the current inversion list + operand */ + SV* only_to_avoid_leaks; - /* Skip white space */ - RExC_parse = regpatws(pRExC_state, RExC_parse, - TRUE); /* means recognize comments */ + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + TRUE /* Force /x */ ); if (RExC_parse >= RExC_end) { Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'"); } - if ((curchar = UCHARAT(RExC_parse)) == ']') { - break; - } + + curchar = UCHARAT(RExC_parse); + +redo_curchar: + + top_index = av_tindex_nomg(stack); switch (curchar) { + SV** stacked_ptr; /* Ptr to something already on 'stack' */ + char stacked_operator; /* The topmost operator on the 'stack'. */ + SV* lhs; /* Operand to the left of the operator */ + SV* rhs; /* Operand to the right of the operator */ + SV* fence_ptr; /* Pointer to top element of the fence + stack */ - case '?': - if (av_tindex(stack) >= 0 /* This makes sure that we can - safely subtract 1 from - RExC_parse in the next clause. - If we have something on the - stack, we have parsed something - */ - && UCHARAT(RExC_parse - 1) == '(' - && RExC_parse < RExC_end) + case '(': + + if ( RExC_parse < RExC_end - 1 + && (UCHARAT(RExC_parse + 1) == '?')) { /* If is a '(?', could be an embedded '(?flags:(?[...])'. * This happens when we have some thing like @@ -12878,14 +14809,18 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, * interpolated expression evaluates to. We use the flags * from the interpolated pattern. */ U32 save_flags = RExC_flags; - const char * const save_parse = ++RExC_parse; + const char * save_parse; + + RExC_parse += 2; /* Skip past the '(?' */ + save_parse = RExC_parse; + /* Parse any flags for the '(?' */ parse_lparen_question_flags(pRExC_state); if (RExC_parse == save_parse /* Makes sure there was at - least one flag (or this - embedding wasn't compiled) - */ + least one flag (or else + this embedding wasn't + compiled) */ || RExC_parse >= RExC_end - 4 || UCHARAT(RExC_parse) != ':' || UCHARAT(++RExC_parse) != '(' @@ -12905,36 +14840,68 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, } vFAIL("Expecting '(?flags:(?[...'"); } + + /* Recurse, with the meat of the embedded expression */ RExC_parse++; (void) handle_regex_sets(pRExC_state, ¤t, flagp, depth+1, oregcomp_parse); /* Here, 'current' contains the embedded expression's * inversion list, and RExC_parse points to the trailing - * ']'; the next character should be the ')' which will be - * paired with the '(' that has been put on the stack, so - * the whole embedded expression reduces to '(operand)' */ + * ']'; the next character should be the ')' */ RExC_parse++; + assert(UCHARAT(RExC_parse) == ')'); + /* Then the ')' matching the original '(' handled by this + * case: statement */ + RExC_parse++; + assert(UCHARAT(RExC_parse) == ')'); + + RExC_parse++; RExC_flags = save_flags; goto handle_operand; } - /* FALL THROUGH */ - default: - RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; - vFAIL("Unexpected character"); + /* A regular '('. Look behind for illegal syntax */ + if (top_index - fence >= 0) { + /* If the top entry on the stack is an operator, it had + * better be a '!', otherwise the entry below the top + * operand should be an operator */ + if ( ! (top_ptr = av_fetch(stack, top_index, FALSE)) + || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!') + || ( IS_OPERAND(*top_ptr) + && ( top_index - fence < 1 + || ! (stacked_ptr = av_fetch(stack, + top_index - 1, + FALSE)) + || ! IS_OPERATOR(*stacked_ptr)))) + { + RExC_parse++; + vFAIL("Unexpected '(' with no preceding operator"); + } + } + + /* Stack the position of this undealt-with left paren */ + fence = top_index + 1; + av_push(fence_stack, newSViv(fence)); + break; case '\\': - /* regclass() can only return RESTART_UTF8 if multi-char - folds are allowed. */ + /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if + * multi-char folds are allowed. */ if (!regclass(pRExC_state, flagp,depth+1, TRUE, /* means parse just the next thing */ FALSE, /* don't allow multi-char folds */ FALSE, /* don't silence non-portable warnings. */ - ¤t)) - FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", - (UV) *flagp); + TRUE, /* strict */ + FALSE, /* Require return to be an ANYOF */ + ¤t, + NULL)) + { + FAIL2("panic: regclass returned NULL to handle_sets, " + "flags=%#"UVxf"", (UV) *flagp); + } + /* regclass() will return with parsing just the \ sequence, * leaving the parse pointer at the next thing to parse */ RExC_parse--; @@ -12942,22 +14909,38 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, case '[': /* Is a bracketed character class */ { - bool is_posix_class = could_it_be_a_POSIX_class(pRExC_state); - + /* See if this is a [:posix:] class. */ + bool is_posix_class = (OOB_NAMEDCLASS + < handle_possible_posix(pRExC_state, + RExC_parse + 1, + NULL, + NULL, + TRUE /* checking only */)); + /* If it is a posix class, leave the parse pointer at the '[' + * to fool regclass() into thinking it is part of a + * '[[:posix:]]'. */ if (! is_posix_class) { RExC_parse++; } - /* regclass() can only return RESTART_UTF8 if multi-char - folds are allowed. */ - if(!regclass(pRExC_state, flagp,depth+1, - is_posix_class, /* parse the whole char class - only if not a posix class */ - FALSE, /* don't allow multi-char folds */ - FALSE, /* don't silence non-portable warnings. */ - ¤t)) - FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", - (UV) *flagp); + /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if + * multi-char folds are allowed. */ + if (!regclass(pRExC_state, flagp,depth+1, + is_posix_class, /* parse the whole char + class only if not a + posix class */ + FALSE, /* don't allow multi-char folds */ + TRUE, /* silence non-portable warnings. */ + TRUE, /* strict */ + FALSE, /* Require return to be an ANYOF */ + ¤t, + NULL + )) + { + FAIL2("panic: regclass returned NULL to handle_sets, " + "flags=%#"UVxf"", (UV) *flagp); + } + /* function call leaves parse pointing to the ']', except if we * faked it */ if (is_posix_class) { @@ -12967,154 +14950,274 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, goto handle_operand; } + case ']': + if (top_index >= 1) { + goto join_operators; + } + + /* Only a single operand on the stack: are done */ + goto done; + + case ')': + if (av_tindex_nomg(fence_stack) < 0) { + RExC_parse++; + vFAIL("Unexpected ')'"); + } + + /* If at least two thing on the stack, treat this as an + * operator */ + if (top_index - fence >= 1) { + goto join_operators; + } + + /* Here only a single thing on the fenced stack, and there is a + * fence. Get rid of it */ + fence_ptr = av_pop(fence_stack); + assert(fence_ptr); + fence = SvIV(fence_ptr) - 1; + SvREFCNT_dec_NN(fence_ptr); + fence_ptr = NULL; + + if (fence < 0) { + fence = 0; + } + + /* Having gotten rid of the fence, we pop the operand at the + * stack top and process it as a newly encountered operand */ + current = av_pop(stack); + if (IS_OPERAND(current)) { + goto handle_operand; + } + + RExC_parse++; + goto bad_syntax; + case '&': case '|': case '+': case '-': case '^': - if (top_index < 0 + + /* These binary operators should have a left operand already + * parsed */ + if ( top_index - fence < 0 + || top_index - fence == 1 || ( ! (top_ptr = av_fetch(stack, top_index, FALSE))) || ! IS_OPERAND(*top_ptr)) { - RExC_parse++; - vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar); + goto unexpected_binary; } - av_push(stack, newSVuv(curchar)); - break; - case '!': - av_push(stack, newSVuv(curchar)); - break; + /* If only the one operand is on the part of the stack visible + * to us, we just place this operator in the proper position */ + if (top_index - fence < 2) { - case '(': - if (top_index >= 0) { - top_ptr = av_fetch(stack, top_index, FALSE); - assert(top_ptr); - if (IS_OPERAND(*top_ptr)) { - RExC_parse++; - vFAIL("Unexpected '(' with no preceding operator"); - } + /* Place the operator before the operand */ + + SV* lhs = av_pop(stack); + av_push(stack, newSVuv(curchar)); + av_push(stack, lhs); + break; } - av_push(stack, newSVuv(curchar)); - break; - case ')': - { - SV* lparen; - if (top_index < 1 - || ! (current = av_pop(stack)) - || ! IS_OPERAND(current) - || ! (lparen = av_pop(stack)) - || IS_OPERAND(lparen) - || SvUV(lparen) != '(') + /* But if there is something else on the stack, we need to + * process it before this new operator if and only if the + * stacked operation has equal or higher precedence than the + * new one */ + + join_operators: + + /* The operator on the stack is supposed to be below both its + * operands */ + if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE)) + || IS_OPERAND(*stacked_ptr)) { - SvREFCNT_dec(current); + /* But if not, it's legal and indicates we are completely + * done if and only if we're currently processing a ']', + * which should be the final thing in the expression */ + if (curchar == ']') { + goto done; + } + + unexpected_binary: RExC_parse++; - vFAIL("Unexpected ')'"); + vFAIL2("Unexpected binary operator '%c' with no " + "preceding operand", curchar); } - top_index -= 2; - SvREFCNT_dec_NN(lparen); + stacked_operator = (char) SvUV(*stacked_ptr); - /* FALL THROUGH */ - } + if (regex_set_precedence(curchar) + > regex_set_precedence(stacked_operator)) + { + /* Here, the new operator has higher precedence than the + * stacked one. This means we need to add the new one to + * the stack to await its rhs operand (and maybe more + * stuff). We put it before the lhs operand, leaving + * untouched the stacked operator and everything below it + * */ + lhs = av_pop(stack); + assert(IS_OPERAND(lhs)); + + av_push(stack, newSVuv(curchar)); + av_push(stack, lhs); + break; + } - handle_operand: + /* Here, the new operator has equal or lower precedence than + * what's already there. This means the operation already + * there should be performed now, before the new one. */ - /* Here, we have an operand to process, in 'current' */ + rhs = av_pop(stack); + if (! IS_OPERAND(rhs)) { - if (top_index < 0) { /* Just push if stack is empty */ - av_push(stack, current); + /* This can happen when a ! is not followed by an operand, + * like in /(?[\t &!])/ */ + goto bad_syntax; } - else { - SV* top = av_pop(stack); - SV *prev = NULL; - char current_operator; - - if (IS_OPERAND(top)) { - SvREFCNT_dec_NN(top); - SvREFCNT_dec_NN(current); - vFAIL("Operand with no preceding operator"); + + lhs = av_pop(stack); + + if (! IS_OPERAND(lhs)) { + + /* This can happen when there is an empty (), like in + * /(?[[0]+()+])/ */ + goto bad_syntax; + } + + switch (stacked_operator) { + case '&': + _invlist_intersection(lhs, rhs, &rhs); + break; + + case '|': + case '+': + _invlist_union(lhs, rhs, &rhs); + break; + + case '-': + _invlist_subtract(lhs, rhs, &rhs); + break; + + case '^': /* The union minus the intersection */ + { + SV* i = NULL; + SV* u = NULL; + SV* element; + + _invlist_union(lhs, rhs, &u); + _invlist_intersection(lhs, rhs, &i); + /* _invlist_subtract will overwrite rhs + without freeing what it already contains */ + element = rhs; + _invlist_subtract(u, i, &rhs); + SvREFCNT_dec_NN(i); + SvREFCNT_dec_NN(u); + SvREFCNT_dec_NN(element); + break; } - current_operator = (char) SvUV(top); - switch (current_operator) { - case '(': /* Push the '(' back on followed by the new - operand */ - av_push(stack, top); - av_push(stack, current); - SvREFCNT_inc(top); /* Counters the '_dec' done - just after the 'break', so - it doesn't get wrongly freed - */ - break; + } + SvREFCNT_dec(lhs); + + /* Here, the higher precedence operation has been done, and the + * result is in 'rhs'. We overwrite the stacked operator with + * the result. Then we redo this code to either push the new + * operator onto the stack or perform any higher precedence + * stacked operation */ + only_to_avoid_leaks = av_pop(stack); + SvREFCNT_dec(only_to_avoid_leaks); + av_push(stack, rhs); + goto redo_curchar; + + case '!': /* Highest priority, right associative */ + + /* If what's already at the top of the stack is another '!", + * they just cancel each other out */ + if ( (top_ptr = av_fetch(stack, top_index, FALSE)) + && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!')) + { + only_to_avoid_leaks = av_pop(stack); + SvREFCNT_dec(only_to_avoid_leaks); + } + else { /* Otherwise, since it's right associative, just push + onto the stack */ + av_push(stack, newSVuv(curchar)); + } + break; - case '!': - _invlist_invert(current); - - /* Unlike binary operators, the top of the stack, - * now that this unary one has been popped off, may - * legally be an operator, and we now have operand - * for it. */ - top_index--; - SvREFCNT_dec_NN(top); - goto handle_operand; - - case '&': - prev = av_pop(stack); - _invlist_intersection(prev, - current, - ¤t); - av_push(stack, current); - break; + default: + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + vFAIL("Unexpected character"); - case '|': - case '+': - prev = av_pop(stack); - _invlist_union(prev, current, ¤t); - av_push(stack, current); - break; + handle_operand: + + /* Here 'current' is the operand. If something is already on the + * stack, we have to check if it is a !. But first, the code above + * may have altered the stack in the time since we earlier set + * 'top_index'. */ + + top_index = av_tindex_nomg(stack); + if (top_index - fence >= 0) { + /* If the top entry on the stack is an operator, it had better + * be a '!', otherwise the entry below the top operand should + * be an operator */ + top_ptr = av_fetch(stack, top_index, FALSE); + assert(top_ptr); + if (IS_OPERATOR(*top_ptr)) { + + /* The only permissible operator at the top of the stack is + * '!', which is applied immediately to this operand. */ + curchar = (char) SvUV(*top_ptr); + if (curchar != '!') { + SvREFCNT_dec(current); + vFAIL2("Unexpected binary operator '%c' with no " + "preceding operand", curchar); + } - case '-': - prev = av_pop(stack);; - _invlist_subtract(prev, current, ¤t); - av_push(stack, current); - break; + _invlist_invert(current); - case '^': /* The union minus the intersection */ - { - SV* i = NULL; - SV* u = NULL; - SV* element; - - prev = av_pop(stack); - _invlist_union(prev, current, &u); - _invlist_intersection(prev, current, &i); - /* _invlist_subtract will overwrite current - without freeing what it already contains */ - element = current; - _invlist_subtract(u, i, ¤t); - av_push(stack, current); - SvREFCNT_dec_NN(i); - SvREFCNT_dec_NN(u); - SvREFCNT_dec_NN(element); - break; - } + only_to_avoid_leaks = av_pop(stack); + SvREFCNT_dec(only_to_avoid_leaks); - default: - Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack"); + /* And we redo with the inverted operand. This allows + * handling multiple ! in a row */ + goto handle_operand; + } + /* Single operand is ok only for the non-binary ')' + * operator */ + else if ((top_index - fence == 0 && curchar != ')') + || (top_index - fence > 0 + && (! (stacked_ptr = av_fetch(stack, + top_index - 1, + FALSE)) + || IS_OPERAND(*stacked_ptr)))) + { + SvREFCNT_dec(current); + vFAIL("Operand with no preceding operator"); } - SvREFCNT_dec_NN(top); - SvREFCNT_dec(prev); } - } + + /* Here there was nothing on the stack or the top element was + * another operand. Just add this new one */ + av_push(stack, current); + + } /* End of switch on next parse token */ RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + } /* End of loop parsing through the construct */ + + done: + if (av_tindex_nomg(fence_stack) >= 0) { + vFAIL("Unmatched ("); } - if (av_tindex(stack) < 0 /* Was empty */ + if (av_tindex_nomg(stack) < 0 /* Was empty */ || ((final = av_pop(stack)) == NULL) || ! IS_OPERAND(final) - || av_tindex(stack) >= 0) /* More left on stack */ + || SvTYPE(final) != SVt_INVLIST + || av_tindex_nomg(stack) >= 0) /* More left on stack */ { + bad_syntax: + SvREFCNT_dec(final); vFAIL("Incomplete expression within '(?[ ])'"); } @@ -13139,6 +15242,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, } } + /* About to generate an ANYOF (or similar) node from the inversion list we + * have calculated */ save_parse = RExC_parse; RExC_parse = SvPV(result_string, len); save_end = RExC_end; @@ -13148,21 +15253,53 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, * already has all folding taken into consideration, and we don't want * regclass() to add to that */ RExC_flags &= ~RXf_PMf_FOLD; - /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed. - */ + /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if multi-char + * folds are allowed. */ node = regclass(pRExC_state, flagp,depth+1, FALSE, /* means parse the whole char class */ FALSE, /* don't allow multi-char folds */ TRUE, /* silence non-portable warnings. The above may very well have generated non-portable code points, but they're valid on this machine */ - NULL); + FALSE, /* similarly, no need for strict */ + FALSE, /* Require return to be an ANYOF */ + NULL, + NULL + ); if (!node) FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf, PTR2UV(flagp)); + + /* Fix up the node type if we are in locale. (We have pretended we are + * under /u for the purposes of regclass(), as this construct will only + * work under UTF-8 locales. But now we change the opcode to be ANYOFL (so + * as to cause any warnings about bad locales to be output in regexec.c), + * and add the flag that indicates to check if not in a UTF-8 locale. The + * reason we above forbid optimization into something other than an ANYOF + * node is simply to minimize the number of code changes in regexec.c. + * Otherwise we would have to create new EXACTish node types and deal with + * them. This decision could be revisited should this construct become + * popular. + * + * (One might think we could look at the resulting ANYOF node and suppress + * the flag if everything is above 255, as those would be UTF-8 only, + * but this isn't true, as the components that led to that result could + * have been locale-affected, and just happen to cancel each other out + * under UTF-8 locales.) */ + if (in_locale) { + set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET); + + assert(OP(node) == ANYOF); + + OP(node) = ANYOFL; + ANYOF_FLAGS(node) + |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; + } + if (save_fold) { RExC_flags |= RXf_PMf_FOLD; } + RExC_parse = save_parse + 1; RExC_end = save_end; SvREFCNT_dec_NN(final); @@ -13172,14 +15309,193 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ return node; } +#undef IS_OPERATOR #undef IS_OPERAND +STATIC void +S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) +{ + /* This hard-codes the Latin1/above-Latin1 folding rules, so that an + * innocent-looking character class, like /[ks]/i won't have to go out to + * disk to find the possible matches. + * + * This should be called only for a Latin1-range code points, cp, which is + * known to be involved in a simple fold with other code points above + * Latin1. It would give false results if /aa has been specified. + * Multi-char folds are outside the scope of this, and must be handled + * specially. + * + * XXX It would be better to generate these via regen, in case a new + * version of the Unicode standard adds new mappings, though that is not + * really likely, and may be caught by the default: case of the switch + * below. */ + + PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS; + + assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp)); + + switch (cp) { + case 'k': + case 'K': + *invlist = + add_cp_to_invlist(*invlist, KELVIN_SIGN); + break; + case 's': + case 'S': + *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S); + break; + case MICRO_SIGN: + *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU); + *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU); + break; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN); + break; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + *invlist = add_cp_to_invlist(*invlist, + LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); + break; + +#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */ + + case LATIN_SMALL_LETTER_SHARP_S: + *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S); + break; + +#endif + +#if UNICODE_MAJOR_VERSION < 3 \ + || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0) + + /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did + * U+0131. */ + case 'i': + case 'I': + *invlist = + add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); +# if UNICODE_DOT_DOT_VERSION == 1 + *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I); +# endif + break; +#endif + + default: + /* Use deprecated warning to increase the chances of this being + * output */ + if (PASS2) { + ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp); + } + break; + } +} + +STATIC void +S_output_or_return_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings, AV** return_posix_warnings) +{ + /* If the final parameter is NULL, output the elements of the array given + * by '*posix_warnings' as REGEXP warnings. Otherwise, the elements are + * pushed onto it, (creating if necessary) */ + + SV * msg; + const bool first_is_fatal = ! return_posix_warnings + && ckDEAD(packWARN(WARN_REGEXP)); + + PERL_ARGS_ASSERT_OUTPUT_OR_RETURN_POSIX_WARNINGS; + + while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) { + if (return_posix_warnings) { + if (! *return_posix_warnings) { /* mortalize to not leak if + warnings are fatal */ + *return_posix_warnings = (AV *) sv_2mortal((SV *) newAV()); + } + av_push(*return_posix_warnings, msg); + } + else { + if (first_is_fatal) { /* Avoid leaking this */ + av_undef(posix_warnings); /* This isn't necessary if the + array is mortal, but is a + fail-safe */ + (void) sv_2mortal(msg); + if (PASS2) { + SAVEFREESV(RExC_rx_sv); + } + } + Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg)); + SvREFCNT_dec_NN(msg); + } + } +} + +STATIC AV * +S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count) +{ + /* This adds the string scalar <multi_string> to the array + * <multi_char_matches>. <multi_string> is known to have exactly + * <cp_count> code points in it. This is used when constructing a + * bracketed character class and we find something that needs to match more + * than a single character. + * + * <multi_char_matches> is actually an array of arrays. Each top-level + * element is an array that contains all the strings known so far that are + * the same length. And that length (in number of code points) is the same + * as the index of the top-level array. Hence, the [2] element is an + * array, each element thereof is a string containing TWO code points; + * while element [3] is for strings of THREE characters, and so on. Since + * this is for multi-char strings there can never be a [0] nor [1] element. + * + * When we rewrite the character class below, we will do so such that the + * longest strings are written first, so that it prefers the longest + * matching strings first. This is done even if it turns out that any + * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom + * Christiansen has agreed that this is ok. This makes the test for the + * ligature 'ffi' come before the test for 'ff', for example */ + + AV* this_array; + AV** this_array_ptr; + + PERL_ARGS_ASSERT_ADD_MULTI_MATCH; + + if (! multi_char_matches) { + multi_char_matches = newAV(); + } + + if (av_exists(multi_char_matches, cp_count)) { + this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE); + this_array = *this_array_ptr; + } + else { + this_array = newAV(); + av_store(multi_char_matches, cp_count, + (SV*) this_array); + } + av_push(this_array, multi_string); + + return multi_char_matches; +} + /* The names of properties whose definitions are not known at compile time are * stored in this SV, after a constant heading. So if the length has been * changed since initialization, then there is a run-time definition. */ #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \ (SvCUR(listsv) != initial_listsv_len) +/* There is a restricted set of white space characters that are legal when + * ignoring white space in a bracketed character class. This generates the + * code to skip them. + * + * There is a line below that uses the same white space criteria but is outside + * this macro. Both here and there must use the same definition */ +#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \ + STMT_START { \ + if (do_skip) { \ + while (isBLANK_A(UCHARAT(p))) \ + { \ + p++; \ + } \ + } \ + } STMT_END + STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const bool stop_at_1, /* Just parse the next thing, don't @@ -13188,7 +15504,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const bool silence_non_portable, /* Don't output warnings about too large characters */ - SV** ret_invlist) /* Return an inversion list, not a node */ + const bool strict, + bool optimizable, /* ? Allow a non-ANYOF return + node */ + SV** ret_invlist, /* Return an inversion list, not a node */ + AV** return_posix_warnings + ) { /* parse a bracketed class specification. Most of these will produce an * ANYOF node; but something like [a] will produce an EXACT node; [aA], an @@ -13205,23 +15526,23 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * ignored in the recursion by means of a flag: * <RExC_in_multi_char_class>.) * - * ANYOF nodes contain a bit map for the first 256 characters, with the - * corresponding bit set if that character is in the list. For characters - * above 255, a range list or swash is used. There are extra bits for \w, - * etc. in locale ANYOFs, as what these match is not determinable at - * compile time + * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS + * characters, with the corresponding bit set if that character is in the + * list. For characters above this, a range list or swash is used. There + * are extra bits for \w, etc. in locale ANYOFs, as what these match is not + * determinable at compile time * - * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs - * to be restarted. This can only happen if ret_invlist is non-NULL. + * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs + * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded + * to UTF-8. This can only happen if ret_invlist is non-NULL. */ - dVAR; UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE; IV range = 0; UV value = OOB_UNICODE, save_value = OOB_UNICODE; regnode *ret; STRLEN numlen; - IV namedclass = OOB_NAMEDCLASS; + int namedclass = OOB_NAMEDCLASS; char *rangebegin = NULL; bool need_class = 0; SV *listsv = NULL; @@ -13238,6 +15559,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, separate for a while from the non-complemented versions because of complications with /d matching */ + SV* simple_posixes = NULL; /* But under some conditions, the classes can be + treated more simply than the general case, + leading to less compilation and execution + work */ UV element_count = 0; /* Number of distinct elements in the class. Optimizations may be possible if this is tiny */ AV * multi_char_matches = NULL; /* Code points that fold to more than one @@ -13246,7 +15571,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, char * stop_ptr = RExC_end; /* where to stop parsing */ const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white space? */ - const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */ /* Unicode properties are stored in a swash; this holds the current one * being parsed. If this swash is the only above-latin1 component of the @@ -13261,8 +15585,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, bool has_user_defined_property = FALSE; /* inversion list of code points this node matches only when the target - * string is in UTF-8. (Because is under /d) */ - SV* depends_list = NULL; + * string is in UTF-8. These are all non-ASCII, < 256. (Because is under + * /d) */ + SV* has_upper_latin1_only_utf8_matches = NULL; /* Inversion list of code points this node matches regardless of things * like locale, folding, utf8ness of the target string */ @@ -13276,11 +15601,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * runtime locale is UTF-8 */ SV* only_utf8_locale_list = NULL; -#ifdef EBCDIC - /* In a range, counts how many 0-2 of the ends of it came from literals, - * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ - UV literal_endpoint = 0; -#endif + /* In a range, if one of the endpoints is non-character-set portable, + * meaning that it hard-codes a code point that may mean a different + * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a + * mnemonic '\t' which each mean the same character no matter which + * character set the platform is on. */ + unsigned int non_portable_endpoint = 0; + + /* Is the range unicode? which means on a platform that isn't 1-1 native + * to Unicode (i.e. non-ASCII), each code point in it should be considered + * to be a Unicode value. */ + bool unicode_range = FALSE; bool invert = FALSE; /* Is this class to be complemented */ bool warn_super = ALWAYS_WARN_SUPER; @@ -13290,6 +15621,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const char * orig_parse = RExC_parse; const SSize_t orig_size = RExC_size; bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */ + + /* This variable is used to mark where the end in the input is of something + * that looks like a POSIX construct but isn't. During the parse, when + * something looks like it could be such a construct is encountered, it is + * checked for being one, but not if we've already checked this area of the + * input. Only after this position is reached do we check again */ + char *not_posix_region_end = RExC_parse - 1; + + AV* posix_warnings = NULL; + const bool do_posix_warnings = return_posix_warnings + || (PASS2 && ckWARN(WARN_REGEXP)); + GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGCLASS; @@ -13299,8 +15642,18 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, DEBUG_PARSE("clas"); +#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \ + || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \ + && UNICODE_DOT_DOT_VERSION == 0) + allow_multi_folds = FALSE; +#endif + /* Assume we are going to generate an ANYOF node. */ - ret = reganode(pRExC_state, ANYOF, 0); + ret = reganode(pRExC_state, + (LOC) + ? ANYOFL + : ANYOF, + 0); if (SIZE_ONLY) { RExC_size += ANYOF_SKIP; @@ -13315,36 +15668,38 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ } - if (skip_white) { - RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); - } + SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); + + assert(RExC_parse <= RExC_end); - if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ + if (UCHARAT(RExC_parse) == '^') { /* Complement the class */ RExC_parse++; invert = TRUE; allow_multi_folds = FALSE; - RExC_naughty++; - if (skip_white) { - RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); - } + MARK_NAUGHTY(1); + SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); } /* Check that they didn't say [:posix:] instead of [[:posix:]] */ - if (!SIZE_ONLY && RExC_parse < RExC_end && POSIXCC(UCHARAT(RExC_parse))) { - const char *s = RExC_parse; - const char c = *s++; - - while (isWORDCHAR(*s)) - s++; - if (*s && c == *s && s[1] == ']') { - SAVEFREESV(RExC_rx_sv); - ckWARN3reg(s+2, - "POSIX syntax [%c %c] belongs inside character classes", - c, c); - (void)ReREFCNT_inc(RExC_rx_sv); - } + if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) { + int maybe_class = handle_possible_posix(pRExC_state, + RExC_parse, + ¬_posix_region_end, + NULL, + TRUE /* checking only */); + if (PASS2 && maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) { + SAVEFREESV(RExC_rx_sv); + ckWARN4reg(not_posix_region_end, + "POSIX syntax [%c %c] belongs inside character classes%s", + *RExC_parse, *RExC_parse, + (maybe_class == OOB_NAMEDCLASS) + ? ((POSIXCC_NOTYET(*RExC_parse)) + ? " (but this one isn't implemented)" + : " (but this one isn't fully valid)") + : "" + ); + (void)ReREFCNT_inc(RExC_rx_sv); + } } /* If the caller wants us to just parse a single element, accomplish this @@ -13357,22 +15712,33 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (UCHARAT(RExC_parse) == ']') goto charclassloop; -parseit: while (1) { + + if ( posix_warnings + && av_tindex_nomg(posix_warnings) >= 0 + && RExC_parse > not_posix_region_end) + { + /* Warnings about posix class issues are considered tentative until + * we are far enough along in the parse that we can no longer + * change our mind, at which point we either output them or add + * them, if it has so specified, to what gets returned to the + * caller. This is done each time through the loop so that a later + * class won't zap them before they have been dealt with. */ + output_or_return_posix_warnings(pRExC_state, posix_warnings, + return_posix_warnings); + } + if (RExC_parse >= stop_ptr) { break; } - if (skip_white) { - RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); - } + SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); if (UCHARAT(RExC_parse) == ']') { break; } - charclassloop: + charclassloop: namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ save_value = value; @@ -13381,8 +15747,9 @@ parseit: if (!range) { rangebegin = RExC_parse; element_count++; + non_portable_endpoint = 0; } - if (UTF) { + if (UTF && ! UTF8_IS_INVARIANT(* RExC_parse)) { value = utf8n_to_uvchr((U8*)RExC_parse, RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT); @@ -13391,14 +15758,54 @@ parseit: else value = UCHARAT(RExC_parse++); - if (value == '[' - && RExC_parse < RExC_end - && POSIXCC(UCHARAT(RExC_parse))) + if (value == '[') { + char * posix_class_end; + namedclass = handle_possible_posix(pRExC_state, + RExC_parse, + &posix_class_end, + do_posix_warnings ? &posix_warnings : NULL, + FALSE /* die if error */); + if (namedclass > OOB_NAMEDCLASS) { + + /* If there was an earlier attempt to parse this particular + * posix class, and it failed, it was a false alarm, as this + * successful one proves */ + if ( posix_warnings + && av_tindex_nomg(posix_warnings) >= 0 + && not_posix_region_end >= RExC_parse + && not_posix_region_end <= posix_class_end) + { + av_undef(posix_warnings); + } + + RExC_parse = posix_class_end; + } + else if (namedclass == OOB_NAMEDCLASS) { + not_posix_region_end = posix_class_end; + } + else { + namedclass = OOB_NAMEDCLASS; + } + } + else if ( RExC_parse - 1 > not_posix_region_end + && MAYBE_POSIXCC(value)) { - namedclass = regpposixcc(pRExC_state, value, strict); + (void) handle_possible_posix( + pRExC_state, + RExC_parse - 1, /* -1 because parse has already been + advanced */ + ¬_posix_region_end, + do_posix_warnings ? &posix_warnings : NULL, + TRUE /* checking only */); } else if (value == '\\') { - if (UTF) { + /* Is a backslash; get the code point of the char after it */ + + if (RExC_parse >= RExC_end) { + vFAIL("Unmatched ["); + } + + if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) { value = utf8n_to_uvchr((U8*)RExC_parse, RExC_end - RExC_parse, &numlen, UTF8_ALLOW_DEFAULT); @@ -13417,7 +15824,7 @@ parseit: * skipped, it means that that white space is wanted literally, and * is already in 'value'. Otherwise, need to translate the escape * into what it signifies. */ - if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) { + if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) { case 'w': namedclass = ANYOF_WORDCHAR; break; case 'W': namedclass = ANYOF_NWORDCHAR; break; @@ -13431,19 +15838,67 @@ parseit: case 'H': namedclass = ANYOF_NHORIZWS; break; case 'N': /* Handle \N{NAME} in class */ { - /* We only pay attention to the first char of - multichar strings being returned. I kinda wonder - if this makes sense as it does change the behaviour - from earlier versions, OTOH that behaviour was broken - as well. */ - if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth, - TRUE, /* => charclass */ - strict)) - { - if (*flagp & RESTART_UTF8) - FAIL("panic: grok_bslash_N set RESTART_UTF8"); - goto parseit; + const char * const backslash_N_beg = RExC_parse - 2; + int cp_count; + + if (! grok_bslash_N(pRExC_state, + NULL, /* No regnode */ + &value, /* Yes single value */ + &cp_count, /* Multiple code pt count */ + flagp, + strict, + depth) + ) { + + if (*flagp & NEED_UTF8) + FAIL("panic: grok_bslash_N set NEED_UTF8"); + if (*flagp & RESTART_PASS1) + return NULL; + + if (cp_count < 0) { + vFAIL("\\N in a character class must be a named character: \\N{...}"); + } + else if (cp_count == 0) { + if (PASS2) { + ckWARNreg(RExC_parse, + "Ignoring zero length \\N{} in character class"); + } + } + else { /* cp_count > 1 */ + if (! RExC_in_multi_char_class) { + if (invert || range || *RExC_parse == '-') { + if (strict) { + RExC_parse--; + vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character"); + } + else if (PASS2) { + ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class"); + } + break; /* <value> contains the first code + point. Drop out of the switch to + process it */ + } + else { + SV * multi_char_N = newSVpvn(backslash_N_beg, + RExC_parse - backslash_N_beg); + multi_char_matches + = add_multi_match(multi_char_matches, + multi_char_N, + cp_count); + } + } + } /* End of cp_count != 1 */ + + /* This element should not be processed further in this + * class */ + element_count--; + value = save_value; + prevvalue = save_prevvalue; + continue; /* Back to top of loop to get next char */ } + + /* Here, is a single code point, and <value> contains it */ + unicode_range = TRUE; /* \N{} are Unicode */ } break; case 'p': @@ -13460,96 +15915,163 @@ parseit: |_CORE_SWASH_INIT_ACCEPT_INVLIST; if (RExC_parse >= RExC_end) - vFAIL2("Empty \\%c{}", (U8)value); + vFAIL2("Empty \\%c", (U8)value); if (*RExC_parse == '{') { const U8 c = (U8)value; - e = strchr(RExC_parse++, '}'); - if (!e) + e = strchr(RExC_parse, '}'); + if (!e) { + RExC_parse++; vFAIL2("Missing right brace on \\%c{}", c); - while (isSPACE(UCHARAT(RExC_parse))) - RExC_parse++; + } + + RExC_parse++; + while (isSPACE(*RExC_parse)) { + RExC_parse++; + } + + if (UCHARAT(RExC_parse) == '^') { + + /* toggle. (The rhs xor gets the single bit that + * differs between P and p; the other xor inverts just + * that bit) */ + value ^= 'P' ^ 'p'; + + RExC_parse++; + while (isSPACE(*RExC_parse)) { + RExC_parse++; + } + } + if (e == RExC_parse) vFAIL2("Empty \\%c{}", c); + n = e - RExC_parse; - while (isSPACE(UCHARAT(RExC_parse + n - 1))) + while (isSPACE(*(RExC_parse + n - 1))) n--; - } - else { + } /* The \p isn't immediately followed by a '{' */ + else if (! isALPHA(*RExC_parse)) { + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + vFAIL2("Character following \\%c must be '{' or a " + "single-character Unicode property name", + (U8) value); + } + else { e = RExC_parse; n = 1; } if (!SIZE_ONLY) { SV* invlist; - char* formatted; char* name; + char* base_name; /* name after any packages are stripped */ + char* lookup_name = NULL; + const char * const colon_colon = "::"; - if (UCHARAT(RExC_parse) == '^') { - RExC_parse++; - n--; - /* toggle. (The rhs xor gets the single bit that - * differs between P and p; the other xor inverts just - * that bit) */ - value ^= 'P' ^ 'p'; - - while (isSPACE(UCHARAT(RExC_parse))) { - RExC_parse++; - n--; - } - } /* Try to get the definition of the property into * <invlist>. If /i is in effect, the effective property * will have its name be <__NAME_i>. The design is * discussed in commit * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ - formatted = Perl_form(aTHX_ - "%s%.*s%s\n", - (FOLD) ? "__" : "", - (int)n, - RExC_parse, - (FOLD) ? "_i" : "" - ); - name = savepvn(formatted, strlen(formatted)); + name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse)); + SAVEFREEPV(name); + if (FOLD) { + lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name)); + + /* The function call just below that uses this can fail + * to return, leaking memory if we don't do this */ + SAVEFREEPV(lookup_name); + } /* Look up the property name, and get its swash and * inversion list, if the property is found */ - if (swash) { - SvREFCNT_dec_NN(swash); - } - swash = _core_swash_init("utf8", name, &PL_sv_undef, + SvREFCNT_dec(swash); /* Free any left-overs */ + swash = _core_swash_init("utf8", + (lookup_name) + ? lookup_name + : name, + &PL_sv_undef, 1, /* binary */ 0, /* not tr/// */ NULL, /* No inversion list */ &swash_init_flags ); if (! swash || ! (invlist = _get_swash_invlist(swash))) { - if (swash) { + HV* curpkg = (IN_PERL_COMPILETIME) + ? PL_curstash + : CopSTASH(PL_curcop); + UV final_n = n; + bool has_pkg; + + if (swash) { /* Got a swash but no inversion list. + Something is likely wrong that will + be sorted-out later */ SvREFCNT_dec_NN(swash); swash = NULL; } - /* Here didn't find it. It could be a user-defined - * property that will be available at run-time. If we - * accept only compile-time properties, is an error; - * otherwise add it to the list for run-time look up */ - if (ret_invlist) { + /* Here didn't find it. It could be a an error (like a + * typo) in specifying a Unicode property, or it could + * be a user-defined property that will be available at + * run-time. The names of these must begin with 'In' + * or 'Is' (after any packages are stripped off). So + * if not one of those, or if we accept only + * compile-time properties, is an error; otherwise add + * it to the list for run-time look up. */ + if ((base_name = rninstr(name, name + n, + colon_colon, colon_colon + 2))) + { /* Has ::. We know this must be a user-defined + property */ + base_name += 2; + final_n -= base_name - name; + has_pkg = TRUE; + } + else { + base_name = name; + has_pkg = FALSE; + } + + if ( final_n < 3 + || base_name[0] != 'I' + || (base_name[1] != 's' && base_name[1] != 'n') + || ret_invlist) + { + const char * const msg + = (has_pkg) + ? "Illegal user-defined property name" + : "Can't find Unicode property definition"; RExC_parse = e + 1; - vFAIL2utf8f( - "Property '%"UTF8f"' is unknown", - UTF8fARG(UTF, n, name)); + + /* diag_listed_as: Can't find Unicode property definition "%s" */ + vFAIL3utf8f("%s \"%"UTF8f"\"", + msg, UTF8fARG(UTF, n, name)); } - Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n", + + /* If the property name doesn't already have a package + * name, add the current one to it so that it can be + * referred to outside it. [perl #121777] */ + if (! has_pkg && curpkg) { + char* pkgname = HvNAME(curpkg); + if (strNE(pkgname, "main")) { + char* full_name = Perl_form(aTHX_ + "%s::%s", + pkgname, + name); + n = strlen(full_name); + name = savepvn(full_name, n); + SAVEFREEPV(name); + } + } + Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%"UTF8f"%s\n", (value == 'p' ? '+' : '!'), - UTF8fARG(UTF, n, name)); + (FOLD) ? "__" : "", + UTF8fARG(UTF, n, name), + (FOLD) ? "_i" : ""); has_user_defined_property = TRUE; + optimizable = FALSE; /* Will have to leave this an + ANYOF node */ - /* We don't know yet, so have to assume that the - * property could match something in the Latin1 range, - * hence something that isn't utf8. Note that this - * would cause things in <depends_list> to match - * inappropriately, except that any \p{}, including - * this one forces Unicode semantics, which means there - * is no <depends_list> */ - ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; + /* We don't know yet what this matches, so have to flag + * it */ + ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP; } else { @@ -13590,14 +16112,13 @@ parseit: _invlist_union(properties, invlist, &properties); } } - Safefree(name); } RExC_parse = e + 1; namedclass = ANYOF_UNIPROP; /* no official name, but it's named */ /* \p means they want Unicode semantics */ - RExC_uni_semantics = 1; + REQUIRE_UNI_RULES(flagp, NULL); } break; case 'n': value = '\n'; break; @@ -13605,7 +16126,7 @@ parseit: case 't': value = '\t'; break; case 'f': value = '\f'; break; case 'b': value = '\b'; break; - case 'e': value = ASCII_TO_NATIVE('\033');break; + case 'e': value = ESC_NATIVE; break; case 'a': value = '\a'; break; case 'o': RExC_parse--; /* function expects to be pointed at the 'o' */ @@ -13614,8 +16135,8 @@ parseit: bool valid = grok_bslash_o(&RExC_parse, &value, &error_msg, - SIZE_ONLY, /* warnings in pass - 1 only */ + PASS2, /* warnings only in + pass 2 */ strict, silence_non_portable, UTF); @@ -13623,7 +16144,8 @@ parseit: vFAIL(error_msg); } } - if (PL_encoding && value < 0x100) { + non_portable_endpoint++; + if (IN_ENCODING && value < 0x100) { goto recode_encoding; } break; @@ -13634,7 +16156,7 @@ parseit: bool valid = grok_bslash_x(&RExC_parse, &value, &error_msg, - TRUE, /* Output warnings */ + PASS2, /* Output warnings */ strict, silence_non_portable, UTF); @@ -13642,11 +16164,13 @@ parseit: vFAIL(error_msg); } } - if (PL_encoding && value < 0x100) + non_portable_endpoint++; + if (IN_ENCODING && value < 0x100) goto recode_encoding; break; case 'c': - value = grok_bslash_c(*RExC_parse++, SIZE_ONLY); + value = grok_bslash_c(*RExC_parse++, PASS2); + non_portable_endpoint++; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': @@ -13674,19 +16198,20 @@ parseit: (void)ReREFCNT_inc(RExC_rx_sv); } } - if (PL_encoding && value < 0x100) + non_portable_endpoint++; + if (IN_ENCODING && value < 0x100) goto recode_encoding; break; } - recode_encoding: + recode_encoding: if (! RExC_override_recoding) { - SV* enc = PL_encoding; - value = reg_recode((const char)(U8)value, &enc); + SV* enc = _get_encoding(); + value = reg_recode((U8)value, &enc); if (!enc) { if (strict) { vFAIL("Invalid escape in the specified encoding"); } - else if (SIZE_ONLY) { + else if (PASS2) { ckWARNreg(RExC_parse, "Invalid escape in the specified encoding"); } @@ -13711,10 +16236,6 @@ parseit: break; } /* End of switch on char following backslash */ } /* end of handling backslash escape sequences */ -#ifdef EBCDIC - else - literal_endpoint++; -#endif /* Here, we have the current token in 'value' */ @@ -13772,13 +16293,24 @@ parseit: else { RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP; } - ANYOF_FLAGS(ret) |= ANYOF_POSIXL; + ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL; ANYOF_POSIXL_ZERO(ret); + + /* We can't change this into some other type of node + * (unless this is the only element, in which case there + * are nodes that mean exactly this) as has runtime + * dependencies */ + optimizable = FALSE; } + /* Coverity thinks it is possible for this to be negative; both + * jhi and khw think it's not, but be safer */ + assert(! (ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL) + || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0); + /* See if it already matches the complement of this POSIX * class */ - if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL) + if ((ANYOF_FLAGS(ret) & ANYOF_MATCHES_POSIXL) && ANYOF_POSIXL_TEST(ret, namedclass + ((namedclass % 2) ? -1 : 1))) @@ -13851,41 +16383,72 @@ parseit: &cp_list); } } - else { /* Garden variety class. If is NASCII, NDIGIT, ... + else if (UNI_SEMANTICS + || classnum == _CC_ASCII + || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT + || classnum == _CC_XDIGIT))) + { + /* We usually have to worry about /d and /a affecting what + * POSIX classes match, with special code needed for /d + * because we won't know until runtime what all matches. + * But there is no extra work needed under /u, and + * [:ascii:] is unaffected by /a and /d; and :digit: and + * :xdigit: don't have runtime differences under /d. So we + * can special case these, and avoid some extra work below, + * and at runtime. */ + _invlist_union_maybe_complement_2nd( + simple_posixes, + PL_XPosix_ptrs[classnum], + namedclass % 2 != 0, + &simple_posixes); + } + else { /* Garden variety class. If is NUPPER, NALPHA, ... complement and use nposixes */ SV** posixes_ptr = namedclass % 2 == 0 ? &posixes : &nposixes; - SV** source_ptr = &PL_XPosix_ptrs[classnum]; _invlist_union_maybe_complement_2nd( *posixes_ptr, - *source_ptr, + PL_XPosix_ptrs[classnum], namedclass % 2 != 0, posixes_ptr); } - continue; /* Go get next character */ } } /* end of namedclass \blah */ - /* Here, we have a single value. If 'range' is set, it is the ending - * of a range--check its validity. Later, we will handle each - * individual code point in the range. If 'range' isn't set, this - * could be the beginning of a range, so check for that by looking - * ahead to see if the next real character to be processed is the range - * indicator--the minus sign */ + SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse); - if (skip_white) { - RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); - } + /* If 'range' is set, 'value' is the ending of a range--check its + * validity. (If value isn't a single code point in the case of a + * range, we should have figured that out above in the code that + * catches false ranges). Later, we will handle each individual code + * point in the range. If 'range' isn't set, this could be the + * beginning of a range, so check for that by looking ahead to see if + * the next real character to be processed is the range indicator--the + * minus sign */ if (range) { +#ifdef EBCDIC + /* For unicode ranges, we have to test that the Unicode as opposed + * to the native values are not decreasing. (Above 255, there is + * no difference between native and Unicode) */ + if (unicode_range && prevvalue < 255 && value < 255) { + if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) { + goto backwards_range; + } + } + else +#endif if (prevvalue > value) /* b-a */ { - const int w = RExC_parse - rangebegin; + int w; +#ifdef EBCDIC + backwards_range: +#endif + w = RExC_parse - rangebegin; vFAIL2utf8f( "Invalid [] range \"%"UTF8f"\"", UTF8fARG(UTF, w, rangebegin)); - range = 0; /* not a valid range */ + NOT_REACHED; /* NOTREACHED */ } } else { @@ -13894,12 +16457,9 @@ parseit: && *RExC_parse == '-') { char* next_char_ptr = RExC_parse + 1; - if (skip_white) { /* Get the next real char after the '-' */ - next_char_ptr = regpatws(pRExC_state, - RExC_parse + 1, - FALSE); /* means don't recognize - comments */ - } + + /* Get the next real char after the '-' */ + SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr); /* If the '-' is at the end of the class (just before the ']', * it is a literal minus; otherwise it is a range */ @@ -13908,15 +16468,15 @@ parseit: /* a bad range like \w-, [:word:]- ? */ if (namedclass > OOB_NAMEDCLASS) { - if (strict || ckWARN(WARN_REGEXP)) { - const int w = - RExC_parse >= rangebegin ? - RExC_parse - rangebegin : 0; + if (strict || (PASS2 && ckWARN(WARN_REGEXP))) { + const int w = RExC_parse >= rangebegin + ? RExC_parse - rangebegin + : 0; if (strict) { vFAIL4("False [] range \"%*.*s\"", w, w, rangebegin); } - else { + else if (PASS2) { vWARN4(RExC_parse, "False [] range \"%*.*s\"", w, w, rangebegin); @@ -13933,13 +16493,18 @@ parseit: } } - /* Here, <prevvalue> is the beginning of the range, if any; or <value> - * if not */ + if (namedclass > OOB_NAMEDCLASS) { + continue; + } + + /* Here, we have a single value this time through the loop, and + * <prevvalue> is the beginning of the range, if any; or <value> if + * not. */ /* non-Latin1 code point implies unicode semantics. Must be set in * pass1 so is there for the whole of pass 2 */ if (value > 255) { - RExC_uni_semantics = 1; + REQUIRE_UNI_RULES(flagp, NULL); } /* Ready to process either the single value, or the completed range. @@ -13982,44 +16547,17 @@ parseit: * again. Otherwise add this character to the list of * multi-char folds. */ if (! RExC_in_multi_char_class) { - AV** this_array_ptr; - AV* this_array; STRLEN cp_count = utf8_length(foldbuf, foldbuf + foldlen); - SV* multi_fold = sv_2mortal(newSVpvn("", 0)); + SV* multi_fold = sv_2mortal(newSVpvs("")); Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value); + multi_char_matches + = add_multi_match(multi_char_matches, + multi_fold, + cp_count); - if (! multi_char_matches) { - multi_char_matches = newAV(); - } - - /* <multi_char_matches> is actually an array of arrays. - * There will be one or two top-level elements: [2], - * and/or [3]. The [2] element is an array, each - * element thereof is a character which folds to TWO - * characters; [3] is for folds to THREE characters. - * (Unicode guarantees a maximum of 3 characters in any - * fold.) When we rewrite the character class below, - * we will do so such that the longest folds are - * written first, so that it prefers the longest - * matching strings first. This is done even if it - * turns out that any quantifier is non-greedy, out of - * programmer laziness. Tom Christiansen has agreed - * that this is ok. This makes the test for the - * ligature 'ffi' come before the test for 'ff' */ - if (av_exists(multi_char_matches, cp_count)) { - this_array_ptr = (AV**) av_fetch(multi_char_matches, - cp_count, FALSE); - this_array = *this_array_ptr; - } - else { - this_array = newAV(); - av_store(multi_char_matches, cp_count, - (SV*) this_array); - } - av_push(this_array, multi_fold); } /* This element should not be processed further in this @@ -14032,43 +16570,159 @@ parseit: } } + if (strict && PASS2 && ckWARN(WARN_REGEXP)) { + if (range) { + + /* If the range starts above 255, everything is portable and + * likely to be so for any forseeable character set, so don't + * warn. */ + if (unicode_range && non_portable_endpoint && prevvalue < 256) { + vWARN(RExC_parse, "Both or neither range ends should be Unicode"); + } + else if (prevvalue != value) { + + /* Under strict, ranges that stop and/or end in an ASCII + * printable should have each end point be a portable value + * for it (preferably like 'A', but we don't warn if it is + * a (portable) Unicode name or code point), and the range + * must be be all digits or all letters of the same case. + * Otherwise, the range is non-portable and unclear as to + * what it contains */ + if ((isPRINT_A(prevvalue) || isPRINT_A(value)) + && (non_portable_endpoint + || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value)) + || (isLOWER_A(prevvalue) && isLOWER_A(value)) + || (isUPPER_A(prevvalue) && isUPPER_A(value))))) + { + vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\""); + } + else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */ + + /* But the nature of Unicode and languages mean we + * can't do the same checks for above-ASCII ranges, + * except in the case of digit ones. These should + * contain only digits from the same group of 10. The + * ASCII case is handled just above. 0x660 is the + * first digit character beyond ASCII. Hence here, the + * range could be a range of digits. Find out. */ + IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT], + prevvalue); + IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT], + value); + + /* If the range start and final points are in the same + * inversion list element, it means that either both + * are not digits, or both are digits in a consecutive + * sequence of digits. (So far, Unicode has kept all + * such sequences as distinct groups of 10, but assert + * to make sure). If the end points are not in the + * same element, neither should be a digit. */ + if (index_start == index_final) { + assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start) + || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1] + - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start] + == 10) + /* But actually Unicode did have one group of 11 + * 'digits' in 5.2, so in case we are operating + * on that version, let that pass */ + || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1] + - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start] + == 11 + && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start] + == 0x19D0) + ); + } + else if ((index_start >= 0 + && ELEMENT_RANGE_MATCHES_INVLIST(index_start)) + || (index_final >= 0 + && ELEMENT_RANGE_MATCHES_INVLIST(index_final))) + { + vWARN(RExC_parse, "Ranges of digits should be from the same group of 10"); + } + } + } + } + if ((! range || prevvalue == value) && non_portable_endpoint) { + if (isPRINT_A(value)) { + char literal[3]; + unsigned d = 0; + if (isBACKSLASHED_PUNCT(value)) { + literal[d++] = '\\'; + } + literal[d++] = (char) value; + literal[d++] = '\0'; + + vWARN4(RExC_parse, + "\"%.*s\" is more clearly written simply as \"%s\"", + (int) (RExC_parse - rangebegin), + rangebegin, + literal + ); + } + else if isMNEMONIC_CNTRL(value) { + vWARN4(RExC_parse, + "\"%.*s\" is more clearly written simply as \"%s\"", + (int) (RExC_parse - rangebegin), + rangebegin, + cntrl_to_mnemonic((U8) value) + ); + } + } + } + /* Deal with this element of the class */ if (! SIZE_ONLY) { + #ifndef EBCDIC cp_foldable_list = _add_range_to_invlist(cp_foldable_list, prevvalue, value); #else - SV* this_range = _new_invlist(1); - _append_range_to_invlist(this_range, prevvalue, value); - - /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous. - * If this range was specified using something like 'i-j', we want - * to include only the 'i' and the 'j', and not anything in - * between, so exclude non-ASCII, non-alphabetics from it. - * However, if the range was specified with something like - * [\x89-\x91] or [\x89-j], all code points within it should be - * included. literal_endpoint==2 means both ends of the range used - * a literal character, not \x{foo} */ - if (literal_endpoint == 2 - && ((prevvalue >= 'a' && value <= 'z') - || (prevvalue >= 'A' && value <= 'Z'))) + /* On non-ASCII platforms, for ranges that span all of 0..255, and + * ones that don't require special handling, we can just add the + * range like we do for ASCII platforms */ + if ((UNLIKELY(prevvalue == 0) && value >= 255) + || ! (prevvalue < 256 + && (unicode_range + || (! non_portable_endpoint + && ((isLOWER_A(prevvalue) && isLOWER_A(value)) + || (isUPPER_A(prevvalue) + && isUPPER_A(value))))))) { - _invlist_intersection(this_range, PL_ASCII, - &this_range); - - /* Since this above only contains ascii, the intersection of it - * with anything will still yield only ascii */ - _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA], - &this_range); + cp_foldable_list = _add_range_to_invlist(cp_foldable_list, + prevvalue, value); + } + else { + /* Here, requires special handling. This can be because it is + * a range whose code points are considered to be Unicode, and + * so must be individually translated into native, or because + * its a subrange of 'A-Z' or 'a-z' which each aren't + * contiguous in EBCDIC, but we have defined them to include + * only the "expected" upper or lower case ASCII alphabetics. + * Subranges above 255 are the same in native and Unicode, so + * can be added as a range */ + U8 start = NATIVE_TO_LATIN1(prevvalue); + unsigned j; + U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255; + for (j = start; j <= end; j++) { + cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j)); + } + if (value > 255) { + cp_foldable_list = _add_range_to_invlist(cp_foldable_list, + 256, value); + } } - _invlist_union(cp_foldable_list, this_range, &cp_foldable_list); - literal_endpoint = 0; #endif } range = 0; /* this range (if it was one) is done now */ } /* End of loop through all the text within the brackets */ + + if ( posix_warnings && av_tindex_nomg(posix_warnings) >= 0) { + output_or_return_posix_warnings(pRExC_state, posix_warnings, + return_posix_warnings); + } + /* If anything in the class expands to more than one character, we have to * deal with them by building up a substitute parse string, and recursively * calling reg() on it, instead of proceeding */ @@ -14078,11 +16732,17 @@ parseit: STRLEN len; char *save_end = RExC_end; char *save_parse = RExC_parse; + char *save_start = RExC_start; + STRLEN prefix_end = 0; /* We copy the character class after a + prefix supplied here. This is the size + + 1 of that prefix */ bool first_time = TRUE; /* First multi-char occurrence doesn't get a "|" */ I32 reg_flags; assert(! invert); + assert(RExC_precomp_adj == 0); /* Only one level of recursion allowed */ + #if 0 /* Have decided not to deal with multi-char folds in inverted classes, because too confusing */ if (invert) { @@ -14091,7 +16751,10 @@ parseit: #endif /* Look at the longest folds first */ - for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) { + for (cp_count = av_tindex_nomg(multi_char_matches); + cp_count > 0; + cp_count--) + { if (av_exists(multi_char_matches, cp_count)) { AV** this_array_ptr; @@ -14116,8 +16779,14 @@ parseit: * multi-character folds, have to include it in recursive parsing */ if (element_count) { sv_catpv(substitute_parse, "|["); + prefix_end = SvCUR(substitute_parse); sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse); - sv_catpv(substitute_parse, "]"); + + /* Put in a closing ']' only if not going off the end, as otherwise + * we are adding something that really isn't there */ + if (RExC_parse < RExC_end) { + sv_catpv(substitute_parse, "]"); + } } sv_catpv(substitute_parse, ")"); @@ -14130,18 +16799,28 @@ parseit: } #endif - RExC_parse = SvPV(substitute_parse, len); + /* Set up the data structure so that any errors will be properly + * reported. See the comments at the definition of + * REPORT_LOCATION_ARGS for details */ + RExC_precomp_adj = orig_parse - RExC_precomp; + RExC_start = RExC_parse = SvPV(substitute_parse, len); + RExC_adjusted_start = RExC_start + prefix_end; RExC_end = RExC_parse + len; RExC_in_multi_char_class = 1; + RExC_override_recoding = 1; RExC_emit = (regnode *)orig_emit; ret = reg(pRExC_state, 1, ®_flags, depth+1); - *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8); + *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8); - RExC_parse = save_parse; + /* And restore so can parse the rest of the pattern */ + RExC_parse = save_parse; + RExC_start = RExC_adjusted_start = save_start; + RExC_precomp_adj = 0; RExC_end = save_end; RExC_in_multi_char_class = 0; + RExC_override_recoding = 0; SvREFCNT_dec_NN(multi_char_matches); return ret; } @@ -14156,8 +16835,9 @@ parseit: * 2) if the character class contains only a single element (including a * single range), we see if there is an equivalent node for it. * Other checks are possible */ - if (! ret_invlist /* Can't optimize if returning the constructed - inversion list */ + if ( optimizable + && ! ret_invlist /* Can't optimize if returning the constructed + inversion list */ && (UNLIKELY(posixl_matches_all) || element_count == 1)) { U8 op = END; @@ -14166,9 +16846,9 @@ parseit: if (UNLIKELY(posixl_matches_all)) { op = SANY; } - else if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like - \w or [:digit:] or \p{foo} - */ + else if (namedclass > OOB_NAMEDCLASS) { /* this is a single named + class, like \w or [:digit:] + or \p{foo} */ /* All named classes are mapped into POSIXish nodes, with its FLAG * argument giving which class it is */ @@ -14190,7 +16870,7 @@ parseit: /* The actual POSIXish node for all the rest depends on the * charset modifier. The ones in the first set depend only on - * ASCII or, if available on this platform, locale */ + * ASCII or, if available on this platform, also locale */ case ANYOF_ASCII: case ANYOF_NASCII: #ifdef HAS_ISASCII @@ -14200,19 +16880,27 @@ parseit: #endif goto join_posix; - case ANYOF_NCASED: + /* The following don't have any matches in the upper Latin1 + * range, hence /d is equivalent to /u for them. Making it /u + * saves some branches at runtime */ + case ANYOF_DIGIT: + case ANYOF_NDIGIT: + case ANYOF_XDIGIT: + case ANYOF_NXDIGIT: + if (! DEPENDS_SEMANTICS) { + goto treat_as_default; + } + + op = POSIXU; + goto join_posix; + + /* The following change to CASED under /i */ case ANYOF_LOWER: case ANYOF_NLOWER: case ANYOF_UPPER: case ANYOF_NUPPER: - /* under /a could be alpha */ if (FOLD) { - if (ASCII_RESTRICTED) { - namedclass = ANYOF_ALPHA + (namedclass % 2); - } - else if (! LOC) { - break; - } + namedclass = ANYOF_CASED + (namedclass % 2); } /* FALLTHROUGH */ @@ -14220,12 +16908,13 @@ parseit: * We take advantage of the enum ordering of the charset * modifiers to get the exact node type, */ default: + treat_as_default: op = POSIXD + get_regex_charset(RExC_flags); if (op > POSIXA) { /* /aa is same as /a */ op = POSIXA; } - join_posix: + join_posix: /* The odd numbered ones are the complements of the * next-lower even number one */ if (namedclass % 2 == 1) { @@ -14244,7 +16933,7 @@ parseit: if (! LOC && value == '\n') { op = REG_ANY; /* Optimize [^\n] */ *flagp |= HASWIDTH|SIMPLE; - RExC_naughty++; + MARK_NAUGHTY(1); } } else if (value < 256 || UTF) { @@ -14261,6 +16950,30 @@ parseit: op = POSIXA; } } + else if (! FOLD || ASCII_FOLD_RESTRICTED) { + /* We can optimize A-Z or a-z, but not if they could match + * something like the KELVIN SIGN under /i. */ + if (prevvalue == 'A') { + if (value == 'Z' +#ifdef EBCDIC + && ! non_portable_endpoint +#endif + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_UPPER; + op = POSIXA; + } + } + else if (prevvalue == 'a') { + if (value == 'z' +#ifdef EBCDIC + && ! non_portable_endpoint +#endif + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_LOWER; + op = POSIXA; + } + } + } } /* Here, we have changed <op> away from its initial value iff we found @@ -14312,6 +17025,7 @@ parseit: SvREFCNT_dec(posixes); SvREFCNT_dec(nposixes); + SvREFCNT_dec(simple_posixes); SvREFCNT_dec(cp_list); SvREFCNT_dec(cp_foldable_list); return ret; @@ -14334,7 +17048,7 @@ parseit: /* Our calculated list will be for Unicode rules. For locale * matching, we have to keep a separate list that is consulted at * runtime only when the locale indicates Unicode rules. For - * non-locale, we just use to the general list */ + * non-locale, we just use the general list */ if (LOC) { use_list = &only_utf8_locale_list; } @@ -14356,18 +17070,7 @@ parseit: /* This is a hash that for a particular fold gives all * characters that are involved in it */ if (! PL_utf8_foldclosures) { - - /* If the folds haven't been read in, call a fold function - * to force that */ - if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES_CASE+1]; - - /* This string is just a short named one above \xff */ - to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); - assert(PL_utf8_tofold); /* Verify that worked */ - } - PL_utf8_foldclosures - = _swash_inversion_hash(PL_utf8_tofold); + _load_PL_utf8_foldclosures(); } } @@ -14384,15 +17087,6 @@ parseit: if (j < 256) { - /* We have the latin1 folding rules hard-coded here so - * that an innocent-looking character class, like - * /[ks]/i won't have to go out to disk to find the - * possible matches. XXX It would be better to - * generate these via regen, in case a new version of - * the Unicode standard adds new mappings, though that - * is not really likely, and may be caught by the - * default: case of the switch below. */ - if (IS_IN_SOME_FOLD_L1(j)) { /* ASCII is always matched; non-ASCII is matched @@ -14403,78 +17097,19 @@ parseit: PL_fold_latin1[j]); } else { - depends_list = - add_cp_to_invlist(depends_list, - PL_fold_latin1[j]); + has_upper_latin1_only_utf8_matches + = add_cp_to_invlist( + has_upper_latin1_only_utf8_matches, + PL_fold_latin1[j]); } } - if (HAS_NONLATIN1_FOLD_CLOSURE(j) + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j) && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) { - /* Certain Latin1 characters have matches outside - * Latin1. To get here, <j> is one of those - * characters. None of these matches is valid for - * ASCII characters under /aa, which is why the 'if' - * just above excludes those. These matches only - * happen when the target string is utf8. The code - * below adds the single fold closures for <j> to the - * inversion list. */ - - switch (j) { - case 'k': - case 'K': - *use_list = - add_cp_to_invlist(*use_list, KELVIN_SIGN); - break; - case 's': - case 'S': - *use_list = add_cp_to_invlist(*use_list, - LATIN_SMALL_LETTER_LONG_S); - break; - case MICRO_SIGN: - *use_list = add_cp_to_invlist(*use_list, - GREEK_CAPITAL_LETTER_MU); - *use_list = add_cp_to_invlist(*use_list, - GREEK_SMALL_LETTER_MU); - break; - case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: - case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: - *use_list = - add_cp_to_invlist(*use_list, ANGSTROM_SIGN); - break; - case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: - *use_list = add_cp_to_invlist(*use_list, - LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); - break; - case LATIN_SMALL_LETTER_SHARP_S: - *use_list = add_cp_to_invlist(*use_list, - LATIN_CAPITAL_LETTER_SHARP_S); - break; - case 'F': case 'f': - case 'I': case 'i': - case 'L': case 'l': - case 'T': case 't': - case 'A': case 'a': - case 'H': case 'h': - case 'J': case 'j': - case 'N': case 'n': - case 'W': case 'w': - case 'Y': case 'y': - /* These all are targets of multi-character - * folds from code points that require UTF8 - * to express, so they can't match unless - * the target string is in UTF-8, so no - * action here is necessary, as regexec.c - * properly handles the general case for - * UTF-8 matching and multi-char folds */ - break; - default: - /* Use deprecated warning to increase the - * chances of this being output */ - ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); - break; - } + add_above_Latin1_folds(pRExC_state, + (U8) j, + use_list); } continue; } @@ -14500,12 +17135,11 @@ parseit: { AV* list = (AV*) *listp; IV k; - for (k = 0; k <= av_tindex(list); k++) { + for (k = 0; k <= av_tindex_nomg(list); k++) { SV** c_p = av_fetch(list, k, FALSE); UV c; - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); - } + assert(c_p); + c = SvUV(*c_p); /* /aa doesn't allow folds between ASCII and non- */ @@ -14530,8 +17164,10 @@ parseit: else { /* Similarly folds involving non-ascii Latin1 * characters under /d are added to their list */ - depends_list = add_cp_to_invlist(depends_list, - c); + has_upper_latin1_only_utf8_matches + = add_cp_to_invlist( + has_upper_latin1_only_utf8_matches, + c); } } } @@ -14550,6 +17186,10 @@ parseit: * classes. The lists are kept separate up to now because we don't want to * fold the classes (folding of those is automatically handled by the swash * fetching code) */ + if (simple_posixes) { + _invlist_union(cp_list, simple_posixes, &cp_list); + SvREFCNT_dec_NN(simple_posixes); + } if (posixes || nposixes) { if (posixes && AT_LEAST_ASCII_RESTRICTED) { /* Under /a and /aa, nothing above ASCII matches these */ @@ -14561,7 +17201,7 @@ parseit: if (DEPENDS_SEMANTICS) { /* Under /d, everything in the upper half of the Latin1 range * matches these complements */ - ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_NON_ASCII_ALL; + ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; } else if (AT_LEAST_ASCII_RESTRICTED) { /* Under /a and /aa, everything above ASCII matches these @@ -14603,13 +17243,15 @@ parseit: cp_list = posixes; } - if (depends_list) { - _invlist_union(depends_list, nonascii_but_latin1_properties, - &depends_list); + if (has_upper_latin1_only_utf8_matches) { + _invlist_union(has_upper_latin1_only_utf8_matches, + nonascii_but_latin1_properties, + &has_upper_latin1_only_utf8_matches); SvREFCNT_dec_NN(nonascii_but_latin1_properties); } else { - depends_list = nonascii_but_latin1_properties; + has_upper_latin1_only_utf8_matches + = nonascii_but_latin1_properties; } } } @@ -14623,15 +17265,15 @@ parseit: * class that isn't a Unicode property, and which matches above Unicode, \W * or [\x{110000}] for example. * (Note that in this case, unlike the Posix one above, there is no - * <depends_list>, because having a Unicode property forces Unicode - * semantics */ + * <has_upper_latin1_only_utf8_matches>, because having a Unicode property + * forces Unicode semantics */ if (properties) { if (cp_list) { /* If it matters to the final outcome, see if a non-property * component of the class matches above Unicode. If so, the * warning gets suppressed. This is true even if just a single - * such code point is specified, as though not strictly correct if + * such code point is specified, as, though not strictly correct if * another such code point is matched against, the fact that they * are using above-Unicode code points indicates they should know * the issues involved */ @@ -14648,7 +17290,12 @@ parseit: } if (warn_super) { - ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER; + ANYOF_FLAGS(ret) + |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; + + /* Because an ANYOF node is the only one that warns, this node + * can't be optimized into something else */ + optimizable = FALSE; } } @@ -14667,28 +17314,115 @@ parseit: * fetching). We know to set the flag if we have a non-NULL list for UTF-8 * locales, or the class matches at least one 0-255 range code point */ if (LOC && FOLD) { + + /* Some things on the list might be unconditionally included because of + * other components. Remove them, and clean up the list if it goes to + * 0 elements */ + if (only_utf8_locale_list && cp_list) { + _invlist_subtract(only_utf8_locale_list, cp_list, + &only_utf8_locale_list); + + if (_invlist_len(only_utf8_locale_list) == 0) { + SvREFCNT_dec_NN(only_utf8_locale_list); + only_utf8_locale_list = NULL; + } + } if (only_utf8_locale_list) { - ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + ANYOF_FLAGS(ret) + |= ANYOFL_FOLD + |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; } - else if (cp_list) { /* Look to see if there a 0-255 code point is in - the list */ + else if (cp_list) { /* Look to see if a 0-255 code point is in list */ UV start, end; invlist_iterinit(cp_list); if (invlist_iternext(cp_list, &start, &end) && start < 256) { - ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + ANYOF_FLAGS(ret) |= ANYOFL_FOLD; } invlist_iterfinish(cp_list); } } +#define MATCHES_ALL_NON_UTF8_NON_ASCII(ret) \ + ( DEPENDS_SEMANTICS \ + && (ANYOF_FLAGS(ret) \ + & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)) + + /* See if we can simplify things under /d */ + if ( has_upper_latin1_only_utf8_matches + || MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) + { + /* But not if we are inverting, as that screws it up */ + if (! invert) { + if (has_upper_latin1_only_utf8_matches) { + if (MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) { + + /* Here, we have both the flag and inversion list. Any + * character in 'has_upper_latin1_only_utf8_matches' + * matches when UTF-8 is in effect, but it also matches + * when UTF-8 is not in effect because of + * MATCHES_ALL_NON_UTF8_NON_ASCII. Therefore it matches + * unconditionally, so can be added to the regular list, + * and 'has_upper_latin1_only_utf8_matches' cleared */ + _invlist_union(cp_list, + has_upper_latin1_only_utf8_matches, + &cp_list); + SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches); + has_upper_latin1_only_utf8_matches = NULL; + } + else if (cp_list) { + + /* Here, 'cp_list' gives chars that always match, and + * 'has_upper_latin1_only_utf8_matches' gives chars that + * were specified to match only if the target string is in + * UTF-8. It may be that these overlap, so we can subtract + * the unconditionally matching from the conditional ones, + * to make the conditional list as small as possible, + * perhaps even clearing it, in which case more + * optimizations are possible later */ + _invlist_subtract(has_upper_latin1_only_utf8_matches, + cp_list, + &has_upper_latin1_only_utf8_matches); + if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) { + SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches); + has_upper_latin1_only_utf8_matches = NULL; + } + } + } + + /* Similarly, if the unconditional matches include every upper + * latin1 character, we can clear that flag to permit later + * optimizations */ + if (cp_list && MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) { + SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1); + _invlist_subtract(only_non_utf8_list, cp_list, + &only_non_utf8_list); + if (_invlist_len(only_non_utf8_list) == 0) { + ANYOF_FLAGS(ret) &= ~ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; + } + SvREFCNT_dec_NN(only_non_utf8_list); + only_non_utf8_list = NULL;; + } + } + + /* If we haven't gotten rid of all conditional matching, we change the + * regnode type to indicate that */ + if ( has_upper_latin1_only_utf8_matches + || MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) + { + OP(ret) = ANYOFD; + optimizable = FALSE; + } + } +#undef MATCHES_ALL_NON_UTF8_NON_ASCII + /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known * at compile time. Besides not inverting folded locale now, we can't * invert if there are things such as \w, which aren't known until runtime * */ if (cp_list && invert + && OP(ret) != ANYOFD && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) - && ! depends_list && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) { _invlist_invert(cp_list); @@ -14704,6 +17438,8 @@ parseit: } if (ret_invlist) { + assert(cp_list); + *ret_invlist = cp_list; SvREFCNT_dec(swash); @@ -14728,32 +17464,28 @@ parseit: * adjacent such nodes. And if the class is equivalent to things like /./, * expensive run-time swashes can be avoided. Now that we have more * complete information, we can find things necessarily missed by the - * earlier code. I (khw) am not sure how much to look for here. It would - * be easy, but perhaps too slow, to check any candidates against all the - * node types they could possibly match using _invlistEQ(). */ - - if (cp_list - && ! invert - && ! depends_list - && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION - - /* We don't optimize if we are supposed to make sure all non-Unicode - * code points raise a warning, as only ANYOF nodes have this check. - * */ - && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER)) - { + * earlier code. Another possible "optimization" that isn't done is that + * something like [Ee] could be changed into an EXACTFU. khw tried this + * and found that the ANYOF is faster, including for code points not in the + * bitmap. This still might make sense to do, provided it got joined with + * an adjacent node(s) to create a longer EXACTFU one. This could be + * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join + * routine would know is joinable. If that didn't happen, the node type + * could then be made a straight ANYOF */ + + if (optimizable && cp_list && ! invert) { UV start, end; U8 op = END; /* The optimzation node-type */ + int posix_class = -1; /* Illegal value */ const char * cur_parse= RExC_parse; invlist_iterinit(cp_list); if (! invlist_iternext(cp_list, &start, &end)) { /* Here, the list is empty. This happens, for example, when a - * Unicode property is the only thing in the character class, and - * it doesn't match anything. (perluniprops.pod notes such - * properties) */ + * Unicode property that doesn't match anything is the only element + * in the character class (perluniprops.pod notes such properties). + * */ op = OPFAIL; *flagp |= HASWIDTH|SIMPLE; } @@ -14770,7 +17502,9 @@ parseit: value = start; if (! FOLD) { - op = EXACT; + op = (LOC) + ? EXACTL + : EXACT; } else if (LOC) { @@ -14807,12 +17541,12 @@ parseit: } } } - } + } /* End of first range contains just a single code point */ else if (start == 0) { if (end == UV_MAX) { op = SANY; *flagp |= HASWIDTH|SIMPLE; - RExC_naughty++; + MARK_NAUGHTY(1); } else if (end == '\n' - 1 && invlist_iternext(cp_list, &start, &end) @@ -14820,16 +17554,63 @@ parseit: { op = REG_ANY; *flagp |= HASWIDTH|SIMPLE; - RExC_naughty++; + MARK_NAUGHTY(1); } } invlist_iterfinish(cp_list); + if (op == END) { + const UV cp_list_len = _invlist_len(cp_list); + const UV* cp_list_array = invlist_array(cp_list); + + /* Here, didn't find an optimization. See if this matches any of + * the POSIX classes. These run slightly faster for above-Unicode + * code points, so don't bother with POSIXA ones nor the 2 that + * have no above-Unicode matches. We can avoid these checks unless + * the ANYOF matches at least as high as the lowest POSIX one + * (which was manually found to be \v. The actual code point may + * increase in later Unicode releases, if a higher code point is + * assigned to be \v, but this code will never break. It would + * just mean we could execute the checks for posix optimizations + * unnecessarily) */ + + if (cp_list_array[cp_list_len-1] > 0x2029) { + for (posix_class = 0; + posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC; + posix_class++) + { + int try_inverted; + if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) { + continue; + } + for (try_inverted = 0; try_inverted < 2; try_inverted++) { + + /* Check if matches normal or inverted */ + if (_invlistEQ(cp_list, + PL_XPosix_ptrs[posix_class], + try_inverted)) + { + op = (try_inverted) + ? NPOSIXU + : POSIXU; + *flagp |= HASWIDTH|SIMPLE; + goto found_posix; + } + } + } + found_posix: ; + } + } + if (op != END) { RExC_parse = (char *)orig_parse; RExC_emit = (regnode *)orig_emit; - ret = reg_node(pRExC_state, op); + if (regarglen[op]) { + ret = reganode(pRExC_state, op, 0); + } else { + ret = reg_node(pRExC_state, op); + } RExC_parse = (char *)cur_parse; @@ -14838,6 +17619,9 @@ parseit: TRUE /* downgradable to EXACT */ ); } + else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) { + FLAGS(ret) = posix_class; + } SvREFCNT_dec_NN(cp_list); return ret; @@ -14858,16 +17642,19 @@ parseit: /* Here, the bitmap has been populated with all the Latin1 code points that * always match. Can now add to the overall list those that match only - * when the target string is UTF-8 (<depends_list>). */ - if (depends_list) { + * when the target string is UTF-8 (<has_upper_latin1_only_utf8_matches>). + * */ + if (has_upper_latin1_only_utf8_matches) { if (cp_list) { - _invlist_union(cp_list, depends_list, &cp_list); - SvREFCNT_dec_NN(depends_list); + _invlist_union(cp_list, + has_upper_latin1_only_utf8_matches, + &cp_list); + SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches); } else { - cp_list = depends_list; + cp_list = has_upper_latin1_only_utf8_matches; } - ANYOF_FLAGS(ret) |= ANYOF_UTF8; + ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP; } /* If there is a swash and more than one element, we can't use the swash in @@ -14877,6 +17664,10 @@ parseit: swash = NULL; } + /* Note that the optimization of using 'swash' if it is the only thing in + * the class doesn't have us change swash at all, so it can include things + * that are also in the bitmap; otherwise we have purposely deleted that + * duplicate information */ set_ANYOF_arg(pRExC_state, ret, cp_list, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) ? listsv : NULL, @@ -14905,7 +17696,7 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, { /* Sets the arg field of an ANYOF-type node 'node', using information about * the node passed-in. If there is nothing outside the node's bitmap, the - * arg is set to ANYOF_NONBITMAP_EMPTY. Otherwise, it sets the argument to + * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to * the count returned by add_data(), having allocated and stored an array, * av, that that count references, as follows: * av[0] stores the character class description in its textual form. @@ -14931,19 +17722,17 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) { assert(! (ANYOF_FLAGS(node) - & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))); - ARG_SET(node, ANYOF_NONBITMAP_EMPTY); + & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)); + ARG_SET(node, ANYOF_ONLY_HAS_BITMAP); } else { AV * const av = newAV(); SV *rv; - assert(ANYOF_FLAGS(node) - & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); - av_store(av, 0, (runtime_defns) ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); if (swash) { + assert(cp_list); av_store(av, 1, swash); SvREFCNT_dec_NN(cp_list); } @@ -14969,118 +17758,394 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, } } +#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) +SV * +Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, + const regnode* node, + bool doinit, + SV** listsvp, + SV** only_utf8_locale_ptr, + SV** output_invlist) -/* reg_skipcomment() +{ + /* For internal core use only. + * Returns the swash for the input 'node' in the regex 'prog'. + * If <doinit> is 'true', will attempt to create the swash if not already + * done. + * If <listsvp> is non-null, will return the printable contents of the + * swash. This can be used to get debugging information even before the + * swash exists, by calling this function with 'doinit' set to false, in + * which case the components that will be used to eventually create the + * swash are returned (in a printable form). + * If <only_utf8_locale_ptr> is not NULL, it is where this routine is to + * store an inversion list of code points that should match only if the + * execution-time locale is a UTF-8 one. + * If <output_invlist> is not NULL, it is where this routine is to store an + * inversion list of the code points that would be instead returned in + * <listsvp> if this were NULL. Thus, what gets output in <listsvp> + * when this parameter is used, is just the non-code point data that + * will go into creating the swash. This currently should be just + * user-defined properties whose definitions were not known at compile + * time. Using this parameter allows for easier manipulation of the + * swash's data by the caller. It is illegal to call this function with + * this parameter set, but not <listsvp> + * + * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note + * that, in spite of this function's name, the swash it returns may include + * the bitmap data as well */ - Absorbs an /x style # comments from the input stream. - Returns true if there is more text remaining in the stream. - Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment - terminates the pattern without including a newline. + SV *sw = NULL; + SV *si = NULL; /* Input swash initialization string */ + SV* invlist = NULL; - Note its the callers responsibility to ensure that we are - actually in /x mode + RXi_GET_DECL(prog,progi); + const struct reg_data * const data = prog ? progi->data : NULL; -*/ + PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA; + assert(! output_invlist || listsvp); -STATIC bool -S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) -{ - bool ended = 0; + if (data && data->count) { + const U32 n = ARG(node); - PERL_ARGS_ASSERT_REG_SKIPCOMMENT; + if (data->what[n] == 's') { + SV * const rv = MUTABLE_SV(data->data[n]); + AV * const av = MUTABLE_AV(SvRV(rv)); + SV **const ary = AvARRAY(av); + U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; - while (RExC_parse < RExC_end) - if (*RExC_parse++ == '\n') { - ended = 1; - break; + si = *ary; /* ary[0] = the string to initialize the swash with */ + + if (av_tindex_nomg(av) >= 2) { + if (only_utf8_locale_ptr + && ary[2] + && ary[2] != &PL_sv_undef) + { + *only_utf8_locale_ptr = ary[2]; + } + else { + assert(only_utf8_locale_ptr); + *only_utf8_locale_ptr = NULL; + } + + /* Elements 3 and 4 are either both present or both absent. [3] + * is any inversion list generated at compile time; [4] + * indicates if that inversion list has any user-defined + * properties in it. */ + if (av_tindex_nomg(av) >= 3) { + invlist = ary[3]; + if (SvUV(ary[4])) { + swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + } + } + else { + invlist = NULL; + } + } + + /* Element [1] is reserved for the set-up swash. If already there, + * return it; if not, create it and store it there */ + if (ary[1] && SvROK(ary[1])) { + sw = ary[1]; + } + else if (doinit && ((si && si != &PL_sv_undef) + || (invlist && invlist != &PL_sv_undef))) { + assert(si); + sw = _core_swash_init("utf8", /* the utf8 package */ + "", /* nameless */ + si, + 1, /* binary */ + 0, /* not from tr/// */ + invlist, + &swash_init_flags); + (void)av_store(av, 1, sw); + } + } + } + + /* If requested, return a printable version of what this swash matches */ + if (listsvp) { + SV* matches_string = NULL; + + /* The swash should be used, if possible, to get the data, as it + * contains the resolved data. But this function can be called at + * compile-time, before everything gets resolved, in which case we + * return the currently best available information, which is the string + * that will eventually be used to do that resolving, 'si' */ + if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) + && (si && si != &PL_sv_undef)) + { + /* Here, we only have 'si' (and possibly some passed-in data in + * 'invlist', which is handled below) If the caller only wants + * 'si', use that. */ + if (! output_invlist) { + matches_string = newSVsv(si); + } + else { + /* But if the caller wants an inversion list of the node, we + * need to parse 'si' and place as much as possible in the + * desired output inversion list, making 'matches_string' only + * contain the currently unresolvable things */ + const char *si_string = SvPVX(si); + STRLEN remaining = SvCUR(si); + UV prev_cp = 0; + U8 count = 0; + + /* Ignore everything before the first new-line */ + while (*si_string != '\n' && remaining > 0) { + si_string++; + remaining--; + } + assert(remaining > 0); + + si_string++; + remaining--; + + while (remaining > 0) { + + /* The data consists of just strings defining user-defined + * property names, but in prior incarnations, and perhaps + * somehow from pluggable regex engines, it could still + * hold hex code point definitions. Each component of a + * range would be separated by a tab, and each range by a + * new-line. If these are found, instead add them to the + * inversion list */ + I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT + |PERL_SCAN_SILENT_NON_PORTABLE; + STRLEN len = remaining; + UV cp = grok_hex(si_string, &len, &grok_flags, NULL); + + /* If the hex decode routine found something, it should go + * up to the next \n */ + if ( *(si_string + len) == '\n') { + if (count) { /* 2nd code point on line */ + *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp); + } + else { + *output_invlist = add_cp_to_invlist(*output_invlist, cp); + } + count = 0; + goto prepare_for_next_iteration; + } + + /* If the hex decode was instead for the lower range limit, + * save it, and go parse the upper range limit */ + if (*(si_string + len) == '\t') { + assert(count == 0); + + prev_cp = cp; + count = 1; + prepare_for_next_iteration: + si_string += len + 1; + remaining -= len + 1; + continue; + } + + /* Here, didn't find a legal hex number. Just add it from + * here to the next \n */ + + remaining -= len; + while (*(si_string + len) != '\n' && remaining > 0) { + remaining--; + len++; + } + if (*(si_string + len) == '\n') { + len++; + remaining--; + } + if (matches_string) { + sv_catpvn(matches_string, si_string, len - 1); + } + else { + matches_string = newSVpvn(si_string, len - 1); + } + si_string += len; + sv_catpvs(matches_string, " "); + } /* end of loop through the text */ + + assert(matches_string); + if (SvCUR(matches_string)) { /* Get rid of trailing blank */ + SvCUR_set(matches_string, SvCUR(matches_string) - 1); + } + } /* end of has an 'si' but no swash */ + } + + /* If we have a swash in place, its equivalent inversion list was above + * placed into 'invlist'. If not, this variable may contain a stored + * inversion list which is information beyond what is in 'si' */ + if (invlist) { + + /* Again, if the caller doesn't want the output inversion list, put + * everything in 'matches-string' */ + if (! output_invlist) { + if ( ! matches_string) { + matches_string = newSVpvs("\n"); + } + sv_catsv(matches_string, invlist_contents(invlist, + TRUE /* traditional style */ + )); + } + else if (! *output_invlist) { + *output_invlist = invlist_clone(invlist); + } + else { + _invlist_union(*output_invlist, invlist, output_invlist); + } } - if (!ended) { - /* we ran off the end of the pattern without ending - the comment, so we have to add an \n when wrapping */ - RExC_seen |= REG_RUN_ON_COMMENT_SEEN; - return 0; - } else - return 1; -} -/* nextchar() + *listsvp = matches_string; + } + + return sw; +} +#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */ - Advances the parse position, and optionally absorbs - "whitespace" from the inputstream. +/* reg_skipcomment() - Without /x "whitespace" means (?#...) style comments only, - with /x this means (?#...) and # comments and whitespace proper. + Absorbs an /x style # comment from the input stream, + returning a pointer to the first character beyond the comment, or if the + comment terminates the pattern without anything following it, this returns + one past the final character of the pattern (in other words, RExC_end) and + sets the REG_RUN_ON_COMMENT_SEEN flag. - Returns the RExC_parse point from BEFORE the scan occurs. + Note it's the callers responsibility to ensure that we are + actually in /x mode - This is the /x friendly way of saying RExC_parse++. */ -STATIC char* -S_nextchar(pTHX_ RExC_state_t *pRExC_state) +PERL_STATIC_INLINE char* +S_reg_skipcomment(RExC_state_t *pRExC_state, char* p) { - char* const retval = RExC_parse++; + PERL_ARGS_ASSERT_REG_SKIPCOMMENT; - PERL_ARGS_ASSERT_NEXTCHAR; + assert(*p == '#'); + + while (p < RExC_end) { + if (*(++p) == '\n') { + return p+1; + } + } + + /* we ran off the end of the pattern without ending the comment, so we have + * to add an \n when wrapping */ + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; + return p; +} + +STATIC void +S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state, + char ** p, + const bool force_to_xmod + ) +{ + /* If the text at the current parse position '*p' is a '(?#...)' comment, + * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p' + * is /x whitespace, advance '*p' so that on exit it points to the first + * byte past all such white space and comments */ + + const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED); + + PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT; + + assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p)); for (;;) { - if (RExC_end - RExC_parse >= 3 - && *RExC_parse == '(' - && RExC_parse[1] == '?' - && RExC_parse[2] == '#') + if (RExC_end - (*p) >= 3 + && *(*p) == '(' + && *(*p + 1) == '?' + && *(*p + 2) == '#') { - while (*RExC_parse != ')') { - if (RExC_parse == RExC_end) + while (*(*p) != ')') { + if ((*p) == RExC_end) FAIL("Sequence (?#... not terminated"); - RExC_parse++; + (*p)++; } - RExC_parse++; + (*p)++; continue; } - if (RExC_flags & RXf_PMf_EXTENDED) { - if (isSPACE(*RExC_parse)) { - RExC_parse++; - continue; - } - else if (*RExC_parse == '#') { - if ( reg_skipcomment( pRExC_state ) ) - continue; - } + + if (use_xmod) { + const char * save_p = *p; + while ((*p) < RExC_end) { + STRLEN len; + if ((len = is_PATWS_safe((*p), RExC_end, UTF))) { + (*p) += len; + } + else if (*(*p) == '#') { + (*p) = reg_skipcomment(pRExC_state, (*p)); + } + else { + break; + } + } + if (*p != save_p) { + continue; + } } - return retval; + + break; } + + return; } -/* -- reg_node - emit a node +/* nextchar() + + Advances the parse position by one byte, unless that byte is the beginning + of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In + those two cases, the parse position is advanced beyond all such comments and + white space. + + This is the UTF, (?#...), and /x friendly way of saying RExC_parse++. */ -STATIC regnode * /* Location. */ -S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) + +STATIC void +S_nextchar(pTHX_ RExC_state_t *pRExC_state) { - dVAR; - regnode *ptr; + PERL_ARGS_ASSERT_NEXTCHAR; + + if (RExC_parse < RExC_end) { + assert( ! UTF + || UTF8_IS_INVARIANT(*RExC_parse) + || UTF8_IS_START(*RExC_parse)); + + RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + FALSE /* Don't assume /x */ ); + } +} + +STATIC regnode * +S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name) +{ + /* Allocate a regnode for 'op' and returns it, with 'extra_size' extra + * space. In pass1, it aligns and increments RExC_size; in pass2, + * RExC_emit */ + regnode * const ret = RExC_emit; GET_RE_DEBUG_FLAGS_DECL; - PERL_ARGS_ASSERT_REG_NODE; + PERL_ARGS_ASSERT_REGNODE_GUTS; + + assert(extra_size >= regarglen[op]); if (SIZE_ONLY) { SIZE_ALIGN(RExC_size); - RExC_size += 1; + RExC_size += 1 + extra_size; return(ret); } if (RExC_emit >= RExC_emit_bound) Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", - op, RExC_emit, RExC_emit_bound); + op, (void*)RExC_emit, (void*)RExC_emit_bound); NODE_ALIGN_FILL(ret); - ptr = ret; - FILL_ADVANCE_NODE(ptr, op); -#ifdef RE_TRACK_PATTERN_OFFSETS +#ifndef RE_TRACK_PATTERN_OFFSETS + PERL_UNUSED_ARG(name); +#else if (RExC_offsets) { /* MJD */ MJD_OFFSET_DEBUG( ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", - "reg_node", __LINE__, + name, __LINE__, PL_reg_name[op], (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", @@ -15090,79 +18155,66 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); } #endif - RExC_emit = ptr; return(ret); } /* -- reganode - emit a node with an argument +- reg_node - emit a node */ STATIC regnode * /* Location. */ -S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) +S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) { - dVAR; - regnode *ptr; - regnode * const ret = RExC_emit; - GET_RE_DEBUG_FLAGS_DECL; + regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node"); - PERL_ARGS_ASSERT_REGANODE; + PERL_ARGS_ASSERT_REG_NODE; - if (SIZE_ONLY) { - SIZE_ALIGN(RExC_size); - RExC_size += 2; - /* - We can't do this: + assert(regarglen[op] == 0); - assert(2==regarglen[op]+1); + if (PASS2) { + regnode *ptr = ret; + FILL_ADVANCE_NODE(ptr, op); + RExC_emit = ptr; + } + return(ret); +} - Anything larger than this has to allocate the extra amount. - If we changed this to be: +/* +- reganode - emit a node with an argument +*/ +STATIC regnode * /* Location. */ +S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) +{ + regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode"); - RExC_size += (1 + regarglen[op]); + PERL_ARGS_ASSERT_REGANODE; - then it wouldn't matter. Its not clear what side effect - might come from that so its not done so far. - -- dmq - */ - return(ret); - } - if (RExC_emit >= RExC_emit_bound) - Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", - op, RExC_emit, RExC_emit_bound); + assert(regarglen[op] == 1); - NODE_ALIGN_FILL(ret); - ptr = ret; - FILL_ADVANCE_NODE_ARG(ptr, op, arg); -#ifdef RE_TRACK_PATTERN_OFFSETS - if (RExC_offsets) { /* MJD */ - MJD_OFFSET_DEBUG( - ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", - "reganode", - __LINE__, - PL_reg_name[op], - (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? - "Overwriting end of array!\n" : "OK", - (UV)(RExC_emit - RExC_emit_start), - (UV)(RExC_parse - RExC_start), - (UV)RExC_offsets[0])); - Set_Cur_Node_Offset; + if (PASS2) { + regnode *ptr = ret; + FILL_ADVANCE_NODE_ARG(ptr, op, arg); + RExC_emit = ptr; } -#endif - RExC_emit = ptr; return(ret); } -/* -- reguni - emit (if appropriate) a Unicode character -*/ -PERL_STATIC_INLINE STRLEN -S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) +STATIC regnode * +S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const I32 arg2) { - dVAR; + /* emit a node with U32 and I32 arguments */ + + regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode"); + + PERL_ARGS_ASSERT_REG2LANODE; - PERL_ARGS_ASSERT_REGUNI; + assert(regarglen[op] == 2); - return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s); + if (PASS2) { + regnode *ptr = ret; + FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2); + RExC_emit = ptr; + } + return(ret); } /* @@ -15173,7 +18225,6 @@ S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) STATIC void S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) { - dVAR; regnode *src; regnode *dst; regnode *place; @@ -15182,6 +18233,7 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGINSERT; + PERL_UNUSED_CONTEXT; PERL_UNUSED_ARG(depth); /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); @@ -15196,6 +18248,9 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) if (RExC_open_parens) { int paren; /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/ + /* remember that RExC_npar is rex->nparens + 1, + * iow it is 1 more than the number of parens seen in + * the pattern so far. */ for ( paren=0 ; paren < RExC_npar ; paren++ ) { if ( RExC_open_parens[paren] >= opnd ) { /*DEBUG_PARSE_FMT("open"," - %d",size);*/ @@ -15211,6 +18266,8 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) } } } + if (RExC_end_op) + RExC_end_op += size; while (src > opnd) { StructCopy(--src, --dst, regnode); @@ -15259,12 +18316,12 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) - regtail - set the next-pointer at the end of a node chain of p to val. - SEE ALSO: regtail_study */ -/* TODO: All three parms should be const */ STATIC void -S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, - const regnode *val,U32 depth) +S_regtail(pTHX_ RExC_state_t * pRExC_state, + const regnode * const p, + const regnode * const val, + const U32 depth) { - dVAR; regnode *scan; GET_RE_DEBUG_FLAGS_DECL; @@ -15277,15 +18334,14 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, return; /* Find last node. */ - scan = p; + scan = (regnode *) p; for (;;) { regnode * const temp = regnext(scan); DEBUG_PARSE_r({ - SV * const mysv=sv_newmortal(); DEBUG_PARSE_MSG((scan==p ? "tail" : "")); - regprop(RExC_rx, mysv, scan, NULL); - PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", - SvPV_nolen_const(mysv), REG_NODE_NUM(scan), + regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); + Perl_re_printf( aTHX_ "~ %s (%d) %s %s\n", + SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan), (temp == NULL ? "->" : ""), (temp == NULL ? PL_reg_name[OP(val)] : "") ); @@ -15324,7 +18380,6 @@ STATIC U8 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) { - dVAR; regnode *scan; U8 exact = PSEUDO; #ifdef EXPERIMENTAL_INPLACESCAN @@ -15354,10 +18409,12 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, if ( exact ) { switch (OP(scan)) { case EXACT: + case EXACTL: case EXACTF: case EXACTFA_NO_TRIE: case EXACTFA: case EXACTFU: + case EXACTFLU8: case EXACTFU_SS: case EXACTFL: if( exact == PSEUDO ) @@ -15371,11 +18428,10 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, } } DEBUG_PARSE_r({ - SV * const mysv=sv_newmortal(); DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); - regprop(RExC_rx, mysv, scan, NULL); - PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", - SvPV_nolen_const(mysv), + regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); + Perl_re_printf( aTHX_ "~ %s (%d) -> %s\n", + SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan), PL_reg_name[exact]); }); @@ -15384,12 +18440,11 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, scan = temp; } DEBUG_PARSE_r({ - SV * const mysv_val=sv_newmortal(); DEBUG_PARSE_MSG(""); - regprop(RExC_rx, mysv_val, val, NULL); - PerlIO_printf(Perl_debug_log, + regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state); + Perl_re_printf( aTHX_ "~ attach to %s (%"IVdf") offset to %"IVdf"\n", - SvPV_nolen_const(mysv_val), + SvPV_nolen_const(RExC_mysv), (IV)REG_NODE_NUM(val), (IV)(val - scan) ); @@ -15421,15 +18476,15 @@ S_regdump_intflags(pTHX_ const char *lead, const U32 flags) for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) { if (flags & (1<<bit)) { if (!set++ && lead) - PerlIO_printf(Perl_debug_log, "%s",lead); - PerlIO_printf(Perl_debug_log, "%s ",PL_reg_intflags_name[bit]); + Perl_re_printf( aTHX_ "%s",lead); + Perl_re_printf( aTHX_ "%s ",PL_reg_intflags_name[bit]); } } if (lead) { if (set) - PerlIO_printf(Perl_debug_log, "\n"); + Perl_re_printf( aTHX_ "\n"); else - PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead); + Perl_re_printf( aTHX_ "%s[none-set]\n",lead); } } @@ -15448,37 +18503,37 @@ S_regdump_extflags(pTHX_ const char *lead, const U32 flags) continue; } if (!set++ && lead) - PerlIO_printf(Perl_debug_log, "%s",lead); - PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]); + Perl_re_printf( aTHX_ "%s",lead); + Perl_re_printf( aTHX_ "%s ",PL_reg_extflags_name[bit]); } } if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) { if (!set++ && lead) { - PerlIO_printf(Perl_debug_log, "%s",lead); + Perl_re_printf( aTHX_ "%s",lead); } switch (cs) { case REGEX_UNICODE_CHARSET: - PerlIO_printf(Perl_debug_log, "UNICODE"); + Perl_re_printf( aTHX_ "UNICODE"); break; case REGEX_LOCALE_CHARSET: - PerlIO_printf(Perl_debug_log, "LOCALE"); + Perl_re_printf( aTHX_ "LOCALE"); break; case REGEX_ASCII_RESTRICTED_CHARSET: - PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED"); + Perl_re_printf( aTHX_ "ASCII-RESTRICTED"); break; case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED"); + Perl_re_printf( aTHX_ "ASCII-MORE_RESTRICTED"); break; default: - PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET"); + Perl_re_printf( aTHX_ "UNKNOWN CHARACTER SET"); break; } } if (lead) { if (set) - PerlIO_printf(Perl_debug_log, "\n"); + Perl_re_printf( aTHX_ "\n"); else - PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead); + Perl_re_printf( aTHX_ "%s[none-set]\n",lead); } } #endif @@ -15487,7 +18542,6 @@ void Perl_regdump(pTHX_ const regexp *r) { #ifdef DEBUGGING - dVAR; SV * const sv = sv_newmortal(); SV *dsv= sv_newmortal(); RXi_GET_DECL(r,ri); @@ -15501,14 +18555,14 @@ Perl_regdump(pTHX_ const regexp *r) if (r->anchored_substr) { RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), RE_SV_DUMPLEN(r->anchored_substr), 30); - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ "anchored %s%s at %"IVdf" ", s, RE_SV_TAIL(r->anchored_substr), (IV)r->anchored_offset); } else if (r->anchored_utf8) { RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), RE_SV_DUMPLEN(r->anchored_utf8), 30); - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ "anchored utf8 %s%s at %"IVdf" ", s, RE_SV_TAIL(r->anchored_utf8), (IV)r->anchored_offset); @@ -15516,57 +18570,55 @@ Perl_regdump(pTHX_ const regexp *r) if (r->float_substr) { RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), RE_SV_DUMPLEN(r->float_substr), 30); - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ "floating %s%s at %"IVdf"..%"UVuf" ", s, RE_SV_TAIL(r->float_substr), (IV)r->float_min_offset, (UV)r->float_max_offset); } else if (r->float_utf8) { RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), RE_SV_DUMPLEN(r->float_utf8), 30); - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ "floating utf8 %s%s at %"IVdf"..%"UVuf" ", s, RE_SV_TAIL(r->float_utf8), (IV)r->float_min_offset, (UV)r->float_max_offset); } if (r->check_substr || r->check_utf8) - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ (const char *) (r->check_substr == r->float_substr && r->check_utf8 == r->float_utf8 ? "(checking floating" : "(checking anchored")); if (r->intflags & PREGf_NOSCAN) - PerlIO_printf(Perl_debug_log, " noscan"); + Perl_re_printf( aTHX_ " noscan"); if (r->extflags & RXf_CHECK_ALL) - PerlIO_printf(Perl_debug_log, " isall"); + Perl_re_printf( aTHX_ " isall"); if (r->check_substr || r->check_utf8) - PerlIO_printf(Perl_debug_log, ") "); + Perl_re_printf( aTHX_ ") "); if (ri->regstclass) { - regprop(r, sv, ri->regstclass, NULL); - PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); + regprop(r, sv, ri->regstclass, NULL, NULL); + Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv)); } if (r->intflags & PREGf_ANCH) { - PerlIO_printf(Perl_debug_log, "anchored"); - if (r->intflags & PREGf_ANCH_BOL) - PerlIO_printf(Perl_debug_log, "(BOL)"); + Perl_re_printf( aTHX_ "anchored"); if (r->intflags & PREGf_ANCH_MBOL) - PerlIO_printf(Perl_debug_log, "(MBOL)"); + Perl_re_printf( aTHX_ "(MBOL)"); if (r->intflags & PREGf_ANCH_SBOL) - PerlIO_printf(Perl_debug_log, "(SBOL)"); + Perl_re_printf( aTHX_ "(SBOL)"); if (r->intflags & PREGf_ANCH_GPOS) - PerlIO_printf(Perl_debug_log, "(GPOS)"); - PerlIO_putc(Perl_debug_log, ' '); + Perl_re_printf( aTHX_ "(GPOS)"); + Perl_re_printf( aTHX_ " "); } if (r->intflags & PREGf_GPOS_SEEN) - PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); + Perl_re_printf( aTHX_ "GPOS:%"UVuf" ", (UV)r->gofs); if (r->intflags & PREGf_SKIP) - PerlIO_printf(Perl_debug_log, "plus "); + Perl_re_printf( aTHX_ "plus "); if (r->intflags & PREGf_IMPLICIT) - PerlIO_printf(Perl_debug_log, "implicit "); - PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen); + Perl_re_printf( aTHX_ "implicit "); + Perl_re_printf( aTHX_ "minlen %"IVdf" ", (IV)r->minlen); if (r->extflags & RXf_EVAL_SEEN) - PerlIO_printf(Perl_debug_log, "with eval "); - PerlIO_printf(Perl_debug_log, "\n"); + Perl_re_printf( aTHX_ "with eval "); + Perl_re_printf( aTHX_ "\n"); DEBUG_FLAGS_r({ regdump_extflags("r->extflags: ",r->extflags); regdump_intflags("r->intflags: ",r->intflags); @@ -15578,68 +18630,68 @@ Perl_regdump(pTHX_ const regexp *r) #endif /* DEBUGGING */ } +/* Should be synchronized with ANYOF_ #defines in regcomp.h */ +#ifdef DEBUGGING + +# if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 \ + || _CC_LOWER != 3 || _CC_UPPER != 4 || _CC_PUNCT != 5 \ + || _CC_PRINT != 6 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 \ + || _CC_CASED != 9 || _CC_SPACE != 10 || _CC_BLANK != 11 \ + || _CC_XDIGIT != 12 || _CC_CNTRL != 13 || _CC_ASCII != 14 \ + || _CC_VERTSPACE != 15 +# error Need to adjust order of anyofs[] +# endif +static const char * const anyofs[] = { + "\\w", + "\\W", + "\\d", + "\\D", + "[:alpha:]", + "[:^alpha:]", + "[:lower:]", + "[:^lower:]", + "[:upper:]", + "[:^upper:]", + "[:punct:]", + "[:^punct:]", + "[:print:]", + "[:^print:]", + "[:alnum:]", + "[:^alnum:]", + "[:graph:]", + "[:^graph:]", + "[:cased:]", + "[:^cased:]", + "\\s", + "\\S", + "[:blank:]", + "[:^blank:]", + "[:xdigit:]", + "[:^xdigit:]", + "[:cntrl:]", + "[:^cntrl:]", + "[:ascii:]", + "[:^ascii:]", + "\\v", + "\\V" +}; +#endif + /* - regprop - printable representation of opcode, with run time support */ void -Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo) +Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state) { #ifdef DEBUGGING - dVAR; int k; - - /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ - static const char * const anyofs[] = { -#if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \ - || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \ - || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \ - || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \ - || _CC_PSXSPC != 13 || _CC_CNTRL != 14 || _CC_ASCII != 15 \ - || _CC_VERTSPACE != 16 - #error Need to adjust order of anyofs[] -#endif - "\\w", - "\\W", - "\\d", - "\\D", - "[:alpha:]", - "[:^alpha:]", - "[:lower:]", - "[:^lower:]", - "[:upper:]", - "[:^upper:]", - "[:punct:]", - "[:^punct:]", - "[:print:]", - "[:^print:]", - "[:alnum:]", - "[:^alnum:]", - "[:graph:]", - "[:^graph:]", - "[:cased:]", - "[:^cased:]", - "\\s", - "\\S", - "[:blank:]", - "[:^blank:]", - "[:xdigit:]", - "[:^xdigit:]", - "[:space:]", - "[:^space:]", - "[:cntrl:]", - "[:^cntrl:]", - "[:ascii:]", - "[:^ascii:]", - "\\v", - "\\V" - }; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGPROP; - sv_setpvs(sv, ""); + sv_setpvn(sv, "", 0); if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ /* It would be nice to FAIL() here, but this may be called from @@ -15689,35 +18741,51 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ ); if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { sv_catpvs(sv, "["); - (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op) - ? ANYOF_BITMAP(o) - : TRIE_BITMAP(trie)); + (void) put_charclass_bitmap_innards(sv, + ((IS_ANYOF_TRIE(op)) + ? ANYOF_BITMAP(o) + : TRIE_BITMAP(trie)), + NULL, + NULL, + NULL + ); sv_catpvs(sv, "]"); } } else if (k == CURLY) { + U32 lo = ARG1(o), hi = ARG2(o); if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ - Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o)); + Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo); + if (hi == REG_INFTY) + sv_catpvs(sv, "INFTY"); + else + Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi); + sv_catpvs(sv, "}"); } else if (k == WHILEM && o->flags) /* Ordinal/of */ Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { - Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ + AV *name_list= NULL; + U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o); + Perl_sv_catpvf(aTHX_ sv, "%"UVuf, (UV)parno); /* Parenth number */ if ( RXp_PAREN_NAMES(prog) ) { + name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); + } else if ( pRExC_state ) { + name_list= RExC_paren_name_list; + } + if (name_list) { if ( k != REF || (OP(o) < NREF)) { - AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); - SV **name= av_fetch(list, ARG(o), 0 ); + SV **name= av_fetch(name_list, parno, 0 ); if (name) Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); } else { - AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]); - SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]); + SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]); I32 *nums=(I32*)SvPVX(sv_dat); - SV **name= av_fetch(list, nums[0], 0 ); + SV **name= av_fetch(name_list, nums[0], 0 ); I32 n; if (name) { for ( n=0; n<SvIVX(sv_dat); n++ ) { @@ -15742,146 +18810,155 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE ); } } - } else if (k == GOSUB) + } else if (k == GOSUB) { + AV *name_list= NULL; + if ( RXp_PAREN_NAMES(prog) ) { + name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); + } else if ( pRExC_state ) { + name_list= RExC_paren_name_list; + } + /* Paren and offset */ - Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); - else if (k == VERB) { - if (!o->flags) - Perl_sv_catpvf(aTHX_ sv, ":%"SVf, - SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); - } else if (k == LOGICAL) + Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o), + (int)((o + (int)ARG2L(o)) - progi->program) ); + if (name_list) { + SV **name= av_fetch(name_list, ARG(o), 0 ); + if (name) + Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); + } + } + else if (k == LOGICAL) /* 2: embedded, otherwise 1 */ Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); else if (k == ANYOF) { const U8 flags = ANYOF_FLAGS(o); - int do_sep = 0; + bool do_sep = FALSE; /* Do we need to separate various components of + the output? */ + /* Set if there is still an unresolved user-defined property */ + SV *unresolved = NULL; + /* Things that are ignored except when the runtime locale is UTF-8 */ + SV *only_utf8_locale_invlist = NULL; - if (flags & ANYOF_LOCALE_FLAGS) - sv_catpvs(sv, "{loc}"); - if (flags & ANYOF_LOC_FOLD) - sv_catpvs(sv, "{i}"); - Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); - if (flags & ANYOF_INVERT) - sv_catpvs(sv, "^"); + /* Code points that don't fit in the bitmap */ + SV *nonbitmap_invlist = NULL; - /* output what the standard cp 0-255 bitmap matches */ - do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o)); + /* And things that aren't in the bitmap, but are small enough to be */ + SV* bitmap_range_not_in_bitmap = NULL; - /* output any special charclass tests (used entirely under use - * locale) * */ - if (ANYOF_POSIXL_TEST_ANY_SET(o)) { - int i; - for (i = 0; i < ANYOF_POSIXL_MAX; i++) { - if (ANYOF_POSIXL_TEST(o,i)) { - sv_catpv(sv, anyofs[i]); - do_sep = 1; - } + if (OP(o) == ANYOFL) { + if (ANYOFL_UTF8_LOCALE_REQD(flags)) { + sv_catpvs(sv, "{utf8-locale-reqd}"); + } + if (flags & ANYOFL_FOLD) { + sv_catpvs(sv, "{i}"); } } - if ((flags & (ANYOF_ABOVE_LATIN1_ALL - |ANYOF_UTF8 - |ANYOF_NONBITMAP_NON_UTF8 - |ANYOF_LOC_FOLD))) - { + /* If there is stuff outside the bitmap, get it */ + if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) { + (void) _get_regclass_nonbitmap_data(prog, o, FALSE, + &unresolved, + &only_utf8_locale_invlist, + &nonbitmap_invlist); + /* The non-bitmap data may contain stuff that could fit in the + * bitmap. This could come from a user-defined property being + * finally resolved when this call was done; or much more likely + * because there are matches that require UTF-8 to be valid, and so + * aren't in the bitmap. This is teased apart later */ + _invlist_intersection(nonbitmap_invlist, + PL_InBitmap, + &bitmap_range_not_in_bitmap); + /* Leave just the things that don't fit into the bitmap */ + _invlist_subtract(nonbitmap_invlist, + PL_InBitmap, + &nonbitmap_invlist); + } + + /* Obey this flag to add all above-the-bitmap code points */ + if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) { + nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, + NUM_ANYOF_CODE_POINTS, + UV_MAX); + } + + /* Ready to start outputting. First, the initial left bracket */ + Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); + + /* Then all the things that could fit in the bitmap */ + do_sep = put_charclass_bitmap_innards(sv, + ANYOF_BITMAP(o), + bitmap_range_not_in_bitmap, + only_utf8_locale_invlist, + o); + SvREFCNT_dec(bitmap_range_not_in_bitmap); + + /* If there are user-defined properties which haven't been defined yet, + * output them, in a separate [] from the bitmap range stuff */ + if (unresolved) { if (do_sep) { Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); - if (flags & ANYOF_INVERT) - /*make sure the invert info is in each */ - sv_catpvs(sv, "^"); } - - if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL) { - sv_catpvs(sv, "{non-utf8-latin1-all}"); + if (flags & ANYOF_INVERT) { + sv_catpvs(sv, "^"); } + sv_catsv(sv, unresolved); + do_sep = TRUE; + SvREFCNT_dec_NN(unresolved); + } - /* output information about the unicode matching */ - if (flags & ANYOF_ABOVE_LATIN1_ALL) - sv_catpvs(sv, "{unicode_all}"); - else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) { - SV *lv; /* Set if there is something outside the bit map. */ - bool byte_output = FALSE; /* If something in the bitmap has - been output */ - SV *only_utf8_locale; - - /* Get the stuff that wasn't in the bitmap */ - (void) _get_regclass_nonbitmap_data(prog, o, FALSE, - &lv, &only_utf8_locale); - if (lv && lv != &PL_sv_undef) { - char *s = savesvpv(lv); - char * const origs = s; - - while (*s && *s != '\n') - s++; - - if (*s == '\n') { - const char * const t = ++s; - - if (flags & ANYOF_NONBITMAP_NON_UTF8) { - sv_catpvs(sv, "{outside bitmap}"); - } - else { - sv_catpvs(sv, "{utf8}"); - } + /* And, finally, add the above-the-bitmap stuff */ + if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) { + SV* contents; - if (byte_output) { - sv_catpvs(sv, " "); - } + /* See if truncation size is overridden */ + const STRLEN dump_len = (PL_dump_re_max_len) + ? PL_dump_re_max_len + : 256; - while (*s) { - if (*s == '\n') { + /* This is output in a separate [] */ + if (do_sep) { + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); + } - /* Truncate very long output */ - if (s - origs > 256) { - Perl_sv_catpvf(aTHX_ sv, - "%.*s...", - (int) (s - origs - 1), - t); - goto out_dump; - } - *s = ' '; - } - else if (*s == '\t') { - *s = '-'; - } - s++; - } - if (s[-1] == ' ') - s[-1] = 0; + /* And, for easy of understanding, it is always output not-shown as + * complemented */ + if (flags & ANYOF_INVERT) { + _invlist_invert(nonbitmap_invlist); + _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist); + } - sv_catpv(sv, t); - } + contents = invlist_contents(nonbitmap_invlist, + FALSE /* output suitable for catsv */ + ); - out_dump: + /* If the output is shorter than the permissible maximum, just do it. */ + if (SvCUR(contents) <= dump_len) { + sv_catsv(sv, contents); + } + else { + const char * contents_string = SvPVX(contents); + STRLEN i = dump_len; - Safefree(origs); - SvREFCNT_dec_NN(lv); + /* Otherwise, start at the permissible max and work back to the + * first break possibility */ + while (i > 0 && contents_string[i] != ' ') { + i--; } - - if ((flags & ANYOF_LOC_FOLD) - && only_utf8_locale - && only_utf8_locale != &PL_sv_undef) - { - UV start, end; - int max_entries = 256; - - sv_catpvs(sv, "{utf8 locale}"); - invlist_iterinit(only_utf8_locale); - while (invlist_iternext(only_utf8_locale, - &start, &end)) { - put_range(sv, start, end); - max_entries --; - if (max_entries < 0) { - sv_catpvs(sv, "..."); - break; - } - } - invlist_iterfinish(only_utf8_locale); + if (i == 0) { /* Fail-safe. Use the max if we couldn't + find a legal break */ + i = dump_len; } + + sv_catpvn(sv, contents_string, i); + sv_catpvs(sv, "..."); } - } + SvREFCNT_dec_NN(contents); + SvREFCNT_dec_NN(nonbitmap_invlist); + } + + /* And finally the matching, closing ']' */ Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); } else if (k == POSIXD || k == NPOSIXD) { @@ -15899,14 +18976,35 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); } } + else if (k == BOUND || k == NBOUND) { + /* Must be synced with order of 'bound_type' in regcomp.h */ + const char * const bounds[] = { + "", /* Traditional */ + "{gcb}", + "{lb}", + "{sb}", + "{wb}" + }; + assert(FLAGS(o) < C_ARRAY_LENGTH(bounds)); + sv_catpv(sv, bounds[FLAGS(o)]); + } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); + else if (OP(o) == SBOL) + Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^"); + + /* add on the verb argument if there is one */ + if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) { + Perl_sv_catpvf(aTHX_ sv, ":%"SVf, + SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); + } #else PERL_UNUSED_CONTEXT; PERL_UNUSED_ARG(sv); PERL_UNUSED_ARG(o); PERL_UNUSED_ARG(prog); PERL_UNUSED_ARG(reginfo); + PERL_UNUSED_ARG(pRExC_state); #endif /* DEBUGGING */ } @@ -15915,7 +19013,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ SV * Perl_re_intuit_string(pTHX_ REGEXP * const r) { /* Assume that RE_INTUIT is set */ - dVAR; struct regexp *const prog = ReANY(r); GET_RE_DEBUG_FLAGS_DECL; @@ -15924,21 +19021,22 @@ Perl_re_intuit_string(pTHX_ REGEXP * const r) DEBUG_COMPILE_r( { - const char * const s = SvPV_nolen_const(prog->check_substr - ? prog->check_substr : prog->check_utf8); + const char * const s = SvPV_nolen_const(RX_UTF8(r) + ? prog->check_utf8 : prog->check_substr); if (!PL_colorset) reginitcolors(); - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n", PL_colors[4], - prog->check_substr ? "" : "utf8 ", + RX_UTF8(r) ? "utf8 " : "", PL_colors[5],PL_colors[0], s, PL_colors[1], (strlen(s) > 60 ? "..." : "")); } ); - return prog->check_substr ? prog->check_substr : prog->check_utf8; + /* use UTF8 check substring if regexp pattern itself is in UTF8 */ + return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr; } /* @@ -15963,7 +19061,6 @@ Perl_pregfree(pTHX_ REGEXP *r) void Perl_pregfree2(pTHX_ REGEXP *rx) { - dVAR; struct regexp *const r = ReANY(rx); GET_RE_DEBUG_FLAGS_DECL; @@ -15989,6 +19086,8 @@ Perl_pregfree2(pTHX_ REGEXP *rx) #endif Safefree(r->offs); SvREFCNT_dec(r->qr_anoncv); + if (r->recurse_locinput) + Safefree(r->recurse_locinput); rx->sv_u.svu_rx = 0; } @@ -16072,6 +19171,8 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) #endif ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx); SvREFCNT_inc_void(ret->qr_anoncv); + if (r->recurse_locinput) + Newxz(ret->recurse_locinput,r->nparens + 1,char *); return ret_x; } @@ -16092,7 +19193,6 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) void Perl_regfree_internal(pTHX_ REGEXP * const rx) { - dVAR; struct regexp *const r = ReANY(rx); RXi_GET_DECL(r,ri); GET_RE_DEBUG_FLAGS_DECL; @@ -16106,7 +19206,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) SV *dsv= sv_newmortal(); RE_PV_QUOTED_DECL(s, RX_UTF8(rx), dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60); - PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", + Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n", PL_colors[4],PL_colors[5],s); } }); @@ -16145,6 +19245,9 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) Used in stclass optimization only */ U32 refcount; reg_ac_data *aho=(reg_ac_data*)ri->data->data[n]; +#ifdef USE_ITHREADS + dVAR; +#endif OP_REFCNT_LOCK; refcount = --aho->refcount; OP_REFCNT_UNLOCK; @@ -16153,7 +19256,16 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) PerlMemShared_free(aho->fail); /* do this last!!!! */ PerlMemShared_free(ri->data->data[n]); - PerlMemShared_free(ri->regstclass); + /* we should only ever get called once, so + * assert as much, and also guard the free + * which /might/ happen twice. At the least + * it will make code anlyzers happy and it + * doesn't cost much. - Yves */ + assert(ri->regstclass); + if (ri->regstclass) { + PerlMemShared_free(ri->regstclass); + ri->regstclass = 0; + } } } break; @@ -16162,6 +19274,9 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) /* trie structure. */ U32 refcount; reg_trie_data *trie=(reg_trie_data*)ri->data->data[n]; +#ifdef USE_ITHREADS + dVAR; +#endif OP_REFCNT_LOCK; refcount = --trie->refcount; OP_REFCNT_UNLOCK; @@ -16196,7 +19311,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) /* - re_dup - duplicate a regexp. + re_dup_guts - duplicate a regexp. This routine is expected to clone a given regexp structure. It is only compiled under USE_ITHREADS. @@ -16264,6 +19379,8 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param)); + if (r->recurse_locinput) + Newxz(ret->recurse_locinput,r->nparens + 1,char *); if (ret->pprivate) RXi_SET(ret,CALLREGDUPE_PVT(dstr,param)); @@ -16318,6 +19435,7 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) char, regexp_internal); Copy(ri->program, reti->program, len+1, regnode); + reti->num_code_blocks = ri->num_code_blocks; if (ri->code_blocks) { int n; @@ -16367,18 +19485,18 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) * when the corresponding reg_ac_data struct is freed. */ reti->regstclass= ri->regstclass; - /* Fall through */ + /* FALLTHROUGH */ case 't': OP_REFCNT_LOCK; ((reg_trie_data*)ri->data->data[i])->refcount++; OP_REFCNT_UNLOCK; - /* Fall through */ + /* FALLTHROUGH */ case 'l': case 'L': d->data[i] = ri->data->data[i]; break; default: - Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", + Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'", ri->data->what[i]); } } @@ -16412,7 +19530,6 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) regnode * Perl_regnext(pTHX_ regnode *p) { - dVAR; I32 offset; if (!p) @@ -16468,7 +19585,6 @@ S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...) void Perl_save_re_context(pTHX) { - dVAR; I32 nparens = -1; I32 i; @@ -16507,113 +19623,583 @@ Perl_save_re_context(pTHX) #ifdef DEBUGGING STATIC void -S_put_byte(pTHX_ SV *sv, int c) +S_put_code_point(pTHX_ SV *sv, UV c) { - PERL_ARGS_ASSERT_PUT_BYTE; + PERL_ARGS_ASSERT_PUT_CODE_POINT; - if (!isPRINT(c)) { - switch (c) { - case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break; - case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break; - case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break; - case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break; - case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break; - - default: - Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c); - break; - } + if (c > 255) { + Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c); } - else { - const char string = c; - if (c == '-' || c == ']' || c == '\\' || c == '^') + else if (isPRINT(c)) { + const char string = (char) c; + + /* We use {phrase} as metanotation in the class, so also escape literal + * braces */ + if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}') sv_catpvs(sv, "\\"); sv_catpvn(sv, &string, 1); } + else if (isMNEMONIC_CNTRL(c)) { + Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c)); + } + else { + Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c); + } } +#define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C + STATIC void -S_put_range(pTHX_ SV *sv, UV start, UV end) +S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) { - /* Appends to 'sv' a displayable version of the range of code points from - * 'start' to 'end' */ + * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls + * that have them, when they occur at the beginning or end of the range. + * It uses hex to output the remaining code points, unless 'allow_literals' + * is true, in which case the printable ASCII ones are output as-is (though + * some of these will be escaped by put_code_point()). + * + * NOTE: This is designed only for printing ranges of code points that fit + * inside an ANYOF bitmap. Higher code points are simply suppressed + */ + + const unsigned int min_range_count = 3; assert(start <= end); PERL_ARGS_ASSERT_PUT_RANGE; - if (end - start < 3) { /* Individual chars in short ranges */ - for (; start <= end; start++) - put_byte(sv, start); - } - else if ( end > 255 - || ! isALPHANUMERIC(start) - || ! isALPHANUMERIC(end) - || isDIGIT(start) != isDIGIT(end) - || isUPPER(start) != isUPPER(end) - || isLOWER(start) != isLOWER(end) - - /* This final test should get optimized out except on EBCDIC - * platforms, where it causes ranges that cross discontinuities - * like i/j to be shown as hex instead of the misleading, - * e.g. H-K (since that range includes more than H, I, J, K). - * */ - || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start)) - { - Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}", - start, - (end < 256) ? end : 255); + while (start <= end) { + UV this_end; + const char * format; + + if (end - start < min_range_count) { + + /* Output chars individually when they occur in short ranges */ + for (; start <= end; start++) { + put_code_point(sv, start); + } + break; + } + + /* If permitted by the input options, and there is a possibility that + * this range contains a printable literal, look to see if there is + * one. */ + if (allow_literals && start <= MAX_PRINT_A) { + + /* If the character at the beginning of the range isn't an ASCII + * printable, effectively split the range into two parts: + * 1) the portion before the first such printable, + * 2) the rest + * and output them separately. */ + if (! isPRINT_A(start)) { + UV temp_end = start + 1; + + /* There is no point looking beyond the final possible + * printable, in MAX_PRINT_A */ + UV max = MIN(end, MAX_PRINT_A); + + while (temp_end <= max && ! isPRINT_A(temp_end)) { + temp_end++; + } + + /* Here, temp_end points to one beyond the first printable if + * found, or to one beyond 'max' if not. If none found, make + * sure that we use the entire range */ + if (temp_end > MAX_PRINT_A) { + temp_end = end + 1; + } + + /* Output the first part of the split range: the part that + * doesn't have printables, with the parameter set to not look + * for literals (otherwise we would infinitely recurse) */ + put_range(sv, start, temp_end - 1, FALSE); + + /* The 2nd part of the range (if any) starts here. */ + start = temp_end; + + /* We do a continue, instead of dropping down, because even if + * the 2nd part is non-empty, it could be so short that we want + * to output it as individual characters, as tested for at the + * top of this loop. */ + continue; + } + + /* Here, 'start' is a printable ASCII. If it is an alphanumeric, + * output a sub-range of just the digits or letters, then process + * the remaining portion as usual. */ + if (isALPHANUMERIC_A(start)) { + UV mask = (isDIGIT_A(start)) + ? _CC_DIGIT + : isUPPER_A(start) + ? _CC_UPPER + : _CC_LOWER; + UV temp_end = start + 1; + + /* Find the end of the sub-range that includes just the + * characters in the same class as the first character in it */ + while (temp_end <= end && _generic_isCC_A(temp_end, mask)) { + temp_end++; + } + temp_end--; + + /* For short ranges, don't duplicate the code above to output + * them; just call recursively */ + if (temp_end - start < min_range_count) { + put_range(sv, start, temp_end, FALSE); + } + else { /* Output as a range */ + put_code_point(sv, start); + sv_catpvs(sv, "-"); + put_code_point(sv, temp_end); + } + start = temp_end + 1; + continue; + } + + /* We output any other printables as individual characters */ + if (isPUNCT_A(start) || isSPACE_A(start)) { + while (start <= end && (isPUNCT_A(start) + || isSPACE_A(start))) + { + put_code_point(sv, start); + start++; + } + continue; + } + } /* End of looking for literals */ + + /* Here is not to output as a literal. Some control characters have + * mnemonic names. Split off any of those at the beginning and end of + * the range to print mnemonically. It isn't possible for many of + * these to be in a row, so this won't overwhelm with output */ + while (isMNEMONIC_CNTRL(start) && start <= end) { + put_code_point(sv, start); + start++; + } + if (start < end && isMNEMONIC_CNTRL(end)) { + + /* Here, the final character in the range has a mnemonic name. + * Work backwards from the end to find the final non-mnemonic */ + UV temp_end = end - 1; + while (isMNEMONIC_CNTRL(temp_end)) { + temp_end--; + } + + /* And separately output the interior range that doesn't start or + * end with mnemonics */ + put_range(sv, start, temp_end, FALSE); + + /* Then output the mnemonic trailing controls */ + start = temp_end + 1; + while (start <= end) { + put_code_point(sv, start); + start++; + } + break; + } + + /* As a final resort, output the range or subrange as hex. */ + + this_end = (end < NUM_ANYOF_CODE_POINTS) + ? end + : NUM_ANYOF_CODE_POINTS - 1; +#if NUM_ANYOF_CODE_POINTS > 256 + format = (this_end < 256) + ? "\\x%02"UVXf"-\\x%02"UVXf"" + : "\\x{%04"UVXf"}-\\x{%04"UVXf"}"; +#else + format = "\\x%02"UVXf"-\\x%02"UVXf""; +#endif + GCC_DIAG_IGNORE(-Wformat-nonliteral); + Perl_sv_catpvf(aTHX_ sv, format, start, this_end); + GCC_DIAG_RESTORE; + break; + } +} + +STATIC void +S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist) +{ + /* Concatenate onto the PV in 'sv' a displayable form of the inversion list + * 'invlist' */ + + UV start, end; + bool allow_literals = TRUE; + + PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST; + + /* Generally, it is more readable if printable characters are output as + * literals, but if a range (nearly) spans all of them, it's best to output + * it as a single range. This code will use a single range if all but 2 + * ASCII printables are in it */ + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { + + /* If the range starts beyond the final printable, it doesn't have any + * in it */ + if (start > MAX_PRINT_A) { + break; + } + + /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span + * all but two, the range must start and end no later than 2 from + * either end */ + if (start < ' ' + 2 && end > MAX_PRINT_A - 2) { + if (end > MAX_PRINT_A) { + end = MAX_PRINT_A; + } + if (start < ' ') { + start = ' '; + } + if (end - start >= MAX_PRINT_A - ' ' - 2) { + allow_literals = FALSE; + } + break; + } } - else { /* Here, the ends of the range are both digits, or both uppercase, - or both lowercase; and there's no discontinuity in the range - (which could happen on EBCDIC platforms) */ - put_byte(sv, start); - sv_catpvs(sv, "-"); - put_byte(sv, end); + invlist_iterfinish(invlist); + + /* Here we have figured things out. Output each range */ + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { + if (start >= NUM_ANYOF_CODE_POINTS) { + break; + } + put_range(sv, start, end, allow_literals); } + invlist_iterfinish(invlist); + + return; +} + +STATIC SV* +S_put_charclass_bitmap_innards_common(pTHX_ + SV* invlist, /* The bitmap */ + SV* posixes, /* Under /l, things like [:word:], \S */ + SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */ + SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */ + SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */ + const bool invert /* Is the result to be inverted? */ +) +{ + /* Create and return an SV containing a displayable version of the bitmap + * and associated information determined by the input parameters. */ + + SV * output; + + PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON; + + if (invert) { + output = newSVpvs("^"); + } + else { + output = newSVpvs(""); + } + + /* First, the code points in the bitmap that are unconditionally there */ + put_charclass_bitmap_innards_invlist(output, invlist); + + /* Traditionally, these have been placed after the main code points */ + if (posixes) { + sv_catsv(output, posixes); + } + + if (only_utf8 && _invlist_len(only_utf8)) { + Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]); + put_charclass_bitmap_innards_invlist(output, only_utf8); + } + + if (not_utf8 && _invlist_len(not_utf8)) { + Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]); + put_charclass_bitmap_innards_invlist(output, not_utf8); + } + + if (only_utf8_locale && _invlist_len(only_utf8_locale)) { + Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]); + put_charclass_bitmap_innards_invlist(output, only_utf8_locale); + + /* This is the only list in this routine that can legally contain code + * points outside the bitmap range. The call just above to + * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so + * output them here. There's about a half-dozen possible, and none in + * contiguous ranges longer than 2 */ + if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { + UV start, end; + SV* above_bitmap = NULL; + + _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap); + + invlist_iterinit(above_bitmap); + while (invlist_iternext(above_bitmap, &start, &end)) { + UV i; + + for (i = start; i <= end; i++) { + put_code_point(output, i); + } + } + invlist_iterfinish(above_bitmap); + SvREFCNT_dec_NN(above_bitmap); + } + } + + /* If the only thing we output is the '^', clear it */ + if (invert && SvCUR(output) == 1) { + SvCUR_set(output, 0); + } + + return output; } STATIC bool -S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap) +S_put_charclass_bitmap_innards(pTHX_ SV *sv, + char *bitmap, + SV *nonbitmap_invlist, + SV *only_utf8_locale_invlist, + const regnode * const node) { /* Appends to 'sv' a displayable version of the innards of the bracketed - * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually - * output anything */ + * character class defined by the other arguments: + * 'bitmap' points to the bitmap. + * 'nonbitmap_invlist' is an inversion list of the code points that are in + * the bitmap range, but for some reason aren't in the bitmap; NULL if + * none. The reasons for this could be that they require some + * condition such as the target string being or not being in UTF-8 + * (under /d), or because they came from a user-defined property that + * was not resolved at the time of the regex compilation (under /u) + * 'only_utf8_locale_invlist' is an inversion list of the code points that + * are valid only if the runtime locale is a UTF-8 one; NULL if none + * 'node' is the regex pattern node. It is needed only when the above two + * parameters are not null, and is passed so that this routine can + * tease apart the various reasons for them. + * + * It returns TRUE if there was actually something output. (It may be that + * the bitmap, etc is empty.) + * + * When called for outputting the bitmap of a non-ANYOF node, just pass the + * bitmap, with the succeeding parameters set to NULL. + * + */ + + /* In general, it tries to display the 'cleanest' representation of the + * innards, choosing whether to display them inverted or not, regardless of + * whether the class itself is to be inverted. However, there are some + * cases where it can't try inverting, as what actually matches isn't known + * until runtime, and hence the inversion isn't either. */ + bool inverting_allowed = TRUE; int i; - bool has_output_anything = FALSE; + STRLEN orig_sv_cur = SvCUR(sv); - PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS; + SV* invlist; /* Inversion list we accumulate of code points that + are unconditionally matched */ + SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is + UTF-8 */ + SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8 + */ + SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */ + SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale + is UTF-8 */ - for (i = 0; i < 256; i++) { - if (BITMAP_TEST((U8 *) bitmap,i)) { + SV* as_is_display; /* The output string when we take the inputs + literally */ + SV* inverted_display; /* The output string when we invert the inputs */ - /* The character at index i should be output. Find the next - * character that should NOT be output */ - int j; - for (j = i + 1; j < 256; j++) { - if (! BITMAP_TEST((U8 *) bitmap, j)) { - break; + U8 flags = (node) ? ANYOF_FLAGS(node) : 0; + + bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted + to match? */ + /* We are biased in favor of displaying things without them being inverted, + * as that is generally easier to understand */ + const int bias = 5; + + PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS; + + /* Start off with whatever code points are passed in. (We clone, so we + * don't change the caller's list) */ + if (nonbitmap_invlist) { + assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS); + invlist = invlist_clone(nonbitmap_invlist); + } + else { /* Worst case size is every other code point is matched */ + invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2); + } + + if (flags) { + if (OP(node) == ANYOFD) { + + /* This flag indicates that the code points below 0x100 in the + * nonbitmap list are precisely the ones that match only when the + * target is UTF-8 (they should all be non-ASCII). */ + if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP) + { + _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8); + _invlist_subtract(invlist, only_utf8, &invlist); + } + + /* And this flag for matching all non-ASCII 0xFF and below */ + if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER) + { + not_utf8 = invlist_clone(PL_UpperLatin1); + } + } + else if (OP(node) == ANYOFL) { + + /* If either of these flags are set, what matches isn't + * determinable except during execution, so don't know enough here + * to invert */ + if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) { + inverting_allowed = FALSE; + } + + /* What the posix classes match also varies at runtime, so these + * will be output symbolically. */ + if (ANYOF_POSIXL_TEST_ANY_SET(node)) { + int i; + + posixes = newSVpvs(""); + for (i = 0; i < ANYOF_POSIXL_MAX; i++) { + if (ANYOF_POSIXL_TEST(node,i)) { + sv_catpv(posixes, anyofs[i]); + } } } + } + } - /* Everything between them is a single range that should be output - * */ - put_range(sv, i, j - 1); - has_output_anything = TRUE; - i = j; + /* Accumulate the bit map into the unconditional match list */ + for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { + if (BITMAP_TEST(bitmap, i)) { + int start = i++; + for (; i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); i++) { + /* empty */ + } + invlist = _add_range_to_invlist(invlist, start, i-1); } } - return has_output_anything; + /* Make sure that the conditional match lists don't have anything in them + * that match unconditionally; otherwise the output is quite confusing. + * This could happen if the code that populates these misses some + * duplication. */ + if (only_utf8) { + _invlist_subtract(only_utf8, invlist, &only_utf8); + } + if (not_utf8) { + _invlist_subtract(not_utf8, invlist, ¬_utf8); + } + + if (only_utf8_locale_invlist) { + + /* Since this list is passed in, we have to make a copy before + * modifying it */ + only_utf8_locale = invlist_clone(only_utf8_locale_invlist); + + _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale); + + /* And, it can get really weird for us to try outputting an inverted + * form of this list when it has things above the bitmap, so don't even + * try */ + if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { + inverting_allowed = FALSE; + } + } + + /* Calculate what the output would be if we take the input as-is */ + as_is_display = put_charclass_bitmap_innards_common(invlist, + posixes, + only_utf8, + not_utf8, + only_utf8_locale, + invert); + + /* If have to take the output as-is, just do that */ + if (! inverting_allowed) { + sv_catsv(sv, as_is_display); + } + else { /* But otherwise, create the output again on the inverted input, and + use whichever version is shorter */ + + int inverted_bias, as_is_bias; + + /* We will apply our bias to whichever of the the results doesn't have + * the '^' */ + if (invert) { + invert = FALSE; + as_is_bias = bias; + inverted_bias = 0; + } + else { + invert = TRUE; + as_is_bias = 0; + inverted_bias = bias; + } + + /* Now invert each of the lists that contribute to the output, + * excluding from the result things outside the possible range */ + + /* For the unconditional inversion list, we have to add in all the + * conditional code points, so that when inverted, they will be gone + * from it */ + _invlist_union(only_utf8, invlist, &invlist); + _invlist_union(not_utf8, invlist, &invlist); + _invlist_union(only_utf8_locale, invlist, &invlist); + _invlist_invert(invlist); + _invlist_intersection(invlist, PL_InBitmap, &invlist); + + if (only_utf8) { + _invlist_invert(only_utf8); + _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8); + } + + if (not_utf8) { + _invlist_invert(not_utf8); + _invlist_intersection(not_utf8, PL_UpperLatin1, ¬_utf8); + } + + if (only_utf8_locale) { + _invlist_invert(only_utf8_locale); + _invlist_intersection(only_utf8_locale, + PL_InBitmap, + &only_utf8_locale); + } + + inverted_display = put_charclass_bitmap_innards_common( + invlist, + posixes, + only_utf8, + not_utf8, + only_utf8_locale, invert); + + /* Use the shortest representation, taking into account our bias + * against showing it inverted */ + if (SvCUR(inverted_display) + inverted_bias + < SvCUR(as_is_display) + as_is_bias) + { + sv_catsv(sv, inverted_display); + } + else { + sv_catsv(sv, as_is_display); + } + + SvREFCNT_dec_NN(as_is_display); + SvREFCNT_dec_NN(inverted_display); + } + + SvREFCNT_dec_NN(invlist); + SvREFCNT_dec(only_utf8); + SvREFCNT_dec(not_utf8); + SvREFCNT_dec(posixes); + SvREFCNT_dec(only_utf8_locale); + + return SvCUR(sv) > orig_sv_cur; } -#define CLEAR_OPTSTART \ +#define CLEAR_OPTSTART \ if (optstart) STMT_START { \ - DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \ + DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \ " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ - optstart=NULL; \ + optstart=NULL; \ } STMT_END #define DUMPUNTIL(b,e) \ @@ -16625,7 +20211,6 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, const regnode *last, const regnode *plast, SV* sv, I32 indent, U32 depth) { - dVAR; U8 op = PSEUDO; /* Arbitrary non-END op. */ const regnode *next; const regnode *optstart= NULL; @@ -16636,7 +20221,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, PERL_ARGS_ASSERT_DUMPUNTIL; #ifdef DEBUG_DUMPUNTIL - PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, + Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n",indent,node-start, last ? last-start : 0,plast ? plast-start : 0); #endif @@ -16644,6 +20229,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, last= plast; while (PL_regkind[op] != END && (!last || node < last)) { + assert(node); /* While that wasn't END last time... */ NODE_ALIGN(node); op = OP(node); @@ -16660,19 +20246,19 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else CLEAR_OPTSTART; - regprop(r, sv, node, NULL); - PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), + regprop(r, sv, node, NULL, NULL); + Perl_re_printf( aTHX_ "%4"IVdf":%*s%s", (IV)(node - start), (int)(2*indent + 1), "", SvPVX_const(sv)); if (OP(node) != OPTIMIZED) { if (next == NULL) /* Next ptr. */ - PerlIO_printf(Perl_debug_log, " (0)"); + Perl_re_printf( aTHX_ " (0)"); else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH ) - PerlIO_printf(Perl_debug_log, " (FAIL)"); + Perl_re_printf( aTHX_ " (FAIL)"); else - PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start)); - (void)PerlIO_putc(Perl_debug_log, '\n'); + Perl_re_printf( aTHX_ " (%"IVdf")", (IV)(next - start)); + Perl_re_printf( aTHX_ "\n"); } after_print: @@ -16710,8 +20296,8 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { SV ** const elem_ptr = av_fetch(trie_words,word_idx,0); - PerlIO_printf(Perl_debug_log, "%*s%s ", - (int)(2*(indent+3)), "", + Perl_re_indentf( aTHX_ "%s ", + indent+3, elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, @@ -16726,7 +20312,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, ); if (trie->jump) { U16 dist= trie->jump[word_idx+1]; - PerlIO_printf(Perl_debug_log, "(%"UVuf")\n", + Perl_re_printf( aTHX_ "(%"UVuf")\n", (UV)((dist ? this_trie + dist : next) - start)); if (dist) { if (!nextbranch) @@ -16736,7 +20322,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) nextbranch= regnext((regnode *)nextbranch); } else { - PerlIO_printf(Perl_debug_log, "\n"); + Perl_re_printf( aTHX_ "\n"); } } if (last && next > last) @@ -16757,7 +20343,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else if (PL_regkind[(U8)op] == ANYOF) { /* arglen 1 + class block */ - node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL) + node += 1 + ((ANYOF_FLAGS(node) & ANYOF_MATCHES_POSIXL) ? ANYOF_POSIXL_SKIP : ANYOF_SKIP); node = NEXTOPER(node); @@ -16776,7 +20362,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } CLEAR_OPTSTART; #ifdef DEBUG_DUMPUNTIL - PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent); + Perl_re_printf( aTHX_ "--- %d\n", (int)indent); #endif return node; } @@ -16784,11 +20370,5 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, #endif /* DEBUGGING */ /* - * Local variables: - * c-indentation-style: bsd - * c-basic-offset: 4 - * indent-tabs-mode: nil - * End: - * * ex: set ts=8 sts=4 sw=4 et: */ |