diff options
Diffstat (limited to 'gnu/usr.bin/perl/regcomp_internal.h')
-rw-r--r-- | gnu/usr.bin/perl/regcomp_internal.h | 1261 |
1 files changed, 1261 insertions, 0 deletions
diff --git a/gnu/usr.bin/perl/regcomp_internal.h b/gnu/usr.bin/perl/regcomp_internal.h new file mode 100644 index 00000000000..6ea42640edf --- /dev/null +++ b/gnu/usr.bin/perl/regcomp_internal.h @@ -0,0 +1,1261 @@ +#ifndef REGCOMP_INTERNAL_H +#define REGCOMP_INTERNAL_H +#ifndef STATIC +#define STATIC static +#endif +#ifndef RE_OPTIMIZE_CURLYX_TO_CURLYM +#define RE_OPTIMIZE_CURLYX_TO_CURLYM 1 +#endif +#ifndef RE_OPTIMIZE_CURLYX_TO_CURLYN +#define RE_OPTIMIZE_CURLYX_TO_CURLYN 1 +#endif + +/* this is a chain of data about sub patterns we are processing that + need to be handled separately/specially in study_chunk. Its so + we can simulate recursion without losing state. */ +struct scan_frame; +typedef struct scan_frame { + regnode *last_regnode; /* last node to process in this frame */ + regnode *next_regnode; /* next node to process when last is reached */ + U32 prev_recursed_depth; + I32 stopparen; /* what stopparen do we use */ + bool in_gosub; /* this or an outer frame is for GOSUB */ + + struct scan_frame *this_prev_frame; /* this previous frame */ + struct scan_frame *prev_frame; /* previous frame */ + struct scan_frame *next_frame; /* next frame */ +} scan_frame; + +/* Certain characters are output as a sequence with the first being a + * backslash. */ +#define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c) + + +struct RExC_state_t { + U32 flags; /* RXf_* are we folding, multilining? */ + U32 pm_flags; /* PMf_* stuff from the calling PMOP */ + char *precomp; /* uncompiled string. */ + char *precomp_end; /* pointer to end of uncompiled string. */ + REGEXP *rx_sv; /* The SV that is the regexp. */ + regexp *rx; /* perl core regexp structure */ + regexp_internal *rxi; /* internal data for regexp object + pprivate field */ + char *start; /* Start of input for compile */ + char *end; /* End of input for compile */ + char *parse; /* Input-scan pointer. */ + char *copy_start; /* start of copy of input within + constructed parse string */ + char *save_copy_start; /* Provides one level of saving + and restoring 'copy_start' */ + char *copy_start_in_input; /* Position in input string + corresponding to copy_start */ + SSize_t whilem_seen; /* number of WHILEM in this expr */ + regnode *emit_start; /* Start of emitted-code area */ + regnode_offset emit; /* Code-emit pointer */ + I32 naughty; /* How bad is this pattern? */ + I32 sawback; /* Did we see \1, ...? */ + SSize_t size; /* Number of regnode equivalents in + pattern */ + Size_t sets_depth; /* Counts recursion depth of already- + compiled regex set patterns */ + U32 seen; + + I32 parens_buf_size; /* #slots malloced open/close_parens */ + regnode_offset *open_parens; /* offsets to open parens */ + regnode_offset *close_parens; /* offsets to close parens */ + HV *paren_names; /* Paren names */ + + /* position beyond 'precomp' of the warning message furthest away from + * 'precomp'. During the parse, no warnings are raised for any problems + * earlier in the parse than this position. This works if warnings are + * raised the first time a given spot is parsed, and if only one + * independent warning is raised for any given spot */ + Size_t latest_warn_offset; + + /* Branch reset /(?|...|...)/ gives us two concepts of capture buffer id. + * "Logical Parno" is the user visible view with branch reset taken into + * account. "Parno" (or physical parno) is the actual capture buffers in + * the pattern *NOT* taking into account branch reset. We also maintain + * a map of "next" pointers which allow us to skip to the next physical + * capture buffer with the same logical id, with 0 representing "none". + * + * As we compile we keep track of the two different counts using the + * 'logical_npar' and 'npar' members, and we keep track of the upper bound + * of both in 'total_par' and 'logical_total_par', we also populate + * the 'logical_to_parno' map, which gives us the first physical parno + * for a given logical parno, and the `parno_to_logical` array which gives + * us the logical id for each physical parno. When compilation is + * completed we construct the 'parno_to_logical_next' array from the + * 'parno_to_logical' array. (We do not bother constructing it during + * compilation as we do not need it, and we can construct it in O(N) time + * once we are done, but would need more complicated logic during the + * compile, because we want the next pointers to go from smallest to + * largest, eg, left to right.) + * + * Logical: $1 $2 $3 $4 $2 $3 $2 $5 + * Physical: 1 2 3 4 5 6 7 8 + * Next: 0 5 6 0 7 0 0 0 + * Pattern /(a) (?| (b) (c) (d) | (e) (f) | (g) ) (h)/ + * + * As much as possible the internals use and store the physical id of + * of capture buffers. We decode the physical to the logical only when + * we need to, for instance when someone use $2. + * + * Note that when branch reset is not used logical and physical are the + * same and the next data would be all zero. So when branch reset is not + * used we do not need to populate this data into the final regexp. + * + */ + I32 *logical_to_parno; /* logical_parno to parno */ + I32 *parno_to_logical; /* parno to logical_parno */ + I32 *parno_to_logical_next; /* parno to next (greater value) + parno with the same + logical_parno as parno.*/ + + I32 npar; /* Capture buffer count so far in the + parse, (OPEN) plus one. ("par" 0 is + the whole pattern)*/ + I32 logical_npar; /* Logical version of npar */ + I32 total_par; /* During initial parse, is either 0, + or -1; the latter indicating a + reparse is needed. After that pass, + it is what 'npar' became after the + pass. Hence, it being > 0 indicates + we are in a reparse situation */ + I32 logical_total_par; /* Logical version to total par */ + I32 nestroot; /* root parens we are in - used by + accept */ + I32 seen_zerolen; + regnode *end_op; /* END node in program */ + I32 utf8; /* whether the pattern is utf8 or not */ + I32 orig_utf8; /* whether the pattern was originally in utf8 */ + /* XXX use this for future optimisation of case + * where pattern must be upgraded to utf8. */ + I32 uni_semantics; /* If a d charset modifier should use unicode + rules, even if the pattern is not in + utf8 */ + + I32 recurse_count; /* Number of recurse regops we have generated */ + regnode **recurse; /* Recurse regops */ + U8 *study_chunk_recursed; /* bitmap of which subs we have moved + through */ + U32 study_chunk_recursed_bytes; /* bytes in bitmap */ + I32 in_lookaround; + I32 contains_locale; + I32 override_recoding; + I32 recode_x_to_native; + I32 in_multi_char_class; + int code_index; /* next code_blocks[] slot */ + struct reg_code_blocks *code_blocks;/* positions of literal (?{}) + within pattern */ + SSize_t maxlen; /* mininum possible number of chars in string to match */ + scan_frame *frame_head; + scan_frame *frame_last; + U32 frame_count; + AV *warn_text; + HV *unlexed_names; + SV *runtime_code_qr; /* qr with the runtime code blocks */ +#ifdef DEBUGGING + const char *lastparse; + I32 lastnum; + U32 study_chunk_recursed_count; + AV *paren_name_list; /* idx -> name */ + SV *mysv1; + SV *mysv2; +#endif + bool seen_d_op; + bool strict; + bool study_started; + bool in_script_run; + bool use_BRANCHJ; + bool sWARN_EXPERIMENTAL__VLB; + bool sWARN_EXPERIMENTAL__REGEX_SETS; +}; + +#ifdef DEBUGGING +#define RExC_lastparse (pRExC_state->lastparse) +#define RExC_lastnum (pRExC_state->lastnum) +#define RExC_paren_name_list (pRExC_state->paren_name_list) +#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count) +#define RExC_mysv (pRExC_state->mysv1) +#define RExC_mysv1 (pRExC_state->mysv1) +#define RExC_mysv2 (pRExC_state->mysv2) +#endif + +#define RExC_flags (pRExC_state->flags) +#define RExC_pm_flags (pRExC_state->pm_flags) +#define RExC_precomp (pRExC_state->precomp) +#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input) +#define RExC_copy_start_in_constructed (pRExC_state->copy_start) +#define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start) +#define RExC_precomp_end (pRExC_state->precomp_end) +#define RExC_rx_sv (pRExC_state->rx_sv) +#define RExC_rx (pRExC_state->rx) +#define RExC_rxi (pRExC_state->rxi) +#define RExC_start (pRExC_state->start) +#define RExC_end (pRExC_state->end) +#define RExC_parse (pRExC_state->parse) +#define RExC_latest_warn_offset (pRExC_state->latest_warn_offset ) +#define RExC_whilem_seen (pRExC_state->whilem_seen) +#define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs + under /d from /u ? */ + +#define RExC_emit (pRExC_state->emit) +#define RExC_emit_start (pRExC_state->emit_start) +#define RExC_sawback (pRExC_state->sawback) +#define RExC_seen (pRExC_state->seen) +#define RExC_size (pRExC_state->size) +#define RExC_maxlen (pRExC_state->maxlen) +#define RExC_logical_npar (pRExC_state->logical_npar) +#define RExC_logical_total_parens (pRExC_state->logical_total_par) +#define RExC_logical_to_parno (pRExC_state->logical_to_parno) +#define RExC_parno_to_logical (pRExC_state->parno_to_logical) +#define RExC_parno_to_logical_next (pRExC_state->parno_to_logical_next) +#define RExC_npar (pRExC_state->npar) +#define RExC_total_parens (pRExC_state->total_par) +#define RExC_parens_buf_size (pRExC_state->parens_buf_size) +#define RExC_nestroot (pRExC_state->nestroot) +#define RExC_seen_zerolen (pRExC_state->seen_zerolen) +#define RExC_utf8 (pRExC_state->utf8) +#define RExC_uni_semantics (pRExC_state->uni_semantics) +#define RExC_orig_utf8 (pRExC_state->orig_utf8) +#define RExC_open_parens (pRExC_state->open_parens) +#define RExC_close_parens (pRExC_state->close_parens) +#define RExC_end_op (pRExC_state->end_op) +#define RExC_paren_names (pRExC_state->paren_names) +#define RExC_recurse (pRExC_state->recurse) +#define RExC_recurse_count (pRExC_state->recurse_count) +#define RExC_sets_depth (pRExC_state->sets_depth) +#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed) +#define RExC_study_chunk_recursed_bytes \ + (pRExC_state->study_chunk_recursed_bytes) +#define RExC_in_lookaround (pRExC_state->in_lookaround) +#define RExC_contains_locale (pRExC_state->contains_locale) +#define RExC_recode_x_to_native (pRExC_state->recode_x_to_native) + +#ifdef EBCDIC +# define SET_recode_x_to_native(x) \ + STMT_START { RExC_recode_x_to_native = (x); } STMT_END +#else +# define SET_recode_x_to_native(x) NOOP +#endif + +#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) +#define RExC_frame_head (pRExC_state->frame_head) +#define RExC_frame_last (pRExC_state->frame_last) +#define RExC_frame_count (pRExC_state->frame_count) +#define RExC_strict (pRExC_state->strict) +#define RExC_study_started (pRExC_state->study_started) +#define RExC_warn_text (pRExC_state->warn_text) +#define RExC_in_script_run (pRExC_state->in_script_run) +#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ) +#define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB) +#define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS) +#define RExC_unlexed_names (pRExC_state->unlexed_names) + + +/***********************************************************************/ +/* UTILITY MACROS FOR ADVANCING OR SETTING THE PARSE "CURSOR" RExC_parse + * + * All of these macros depend on the above RExC_ accessor macros, which + * in turns depend on a variable pRExC_state being in scope where they + * are used. This is the standard regexp parser context variable which is + * passed into every non-trivial parse function in this file. + * + * Note that the UTF macro is itself a wrapper around RExC_utf8, so all + * of the macros which do not take an argument will operate on the + * pRExC_state structure *only*. + * + * Please do NOT modify RExC_parse without using these macros. In the + * future these macros will be extended for enhanced debugging and trace + * output during the parse process. + */ + +/* RExC_parse_incf(flag) + * + * Increment RExC_parse to point at the next codepoint, while doing + * the right thing depending on whether we are parsing UTF-8 strings + * or not. The 'flag' argument determines if content is UTF-8 or not, + * intended for cases where this is NOT governed by the UTF macro. + * + * Use RExC_parse_inc() if UTF-8ness is controlled by the UTF macro. + * + * WARNING: Does NOT take into account RExC_end; it is the callers + * responsibility to make sure there are enough octets left in + * RExC_parse to ensure that when processing UTF-8 we would not read + * past the end of the string. + */ +#define RExC_parse_incf(flag) STMT_START { \ + RExC_parse += (flag) ? UTF8SKIP(RExC_parse) : 1; \ +} STMT_END + +/* RExC_parse_inc_safef(flag) + * + * Safely increment RExC_parse to point at the next codepoint, + * doing the right thing depending on whether we are parsing + * UTF-8 strings or not and NOT reading past the end of the buffer. + * The 'flag' argument determines if content is UTF-8 or not, + * intended for cases where this is NOT governed by the UTF macro. + * + * Use RExC_parse_safe() if UTF-8ness is controlled by the UTF macro. + * + * NOTE: Will NOT read past RExC_end when content is UTF-8. + */ +#define RExC_parse_inc_safef(flag) STMT_START { \ + RExC_parse += (flag) ? UTF8_SAFE_SKIP(RExC_parse,RExC_end) : 1; \ +} STMT_END + +/* RExC_parse_inc() + * + * Increment RExC_parse to point at the next codepoint, + * doing the right thing depending on whether we are parsing + * UTF-8 strings or not. + * + * WARNING: Does NOT take into account RExC_end, it is the callers + * responsibility to make sure there are enough octets left in + * RExC_parse to ensure that when processing UTF-8 we would not read + * past the end of the string. + * + * NOTE: whether we are parsing UTF-8 or not is determined by the + * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this + * macro operates on the pRExC_state structure only. + */ +#define RExC_parse_inc() RExC_parse_incf(UTF) + +/* RExC_parse_inc_safe() + * + * Safely increment RExC_parse to point at the next codepoint, + * doing the right thing depending on whether we are parsing + * UTF-8 strings or not and NOT reading past the end of the buffer. + * + * NOTE: whether we are parsing UTF-8 or not is determined by the + * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this + * macro operates on the pRExC_state structure only. + */ +#define RExC_parse_inc_safe() RExC_parse_inc_safef(UTF) + +/* RExC_parse_inc_utf8() + * + * Increment RExC_parse to point at the next utf8 codepoint, + * assumes content is UTF-8. + * + * WARNING: Does NOT take into account RExC_end; it is the callers + * responsibility to make sure there are enough octets left in RExC_parse + * to ensure that when processing UTF-8 we would not read past the end + * of the string. + */ +#define RExC_parse_inc_utf8() STMT_START { \ + RExC_parse += UTF8SKIP(RExC_parse); \ +} STMT_END + +/* RExC_parse_inc_if_char() + * + * Increment RExC_parse to point at the next codepoint, if and only + * if the current parse point is NOT a NULL, while doing the right thing + * depending on whether we are parsing UTF-8 strings or not. + * + * WARNING: Does NOT take into account RExC_end, it is the callers + * responsibility to make sure there are enough octets left in RExC_parse + * to ensure that when processing UTF-8 we would not read past the end + * of the string. + * + * NOTE: whether we are parsing UTF-8 or not is determined by the + * UTF macro which is defined as cBOOL(RExC_parse_utf8), thus this + * macro operates on the pRExC_state structure only. + */ +#define RExC_parse_inc_if_char() STMT_START { \ + RExC_parse += SKIP_IF_CHAR(RExC_parse,RExC_end); \ +} STMT_END + +/* RExC_parse_inc_by(n_octets) + * + * Increment the parse cursor by the number of octets specified by + * the 'n_octets' argument. + * + * NOTE: Does NOT check ANY constraints. It is the callers responsibility + * that this will not move past the end of the string, or leave the + * pointer in the middle of a UTF-8 sequence. + * + * Typically used to advanced past previously analyzed content. + */ +#define RExC_parse_inc_by(n_octets) STMT_START { \ + RExC_parse += (n_octets); \ +} STMT_END + +/* RExC_parse_set(to_ptr) + * + * Sets the RExC_parse pointer to the pointer specified by the 'to' + * argument. No validation whatsoever is performed on the to pointer. + */ +#define RExC_parse_set(to_ptr) STMT_START { \ + RExC_parse = (to_ptr); \ +} STMT_END + +/**********************************************************************/ + +/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set + * a flag to disable back-off on the fixed/floating substrings - if it's + * a high complexity pattern we assume the benefit of avoiding a full match + * is worth the cost of checking for the substrings even if they rarely help. + */ +#define RExC_naughty (pRExC_state->naughty) +#define TOO_NAUGHTY (10) +#define MARK_NAUGHTY(add) \ + if (RExC_naughty < TOO_NAUGHTY) \ + RExC_naughty += (add) +#define MARK_NAUGHTY_EXP(exp, add) \ + if (RExC_naughty < TOO_NAUGHTY) \ + RExC_naughty += RExC_naughty / (exp) + (add) + +#define isNON_BRACE_QUANTIFIER(c) ((c) == '*' || (c) == '+' || (c) == '?') +#define isQUANTIFIER(s,e) ( isNON_BRACE_QUANTIFIER(*s) \ + || ((*s) == '{' && regcurly(s, e, NULL))) + +/* + * Flags to be passed up. + */ +#define HASWIDTH 0x01 /* Known to not match null strings, could match + non-null ones. */ +#define SIMPLE 0x02 /* Exactly one character wide */ + /* (or LNBREAK as a special case) */ +#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */ +#define TRYAGAIN 0x10 /* Weeded out a declaration. */ +#define RESTART_PARSE 0x20 /* Need to redo the parse */ +#define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to + calcuate sizes as UTF-8 */ + +#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) + +/* whether trie related optimizations are enabled */ +#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION +#define TRIE_STUDY_OPT +#define FULL_TRIE_STUDY +#define TRIE_STCLASS +#endif + +/* About the term "restudy" and the var "restudied" and the defines + * "SCF_TRIE_RESTUDY" and "SCF_TRIE_DOING_RESTUDY": All of these relate to + * doing multiple study_chunk() calls over the same set of opcodes for* the + * purpose of enhanced TRIE optimizations. + * + * Specifically, when TRIE_STUDY_OPT is defined, and it is defined in normal + * builds, (see above), during compilation SCF_TRIE_RESTUDY may be enabled + * which then causes the Perl_re_op_compile() to then call the optimizer + * S_study_chunk() a second time to perform additional optimizations, + * including the aho_corasick startclass optimization. + * This additional pass will only happen once, which is managed by the + * 'restudied' variable in Perl_re_op_compile(). + * + * When this second pass is under way the flags passed into study_chunk() will + * include SCF_TRIE_DOING_RESTUDY and this flag is and must be cascaded down + * to any recursive calls to S_study_chunk(). + * + * IMPORTANT: Any logic in study_chunk() that emits warnings should check that + * the SCF_TRIE_DOING_RESTUDY flag is NOT set in 'flags', or the warning may + * be produced twice. + * + * See commit 07be1b83a6b2d24b492356181ddf70e1c7917ae3 and + * 688e03912e3bff2d2419c457d8b0e1bab3eb7112 for more details. + */ + + +#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3] +#define PBITVAL(paren) (1 << ((paren) & 7)) +#define PAREN_OFFSET(depth) \ + (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes) +#define PAREN_TEST(depth, paren) \ + (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren)) +#define PAREN_SET(depth, paren) \ + (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren)) +#define PAREN_UNSET(depth, paren) \ + (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren)) + +#define REQUIRE_UTF8(flagp) STMT_START { \ + if (!UTF) { \ + *flagp = RESTART_PARSE|NEED_UTF8; \ + return 0; \ + } \ + } STMT_END + +/* /u is to be chosen if we are supposed to use Unicode rules, or if the + * pattern is in UTF-8. This latter condition is in case the outermost rules + * are locale. See GH #17278 */ +#define toUSE_UNI_CHARSET_NOT_DEPENDS (RExC_uni_semantics || UTF) + +/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is + * a flag that indicates we need to override /d with /u as a result of + * something in the pattern. It should only be used in regards to calling + * set_regex_charset() or get_regex_charset() */ +#define REQUIRE_UNI_RULES(flagp, restart_retval) \ + STMT_START { \ + if (DEPENDS_SEMANTICS) { \ + set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \ + RExC_uni_semantics = 1; \ + if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \ + /* No need to restart the parse if we haven't seen \ + * anything that differs between /u and /d, and no need \ + * to restart immediately if we're going to reparse \ + * anyway to count parens */ \ + *flagp |= RESTART_PARSE; \ + return restart_retval; \ + } \ + } \ + } STMT_END + +#define REQUIRE_BRANCHJ(flagp, restart_retval) \ + STMT_START { \ + RExC_use_BRANCHJ = 1; \ + *flagp |= RESTART_PARSE; \ + return restart_retval; \ + } STMT_END + +/* Until we have completed the parse, we leave RExC_total_parens at 0 or + * less. After that, it must always be positive, because the whole re is + * considered to be surrounded by virtual parens. Setting it to negative + * indicates there is some construct that needs to know the actual number of + * parens to be properly handled. And that means an extra pass will be + * required after we've counted them all */ +#define ALL_PARENS_COUNTED (RExC_total_parens > 0) +#define REQUIRE_PARENS_PASS \ + STMT_START { /* No-op if have completed a pass */ \ + if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \ + } STMT_END +#define IN_PARENS_PASS (RExC_total_parens < 0) + + +/* This is used to return failure (zero) early from the calling function if + * various flags in 'flags' are set. Two flags always cause a return: + * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any + * additional flags that should cause a return; 0 if none. If the return will + * be done, '*flagp' is first set to be all of the flags that caused the + * return. */ +#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \ + STMT_START { \ + if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \ + *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \ + return 0; \ + } \ + } STMT_END + +#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE)) + +#define RETURN_FAIL_ON_RESTART(flags,flagp) \ + RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0) +#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \ + if (MUST_RESTART(*(flagp))) return 0 + +/* This converts the named class defined in regcomp.h to its equivalent class + * number defined in handy.h. */ +#define namedclass_to_classnum(class) ((int) ((class) / 2)) +#define classnum_to_namedclass(classnum) ((classnum) * 2) + +#define _invlist_union_complement_2nd(a, b, output) \ + _invlist_union_maybe_complement_2nd(a, b, TRUE, output) +#define _invlist_intersection_complement_2nd(a, b, output) \ + _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output) + +/* We add a marker if we are deferring expansion of a property that is both + * 1) potentiallly user-defined; and + * 2) could also be an official Unicode property. + * + * Without this marker, any deferred expansion can only be for a user-defined + * one. This marker shouldn't conflict with any that could be in a legal name, + * and is appended to its name to indicate this. There is a string and + * character form */ +#define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~" +#define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~' + +/* What is infinity for optimization purposes */ +#define OPTIMIZE_INFTY SSize_t_MAX + +/* About scan_data_t. + + During optimisation we recurse through the regexp program performing + various inplace (keyhole style) optimisations. In addition study_chunk + and scan_commit populate this data structure with information about + what strings MUST appear in the pattern. We look for the longest + string that must appear at a fixed location, and we look for the + longest string that may appear at a floating location. So for instance + in the pattern: + + /FOO[xX]A.*B[xX]BAR/ + + Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating + strings (because they follow a .* construct). study_chunk will identify + both FOO and BAR as being the longest fixed and floating strings respectively. + + The strings can be composites, for instance + + /(f)(o)(o)/ + + will result in a composite fixed substring 'foo'. + + For each string some basic information is maintained: + + - min_offset + This is the position the string must appear at, or not before. + It also implicitly (when combined with minlenp) tells us how many + characters must match before the string we are searching for. + Likewise when combined with minlenp and the length of the string it + tells us how many characters must appear after the string we have + found. + + - max_offset + Only used for floating strings. This is the rightmost point that + the string can appear at. If set to OPTIMIZE_INFTY it indicates that the + string can occur infinitely far to the right. + For fixed strings, it is equal to min_offset. + + - minlenp + A pointer to the minimum number of characters of the pattern that the + string was found inside. This is important as in the case of positive + lookahead or positive lookbehind we can have multiple patterns + involved. Consider + + /(?=FOO).*F/ + + The minimum length of the pattern overall is 3, the minimum length + of the lookahead part is 3, but the minimum length of the part that + will actually match is 1. So 'FOO's minimum length is 3, but the + minimum length for the F is 1. This is important as the minimum length + is used to determine offsets in front of and behind the string being + looked for. Since strings can be composites this is the length of the + pattern at the time it was committed with a scan_commit. Note that + the length is calculated by study_chunk, so that the minimum lengths + are not known until the full pattern has been compiled, thus the + pointer to the value. + + - lookbehind + + In the case of lookbehind the string being searched for can be + offset past the start point of the final matching string. + If this value was just blithely removed from the min_offset it would + invalidate some of the calculations for how many chars must match + before or after (as they are derived from min_offset and minlen and + the length of the string being searched for). + When the final pattern is compiled and the data is moved from the + scan_data_t structure into the regexp structure the information + about lookbehind is factored in, with the information that would + have been lost precalculated in the end_shift field for the + associated string. + + The fields pos_min and pos_delta are used to store the minimum offset + and the delta to the maximum offset at the current point in the pattern. + +*/ + +struct scan_data_substrs { + SV *str; /* longest substring found in pattern */ + SSize_t min_offset; /* earliest point in string it can appear */ + SSize_t max_offset; /* latest point in string it can appear */ + SSize_t *minlenp; /* pointer to the minlen relevant to the string */ + SSize_t lookbehind; /* is the pos of the string modified by LB */ + I32 flags; /* per substring SF_* and SCF_* flags */ +}; + +/* this is typedef'ed in perl.h */ +struct scan_data_t { + /*I32 len_min; unused */ + /*I32 len_delta; unused */ + SSize_t pos_min; + SSize_t pos_delta; + SV *last_found; + SSize_t last_end; /* min value, <0 unless valid. */ + SSize_t last_start_min; + SSize_t last_start_max; + U8 cur_is_floating; /* whether the last_* values should be set as + * the next fixed (0) or floating (1) + * substring */ + + /* [0] is longest fixed substring so far, [1] is longest float so far */ + struct scan_data_substrs substrs[2]; + + I32 flags; /* common SF_* and SCF_* flags */ + I32 whilem_c; + SSize_t *last_closep; + regnode **last_close_opp; /* pointer to pointer to last CLOSE regop + seen. DO NOT DEREFERENCE the regnode + pointer - the op may have been optimized + away */ + regnode_ssc *start_class; +}; + +/* + * Forward declarations for pregcomp()'s friends. + */ + +static const scan_data_t zero_scan_data = { + 0, 0, NULL, 0, 0, 0, 0, + { + { NULL, 0, 0, 0, 0, 0 }, + { NULL, 0, 0, 0, 0, 0 }, + }, + 0, 0, NULL, NULL, NULL +}; + +/* study flags */ + +#define SF_BEFORE_SEOL 0x0001 +#define SF_BEFORE_MEOL 0x0002 +#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) + +#define SF_IS_INF 0x0040 +#define SF_HAS_PAR 0x0080 +#define SF_IN_PAR 0x0100 +#define SF_HAS_EVAL 0x0200 + + +/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the + * longest substring in the pattern. When it is not set the optimiser keeps + * track of position, but does not keep track of the actual strings seen, + * + * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but + * /foo/i will not. + * + * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble" + * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be + * turned off because of the alternation (BRANCH). */ +#define SCF_DO_SUBSTR 0x0400 + +#define SCF_DO_STCLASS_AND 0x0800 +#define SCF_DO_STCLASS_OR 0x1000 +#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) +#define SCF_WHILEM_VISITED_POS 0x2000 + +#define SCF_TRIE_RESTUDY 0x4000 /* Need to do restudy in study_chunk()? + Search for "restudy" in this file + to find a detailed explanation.*/ +#define SCF_SEEN_ACCEPT 0x8000 +#define SCF_TRIE_DOING_RESTUDY 0x10000 /* Are we in restudy right now? + Search for "restudy" in this file + to find a detailed explanation. */ +#define SCF_IN_DEFINE 0x20000 + + + +#define UTF cBOOL(RExC_utf8) + +/* The enums for all these are ordered so things work out correctly */ +#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) +#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \ + == REGEX_DEPENDS_CHARSET) +#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) +#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \ + >= REGEX_UNICODE_CHARSET) +#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_RESTRICTED_CHARSET) +#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \ + >= REGEX_ASCII_RESTRICTED_CHARSET) +#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \ + == REGEX_ASCII_MORE_RESTRICTED_CHARSET) + +#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) + +/* For programs that want to be strictly Unicode compatible by dying if any + * attempt is made to match a non-Unicode code point against a Unicode + * property. */ +#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE)) + +#define OOB_NAMEDCLASS -1 + +/* There is no code point that is out-of-bounds, so this is problematic. But + * its only current use is to initialize a variable that is always set before + * looked at. */ +#define OOB_UNICODE 0xDEADBEEF + +#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) + + +/* length of regex to show in messages that don't mark a position within */ +#define RegexLengthToShowInErrorMessages 127 + +/* + * If MARKER[12] are adjusted, be sure to adjust the constants at the top + * of t/op/regmesg.t, the tests in t/op/re_tests, and those in + * op/pragma/warn/regcomp. + */ +#define MARKER1 "<-- HERE" /* marker as it appears in the description */ +#define MARKER2 " <-- HERE " /* marker as it appears within the regex */ + +#define REPORT_LOCATION " in regex; marked by " MARKER1 \ + " in m/%" UTF8f MARKER2 "%" UTF8f "/" + +/* The code in this file in places uses one level of recursion with parsing + * rebased to an alternate string constructed by us in memory. This can take + * the form of something that is completely different from the input, or + * something that uses the input as part of the alternate. In the first case, + * there should be no possibility of an error, as we are in complete control of + * the alternate string. But in the second case we don't completely control + * the input portion, so there may be errors in that. Here's an example: + * /[abc\x{DF}def]/ui + * is handled specially because \x{df} folds to a sequence of more than one + * character: 'ss'. What is done is to create and parse an alternate string, + * which looks like this: + * /(?:\x{DF}|[abc\x{DF}def])/ui + * where it uses the input unchanged in the middle of something it constructs, + * which is a branch for the DF outside the character class, and clustering + * parens around the whole thing. (It knows enough to skip the DF inside the + * class while in this substitute parse.) 'abc' and 'def' may have errors that + * need to be reported. The general situation looks like this: + * + * |<------- identical ------>| + * sI tI xI eI + * Input: --------------------------------------------------------------- + * Constructed: --------------------------------------------------- + * sC tC xC eC EC + * |<------- identical ------>| + * + * sI..eI is the portion of the input pattern we are concerned with here. + * sC..EC is the constructed substitute parse string. + * sC..tC is constructed by us + * tC..eC is an exact duplicate of the portion of the input pattern tI..eI. + * In the diagram, these are vertically aligned. + * eC..EC is also constructed by us. + * xC is the position in the substitute parse string where we found a + * problem. + * xI is the position in the original pattern corresponding to xC. + * + * We want to display a message showing the real input string. Thus we need to + * translate from xC to xI. We know that xC >= tC, since the portion of the + * string sC..tC has been constructed by us, and so shouldn't have errors. We + * get: + * xI = tI + (xC - tC) + * + * When the substitute parse is constructed, the code needs to set: + * RExC_start (sC) + * RExC_end (eC) + * RExC_copy_start_in_input (tI) + * RExC_copy_start_in_constructed (tC) + * and restore them when done. + * + * During normal processing of the input pattern, both + * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to + * sI, so that xC equals xI. + */ + +#define sI RExC_precomp +#define eI RExC_precomp_end +#define sC RExC_start +#define eC RExC_end +#define tI RExC_copy_start_in_input +#define tC RExC_copy_start_in_constructed +#define xI(xC) (tI + (xC - tC)) +#define xI_offset(xC) (xI(xC) - sI) + +#define REPORT_LOCATION_ARGS(xC) \ + UTF8fARG(UTF, \ + (xI(xC) > eI) /* Don't run off end */ \ + ? eI - sI /* Length before the <--HERE */ \ + : ((xI_offset(xC) >= 0) \ + ? xI_offset(xC) \ + : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \ + IVdf " trying to output message for " \ + " pattern %.*s", \ + __FILE__, __LINE__, (IV) xI_offset(xC), \ + ((int) (eC - sC)), sC), 0)), \ + sI), /* The input pattern printed up to the <--HERE */ \ + UTF8fARG(UTF, \ + (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \ + (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */ + +/* Used to point after bad bytes for an error message, but avoid skipping + * past a nul byte. */ +#define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1) + +/* Set up to clean up after our imminent demise */ +#define PREPARE_TO_DIE \ + STMT_START { \ + if (RExC_rx_sv) \ + SAVEFREESV(RExC_rx_sv); \ + if (RExC_open_parens) \ + SAVEFREEPV(RExC_open_parens); \ + if (RExC_close_parens) \ + SAVEFREEPV(RExC_close_parens); \ + if (RExC_logical_to_parno) \ + SAVEFREEPV(RExC_logical_to_parno); \ + if (RExC_parno_to_logical) \ + SAVEFREEPV(RExC_parno_to_logical); \ + } STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given + * arg. Show regex, up to a maximum length. If it's too long, chop and add + * "...". + */ +#define _FAIL(code) STMT_START { \ + const char *ellipses = ""; \ + IV len = RExC_precomp_end - RExC_precomp; \ + \ + PREPARE_TO_DIE; \ + if (len > RegexLengthToShowInErrorMessages) { \ + /* chop 10 shorter than the max, to ensure meaning of "..." */ \ + len = RegexLengthToShowInErrorMessages - 10; \ + ellipses = "..."; \ + } \ + code; \ +} STMT_END + +#define FAIL(msg) _FAIL( \ + Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \ + msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) + +#define FAIL2(msg,arg) _FAIL( \ + Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \ + arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) + +#define FAIL3(msg,arg1,arg2) _FAIL( \ + Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \ + arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses)) + +/* + * Simple_vFAIL -- like FAIL, but marks the current location in the scan + */ +#define Simple_vFAIL(m) STMT_START { \ + Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ + m, REPORT_LOCATION_ARGS(RExC_parse)); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() + */ +#define vFAIL(m) STMT_START { \ + PREPARE_TO_DIE; \ + Simple_vFAIL(m); \ +} STMT_END + +/* + * Like Simple_vFAIL(), but accepts two arguments. + */ +#define Simple_vFAIL2(m,a1) STMT_START { \ + S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(RExC_parse)); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). + */ +#define vFAIL2(m,a1) STMT_START { \ + PREPARE_TO_DIE; \ + Simple_vFAIL2(m, a1); \ +} STMT_END + + +/* + * Like Simple_vFAIL(), but accepts three arguments. + */ +#define Simple_vFAIL3(m, a1, a2) STMT_START { \ + S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \ + REPORT_LOCATION_ARGS(RExC_parse)); \ +} STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). + */ +#define vFAIL3(m,a1,a2) STMT_START { \ + PREPARE_TO_DIE; \ + Simple_vFAIL3(m, a1, a2); \ +} STMT_END + +/* + * Like Simple_vFAIL(), but accepts four arguments. + */ +#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \ + S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3, \ + REPORT_LOCATION_ARGS(RExC_parse)); \ +} STMT_END + +#define vFAIL4(m,a1,a2,a3) STMT_START { \ + PREPARE_TO_DIE; \ + Simple_vFAIL4(m, a1, a2, a3); \ +} STMT_END + +/* A specialized version of vFAIL2 that works with UTF8f */ +#define vFAIL2utf8f(m, a1) STMT_START { \ + PREPARE_TO_DIE; \ + S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \ + REPORT_LOCATION_ARGS(RExC_parse)); \ +} STMT_END + +#define vFAIL3utf8f(m, a1, a2) STMT_START { \ + PREPARE_TO_DIE; \ + S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \ + REPORT_LOCATION_ARGS(RExC_parse)); \ +} STMT_END + +/* Setting this to NULL is a signal to not output warnings */ +#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \ + STMT_START { \ + RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\ + RExC_copy_start_in_constructed = NULL; \ + } STMT_END +#define RESTORE_WARNINGS \ + RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed + +/* Since a warning can be generated multiple times as the input is reparsed, we + * output it the first time we come to that point in the parse, but suppress it + * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not + * generate any warnings */ +#define TO_OUTPUT_WARNINGS(loc) \ + ( RExC_copy_start_in_constructed \ + && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset) + +/* After we've emitted a warning, we save the position in the input so we don't + * output it again */ +#define UPDATE_WARNINGS_LOC(loc) \ + STMT_START { \ + if (TO_OUTPUT_WARNINGS(loc)) { \ + RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \ + - RExC_precomp; \ + } \ + } STMT_END + +/* 'warns' is the output of the packWARNx macro used in 'code' */ +#define _WARN_HELPER(loc, warns, code) \ + STMT_START { \ + if (! RExC_copy_start_in_constructed) { \ + Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \ + " expected at '%s'", \ + __FILE__, __LINE__, loc); \ + } \ + if (TO_OUTPUT_WARNINGS(loc)) { \ + if (ckDEAD(warns)) \ + PREPARE_TO_DIE; \ + code; \ + UPDATE_WARNINGS_LOC(loc); \ + } \ + } STMT_END + +/* m is not necessarily a "literal string", in this macro */ +#define warn_non_literal_string(loc, packed_warn, m) \ + _WARN_HELPER(loc, packed_warn, \ + Perl_warner(aTHX_ packed_warn, \ + "%s" REPORT_LOCATION, \ + m, REPORT_LOCATION_ARGS(loc))) +#define reg_warn_non_literal_string(loc, m) \ + warn_non_literal_string(loc, packWARN(WARN_REGEXP), m) + +#define ckWARN2_non_literal_string(loc, packwarn, m, a1) \ + STMT_START { \ + char * format; \ + Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\ + Newx(format, format_size, char); \ + my_strlcpy(format, m, format_size); \ + my_strlcat(format, REPORT_LOCATION, format_size); \ + SAVEFREEPV(format); \ + _WARN_HELPER(loc, packwarn, \ + Perl_ck_warner(aTHX_ packwarn, \ + format, \ + a1, REPORT_LOCATION_ARGS(loc))); \ + } STMT_END + +#define ckWARNreg(loc,m) \ + _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc))) + +#define vWARN(loc, m) \ + _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc))) \ + +#define vWARN_dep(loc,category,m) \ + _WARN_HELPER(loc, packWARN(category), \ + Perl_warner(aTHX_ packWARN(category), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc))) + +#define ckWARNdep(loc,category,m) \ + _WARN_HELPER(loc, packWARN(category), \ + Perl_ck_warner_d(aTHX_ packWARN(category), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc))) + +#define ckWARNregdep(loc,category,m) \ + _WARN_HELPER(loc, packWARN2(category, WARN_REGEXP), \ + Perl_ck_warner_d(aTHX_ packWARN2(category, \ + WARN_REGEXP), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc))) + +#define ckWARN2reg_d(loc,m, a1) \ + _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ + Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, REPORT_LOCATION_ARGS(loc))) + +#define ckWARN2reg(loc, m, a1) \ + _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, REPORT_LOCATION_ARGS(loc))) + +#define vWARN3(loc, m, a1, a2) \ + _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, REPORT_LOCATION_ARGS(loc))) + +#define ckWARN3reg(loc, m, a1, a2) \ + _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, \ + REPORT_LOCATION_ARGS(loc))) + +#define vWARN4(loc, m, a1, a2, a3) \ + _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, a3, \ + REPORT_LOCATION_ARGS(loc))) + +#define ckWARN4reg(loc, m, a1, a2, a3) \ + _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, a3, \ + REPORT_LOCATION_ARGS(loc))) + +#define vWARN5(loc, m, a1, a2, a3, a4) \ + _WARN_HELPER(loc, packWARN(WARN_REGEXP), \ + Perl_warner(aTHX_ packWARN(WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, a2, a3, a4, \ + REPORT_LOCATION_ARGS(loc))) + +#define ckWARNexperimental(loc, class, m) \ + STMT_START { \ + if (! RExC_warned_ ## class) { /* warn once per compilation */ \ + RExC_warned_ ## class = 1; \ + _WARN_HELPER(loc, packWARN(class), \ + Perl_ck_warner_d(aTHX_ packWARN(class), \ + m REPORT_LOCATION, \ + REPORT_LOCATION_ARGS(loc)));\ + } \ + } STMT_END + +#define ckWARNexperimental_with_arg(loc, class, m, arg) \ + STMT_START { \ + if (! RExC_warned_ ## class) { /* warn once per compilation */ \ + RExC_warned_ ## class = 1; \ + _WARN_HELPER(loc, packWARN(class), \ + Perl_ck_warner_d(aTHX_ packWARN(class), \ + m REPORT_LOCATION, \ + arg, REPORT_LOCATION_ARGS(loc)));\ + } \ + } STMT_END + +/* Convert between a pointer to a node and its offset from the beginning of the + * program */ +#define REGNODE_p(offset) (RExC_emit_start + (offset)) +#define REGNODE_OFFSET(node) (__ASSERT_((node) >= RExC_emit_start) \ + (SSize_t) ((node) - RExC_emit_start)) + +#define ProgLen(ri) ri->proglen +#define SetProgLen(ri,x) ri->proglen = x + +#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS +#define EXPERIMENTAL_INPLACESCAN +#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ + +#define DEBUG_RExC_seen() \ + DEBUG_OPTIMISE_MORE_r({ \ + Perl_re_printf( aTHX_ "RExC_seen: "); \ + \ + if (RExC_seen & REG_ZERO_LEN_SEEN) \ + Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \ + \ + if (RExC_seen & REG_LOOKBEHIND_SEEN) \ + Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \ + \ + if (RExC_seen & REG_GPOS_SEEN) \ + Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \ + \ + if (RExC_seen & REG_RECURSE_SEEN) \ + Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \ + \ + if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ + Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \ + \ + if (RExC_seen & REG_VERBARG_SEEN) \ + Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \ + \ + if (RExC_seen & REG_CUTGROUP_SEEN) \ + Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \ + \ + if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ + Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \ + \ + if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ + Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \ + \ + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ + Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \ + \ + if (RExC_seen & REG_PESSIMIZE_SEEN) \ + Perl_re_printf( aTHX_ "REG_PESSIMIZE_SEEN "); \ + \ + Perl_re_printf( aTHX_ "\n"); \ + }); + +#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \ + if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag) + + +#ifdef DEBUGGING +# define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) \ + debug_studydata(where, data, depth, is_inf, min, stopmin, delta) + +# define DEBUG_PEEP(str, scan, depth, flags) \ + debug_peep(str, pRExC_state, scan, depth, flags) +#else +# define DEBUG_STUDYDATA(where, data, depth, is_inf, min, stopmin, delta) NOOP +# define DEBUG_PEEP(str, scan, depth, flags) NOOP +#endif + +#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1) +#ifdef DEBUGGING +#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1) +#else +#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) +#endif + +#define MADE_TRIE 1 +#define MADE_JUMP_TRIE 2 +#define MADE_EXACT_TRIE 4 + +#define INVLIST_INDEX 0 +#define ONLY_LOCALE_MATCHES_INDEX 1 +#define DEFERRED_USER_DEFINED_INDEX 2 + +/* These two functions currently do the exact same thing */ +#define ssc_init_zero ssc_init + +#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp)) +#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX) + +#ifdef DEBUGGING +#define REGNODE_GUTS(state,op,extra_size) \ + regnode_guts_debug(state,op,extra_size) +#else +#define REGNODE_GUTS(state,op,extra_size) \ + regnode_guts(state,extra_size) +#endif + +#define CLEAR_OPTSTART \ + if (optstart) STMT_START { \ + DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \ + " (%" IVdf " nodes)\n", (IV)(node - optstart))); \ + optstart=NULL; \ + } STMT_END + +#define DUMPUNTIL(b,e) \ + CLEAR_OPTSTART; \ + node = dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1); + +#define REGNODE_STEP_OVER(ret,t1,t2) \ + NEXT_OFF(REGNODE_p(ret)) = ((sizeof(t1)+sizeof(t2))/sizeof(regnode)) + +#endif /* REGCOMP_INTERNAL_H */ |