summaryrefslogtreecommitdiff
path: root/gnu/usr.bin/perl/regcomp.c
diff options
context:
space:
mode:
authorTodd C. Miller <millert@cvs.openbsd.org>2002-10-27 22:25:41 +0000
committerTodd C. Miller <millert@cvs.openbsd.org>2002-10-27 22:25:41 +0000
commitd85c2f57f17d991a6ca78d3e1c9f3308a2bbb271 (patch)
tree8c9a359433cbb3488b0a848e99bd869c76295dfd /gnu/usr.bin/perl/regcomp.c
parent74cfb115ac810480c0000dc742b20383c1578bac (diff)
Resolve conflicts, remove old files, merge local changes
Diffstat (limited to 'gnu/usr.bin/perl/regcomp.c')
-rw-r--r--gnu/usr.bin/perl/regcomp.c3135
1 files changed, 1812 insertions, 1323 deletions
diff --git a/gnu/usr.bin/perl/regcomp.c b/gnu/usr.bin/perl/regcomp.c
index b0d238f168d..3055876890c 100644
--- a/gnu/usr.bin/perl/regcomp.c
+++ b/gnu/usr.bin/perl/regcomp.c
@@ -40,10 +40,10 @@
/* *These* symbols are masked to allow static link. */
# define Perl_regnext my_regnext
# define Perl_save_re_context my_save_re_context
-# define Perl_reginitcolors my_reginitcolors
+# define Perl_reginitcolors my_reginitcolors
# define PERL_NO_GET_CONTEXT
-#endif
+#endif
/*SUPPRESS 112*/
/*
@@ -69,7 +69,7 @@
*
**** Alterations to Henry's code are...
****
- **** Copyright (c) 1991-2001, Larry Wall
+ **** Copyright (c) 1991-2002, Larry Wall
****
**** You may distribute under the terms of either the GNU General Public
**** License or the Artistic License, as specified in the README file.
@@ -83,11 +83,7 @@
#define PERL_IN_REGCOMP_C
#include "perl.h"
-#ifdef PERL_IN_XSUB_RE
-# if defined(PERL_CAPI) || defined(PERL_OBJECT)
-# include "XSUB.h"
-# endif
-#else
+#ifndef PERL_IN_XSUB_RE
# include "INTERN.h"
#endif
@@ -111,6 +107,51 @@
#define STATIC static
#endif
+typedef struct RExC_state_t {
+ U32 flags; /* are we folding, multilining? */
+ char *precomp; /* uncompiled string. */
+ regexp *rx;
+ char *start; /* Start of input for compile */
+ char *end; /* End of input for compile */
+ char *parse; /* Input-scan pointer. */
+ I32 whilem_seen; /* number of WHILEM in this expr */
+ regnode *emit_start; /* Start of emitted-code area */
+ regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
+ I32 naughty; /* How bad is this pattern? */
+ I32 sawback; /* Did we see \1, ...? */
+ U32 seen;
+ I32 size; /* Code size. */
+ I32 npar; /* () count. */
+ I32 extralen;
+ I32 seen_zerolen;
+ I32 seen_evals;
+ I32 utf8;
+#if ADD_TO_REGEXEC
+ char *starttry; /* -Dr: where regtry was called. */
+#define RExC_starttry (pRExC_state->starttry)
+#endif
+} RExC_state_t;
+
+#define RExC_flags (pRExC_state->flags)
+#define RExC_precomp (pRExC_state->precomp)
+#define RExC_rx (pRExC_state->rx)
+#define RExC_start (pRExC_state->start)
+#define RExC_end (pRExC_state->end)
+#define RExC_parse (pRExC_state->parse)
+#define RExC_whilem_seen (pRExC_state->whilem_seen)
+#define RExC_offsets (pRExC_state->rx->offsets) /* I am not like the others */
+#define RExC_emit (pRExC_state->emit)
+#define RExC_emit_start (pRExC_state->emit_start)
+#define RExC_naughty (pRExC_state->naughty)
+#define RExC_sawback (pRExC_state->sawback)
+#define RExC_seen (pRExC_state->seen)
+#define RExC_size (pRExC_state->size)
+#define RExC_npar (pRExC_state->npar)
+#define RExC_extralen (pRExC_state->extralen)
+#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
+#define RExC_seen_evals (pRExC_state->seen_evals)
+#define RExC_utf8 (pRExC_state->utf8)
+
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
((*s) == '{' && regcurly(s)))
@@ -154,7 +195,7 @@ typedef struct scan_data_t {
* Forward declarations for pregcomp()'s friends.
*/
-static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0};
#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
@@ -186,13 +227,11 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
#define SCF_WHILEM_VISITED_POS 0x2000
-#define RF_utf8 8
-#define UTF (PL_reg_flags & RF_utf8)
-#define LOC (PL_regflags & PMf_LOCALE)
-#define FOLD (PL_regflags & PMf_FOLD)
+#define UTF (RExC_utf8 != 0)
+#define LOC ((RExC_flags & PMf_LOCALE) != 0)
+#define FOLD ((RExC_flags & PMf_FOLD) != 0)
-#define OOB_CHAR8 1234
-#define OOB_UTF8 123456
+#define OOB_UNICODE 12345678
#define OOB_NAMEDCLASS -1
#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
@@ -207,10 +246,10 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
* of t/op/regmesg.t, the tests in t/op/re_tests, and those in
* op/pragma/warn/regcomp.
*/
-#define MARKER1 "HERE" /* marker as it appears in the description */
-#define MARKER2 " << HERE " /* marker as it appears within the regex */
-
-#define REPORT_LOCATION " before " MARKER1 " mark in regex m/%.*s" MARKER2 "%s/"
+#define MARKER1 "<-- HERE" /* marker as it appears in the description */
+#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
+
+#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
/*
* Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
@@ -220,10 +259,10 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#define FAIL(msg) \
STMT_START { \
char *ellipses = ""; \
- unsigned len = strlen(PL_regprecomp); \
+ IV len = RExC_end - RExC_precomp; \
\
if (!SIZE_ONLY) \
- SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \
+ SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
\
if (len > RegexLengthToShowInErrorMessages) { \
/* chop 10 shorter than the max, to ensure meaning of "..." */ \
@@ -231,7 +270,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
ellipses = "..."; \
} \
Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
- msg, (int)len, PL_regprecomp, ellipses); \
+ msg, (int)len, RExC_precomp, ellipses); \
} STMT_END
/*
@@ -242,10 +281,10 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#define FAIL2(pat,msg) \
STMT_START { \
char *ellipses = ""; \
- unsigned len = strlen(PL_regprecomp); \
+ IV len = RExC_end - RExC_precomp; \
\
if (!SIZE_ONLY) \
- SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \
+ SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
\
if (len > RegexLengthToShowInErrorMessages) { \
/* chop 10 shorter than the max, to ensure meaning of "..." */ \
@@ -253,7 +292,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
ellipses = "..."; \
} \
S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \
- msg, (int)len, PL_regprecomp, ellipses); \
+ msg, (int)len, RExC_precomp, ellipses); \
} STMT_END
@@ -262,10 +301,10 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
*/
#define Simple_vFAIL(m) \
STMT_START { \
- unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
\
Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
- m, (int)offset, PL_regprecomp, PL_regprecomp + offset); \
+ m, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
/*
@@ -274,7 +313,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#define vFAIL(m) \
STMT_START { \
if (!SIZE_ONLY) \
- SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \
+ SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
Simple_vFAIL(m); \
} STMT_END
@@ -283,10 +322,10 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
*/
#define Simple_vFAIL2(m,a1) \
STMT_START { \
- unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
\
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
- (int)offset, PL_regprecomp, PL_regprecomp + offset); \
+ (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
/*
@@ -295,7 +334,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#define vFAIL2(m,a1) \
STMT_START { \
if (!SIZE_ONLY) \
- SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \
+ SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
Simple_vFAIL2(m, a1); \
} STMT_END
@@ -305,10 +344,10 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
*/
#define Simple_vFAIL3(m, a1, a2) \
STMT_START { \
- unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
\
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
- (int)offset, PL_regprecomp, PL_regprecomp + offset); \
+ (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
/*
@@ -317,7 +356,7 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#define vFAIL3(m,a1,a2) \
STMT_START { \
if (!SIZE_ONLY) \
- SAVEDESTRUCTOR_X(clear_re,(void*)PL_regcomp_rx); \
+ SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
Simple_vFAIL3(m, a1, a2); \
} STMT_END
@@ -326,10 +365,10 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
*/
#define Simple_vFAIL4(m, a1, a2, a3) \
STMT_START { \
- unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
\
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\
- (int)offset, PL_regprecomp, PL_regprecomp + offset); \
+ (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
/*
@@ -337,78 +376,139 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
*/
#define Simple_vFAIL5(m, a1, a2, a3, a4) \
STMT_START { \
- unsigned offset = strlen(PL_regprecomp)-(PL_regxend-PL_regcomp_parse); \
+ IV offset = RExC_parse - RExC_precomp; \
S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\
- (int)offset, PL_regprecomp, PL_regprecomp + offset); \
+ (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
#define vWARN(loc,m) \
STMT_START { \
- unsigned offset = strlen(PL_regprecomp)-(PL_regxend-(loc)); \
- Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\
- m, (int)offset, PL_regprecomp, PL_regprecomp + offset); \
+ IV offset = loc - RExC_precomp; \
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION,\
+ m, (int)offset, RExC_precomp, RExC_precomp + offset); \
+ } STMT_END \
+
+#define vWARNdep(loc,m) \
+ STMT_START { \
+ IV offset = loc - RExC_precomp; \
+ Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), "%s" REPORT_LOCATION,\
+ m, (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END \
#define vWARN2(loc, m, a1) \
STMT_START { \
- unsigned offset = strlen(PL_regprecomp)-(PL_regxend-(loc)); \
- Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
+ IV offset = loc - RExC_precomp; \
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
a1, \
- (int)offset, PL_regprecomp, PL_regprecomp + offset); \
+ (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
#define vWARN3(loc, m, a1, a2) \
STMT_START { \
- unsigned offset = strlen(PL_regprecomp) - (PL_regxend - (loc)); \
- Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \
+ IV offset = loc - RExC_precomp; \
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
a1, a2, \
- (int)offset, PL_regprecomp, PL_regprecomp + offset); \
+ (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
#define vWARN4(loc, m, a1, a2, a3) \
STMT_START { \
- unsigned offset = strlen(PL_regprecomp)-(PL_regxend-(loc)); \
- Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\
+ IV offset = loc - RExC_precomp; \
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION,\
a1, a2, a3, \
- (int)offset, PL_regprecomp, PL_regprecomp + offset); \
+ (int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
+/* used for the parse_flags section for (?c) -- japhy */
+#define vWARN5(loc, m, a1, a2, a3, a4) \
+ STMT_START { \
+ IV offset = loc - RExC_precomp; \
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
+ a1, a2, a3, a4, \
+ (int)offset, RExC_precomp, RExC_precomp + offset); \
+ } STMT_END
/* Allow for side effects in s */
-#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (s);} STMT_END
+#define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (void)(s);} STMT_END
+
+/* Macros for recording node offsets. 20001227 mjd@plover.com
+ * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
+ * element 2*n-1 of the array. Element #2n holds the byte length node #n.
+ * Element 0 holds the number n.
+ */
-static void clear_re(pTHXo_ void *r);
+#define MJD_OFFSET_DEBUG(x)
+/* #define MJD_OFFSET_DEBUG(x) fprintf x */
+
+
+# define Set_Node_Offset_To_R(node,byte) \
+ STMT_START { \
+ if (! SIZE_ONLY) { \
+ if((node) < 0) { \
+ Perl_croak(aTHX_ "value of node is %d in Offset macro", node); \
+ } else { \
+ RExC_offsets[2*(node)-1] = (byte); \
+ } \
+ } \
+ } STMT_END
+
+# define Set_Node_Offset(node,byte) Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
+# define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
+
+# define Set_Node_Length_To_R(node,len) \
+ STMT_START { \
+ if (! SIZE_ONLY) { \
+ MJD_OFFSET_DEBUG((stderr, "** (%d) size of node %d is %d.\n", __LINE__, (node), (len))); \
+ if((node) < 0) { \
+ Perl_croak(aTHX_ "value of node is %d in Length macro", node); \
+ } else { \
+ RExC_offsets[2*(node)] = (len); \
+ } \
+ } \
+ } STMT_END
+
+# define Set_Node_Length(node,len) Set_Node_Length_To_R((node)-RExC_emit_start, len)
+# define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
+# define Set_Node_Cur_Length(node) Set_Node_Length(node, RExC_parse - parse_start)
+
+/* Get offsets and lengths */
+#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
+#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
+
+static void clear_re(pTHX_ void *r);
/* Mark that we cannot extend a found fixed substring at this point.
Updata the longest found anchored substring and the longest found
floating substrings if needed. */
STATIC void
-S_scan_commit(pTHX_ scan_data_t *data)
+S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
{
STRLEN l = CHR_SVLEN(data->last_found);
STRLEN old_l = CHR_SVLEN(*data->longest);
-
+
if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
sv_setsv(*data->longest, data->last_found);
if (*data->longest == data->longest_fixed) {
data->offset_fixed = l ? data->last_start_min : data->pos_min;
if (data->flags & SF_BEFORE_EOL)
- data->flags
+ data->flags
|= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
else
data->flags &= ~SF_FIX_BEFORE_EOL;
}
else {
data->offset_float_min = l ? data->last_start_min : data->pos_min;
- data->offset_float_max = (l
- ? data->last_start_max
+ data->offset_float_max = (l
+ ? data->last_start_max
: data->pos_min + data->pos_delta);
+ if ((U32)data->offset_float_max > (U32)I32_MAX)
+ data->offset_float_max = I32_MAX;
if (data->flags & SF_BEFORE_EOL)
- data->flags
+ data->flags
|= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
else
data->flags &= ~SF_FL_BEFORE_EOL;
@@ -421,14 +521,11 @@ S_scan_commit(pTHX_ scan_data_t *data)
/* Can match anything (initialization) */
STATIC void
-S_cl_anything(pTHX_ struct regnode_charclass_class *cl)
+S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
{
- int value;
-
ANYOF_CLASS_ZERO(cl);
- for (value = 0; value < 256; ++value)
- ANYOF_BITMAP_SET(cl, value);
- cl->flags = ANYOF_EOS;
+ ANYOF_BITMAP_SETALL(cl);
+ cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
if (LOC)
cl->flags |= ANYOF_LOCALE;
}
@@ -442,27 +539,28 @@ S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl)
for (value = 0; value <= ANYOF_MAX; value += 2)
if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
return 1;
- for (value = 0; value < 256; ++value)
- if (!ANYOF_BITMAP_TEST(cl, value))
- return 0;
+ if (!(cl->flags & ANYOF_UNICODE_ALL))
+ return 0;
+ if (!ANYOF_BITMAP_TESTALLSET(cl))
+ return 0;
return 1;
}
/* Can match anything (initialization) */
STATIC void
-S_cl_init(pTHX_ struct regnode_charclass_class *cl)
+S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
{
Zero(cl, 1, struct regnode_charclass_class);
cl->type = ANYOF;
- cl_anything(cl);
+ cl_anything(pRExC_state, cl);
}
STATIC void
-S_cl_init_zero(pTHX_ struct regnode_charclass_class *cl)
+S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
{
Zero(cl, 1, struct regnode_charclass_class);
cl->type = ANYOF;
- cl_anything(cl);
+ cl_anything(pRExC_state, cl);
if (LOC)
cl->flags |= ANYOF_LOCALE;
}
@@ -489,12 +587,22 @@ S_cl_and(pTHX_ struct regnode_charclass_class *cl,
} /* XXXX: logic is complicated otherwise, leave it along for a moment. */
if (!(and_with->flags & ANYOF_EOS))
cl->flags &= ~ANYOF_EOS;
+
+ if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
+ cl->flags &= ~ANYOF_UNICODE_ALL;
+ cl->flags |= ANYOF_UNICODE;
+ ARG_SET(cl, ARG(and_with));
+ }
+ if (!(and_with->flags & ANYOF_UNICODE_ALL))
+ cl->flags &= ~ANYOF_UNICODE_ALL;
+ if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
+ cl->flags &= ~ANYOF_UNICODE;
}
/* 'OR' a given class with another one. Can create false positives */
/* We assume that cl is not inverted */
STATIC void
-S_cl_or(pTHX_ struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
+S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with)
{
if (or_with->flags & ANYOF_INVERT) {
/* We do not use
@@ -515,12 +623,12 @@ S_cl_or(pTHX_ struct regnode_charclass_class *cl, struct regnode_charclass_class
cl->bitmap[i] |= ~or_with->bitmap[i];
} /* XXXX: logic is complicated otherwise */
else {
- cl_anything(cl);
+ cl_anything(pRExC_state, cl);
}
} else {
/* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
- && (!(or_with->flags & ANYOF_FOLD)
+ && (!(or_with->flags & ANYOF_FOLD)
|| (cl->flags & ANYOF_FOLD)) ) {
int i;
@@ -534,13 +642,34 @@ S_cl_or(pTHX_ struct regnode_charclass_class *cl, struct regnode_charclass_class
}
}
else { /* XXXX: logic is complicated, leave it along for a moment. */
- cl_anything(cl);
+ cl_anything(pRExC_state, cl);
}
}
if (or_with->flags & ANYOF_EOS)
cl->flags |= ANYOF_EOS;
+
+ if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
+ ARG(cl) != ARG(or_with)) {
+ cl->flags |= ANYOF_UNICODE_ALL;
+ cl->flags &= ~ANYOF_UNICODE;
+ }
+ if (or_with->flags & ANYOF_UNICODE_ALL) {
+ cl->flags |= ANYOF_UNICODE_ALL;
+ cl->flags &= ~ANYOF_UNICODE;
+ }
}
+/*
+ * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
+ * These need to be revisited when a newer toolchain becomes available.
+ */
+#if defined(__sparc64__) && defined(__GNUC__)
+# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
+# undef SPARC64_GCC_WORKAROUND
+# define SPARC64_GCC_WORKAROUND 1
+# endif
+#endif
+
/* REx optimizer. Converts nodes into quickier variants "in place".
Finds fixed substrings. */
@@ -548,7 +677,7 @@ S_cl_or(pTHX_ struct regnode_charclass_class *cl, struct regnode_charclass_class
to the position after last scanned or to NULL. */
STATIC I32
-S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
+S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags)
/* scanp: Start here (read-write). */
/* deltap: Write maxlen-minlen here. */
/* last: Stop before this one. */
@@ -561,7 +690,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
scan_data_t data_fake;
struct regnode_charclass_class and_with; /* Valid if flags & SCF_DO_STCLASS_OR */
-
+
while (scan && OP(scan) != END && scan < last) {
/* Peephole optimizer: */
@@ -571,12 +700,12 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
U32 stringok = 1;
#ifdef DEBUGGING
regnode *stop = scan;
-#endif
+#endif
next = scan + NODE_SZ_STR(scan);
/* Skip NOTHING, merge EXACT*. */
while (n &&
- ( PL_regkind[(U8)OP(n)] == NOTHING ||
+ ( PL_regkind[(U8)OP(n)] == NOTHING ||
(stringok && (OP(n) == OP(scan))))
&& NEXT_OFF(n)
&& NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
@@ -588,14 +717,14 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
#ifdef DEBUGGING
if (stringok)
stop = n;
-#endif
+#endif
n = regnext(n);
}
else if (stringok) {
int oldl = STR_LEN(scan);
regnode *nnext = regnext(n);
- if (oldl + STR_LEN(n) > U8_MAX)
+ if (oldl + STR_LEN(n) > U8_MAX)
break;
NEXT_OFF(scan) += NEXT_OFF(n);
STR_LEN(scan) += STR_LEN(n);
@@ -604,10 +733,54 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
#ifdef DEBUGGING
stop = next - 1;
-#endif
+#endif
n = nnext;
}
}
+
+ if (UTF && OP(scan) == EXACTF && STR_LEN(scan) >= 6) {
+/*
+ Two problematic code points in Unicode casefolding of EXACT nodes:
+
+ U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
+ U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
+
+ which casefold to
+
+ Unicode UTF-8
+
+ U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
+ U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
+
+ This means that in case-insensitive matching (or "loose matching",
+ as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
+ length of the above casefolded versions) can match a target string
+ of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
+ This would rather mess up the minimum length computation.
+
+ What we'll do is to look for the tail four bytes, and then peek
+ at the preceding two bytes to see whether we need to decrease
+ the minimum length by four (six minus two).
+
+ Thanks to the design of UTF-8, there cannot be false matches:
+ A sequence of valid UTF-8 bytes cannot be a subsequence of
+ another valid sequence of UTF-8 bytes.
+
+*/
+ char *s0 = STRING(scan), *s, *t;
+ char *s1 = s0 + STR_LEN(scan) - 1, *s2 = s1 - 4;
+ char *t0 = "\xcc\x88\xcc\x81";
+ char *t1 = t0 + 3;
+
+ for (s = s0 + 2;
+ s < s2 && (t = ninstr(s, s1, t0, t1));
+ s = t + 4) {
+ if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
+ ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
+ min -= 4;
+ }
+ }
+
#ifdef DEBUGGING
/* Allow dumping */
n = scan + NODE_SZ_STR(scan);
@@ -630,7 +803,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
int noff;
regnode *n = scan;
-
+
/* Skip NOTHING and LONGJMP. */
while ((n = regnext(n))
&& ((PL_regkind[(U8)OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
@@ -639,31 +812,31 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
off += noff;
if (reg_off_by_arg[OP(scan)])
ARG(scan) = off;
- else
+ else
NEXT_OFF(scan) = off;
}
/* The principal pseudo-switch. Cannot be a switch, since we
look into several different things. */
- if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
+ if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
|| OP(scan) == IFTHEN || OP(scan) == SUSPEND) {
next = regnext(scan);
code = OP(scan);
-
- if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
+
+ if (OP(next) == code || code == IFTHEN || code == SUSPEND) {
I32 max1 = 0, min1 = I32_MAX, num = 0;
struct regnode_charclass_class accum;
if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
- scan_commit(data); /* Cannot merge strings after this. */
+ scan_commit(pRExC_state, data); /* Cannot merge strings after this. */
if (flags & SCF_DO_STCLASS)
- cl_init_zero(&accum);
+ cl_init_zero(pRExC_state, &accum);
while (OP(scan) == code) {
- I32 deltanext, minnext, f = 0, fake = 0;
+ I32 deltanext, minnext, f = 0, fake;
struct regnode_charclass_class this_class;
num++;
data_fake.flags = 0;
- if (data) {
+ if (data) {
data_fake.whilem_c = data->whilem_c;
data_fake.last_closep = data->last_closep;
}
@@ -674,16 +847,16 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
if (code != BRANCH)
scan = NEXTOPER(scan);
if (flags & SCF_DO_STCLASS) {
- cl_init(&this_class);
+ cl_init(pRExC_state, &this_class);
data_fake.start_class = &this_class;
f = SCF_DO_STCLASS_AND;
- }
+ }
if (flags & SCF_WHILEM_VISITED_POS)
f |= SCF_WHILEM_VISITED_POS;
/* we suppose the run is continuous, last=next...*/
- minnext = study_chunk(&scan, &deltanext, next,
- &data_fake, f);
- if (min1 > minnext)
+ minnext = study_chunk(pRExC_state, &scan, &deltanext,
+ next, &data_fake, f);
+ if (min1 > minnext)
min1 = minnext;
if (max1 < minnext + deltanext)
max1 = minnext + deltanext;
@@ -697,8 +870,8 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
if (data)
data->whilem_c = data_fake.whilem_c;
if (flags & SCF_DO_STCLASS)
- cl_or(&accum, &this_class);
- if (code == SUSPEND)
+ cl_or(pRExC_state, &accum, &this_class);
+ if (code == SUSPEND)
break;
}
if (code == IFTHEN && num < 2) /* Empty ELSE branch */
@@ -712,7 +885,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
min += min1;
delta += max1 - min1;
if (flags & SCF_DO_STCLASS_OR) {
- cl_or(data->start_class, &accum);
+ cl_or(pRExC_state, data->start_class, &accum);
if (min1) {
cl_and(data->start_class, &and_with);
flags &= ~SCF_DO_STCLASS;
@@ -724,7 +897,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
flags &= ~SCF_DO_STCLASS;
}
else {
- /* Switch to OR mode: cache the old value of
+ /* Switch to OR mode: cache the old value of
* data->start_class */
StructCopy(data->start_class, &and_with,
struct regnode_charclass_class);
@@ -744,15 +917,11 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
}
else if (OP(scan) == EXACT) {
I32 l = STR_LEN(scan);
+ UV uc = *((U8*)STRING(scan));
if (UTF) {
- unsigned char *s = (unsigned char *)STRING(scan);
- unsigned char *e = s + l;
- I32 newl = 0;
- while (s < e) {
- newl++;
- s += UTF8SKIP(s);
- }
- l = newl;
+ U8 *s = (U8*)STRING(scan);
+ l = utf8_length(s, s + l);
+ uc = utf8_to_uvchr(s, NULL);
}
min += l;
if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
@@ -761,9 +930,11 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
if (data->last_end == -1) { /* Update the start info. */
data->last_start_min = data->pos_min;
data->last_start_max = is_inf
- ? I32_MAX : data->pos_min + data->pos_delta;
+ ? I32_MAX : data->pos_min + data->pos_delta;
}
sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
+ if (UTF)
+ SvUTF8_on(data->last_found);
data->last_end = data->pos_min + l;
data->pos_min += l; /* As in the first entry. */
data->flags &= ~SF_BEFORE_EOL;
@@ -772,21 +943,27 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
/* Check whether it is compatible with what we know already! */
int compat = 1;
- if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
- && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan))
+ if (uc >= 0x100 ||
+ (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
+ && !ANYOF_BITMAP_TEST(data->start_class, uc)
&& (!(data->start_class->flags & ANYOF_FOLD)
- || !ANYOF_BITMAP_TEST(data->start_class,
- PL_fold[*(U8*)STRING(scan)])))
+ || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
+ )
compat = 0;
ANYOF_CLASS_ZERO(data->start_class);
ANYOF_BITMAP_ZERO(data->start_class);
if (compat)
- ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
+ ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
+ if (uc < 0x100)
+ data->start_class->flags &= ~ANYOF_UNICODE_ALL;
}
else if (flags & SCF_DO_STCLASS_OR) {
/* false positive possible if the class is case-folded */
- ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
+ if (uc < 0x100)
+ ANYOF_BITMAP_SET(data->start_class, uc);
+ else
+ data->start_class->flags |= ANYOF_UNICODE_ALL;
data->start_class->flags &= ~ANYOF_EOS;
cl_and(data->start_class, &and_with);
}
@@ -794,19 +971,15 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
}
else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
I32 l = STR_LEN(scan);
+ UV uc = *((U8*)STRING(scan));
/* Search for fixed substrings supports EXACT only. */
- if (flags & SCF_DO_SUBSTR)
- scan_commit(data);
+ if (flags & SCF_DO_SUBSTR)
+ scan_commit(pRExC_state, data);
if (UTF) {
- unsigned char *s = (unsigned char *)STRING(scan);
- unsigned char *e = s + l;
- I32 newl = 0;
- while (s < e) {
- newl++;
- s += UTF8SKIP(s);
- }
- l = newl;
+ U8 *s = (U8 *)STRING(scan);
+ l = utf8_length(s, s + l);
+ uc = utf8_to_uvchr(s, NULL);
}
min += l;
if (data && (flags & SCF_DO_SUBSTR))
@@ -815,15 +988,15 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
/* Check whether it is compatible with what we know already! */
int compat = 1;
- if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
- && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan))
- && !ANYOF_BITMAP_TEST(data->start_class,
- PL_fold[*(U8*)STRING(scan)]))
+ if (uc >= 0x100 ||
+ (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
+ && !ANYOF_BITMAP_TEST(data->start_class, uc)
+ && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
compat = 0;
ANYOF_CLASS_ZERO(data->start_class);
ANYOF_BITMAP_ZERO(data->start_class);
if (compat) {
- ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
+ ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
data->start_class->flags |= ANYOF_FOLD;
if (OP(scan) == EXACTFL)
@@ -834,7 +1007,8 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
if (data->start_class->flags & ANYOF_FOLD) {
/* false positive possible if the class is case-folded.
Assume that the locale settings are the same... */
- ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
+ if (uc < 0x100)
+ ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
}
cl_and(data->start_class, &and_with);
@@ -842,11 +1016,12 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
flags &= ~SCF_DO_STCLASS;
}
else if (strchr((char*)PL_varies,OP(scan))) {
- I32 mincount, maxcount, minnext, deltanext, fl;
+ I32 mincount, maxcount, minnext, deltanext, fl = 0;
I32 f = flags, pos_before = 0;
regnode *oscan = scan;
struct regnode_charclass_class this_class;
struct regnode_charclass_class *oclass = NULL;
+ I32 next_is_eval = 0;
switch (PL_regkind[(U8)OP(scan)]) {
case WHILEM: /* End of (?:...)* . */
@@ -856,8 +1031,8 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
next = NEXTOPER(scan);
if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
- mincount = 1;
- maxcount = REG_INFTY;
+ mincount = 1;
+ maxcount = REG_INFTY;
next = regnext(scan);
scan = NEXTOPER(scan);
goto do_curly;
@@ -870,20 +1045,20 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
case STAR:
if (flags & SCF_DO_STCLASS) {
mincount = 0;
- maxcount = REG_INFTY;
+ maxcount = REG_INFTY;
next = regnext(scan);
scan = NEXTOPER(scan);
goto do_curly;
}
- is_inf = is_inf_internal = 1;
+ is_inf = is_inf_internal = 1;
scan = regnext(scan);
if (flags & SCF_DO_SUBSTR) {
- scan_commit(data); /* Cannot extend fixed substrings */
+ scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */
data->longest = &(data->longest_float);
}
goto optimize_curly_tail;
case CURLY:
- mincount = ARG1(scan);
+ mincount = ARG1(scan);
maxcount = ARG2(scan);
next = regnext(scan);
if (OP(scan) == CURLYX) {
@@ -892,9 +1067,10 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX);
}
scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
+ next_is_eval = (OP(scan) == EVAL);
do_curly:
if (flags & SCF_DO_SUBSTR) {
- if (mincount == 0) scan_commit(data); /* Cannot extend fixed substrings */
+ if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */
pos_before = data->pos_min;
}
if (data) {
@@ -904,7 +1080,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
data->flags |= SF_IS_INF;
}
if (flags & SCF_DO_STCLASS) {
- cl_init(&this_class);
+ cl_init(pRExC_state, &this_class);
oclass = data->start_class;
data->start_class = &this_class;
f |= SCF_DO_STCLASS_AND;
@@ -913,25 +1089,25 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
/* These are the cases when once a subexpression
fails at a particular position, it cannot succeed
even after backtracking at the enclosing scope.
-
+
XXXX what if minimal match and we are at the
initial run of {n,m}? */
if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
f &= ~SCF_WHILEM_VISITED_POS;
/* This will finish on WHILEM, setting scan, or on NULL: */
- minnext = study_chunk(&scan, &deltanext, last, data,
- mincount == 0
+ minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data,
+ mincount == 0
? (f & ~SCF_DO_SUBSTR) : f);
if (flags & SCF_DO_STCLASS)
data->start_class = oclass;
if (mincount == 0 || minnext == 0) {
if (flags & SCF_DO_STCLASS_OR) {
- cl_or(data->start_class, &this_class);
+ cl_or(pRExC_state, data->start_class, &this_class);
}
else if (flags & SCF_DO_STCLASS_AND) {
- /* Switch to OR mode: cache the old value of
+ /* Switch to OR mode: cache the old value of
* data->start_class */
StructCopy(data->start_class, &and_with,
struct regnode_charclass_class);
@@ -943,7 +1119,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
}
} else { /* Non-zero len */
if (flags & SCF_DO_STCLASS_OR) {
- cl_or(data->start_class, &this_class);
+ cl_or(pRExC_state, data->start_class, &this_class);
cl_and(data->start_class, &and_with);
}
else if (flags & SCF_DO_STCLASS_AND)
@@ -952,42 +1128,50 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
}
if (!scan) /* It was not CURLYX, but CURLY. */
scan = next;
- if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0)
- && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
+ if (ckWARN(WARN_REGEXP)
+ /* ? quantifier ok, except for (?{ ... }) */
+ && (next_is_eval || !(mincount == 0 && maxcount == 1))
+ && (minnext == 0) && (deltanext == 0)
+ && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
&& maxcount <= REG_INFTY/3) /* Complement check for big count */
{
- vWARN(PL_regcomp_parse,
+ vWARN(RExC_parse,
"Quantifier unexpected on zero-length expression");
}
min += minnext * mincount;
- is_inf_internal |= ((maxcount == REG_INFTY
+ is_inf_internal |= ((maxcount == REG_INFTY
&& (minnext + deltanext) > 0)
|| deltanext == I32_MAX);
is_inf |= is_inf_internal;
delta += (minnext + deltanext) * maxcount - minnext * mincount;
/* Try powerful optimization CURLYX => CURLYN. */
- if ( OP(oscan) == CURLYX && data
+ if ( OP(oscan) == CURLYX && data
&& data->flags & SF_IN_PAR
&& !(data->flags & SF_HAS_EVAL)
&& !deltanext && minnext == 1 ) {
/* Try to optimize to CURLYN. */
regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
- regnode *nxt1 = nxt, *nxt2;
+ regnode *nxt1 = nxt;
+#ifdef DEBUGGING
+ regnode *nxt2;
+#endif
/* Skip open. */
nxt = regnext(nxt);
if (!strchr((char*)PL_simple,OP(nxt))
&& !(PL_regkind[(U8)OP(nxt)] == EXACT
- && STR_LEN(nxt) == 1))
+ && STR_LEN(nxt) == 1))
goto nogo;
+#ifdef DEBUGGING
nxt2 = nxt;
+#endif
nxt = regnext(nxt);
- if (OP(nxt) != CLOSE)
+ if (OP(nxt) != CLOSE)
goto nogo;
/* Now we know that nxt2 is the only contents: */
- oscan->flags = ARG(nxt);
+ oscan->flags = (U8)ARG(nxt);
OP(oscan) = CURLYN;
OP(nxt1) = NOTHING; /* was OPEN. */
#ifdef DEBUGGING
@@ -997,12 +1181,12 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
OP(nxt) = OPTIMIZED; /* was CLOSE. */
OP(nxt + 1) = OPTIMIZED; /* was count. */
NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
-#endif
+#endif
}
nogo:
/* Try optimization CURLYX => CURLYM. */
- if ( OP(oscan) == CURLYX && data
+ if ( OP(oscan) == CURLYX && data
&& !(data->flags & SF_HAS_PAR)
&& !(data->flags & SF_HAS_EVAL)
&& !deltanext ) {
@@ -1013,7 +1197,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
OP(oscan) = CURLYM;
while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
- && (OP(nxt2) != WHILEM))
+ && (OP(nxt2) != WHILEM))
nxt = nxt2;
OP(nxt2) = SUCCEED; /* Whas WHILEM */
/* Need to optimize away parenths. */
@@ -1021,9 +1205,9 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
/* Set the parenth number. */
regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
- if (OP(nxt) != CLOSE)
+ if (OP(nxt) != CLOSE)
FAIL("Panic opt close");
- oscan->flags = ARG(nxt);
+ oscan->flags = (U8)ARG(nxt);
OP(nxt1) = OPTIMIZED; /* was OPEN. */
OP(nxt) = OPTIMIZED; /* was CLOSE. */
#ifdef DEBUGGING
@@ -1031,11 +1215,11 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
OP(nxt + 1) = OPTIMIZED; /* was count. */
NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
-#endif
+#endif
#if 0
while ( nxt1 && (OP(nxt1) != WHILEM)) {
regnode *nnxt = regnext(nxt1);
-
+
if (nnxt == nxt) {
if (reg_off_by_arg[OP(nxt1)])
ARG_SET(nxt1, nxt2 - nxt1);
@@ -1048,7 +1232,8 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
}
#endif
/* Optimize again: */
- study_chunk(&nxt1, &deltanext, nxt, NULL, 0);
+ study_chunk(pRExC_state, &nxt1, &deltanext, nxt,
+ NULL, 0);
}
else
oscan->flags = 0;
@@ -1066,21 +1251,38 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
nxt += ARG(nxt);
- PREVOPER(nxt)->flags = data->whilem_c
- | (PL_reg_whilem_seen << 4); /* On WHILEM */
+ PREVOPER(nxt)->flags = (U8)(data->whilem_c
+ | (RExC_whilem_seen << 4)); /* On WHILEM */
}
- if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
+ if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
pars++;
if (flags & SCF_DO_SUBSTR) {
SV *last_str = Nullsv;
int counted = mincount != 0;
if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
- I32 b = pos_before >= data->last_start_min
+#if defined(SPARC64_GCC_WORKAROUND)
+ I32 b = 0;
+ STRLEN l = 0;
+ char *s = NULL;
+ I32 old = 0;
+
+ if (pos_before >= data->last_start_min)
+ b = pos_before;
+ else
+ b = data->last_start_min;
+
+ l = 0;
+ s = SvPV(data->last_found, l);
+ old = b - data->last_start_min;
+
+#else
+ I32 b = pos_before >= data->last_start_min
? pos_before : data->last_start_min;
STRLEN l;
char *s = SvPV(data->last_found, l);
I32 old = b - data->last_start_min;
+#endif
if (UTF)
old = utf8_hop((U8*)s, old) - (U8*)s;
@@ -1092,11 +1294,11 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
/* What was added is a constant string */
if (mincount > 1) {
SvGROW(last_str, (mincount * l) + 1);
- repeatcpy(SvPVX(last_str) + l,
+ repeatcpy(SvPVX(last_str) + l,
SvPVX(last_str), l, mincount - 1);
SvCUR(last_str) *= mincount;
/* Add additional parts. */
- SvCUR_set(data->last_found,
+ SvCUR_set(data->last_found,
SvCUR(data->last_found) - l);
sv_catsv(data->last_found, last_str);
data->last_end += l * (mincount - 1);
@@ -1115,14 +1317,14 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
if (mincount != maxcount) {
/* Cannot extend fixed substrings found inside
the group. */
- scan_commit(data);
+ scan_commit(pRExC_state,data);
if (mincount && last_str) {
sv_setsv(data->last_found, last_str);
data->last_end = data->pos_min;
- data->last_start_min =
+ data->last_start_min =
data->pos_min - CHR_SVLEN(last_str);
- data->last_start_max = is_inf
- ? I32_MAX
+ data->last_start_max = is_inf
+ ? I32_MAX
: data->pos_min + data->pos_delta
- CHR_SVLEN(last_str);
}
@@ -1141,21 +1343,21 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
continue;
default: /* REF and CLUMP only? */
if (flags & SCF_DO_SUBSTR) {
- scan_commit(data); /* Cannot expect anything... */
+ scan_commit(pRExC_state,data); /* Cannot expect anything... */
data->longest = &(data->longest_float);
}
is_inf = is_inf_internal = 1;
if (flags & SCF_DO_STCLASS_OR)
- cl_anything(data->start_class);
+ cl_anything(pRExC_state, data->start_class);
flags &= ~SCF_DO_STCLASS;
break;
}
}
- else if (strchr((char*)PL_simple,OP(scan)) || PL_regkind[(U8)OP(scan)] == ANYUTF8) {
- int value;
+ else if (strchr((char*)PL_simple,OP(scan))) {
+ int value = 0;
if (flags & SCF_DO_SUBSTR) {
- scan_commit(data);
+ scan_commit(pRExC_state,data);
data->pos_min++;
}
min++;
@@ -1165,25 +1367,12 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
/* Some of the logic below assumes that switching
locale on will only add false positives. */
switch (PL_regkind[(U8)OP(scan)]) {
- case ANYUTF8:
case SANY:
- case SANYUTF8:
- case ALNUMUTF8:
- case ANYOFUTF8:
- case ALNUMLUTF8:
- case NALNUMUTF8:
- case NALNUMLUTF8:
- case SPACEUTF8:
- case NSPACEUTF8:
- case SPACELUTF8:
- case NSPACELUTF8:
- case DIGITUTF8:
- case NDIGITUTF8:
default:
do_default:
/* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
- cl_anything(data->start_class);
+ cl_anything(pRExC_state, data->start_class);
break;
case REG_ANY:
if (OP(scan) == SANY)
@@ -1191,7 +1380,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
|| (data->start_class->flags & ANYOF_CLASS));
- cl_anything(data->start_class);
+ cl_anything(pRExC_state, data->start_class);
}
if (flags & SCF_DO_STCLASS_AND || !value)
ANYOF_BITMAP_CLEAR(data->start_class,'\n');
@@ -1201,7 +1390,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
cl_and(data->start_class,
(struct regnode_charclass_class*)scan);
else
- cl_or(data->start_class,
+ cl_or(pRExC_state, data->start_class,
(struct regnode_charclass_class*)scan);
break;
case ALNUM:
@@ -1219,7 +1408,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
else {
for (value = 0; value < 256; value++)
if (isALNUM(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
@@ -1248,7 +1437,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
else {
for (value = 0; value < 256; value++)
if (!isALNUM(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
@@ -1277,7 +1466,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
else {
for (value = 0; value < 256; value++)
if (isSPACE(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
@@ -1306,7 +1495,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
else {
for (value = 0; value < 256; value++)
if (!isSPACE(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
@@ -1337,7 +1526,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
else {
for (value = 0; value < 256; value++)
if (isDIGIT(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
@@ -1354,7 +1543,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
else {
for (value = 0; value < 256; value++)
if (!isDIGIT(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ ANYOF_BITMAP_SET(data->start_class, value);
}
}
break;
@@ -1380,7 +1569,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
int f = 0;
data_fake.flags = 0;
- if (data) {
+ if (data) {
data_fake.whilem_c = data->whilem_c;
data_fake.last_closep = data->last_closep;
}
@@ -1388,7 +1577,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
data_fake.last_closep = &fake;
if ( flags & SCF_DO_STCLASS && !scan->flags
&& OP(scan) == IFMATCH ) { /* Lookahead */
- cl_init(&intrnl);
+ cl_init(pRExC_state, &intrnl);
data_fake.start_class = &intrnl;
f |= SCF_DO_STCLASS_AND;
}
@@ -1396,7 +1585,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
f |= SCF_WHILEM_VISITED_POS;
next = regnext(scan);
nscan = NEXTOPER(NEXTOPER(scan));
- minnext = study_chunk(&nscan, &deltanext, last, &data_fake, f);
+ minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f);
if (scan->flags) {
if (deltanext) {
vFAIL("Variable length lookbehind not implemented");
@@ -1404,7 +1593,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
else if (minnext > U8_MAX) {
vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
}
- scan->flags = minnext;
+ scan->flags = (U8)minnext;
}
if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
pars++;
@@ -1424,7 +1613,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
pars++;
}
else if (OP(scan) == CLOSE) {
- if (ARG(scan) == is_par) {
+ if ((I32)ARG(scan) == is_par) {
next = regnext(scan);
if ( next && (OP(next) != WHILEM) && next < last)
@@ -1439,12 +1628,12 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
}
else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */
if (flags & SCF_DO_SUBSTR) {
- scan_commit(data);
+ scan_commit(pRExC_state,data);
data->longest = &(data->longest_float);
}
is_inf = is_inf_internal = 1;
if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
- cl_anything(data->start_class);
+ cl_anything(pRExC_state, data->start_class);
flags &= ~SCF_DO_STCLASS;
}
/* Else: zero-length, ignore. */
@@ -1454,7 +1643,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
finish:
*scanp = scan;
*deltap = is_inf_internal ? I32_MAX : delta;
- if (flags & SCF_DO_SUBSTR && is_inf)
+ if (flags & SCF_DO_SUBSTR && is_inf)
data->pos_delta = I32_MAX - data->pos_min;
if (is_par > U8_MAX)
is_par = 0;
@@ -1472,23 +1661,23 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da
}
STATIC I32
-S_add_data(pTHX_ I32 n, char *s)
+S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
{
- if (PL_regcomp_rx->data) {
- Renewc(PL_regcomp_rx->data,
- sizeof(*PL_regcomp_rx->data) + sizeof(void*) * (PL_regcomp_rx->data->count + n - 1),
+ if (RExC_rx->data) {
+ Renewc(RExC_rx->data,
+ sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
char, struct reg_data);
- Renew(PL_regcomp_rx->data->what, PL_regcomp_rx->data->count + n, U8);
- PL_regcomp_rx->data->count += n;
+ Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8);
+ RExC_rx->data->count += n;
}
else {
- Newc(1207, PL_regcomp_rx->data, sizeof(*PL_regcomp_rx->data) + sizeof(void*) * (n - 1),
+ Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1),
char, struct reg_data);
- New(1208, PL_regcomp_rx->data->what, n, U8);
- PL_regcomp_rx->data->count = n;
+ New(1208, RExC_rx->data->what, n, U8);
+ RExC_rx->data->count = n;
}
- Copy(s, PL_regcomp_rx->data->what + PL_regcomp_rx->data->count - n, n, U8);
- return PL_regcomp_rx->data->count - n;
+ Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8);
+ return RExC_rx->data->count - n;
}
void
@@ -1496,7 +1685,7 @@ Perl_reginitcolors(pTHX)
{
int i = 0;
char *s = PerlEnv_getenv("PERL_RE_COLORS");
-
+
if (s) {
PL_colors[0] = s = savepv(s);
while (++i < 6) {
@@ -1509,7 +1698,7 @@ Perl_reginitcolors(pTHX)
PL_colors[i] = s = "";
}
} else {
- while (i < 6)
+ while (i < 6)
PL_colors[i++] = "";
}
PL_colorset = 1;
@@ -1542,99 +1731,111 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
I32 sawplus = 0;
I32 sawopen = 0;
scan_data_t data;
+ RExC_state_t RExC_state;
+ RExC_state_t *pRExC_state = &RExC_state;
if (exp == NULL)
FAIL("NULL regexp argument");
- if (pm->op_pmdynflags & PMdf_UTF8) {
- PL_reg_flags |= RF_utf8;
- }
- else
- PL_reg_flags = 0;
+ RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
- PL_regprecomp = exp;
- DEBUG_r(if (!PL_colorset) reginitcolors());
- DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
- PL_colors[4],PL_colors[5],PL_colors[0],
- (int)(xend - exp), PL_regprecomp, PL_colors[1]));
- PL_regflags = pm->op_pmflags;
- PL_regsawback = 0;
+ RExC_precomp = exp;
+ DEBUG_r({
+ if (!PL_colorset) reginitcolors();
+ PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
+ PL_colors[4],PL_colors[5],PL_colors[0],
+ (int)(xend - exp), RExC_precomp, PL_colors[1]);
+ });
+ RExC_flags = pm->op_pmflags;
+ RExC_sawback = 0;
- PL_regseen = 0;
- PL_seen_zerolen = *exp == '^' ? -1 : 0;
- PL_seen_evals = 0;
- PL_extralen = 0;
+ RExC_seen = 0;
+ RExC_seen_zerolen = *exp == '^' ? -1 : 0;
+ RExC_seen_evals = 0;
+ RExC_extralen = 0;
/* First pass: determine size, legality. */
- PL_regcomp_parse = exp;
- PL_regxend = xend;
- PL_regnaughty = 0;
- PL_regnpar = 1;
- PL_regsize = 0L;
- PL_regcode = &PL_regdummy;
- PL_reg_whilem_seen = 0;
+ RExC_parse = exp;
+ RExC_start = exp;
+ RExC_end = xend;
+ RExC_naughty = 0;
+ RExC_npar = 1;
+ RExC_size = 0L;
+ RExC_emit = &PL_regdummy;
+ RExC_whilem_seen = 0;
#if 0 /* REGC() is (currently) a NOP at the first pass.
* Clever compilers notice this and complain. --jhi */
- REGC((U8)REG_MAGIC, (char*)PL_regcode);
+ REGC((U8)REG_MAGIC, (char*)RExC_emit);
#endif
- if (reg(0, &flags) == NULL) {
- PL_regprecomp = Nullch;
+ if (reg(pRExC_state, 0, &flags) == NULL) {
+ RExC_precomp = Nullch;
return(NULL);
}
- DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)PL_regsize));
+ DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
/* Small enough for pointer-storage convention?
If extralen==0, this means that we will not need long jumps. */
- if (PL_regsize >= 0x10000L && PL_extralen)
- PL_regsize += PL_extralen;
+ if (RExC_size >= 0x10000L && RExC_extralen)
+ RExC_size += RExC_extralen;
else
- PL_extralen = 0;
- if (PL_reg_whilem_seen > 15)
- PL_reg_whilem_seen = 15;
+ RExC_extralen = 0;
+ if (RExC_whilem_seen > 15)
+ RExC_whilem_seen = 15;
/* Allocate space and initialize. */
- Newc(1001, r, sizeof(regexp) + (unsigned)PL_regsize * sizeof(regnode),
+ Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode),
char, regexp);
if (r == NULL)
FAIL("Regexp out of space");
#ifdef DEBUGGING
/* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
- Zero(r, sizeof(regexp) + (unsigned)PL_regsize * sizeof(regnode), char);
+ Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char);
#endif
r->refcnt = 1;
r->prelen = xend - exp;
- r->precomp = savepvn(PL_regprecomp, r->prelen);
+ r->precomp = savepvn(RExC_precomp, r->prelen);
r->subbeg = NULL;
r->reganch = pm->op_pmflags & PMf_COMPILETIME;
- r->nparens = PL_regnpar - 1; /* set early to validate backrefs */
+ r->nparens = RExC_npar - 1; /* set early to validate backrefs */
r->substrs = 0; /* Useful during FAIL. */
r->startp = 0; /* Useful during FAIL. */
r->endp = 0; /* Useful during FAIL. */
- PL_regcomp_rx = r;
+ Newz(1304, r->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
+ if (r->offsets) {
+ r->offsets[0] = RExC_size;
+ }
+ DEBUG_r(PerlIO_printf(Perl_debug_log,
+ "%s %"UVuf" bytes for offset annotations.\n",
+ r->offsets ? "Got" : "Couldn't get",
+ (UV)((2*RExC_size+1) * sizeof(U32))));
+
+ RExC_rx = r;
/* Second pass: emit code. */
- PL_regcomp_parse = exp;
- PL_regxend = xend;
- PL_regnaughty = 0;
- PL_regnpar = 1;
- PL_regcode = r->program;
+ RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
+ RExC_parse = exp;
+ RExC_end = xend;
+ RExC_naughty = 0;
+ RExC_npar = 1;
+ RExC_emit_start = r->program;
+ RExC_emit = r->program;
/* Store the count of eval-groups for security checks: */
- PL_regcode->next_off = ((PL_seen_evals > U16_MAX) ? U16_MAX : PL_seen_evals);
- REGC((U8)REG_MAGIC, (char*) PL_regcode++);
+ RExC_emit->next_off = (U16)((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals);
+ REGC((U8)REG_MAGIC, (char*) RExC_emit++);
r->data = 0;
- if (reg(0, &flags) == NULL)
+ if (reg(pRExC_state, 0, &flags) == NULL)
return(NULL);
/* Dig out information for optimizations. */
r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */
- pm->op_pmflags = PL_regflags;
+ pm->op_pmflags = RExC_flags;
if (UTF)
- r->reganch |= ROPT_UTF8;
+ r->reganch |= ROPT_UTF8; /* Unicode in it? */
r->regstclass = NULL;
- if (PL_regnaughty >= 10) /* Probably an expensive pattern. */
+ if (RExC_naughty >= 10) /* Probably an expensive pattern. */
r->reganch |= ROPT_NAUGHTY;
scan = r->program + 1; /* First BRANCH. */
@@ -1670,9 +1871,9 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
/* Starting-point info. */
again:
if (PL_regkind[(U8)OP(first)] == EXACT) {
- if (OP(first) == EXACT); /* Empty, get anchored substr later. */
- else if ((OP(first) == EXACTF || OP(first) == EXACTFL)
- && !UTF)
+ if (OP(first) == EXACT)
+ ; /* Empty, get anchored substr later. */
+ else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
r->regstclass = first;
}
else if (strchr((char*)PL_simple,OP(first)))
@@ -1694,14 +1895,14 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
first = NEXTOPER(first);
goto again;
}
- else if ((OP(first) == STAR &&
+ else if (!sawopen && (OP(first) == STAR &&
PL_regkind[(U8)OP(NEXTOPER(first))] == REG_ANY) &&
!(r->reganch & ROPT_ANCH) )
{
/* turn .* into ^.* with an implied $*=1 */
int type = OP(NEXTOPER(first));
- if (type == REG_ANY || type == ANYUTF8)
+ if (type == REG_ANY)
type = ROPT_ANCH_MBOL;
else
type = ROPT_ANCH_SBOL;
@@ -1710,13 +1911,13 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
first = NEXTOPER(first);
goto again;
}
- if (sawplus && (!sawopen || !PL_regsawback)
- && !(PL_regseen & REG_SEEN_EVAL)) /* May examine pos and $& */
+ if (sawplus && (!sawopen || !RExC_sawback)
+ && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
/* x+ must match at the 1st pos of run of x's */
r->reganch |= ROPT_SKIP;
/* Scan is after the zeroth branch, first is atomic matcher. */
- DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
+ DEBUG_r(PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
(IV)(first - scan + 1)));
/*
* If there's something expensive in the r.e., find the
@@ -1737,28 +1938,28 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
data.longest = &(data.longest_fixed);
first = scan;
if (!r->regstclass) {
- cl_init(&ch_class);
+ cl_init(pRExC_state, &ch_class);
data.start_class = &ch_class;
stclass_flag = SCF_DO_STCLASS_AND;
} else /* XXXX Check for BOUND? */
stclass_flag = 0;
data.last_closep = &last_close;
- minlen = study_chunk(&first, &fake, scan + PL_regsize, /* Up to end */
+ minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */
&data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag);
- if ( PL_regnpar == 1 && data.longest == &(data.longest_fixed)
- && data.last_start_min == 0 && data.last_end > 0
- && !PL_seen_zerolen
- && (!(PL_regseen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
+ if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
+ && data.last_start_min == 0 && data.last_end > 0
+ && !RExC_seen_zerolen
+ && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS)))
r->reganch |= ROPT_CHECK_ALL;
- scan_commit(&data);
+ scan_commit(pRExC_state, &data);
SvREFCNT_dec(data.last_found);
longest_float_length = CHR_SVLEN(data.longest_float);
if (longest_float_length
|| (data.flags & SF_FL_BEFORE_EOL
&& (!(data.flags & SF_FL_BEFORE_MEOL)
- || (PL_regflags & PMf_MULTILINE)))) {
+ || (RExC_flags & PMf_MULTILINE)))) {
int t;
if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
@@ -1766,17 +1967,23 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
&& SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
goto remove_float; /* As in (a)+. */
- r->float_substr = data.longest_float;
+ if (SvUTF8(data.longest_float)) {
+ r->float_utf8 = data.longest_float;
+ r->float_substr = Nullsv;
+ } else {
+ r->float_substr = data.longest_float;
+ r->float_utf8 = Nullsv;
+ }
r->float_min_offset = data.offset_float_min;
r->float_max_offset = data.offset_float_max;
t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
&& (!(data.flags & SF_FL_BEFORE_MEOL)
- || (PL_regflags & PMf_MULTILINE)));
- fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0);
+ || (RExC_flags & PMf_MULTILINE)));
+ fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
}
else {
remove_float:
- r->float_substr = Nullsv;
+ r->float_substr = r->float_utf8 = Nullsv;
SvREFCNT_dec(data.longest_float);
longest_float_length = 0;
}
@@ -1785,61 +1992,71 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
if (longest_fixed_length
|| (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
&& (!(data.flags & SF_FIX_BEFORE_MEOL)
- || (PL_regflags & PMf_MULTILINE)))) {
+ || (RExC_flags & PMf_MULTILINE)))) {
int t;
- r->anchored_substr = data.longest_fixed;
+ if (SvUTF8(data.longest_fixed)) {
+ r->anchored_utf8 = data.longest_fixed;
+ r->anchored_substr = Nullsv;
+ } else {
+ r->anchored_substr = data.longest_fixed;
+ r->anchored_utf8 = Nullsv;
+ }
r->anchored_offset = data.offset_fixed;
t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
&& (!(data.flags & SF_FIX_BEFORE_MEOL)
- || (PL_regflags & PMf_MULTILINE)));
- fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0);
+ || (RExC_flags & PMf_MULTILINE)));
+ fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
}
else {
- r->anchored_substr = Nullsv;
+ r->anchored_substr = r->anchored_utf8 = Nullsv;
SvREFCNT_dec(data.longest_fixed);
longest_fixed_length = 0;
}
- if (r->regstclass
- && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == ANYUTF8
- || OP(r->regstclass) == SANYUTF8 || OP(r->regstclass) == SANY))
+ if (r->regstclass
+ && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
r->regstclass = NULL;
- if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
+ if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
+ && stclass_flag
&& !(data.start_class->flags & ANYOF_EOS)
- && !cl_is_anything(data.start_class)) {
- SV *sv;
- I32 n = add_data(1, "f");
+ && !cl_is_anything(data.start_class))
+ {
+ I32 n = add_data(pRExC_state, 1, "f");
- New(1006, PL_regcomp_rx->data->data[n], 1,
+ New(1006, RExC_rx->data->data[n], 1,
struct regnode_charclass_class);
StructCopy(data.start_class,
- (struct regnode_charclass_class*)PL_regcomp_rx->data->data[n],
+ (struct regnode_charclass_class*)RExC_rx->data->data[n],
struct regnode_charclass_class);
- r->regstclass = (regnode*)PL_regcomp_rx->data->data[n];
+ r->regstclass = (regnode*)RExC_rx->data->data[n];
r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
- DEBUG_r((sv = sv_newmortal(),
- regprop(sv, (regnode*)data.start_class),
- PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
- SvPVX(sv))));
+ PL_regdata = r->data; /* for regprop() */
+ DEBUG_r({ SV *sv = sv_newmortal();
+ regprop(sv, (regnode*)data.start_class);
+ PerlIO_printf(Perl_debug_log,
+ "synthetic stclass `%s'.\n",
+ SvPVX(sv));});
}
/* A temporary algorithm prefers floated substr to fixed one to dig more info. */
if (longest_fixed_length > longest_float_length) {
r->check_substr = r->anchored_substr;
+ r->check_utf8 = r->anchored_utf8;
r->check_offset_min = r->check_offset_max = r->anchored_offset;
if (r->reganch & ROPT_ANCH_SINGLE)
r->reganch |= ROPT_NOSCAN;
}
else {
r->check_substr = r->float_substr;
+ r->check_utf8 = r->float_utf8;
r->check_offset_min = data.offset_float_min;
r->check_offset_max = data.offset_float_max;
}
/* XXXX Currently intuiting is not compatible with ANCH_GPOS.
This should be changed ASAP! */
- if (r->check_substr && !(r->reganch & ROPT_ANCH_GPOS)) {
+ if ((r->check_substr || r->check_utf8) && !(r->reganch & ROPT_ANCH_GPOS)) {
r->reganch |= RE_USE_INTUIT;
- if (SvTAIL(r->check_substr))
+ if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
r->reganch |= RE_INTUIT_TAIL;
}
}
@@ -1851,40 +2068,44 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
DEBUG_r(PerlIO_printf(Perl_debug_log, "\n"));
scan = r->program + 1;
- cl_init(&ch_class);
+ cl_init(pRExC_state, &ch_class);
data.start_class = &ch_class;
data.last_closep = &last_close;
- minlen = study_chunk(&scan, &fake, scan + PL_regsize, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
- r->check_substr = r->anchored_substr = r->float_substr = Nullsv;
+ minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS);
+ r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
+ = r->float_substr = r->float_utf8 = Nullsv;
if (!(data.start_class->flags & ANYOF_EOS)
- && !cl_is_anything(data.start_class)) {
- SV *sv;
- I32 n = add_data(1, "f");
+ && !cl_is_anything(data.start_class))
+ {
+ I32 n = add_data(pRExC_state, 1, "f");
- New(1006, PL_regcomp_rx->data->data[n], 1,
+ New(1006, RExC_rx->data->data[n], 1,
struct regnode_charclass_class);
StructCopy(data.start_class,
- (struct regnode_charclass_class*)PL_regcomp_rx->data->data[n],
+ (struct regnode_charclass_class*)RExC_rx->data->data[n],
struct regnode_charclass_class);
- r->regstclass = (regnode*)PL_regcomp_rx->data->data[n];
+ r->regstclass = (regnode*)RExC_rx->data->data[n];
r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
- DEBUG_r((sv = sv_newmortal(),
- regprop(sv, (regnode*)data.start_class),
- PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
- SvPVX(sv))));
+ DEBUG_r({ SV* sv = sv_newmortal();
+ regprop(sv, (regnode*)data.start_class);
+ PerlIO_printf(Perl_debug_log,
+ "synthetic stclass `%s'.\n",
+ SvPVX(sv));});
}
}
r->minlen = minlen;
- if (PL_regseen & REG_SEEN_GPOS)
+ if (RExC_seen & REG_SEEN_GPOS)
r->reganch |= ROPT_GPOS_SEEN;
- if (PL_regseen & REG_SEEN_LOOKBEHIND)
+ if (RExC_seen & REG_SEEN_LOOKBEHIND)
r->reganch |= ROPT_LOOKBEHIND_SEEN;
- if (PL_regseen & REG_SEEN_EVAL)
+ if (RExC_seen & REG_SEEN_EVAL)
r->reganch |= ROPT_EVAL_SEEN;
- Newz(1002, r->startp, PL_regnpar, I32);
- Newz(1002, r->endp, PL_regnpar, I32);
- PL_regdata = r->data; /* for regprop() ANYOFUTF8 */
+ if (RExC_seen & REG_SEEN_CANY)
+ r->reganch |= ROPT_CANY_SEEN;
+ Newz(1002, r->startp, RExC_npar, I32);
+ Newz(1002, r->endp, RExC_npar, I32);
+ PL_regdata = r->data; /* for regprop() */
DEBUG_r(regdump(r));
return(r);
}
@@ -1899,7 +2120,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
* follows makes it hard to avoid.
*/
STATIC regnode *
-S_reg(pTHX_ I32 paren, I32 *flagp)
+S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
/* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
{
register regnode *ret; /* Will be the head of the group. */
@@ -1907,261 +2128,321 @@ S_reg(pTHX_ I32 paren, I32 *flagp)
register regnode *lastbr;
register regnode *ender = 0;
register I32 parno = 0;
- I32 flags, oregflags = PL_regflags, have_branch = 0, open = 0;
- char *oregcomp_parse = PL_regcomp_parse;
+ I32 flags, oregflags = RExC_flags, have_branch = 0, open = 0;
+
+ /* for (?g), (?gc), and (?o) warnings; warning
+ about (?c) will warn about (?g) -- japhy */
+
+ I32 wastedflags = 0x00,
+ wasted_o = 0x01,
+ wasted_g = 0x02,
+ wasted_gc = 0x02 | 0x04,
+ wasted_c = 0x04;
+
+ char * parse_start = RExC_parse; /* MJD */
+ char *oregcomp_parse = RExC_parse;
char c;
*flagp = 0; /* Tentatively. */
+
/* Make an OPEN node, if parenthesized. */
if (paren) {
- if (*PL_regcomp_parse == '?') {
- U16 posflags = 0, negflags = 0;
- U16 *flagsp = &posflags;
+ if (*RExC_parse == '?') { /* (?...) */
+ U32 posflags = 0, negflags = 0;
+ U32 *flagsp = &posflags;
int logical = 0;
- char *seqstart = PL_regcomp_parse;
+ char *seqstart = RExC_parse;
- PL_regcomp_parse++;
- paren = *PL_regcomp_parse++;
+ RExC_parse++;
+ paren = *RExC_parse++;
ret = NULL; /* For look-ahead/behind. */
switch (paren) {
- case '<':
- PL_regseen |= REG_SEEN_LOOKBEHIND;
- if (*PL_regcomp_parse == '!')
+ case '<': /* (?<...) */
+ RExC_seen |= REG_SEEN_LOOKBEHIND;
+ if (*RExC_parse == '!')
paren = ',';
- if (*PL_regcomp_parse != '=' && *PL_regcomp_parse != '!')
+ if (*RExC_parse != '=' && *RExC_parse != '!')
goto unknown;
- PL_regcomp_parse++;
- case '=':
- case '!':
- PL_seen_zerolen++;
- case ':':
- case '>':
+ RExC_parse++;
+ case '=': /* (?=...) */
+ case '!': /* (?!...) */
+ RExC_seen_zerolen++;
+ case ':': /* (?:...) */
+ case '>': /* (?>...) */
break;
- case '$':
- case '@':
+ case '$': /* (?$...) */
+ case '@': /* (?@...) */
vFAIL2("Sequence (?%c...) not implemented", (int)paren);
break;
- case '#':
- while (*PL_regcomp_parse && *PL_regcomp_parse != ')')
- PL_regcomp_parse++;
- if (*PL_regcomp_parse != ')')
+ case '#': /* (?#...) */
+ while (*RExC_parse && *RExC_parse != ')')
+ RExC_parse++;
+ if (*RExC_parse != ')')
FAIL("Sequence (?#... not terminated");
- nextchar();
+ nextchar(pRExC_state);
*flagp = TRYAGAIN;
return NULL;
- case 'p':
- if (SIZE_ONLY)
- vWARN(PL_regcomp_parse, "(?p{}) is deprecated - use (??{})");
+ case 'p': /* (?p...) */
+ if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
+ vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
/* FALL THROUGH*/
- case '?':
+ case '?': /* (??...) */
logical = 1;
- paren = *PL_regcomp_parse++;
+ if (*RExC_parse != '{')
+ goto unknown;
+ paren = *RExC_parse++;
/* FALL THROUGH */
- case '{':
+ case '{': /* (?{...}) */
{
I32 count = 1, n = 0;
char c;
- char *s = PL_regcomp_parse;
+ char *s = RExC_parse;
SV *sv;
OP_4tree *sop, *rop;
- PL_seen_zerolen++;
- PL_regseen |= REG_SEEN_EVAL;
- while (count && (c = *PL_regcomp_parse)) {
- if (c == '\\' && PL_regcomp_parse[1])
- PL_regcomp_parse++;
- else if (c == '{')
+ RExC_seen_zerolen++;
+ RExC_seen |= REG_SEEN_EVAL;
+ while (count && (c = *RExC_parse)) {
+ if (c == '\\' && RExC_parse[1])
+ RExC_parse++;
+ else if (c == '{')
count++;
- else if (c == '}')
+ else if (c == '}')
count--;
- PL_regcomp_parse++;
+ RExC_parse++;
}
- if (*PL_regcomp_parse != ')')
+ if (*RExC_parse != ')')
{
- PL_regcomp_parse = s;
+ RExC_parse = s;
vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
}
if (!SIZE_ONLY) {
AV *av;
-
- if (PL_regcomp_parse - 1 - s)
- sv = newSVpvn(s, PL_regcomp_parse - 1 - s);
+
+ if (RExC_parse - 1 - s)
+ sv = newSVpvn(s, RExC_parse - 1 - s);
else
sv = newSVpvn("", 0);
ENTER;
Perl_save_re_context(aTHX);
rop = sv_compile_2op(sv, &sop, "re", &av);
+ sop->op_private |= OPpREFCOUNTED;
+ /* re_dup will OpREFCNT_inc */
+ OpREFCNT_set(sop, 1);
LEAVE;
- n = add_data(3, "nop");
- PL_regcomp_rx->data->data[n] = (void*)rop;
- PL_regcomp_rx->data->data[n+1] = (void*)sop;
- PL_regcomp_rx->data->data[n+2] = (void*)av;
+ n = add_data(pRExC_state, 3, "nop");
+ RExC_rx->data->data[n] = (void*)rop;
+ RExC_rx->data->data[n+1] = (void*)sop;
+ RExC_rx->data->data[n+2] = (void*)av;
SvREFCNT_dec(sv);
}
else { /* First pass */
- if (PL_reginterp_cnt < ++PL_seen_evals
+ if (PL_reginterp_cnt < ++RExC_seen_evals
&& PL_curcop != &PL_compiling)
/* No compiled RE interpolated, has runtime
components ===> unsafe. */
FAIL("Eval-group not allowed at runtime, use re 'eval'");
- if (PL_tainted)
+ if (PL_tainting && PL_tainted)
FAIL("Eval-group in insecure regular expression");
}
- nextchar();
+ nextchar(pRExC_state);
if (logical) {
- ret = reg_node(LOGICAL);
+ ret = reg_node(pRExC_state, LOGICAL);
if (!SIZE_ONLY)
ret->flags = 2;
- regtail(ret, reganode(EVAL, n));
+ regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
+ /* deal with the length of this later - MJD */
return ret;
}
- return reganode(EVAL, n);
+ return reganode(pRExC_state, EVAL, n);
}
- case '(':
+ case '(': /* (?(?{...})...) and (?(?=...)...) */
{
- if (PL_regcomp_parse[0] == '?') {
- if (PL_regcomp_parse[1] == '=' || PL_regcomp_parse[1] == '!'
- || PL_regcomp_parse[1] == '<'
- || PL_regcomp_parse[1] == '{') { /* Lookahead or eval. */
+ if (RExC_parse[0] == '?') { /* (?(?...)) */
+ if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
+ || RExC_parse[1] == '<'
+ || RExC_parse[1] == '{') { /* Lookahead or eval. */
I32 flag;
- ret = reg_node(LOGICAL);
+ ret = reg_node(pRExC_state, LOGICAL);
if (!SIZE_ONLY)
ret->flags = 1;
- regtail(ret, reg(1, &flag));
+ regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag));
goto insert_if;
- }
+ }
}
- else if (PL_regcomp_parse[0] >= '1' && PL_regcomp_parse[0] <= '9' ) {
- parno = atoi(PL_regcomp_parse++);
-
- while (isDIGIT(*PL_regcomp_parse))
- PL_regcomp_parse++;
- ret = reganode(GROUPP, parno);
- if ((c = *nextchar()) != ')')
+ else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
+ /* (?(1)...) */
+ parno = atoi(RExC_parse++);
+
+ while (isDIGIT(*RExC_parse))
+ RExC_parse++;
+ ret = reganode(pRExC_state, GROUPP, parno);
+
+ if ((c = *nextchar(pRExC_state)) != ')')
vFAIL("Switch condition not recognized");
insert_if:
- regtail(ret, reganode(IFTHEN, 0));
- br = regbranch(&flags, 1);
+ regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
+ br = regbranch(pRExC_state, &flags, 1);
if (br == NULL)
- br = reganode(LONGJMP, 0);
+ br = reganode(pRExC_state, LONGJMP, 0);
else
- regtail(br, reganode(LONGJMP, 0));
- c = *nextchar();
+ regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
+ c = *nextchar(pRExC_state);
if (flags&HASWIDTH)
*flagp |= HASWIDTH;
if (c == '|') {
- lastbr = reganode(IFTHEN, 0); /* Fake one for optimizer. */
- regbranch(&flags, 1);
- regtail(ret, lastbr);
+ lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
+ regbranch(pRExC_state, &flags, 1);
+ regtail(pRExC_state, ret, lastbr);
if (flags&HASWIDTH)
*flagp |= HASWIDTH;
- c = *nextchar();
+ c = *nextchar(pRExC_state);
}
else
lastbr = NULL;
if (c != ')')
vFAIL("Switch (?(condition)... contains too many branches");
- ender = reg_node(TAIL);
- regtail(br, ender);
+ ender = reg_node(pRExC_state, TAIL);
+ regtail(pRExC_state, br, ender);
if (lastbr) {
- regtail(lastbr, ender);
- regtail(NEXTOPER(NEXTOPER(lastbr)), ender);
+ regtail(pRExC_state, lastbr, ender);
+ regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
}
else
- regtail(ret, ender);
+ regtail(pRExC_state, ret, ender);
return ret;
}
else {
- vFAIL2("Unknown switch condition (?(%.2s", PL_regcomp_parse);
+ vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
}
}
case 0:
- PL_regcomp_parse--; /* for vFAIL to print correctly */
+ RExC_parse--; /* for vFAIL to print correctly */
vFAIL("Sequence (? incomplete");
break;
default:
- --PL_regcomp_parse;
- parse_flags:
- while (*PL_regcomp_parse && strchr("iogcmsx", *PL_regcomp_parse)) {
- if (*PL_regcomp_parse != 'o')
- pmflag(flagsp, *PL_regcomp_parse);
- ++PL_regcomp_parse;
+ --RExC_parse;
+ parse_flags: /* (?i) */
+ while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) {
+ /* (?g), (?gc) and (?o) are useless here
+ and must be globally applied -- japhy */
+
+ if (*RExC_parse == 'o' || *RExC_parse == 'g') {
+ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
+ I32 wflagbit = *RExC_parse == 'o' ? wasted_o : wasted_g;
+ if (! (wastedflags & wflagbit) ) {
+ wastedflags |= wflagbit;
+ vWARN5(
+ RExC_parse + 1,
+ "Useless (%s%c) - %suse /%c modifier",
+ flagsp == &negflags ? "?-" : "?",
+ *RExC_parse,
+ flagsp == &negflags ? "don't " : "",
+ *RExC_parse
+ );
+ }
+ }
+ }
+ else if (*RExC_parse == 'c') {
+ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
+ if (! (wastedflags & wasted_c) ) {
+ wastedflags |= wasted_gc;
+ vWARN3(
+ RExC_parse + 1,
+ "Useless (%sc) - %suse /gc modifier",
+ flagsp == &negflags ? "?-" : "?",
+ flagsp == &negflags ? "don't " : ""
+ );
+ }
+ }
+ }
+ else { pmflag(flagsp, *RExC_parse); }
+
+ ++RExC_parse;
}
- if (*PL_regcomp_parse == '-') {
+ if (*RExC_parse == '-') {
flagsp = &negflags;
- ++PL_regcomp_parse;
+ wastedflags = 0; /* reset so (?g-c) warns twice */
+ ++RExC_parse;
goto parse_flags;
}
- PL_regflags |= posflags;
- PL_regflags &= ~negflags;
- if (*PL_regcomp_parse == ':') {
- PL_regcomp_parse++;
+ RExC_flags |= posflags;
+ RExC_flags &= ~negflags;
+ if (*RExC_parse == ':') {
+ RExC_parse++;
paren = ':';
break;
}
unknown:
- if (*PL_regcomp_parse != ')') {
- PL_regcomp_parse++;
- vFAIL3("Sequence (%.*s...) not recognized", PL_regcomp_parse-seqstart, seqstart);
+ if (*RExC_parse != ')') {
+ RExC_parse++;
+ vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
}
- nextchar();
+ nextchar(pRExC_state);
*flagp = TRYAGAIN;
return NULL;
}
}
- else {
- parno = PL_regnpar;
- PL_regnpar++;
- ret = reganode(OPEN, parno);
+ else { /* (...) */
+ parno = RExC_npar;
+ RExC_npar++;
+ ret = reganode(pRExC_state, OPEN, parno);
+ Set_Node_Length(ret, 1); /* MJD */
+ Set_Node_Offset(ret, RExC_parse); /* MJD */
open = 1;
}
}
- else
+ else /* ! paren */
ret = NULL;
/* Pick up the branches, linking them together. */
- br = regbranch(&flags, 1);
+ parse_start = RExC_parse; /* MJD */
+ br = regbranch(pRExC_state, &flags, 1);
+ /* branch_len = (paren != 0); */
+
if (br == NULL)
return(NULL);
- if (*PL_regcomp_parse == '|') {
- if (!SIZE_ONLY && PL_extralen) {
- reginsert(BRANCHJ, br);
+ if (*RExC_parse == '|') {
+ if (!SIZE_ONLY && RExC_extralen) {
+ reginsert(pRExC_state, BRANCHJ, br);
}
- else
- reginsert(BRANCH, br);
+ else { /* MJD */
+ reginsert(pRExC_state, BRANCH, br);
+ Set_Node_Length(br, paren != 0);
+ Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
+ }
have_branch = 1;
if (SIZE_ONLY)
- PL_extralen += 1; /* For BRANCHJ-BRANCH. */
+ RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
}
else if (paren == ':') {
*flagp |= flags&SIMPLE;
}
if (open) { /* Starts with OPEN. */
- regtail(ret, br); /* OPEN -> first. */
+ regtail(pRExC_state, ret, br); /* OPEN -> first. */
}
else if (paren != '?') /* Not Conditional */
ret = br;
- if (flags&HASWIDTH)
- *flagp |= HASWIDTH;
- *flagp |= flags&SPSTART;
+ *flagp |= flags & (SPSTART | HASWIDTH);
lastbr = br;
- while (*PL_regcomp_parse == '|') {
- if (!SIZE_ONLY && PL_extralen) {
- ender = reganode(LONGJMP,0);
- regtail(NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
+ while (*RExC_parse == '|') {
+ if (!SIZE_ONLY && RExC_extralen) {
+ ender = reganode(pRExC_state, LONGJMP,0);
+ regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
}
if (SIZE_ONLY)
- PL_extralen += 2; /* Account for LONGJMP. */
- nextchar();
- br = regbranch(&flags, 0);
+ RExC_extralen += 2; /* Account for LONGJMP. */
+ nextchar(pRExC_state);
+ br = regbranch(pRExC_state, &flags, 0);
+
if (br == NULL)
return(NULL);
- regtail(lastbr, br); /* BRANCH -> BRANCH. */
+ regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
lastbr = br;
if (flags&HASWIDTH)
*flagp |= HASWIDTH;
@@ -2172,10 +2453,12 @@ S_reg(pTHX_ I32 paren, I32 *flagp)
/* Make a closing node, and hook it on the end. */
switch (paren) {
case ':':
- ender = reg_node(TAIL);
+ ender = reg_node(pRExC_state, TAIL);
break;
case 1:
- ender = reganode(CLOSE, parno);
+ ender = reganode(pRExC_state, CLOSE, parno);
+ Set_Node_Offset(ender,RExC_parse+1); /* MJD */
+ Set_Node_Length(ender,1); /* MJD */
break;
case '<':
case ',':
@@ -2184,18 +2467,18 @@ S_reg(pTHX_ I32 paren, I32 *flagp)
*flagp &= ~HASWIDTH;
/* FALL THROUGH */
case '>':
- ender = reg_node(SUCCEED);
+ ender = reg_node(pRExC_state, SUCCEED);
break;
case 0:
- ender = reg_node(END);
+ ender = reg_node(pRExC_state, END);
break;
}
- regtail(lastbr, ender);
+ regtail(pRExC_state, lastbr, ender);
if (have_branch) {
/* Hook the tails of the branches to the closing node. */
for (br = ret; br != NULL; br = regnext(br)) {
- regoptail(br, ender);
+ regoptail(pRExC_state, br, ender);
}
}
}
@@ -2205,28 +2488,28 @@ S_reg(pTHX_ I32 paren, I32 *flagp)
static char parens[] = "=!<,>";
if (paren && (p = strchr(parens, paren))) {
- int node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
+ U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
int flag = (p - parens) > 1;
if (paren == '>')
node = SUSPEND, flag = 0;
- reginsert(node,ret);
+ reginsert(pRExC_state, node,ret);
ret->flags = flag;
- regtail(ret, reg_node(TAIL));
+ regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL));
}
}
/* Check for proper termination. */
if (paren) {
- PL_regflags = oregflags;
- if (PL_regcomp_parse >= PL_regxend || *nextchar() != ')') {
- PL_regcomp_parse = oregcomp_parse;
+ RExC_flags = oregflags;
+ if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
+ RExC_parse = oregcomp_parse;
vFAIL("Unmatched (");
}
}
- else if (!paren && PL_regcomp_parse < PL_regxend) {
- if (*PL_regcomp_parse == ')') {
- PL_regcomp_parse++;
+ else if (!paren && RExC_parse < RExC_end) {
+ if (*RExC_parse == ')') {
+ RExC_parse++;
vFAIL("Unmatched )");
}
else
@@ -2243,32 +2526,34 @@ S_reg(pTHX_ I32 paren, I32 *flagp)
* Implements the concatenation operator.
*/
STATIC regnode *
-S_regbranch(pTHX_ I32 *flagp, I32 first)
+S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
{
register regnode *ret;
register regnode *chain = NULL;
register regnode *latest;
I32 flags = 0, c = 0;
- if (first)
+ if (first)
ret = NULL;
else {
- if (!SIZE_ONLY && PL_extralen)
- ret = reganode(BRANCHJ,0);
- else
- ret = reg_node(BRANCH);
+ if (!SIZE_ONLY && RExC_extralen)
+ ret = reganode(pRExC_state, BRANCHJ,0);
+ else {
+ ret = reg_node(pRExC_state, BRANCH);
+ Set_Node_Length(ret, 1);
+ }
}
- if (!first && SIZE_ONLY)
- PL_extralen += 1; /* BRANCHJ */
-
+ if (!first && SIZE_ONLY)
+ RExC_extralen += 1; /* BRANCHJ */
+
*flagp = WORST; /* Tentatively. */
- PL_regcomp_parse--;
- nextchar();
- while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != '|' && *PL_regcomp_parse != ')') {
+ RExC_parse--;
+ nextchar(pRExC_state);
+ while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
flags &= ~TRYAGAIN;
- latest = regpiece(&flags);
+ latest = regpiece(pRExC_state, &flags);
if (latest == NULL) {
if (flags & TRYAGAIN)
continue;
@@ -2280,14 +2565,14 @@ S_regbranch(pTHX_ I32 *flagp, I32 first)
if (chain == NULL) /* First piece. */
*flagp |= flags&SPSTART;
else {
- PL_regnaughty++;
- regtail(chain, latest);
+ RExC_naughty++;
+ regtail(pRExC_state, chain, latest);
}
chain = latest;
c++;
}
if (chain == NULL) { /* Loop ran zero times. */
- chain = reg_node(NOTHING);
+ chain = reg_node(pRExC_state, NOTHING);
if (ret == NULL)
ret = chain;
}
@@ -2308,28 +2593,30 @@ S_regbranch(pTHX_ I32 *flagp, I32 first)
* endmarker role is not redundant.
*/
STATIC regnode *
-S_regpiece(pTHX_ I32 *flagp)
+S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
{
register regnode *ret;
register char op;
register char *next;
I32 flags;
- char *origparse = PL_regcomp_parse;
+ char *origparse = RExC_parse;
char *maxpos;
I32 min;
I32 max = REG_INFTY;
+ char *parse_start;
- ret = regatom(&flags);
+ ret = regatom(pRExC_state, &flags);
if (ret == NULL) {
if (flags & TRYAGAIN)
*flagp |= TRYAGAIN;
return(NULL);
}
- op = *PL_regcomp_parse;
+ op = *RExC_parse;
- if (op == '{' && regcurly(PL_regcomp_parse)) {
- next = PL_regcomp_parse + 1;
+ if (op == '{' && regcurly(RExC_parse)) {
+ parse_start = RExC_parse; /* MJD */
+ next = RExC_parse + 1;
maxpos = Nullch;
while (isDIGIT(*next) || *next == ',') {
if (*next == ',') {
@@ -2343,42 +2630,49 @@ S_regpiece(pTHX_ I32 *flagp)
if (*next == '}') { /* got one */
if (!maxpos)
maxpos = next;
- PL_regcomp_parse++;
- min = atoi(PL_regcomp_parse);
+ RExC_parse++;
+ min = atoi(RExC_parse);
if (*maxpos == ',')
maxpos++;
else
- maxpos = PL_regcomp_parse;
+ maxpos = RExC_parse;
max = atoi(maxpos);
if (!max && *maxpos != '0')
max = REG_INFTY; /* meaning "infinity" */
else if (max >= REG_INFTY)
vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
- PL_regcomp_parse = next;
- nextchar();
+ RExC_parse = next;
+ nextchar(pRExC_state);
do_curly:
if ((flags&SIMPLE)) {
- PL_regnaughty += 2 + PL_regnaughty / 2;
- reginsert(CURLY, ret);
+ RExC_naughty += 2 + RExC_naughty / 2;
+ reginsert(pRExC_state, CURLY, ret);
+ Set_Node_Offset(ret, parse_start+1); /* MJD */
+ Set_Node_Cur_Length(ret);
}
else {
- regnode *w = reg_node(WHILEM);
+ regnode *w = reg_node(pRExC_state, WHILEM);
w->flags = 0;
- regtail(ret, w);
- if (!SIZE_ONLY && PL_extralen) {
- reginsert(LONGJMP,ret);
- reginsert(NOTHING,ret);
+ regtail(pRExC_state, ret, w);
+ if (!SIZE_ONLY && RExC_extralen) {
+ reginsert(pRExC_state, LONGJMP,ret);
+ reginsert(pRExC_state, NOTHING,ret);
NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
}
- reginsert(CURLYX,ret);
- if (!SIZE_ONLY && PL_extralen)
+ reginsert(pRExC_state, CURLYX,ret);
+ /* MJD hk */
+ Set_Node_Offset(ret, parse_start+1);
+ Set_Node_Length(ret,
+ op == '{' ? (RExC_parse - parse_start) : 1);
+
+ if (!SIZE_ONLY && RExC_extralen)
NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
- regtail(ret, reg_node(NOTHING));
+ regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
if (SIZE_ONLY)
- PL_reg_whilem_seen++, PL_extralen += 3;
- PL_regnaughty += 4 + PL_regnaughty; /* compound interest */
+ RExC_whilem_seen++, RExC_extralen += 3;
+ RExC_naughty += 4 + RExC_naughty; /* compound interest */
}
ret->flags = 0;
@@ -2389,8 +2683,8 @@ S_regpiece(pTHX_ I32 *flagp)
if (max && max < min)
vFAIL("Can't do {n,m} with n > m");
if (!SIZE_ONLY) {
- ARG1_SET(ret, min);
- ARG2_SET(ret, max);
+ ARG1_SET(ret, (U16)min);
+ ARG2_SET(ret, (U16)max);
}
goto nest_check;
@@ -2416,25 +2710,26 @@ S_regpiece(pTHX_ I32 *flagp)
if (!(flags&HASWIDTH) && op != '?')
vFAIL("Regexp *+ operand could be empty");
-#endif
+#endif
- nextchar();
+ parse_start = RExC_parse;
+ nextchar(pRExC_state);
*flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
if (op == '*' && (flags&SIMPLE)) {
- reginsert(STAR, ret);
+ reginsert(pRExC_state, STAR, ret);
ret->flags = 0;
- PL_regnaughty += 4;
+ RExC_naughty += 4;
}
else if (op == '*') {
min = 0;
goto do_curly;
}
else if (op == '+' && (flags&SIMPLE)) {
- reginsert(PLUS, ret);
+ reginsert(pRExC_state, PLUS, ret);
ret->flags = 0;
- PL_regnaughty += 3;
+ RExC_naughty += 3;
}
else if (op == '+') {
min = 1;
@@ -2446,19 +2741,19 @@ S_regpiece(pTHX_ I32 *flagp)
}
nest_check:
if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) {
- vWARN3(PL_regcomp_parse,
+ vWARN3(RExC_parse,
"%.*s matches null string many times",
- PL_regcomp_parse - origparse,
+ RExC_parse - origparse,
origparse);
}
- if (*PL_regcomp_parse == '?') {
- nextchar();
- reginsert(MINMOD, ret);
- regtail(ret, ret + NODE_STEP_REGNODE);
+ if (*RExC_parse == '?') {
+ nextchar(pRExC_state);
+ reginsert(pRExC_state, MINMOD, ret);
+ regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE);
}
- if (ISMULT2(PL_regcomp_parse)) {
- PL_regcomp_parse++;
+ if (ISMULT2(RExC_parse)) {
+ RExC_parse++;
vFAIL("Nested quantifiers");
}
@@ -2475,72 +2770,68 @@ S_regpiece(pTHX_ I32 *flagp)
*
* [Yes, it is worth fixing, some scripts can run twice the speed.] */
STATIC regnode *
-S_regatom(pTHX_ I32 *flagp)
+S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
{
register regnode *ret = 0;
I32 flags;
+ char *parse_start = 0;
*flagp = WORST; /* Tentatively. */
tryagain:
- switch (*PL_regcomp_parse) {
+ switch (*RExC_parse) {
case '^':
- PL_seen_zerolen++;
- nextchar();
- if (PL_regflags & PMf_MULTILINE)
- ret = reg_node(MBOL);
- else if (PL_regflags & PMf_SINGLELINE)
- ret = reg_node(SBOL);
+ RExC_seen_zerolen++;
+ nextchar(pRExC_state);
+ if (RExC_flags & PMf_MULTILINE)
+ ret = reg_node(pRExC_state, MBOL);
+ else if (RExC_flags & PMf_SINGLELINE)
+ ret = reg_node(pRExC_state, SBOL);
else
- ret = reg_node(BOL);
+ ret = reg_node(pRExC_state, BOL);
+ Set_Node_Length(ret, 1); /* MJD */
break;
case '$':
- nextchar();
- if (*PL_regcomp_parse)
- PL_seen_zerolen++;
- if (PL_regflags & PMf_MULTILINE)
- ret = reg_node(MEOL);
- else if (PL_regflags & PMf_SINGLELINE)
- ret = reg_node(SEOL);
+ nextchar(pRExC_state);
+ if (*RExC_parse)
+ RExC_seen_zerolen++;
+ if (RExC_flags & PMf_MULTILINE)
+ ret = reg_node(pRExC_state, MEOL);
+ else if (RExC_flags & PMf_SINGLELINE)
+ ret = reg_node(pRExC_state, SEOL);
else
- ret = reg_node(EOL);
+ ret = reg_node(pRExC_state, EOL);
+ Set_Node_Length(ret, 1); /* MJD */
break;
case '.':
- nextchar();
- if (UTF) {
- if (PL_regflags & PMf_SINGLELINE)
- ret = reg_node(SANYUTF8);
- else
- ret = reg_node(ANYUTF8);
- *flagp |= HASWIDTH;
- }
- else {
- if (PL_regflags & PMf_SINGLELINE)
- ret = reg_node(SANY);
- else
- ret = reg_node(REG_ANY);
- *flagp |= HASWIDTH|SIMPLE;
- }
- PL_regnaughty++;
+ nextchar(pRExC_state);
+ if (RExC_flags & PMf_SINGLELINE)
+ ret = reg_node(pRExC_state, SANY);
+ else
+ ret = reg_node(pRExC_state, REG_ANY);
+ *flagp |= HASWIDTH|SIMPLE;
+ RExC_naughty++;
+ Set_Node_Length(ret, 1); /* MJD */
break;
case '[':
{
- char *oregcomp_parse = ++PL_regcomp_parse;
- ret = (UTF ? regclassutf8() : regclass());
- if (*PL_regcomp_parse != ']') {
- PL_regcomp_parse = oregcomp_parse;
+ char *oregcomp_parse = ++RExC_parse;
+ ret = regclass(pRExC_state);
+ if (*RExC_parse != ']') {
+ RExC_parse = oregcomp_parse;
vFAIL("Unmatched [");
}
- nextchar();
+ nextchar(pRExC_state);
*flagp |= HASWIDTH|SIMPLE;
+ Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
break;
}
case '(':
- nextchar();
- ret = reg(1, &flags);
+ nextchar(pRExC_state);
+ ret = reg(pRExC_state, 1, &flags);
if (ret == NULL) {
if (flags & TRYAGAIN) {
- if (PL_regcomp_parse == PL_regxend) {
+ if (RExC_parse == RExC_end) {
/* Make parent create an empty node if needed. */
*flagp |= TRYAGAIN;
return(NULL);
@@ -2561,155 +2852,141 @@ tryagain:
/* Supposed to be caught earlier. */
break;
case '{':
- if (!regcurly(PL_regcomp_parse)) {
- PL_regcomp_parse++;
+ if (!regcurly(RExC_parse)) {
+ RExC_parse++;
goto defchar;
}
/* FALL THROUGH */
case '?':
case '+':
case '*':
- PL_regcomp_parse++;
+ RExC_parse++;
vFAIL("Quantifier follows nothing");
break;
case '\\':
- switch (*++PL_regcomp_parse) {
+ switch (*++RExC_parse) {
case 'A':
- PL_seen_zerolen++;
- ret = reg_node(SBOL);
+ RExC_seen_zerolen++;
+ ret = reg_node(pRExC_state, SBOL);
*flagp |= SIMPLE;
- nextchar();
+ nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'G':
- ret = reg_node(GPOS);
- PL_regseen |= REG_SEEN_GPOS;
+ ret = reg_node(pRExC_state, GPOS);
+ RExC_seen |= REG_SEEN_GPOS;
*flagp |= SIMPLE;
- nextchar();
+ nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'Z':
- ret = reg_node(SEOL);
+ ret = reg_node(pRExC_state, SEOL);
*flagp |= SIMPLE;
- nextchar();
+ RExC_seen_zerolen++; /* Do not optimize RE away */
+ nextchar(pRExC_state);
break;
case 'z':
- ret = reg_node(EOS);
+ ret = reg_node(pRExC_state, EOS);
*flagp |= SIMPLE;
- PL_seen_zerolen++; /* Do not optimize RE away */
- nextchar();
+ RExC_seen_zerolen++; /* Do not optimize RE away */
+ nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'C':
- ret = reg_node(SANY);
+ ret = reg_node(pRExC_state, CANY);
+ RExC_seen |= REG_SEEN_CANY;
*flagp |= HASWIDTH|SIMPLE;
- nextchar();
+ nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'X':
- ret = reg_node(CLUMP);
+ ret = reg_node(pRExC_state, CLUMP);
*flagp |= HASWIDTH;
- nextchar();
- if (UTF && !PL_utf8_mark)
- is_utf8_mark((U8*)"~"); /* preload table */
+ nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'w':
- ret = reg_node(
- UTF
- ? (LOC ? ALNUMLUTF8 : ALNUMUTF8)
- : (LOC ? ALNUML : ALNUM));
+ ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
*flagp |= HASWIDTH|SIMPLE;
- nextchar();
- if (UTF && !PL_utf8_alnum)
- is_utf8_alnum((U8*)"a"); /* preload table */
+ nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'W':
- ret = reg_node(
- UTF
- ? (LOC ? NALNUMLUTF8 : NALNUMUTF8)
- : (LOC ? NALNUML : NALNUM));
+ ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
*flagp |= HASWIDTH|SIMPLE;
- nextchar();
- if (UTF && !PL_utf8_alnum)
- is_utf8_alnum((U8*)"a"); /* preload table */
+ nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'b':
- PL_seen_zerolen++;
- PL_regseen |= REG_SEEN_LOOKBEHIND;
- ret = reg_node(
- UTF
- ? (LOC ? BOUNDLUTF8 : BOUNDUTF8)
- : (LOC ? BOUNDL : BOUND));
+ RExC_seen_zerolen++;
+ RExC_seen |= REG_SEEN_LOOKBEHIND;
+ ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
*flagp |= SIMPLE;
- nextchar();
- if (UTF && !PL_utf8_alnum)
- is_utf8_alnum((U8*)"a"); /* preload table */
+ nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'B':
- PL_seen_zerolen++;
- PL_regseen |= REG_SEEN_LOOKBEHIND;
- ret = reg_node(
- UTF
- ? (LOC ? NBOUNDLUTF8 : NBOUNDUTF8)
- : (LOC ? NBOUNDL : NBOUND));
+ RExC_seen_zerolen++;
+ RExC_seen |= REG_SEEN_LOOKBEHIND;
+ ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
*flagp |= SIMPLE;
- nextchar();
- if (UTF && !PL_utf8_alnum)
- is_utf8_alnum((U8*)"a"); /* preload table */
+ nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 's':
- ret = reg_node(
- UTF
- ? (LOC ? SPACELUTF8 : SPACEUTF8)
- : (LOC ? SPACEL : SPACE));
+ ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
*flagp |= HASWIDTH|SIMPLE;
- nextchar();
- if (UTF && !PL_utf8_space)
- is_utf8_space((U8*)" "); /* preload table */
+ nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'S':
- ret = reg_node(
- UTF
- ? (LOC ? NSPACELUTF8 : NSPACEUTF8)
- : (LOC ? NSPACEL : NSPACE));
+ ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
*flagp |= HASWIDTH|SIMPLE;
- nextchar();
- if (UTF && !PL_utf8_space)
- is_utf8_space((U8*)" "); /* preload table */
+ nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'd':
- ret = reg_node(UTF ? DIGITUTF8 : DIGIT);
+ ret = reg_node(pRExC_state, DIGIT);
*flagp |= HASWIDTH|SIMPLE;
- nextchar();
- if (UTF && !PL_utf8_digit)
- is_utf8_digit((U8*)"1"); /* preload table */
+ nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'D':
- ret = reg_node(UTF ? NDIGITUTF8 : NDIGIT);
+ ret = reg_node(pRExC_state, NDIGIT);
*flagp |= HASWIDTH|SIMPLE;
- nextchar();
- if (UTF && !PL_utf8_digit)
- is_utf8_digit((U8*)"1"); /* preload table */
+ nextchar(pRExC_state);
+ Set_Node_Length(ret, 2); /* MJD */
break;
case 'p':
case 'P':
- { /* a lovely hack--pretend we saw [\pX] instead */
- char* oldregxend = PL_regxend;
-
- if (PL_regcomp_parse[1] == '{') {
- PL_regxend = strchr(PL_regcomp_parse, '}');
- if (!PL_regxend) {
- PL_regcomp_parse += 2;
- PL_regxend = oldregxend;
- vFAIL("Missing right brace on \\p{}");
+ {
+ char* oldregxend = RExC_end;
+ char* parse_start = RExC_parse;
+
+ if (RExC_parse[1] == '{') {
+ /* a lovely hack--pretend we saw [\pX] instead */
+ RExC_end = strchr(RExC_parse, '}');
+ if (!RExC_end) {
+ U8 c = (U8)*RExC_parse;
+ RExC_parse += 2;
+ RExC_end = oldregxend;
+ vFAIL2("Missing right brace on \\%c{}", c);
}
- PL_regxend++;
+ RExC_end++;
}
- else
- PL_regxend = PL_regcomp_parse + 2;
- PL_regcomp_parse--;
+ else {
+ RExC_end = RExC_parse + 2;
+ if (RExC_end > oldregxend)
+ RExC_end = oldregxend;
+ }
+ RExC_parse--;
- ret = regclassutf8();
+ ret = regclass(pRExC_state);
- PL_regxend = oldregxend;
- PL_regcomp_parse--;
- nextchar();
+ RExC_end = oldregxend;
+ RExC_parse--;
+ Set_Node_Cur_Length(ret); /* MJD */
+ nextchar(pRExC_state);
*flagp |= HASWIDTH|SIMPLE;
}
break;
@@ -2726,28 +3003,33 @@ tryagain:
case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
{
- I32 num = atoi(PL_regcomp_parse);
+ I32 num = atoi(RExC_parse);
- if (num > 9 && num >= PL_regnpar)
+ if (num > 9 && num >= RExC_npar)
goto defchar;
else {
- while (isDIGIT(*PL_regcomp_parse))
- PL_regcomp_parse++;
+ char * parse_start = RExC_parse - 1; /* MJD */
+ while (isDIGIT(*RExC_parse))
+ RExC_parse++;
- if (!SIZE_ONLY && num > PL_regcomp_rx->nparens)
+ if (!SIZE_ONLY && num > (I32)RExC_rx->nparens)
vFAIL("Reference to nonexistent group");
- PL_regsawback = 1;
- ret = reganode(FOLD
- ? (LOC ? REFFL : REFF)
- : REF, num);
+ RExC_sawback = 1;
+ ret = reganode(pRExC_state,
+ (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
+ num);
*flagp |= HASWIDTH;
- PL_regcomp_parse--;
- nextchar();
+
+ /* override incorrect value set in reganode MJD */
+ Set_Node_Offset(ret, parse_start+1);
+ Set_Node_Cur_Length(ret); /* MJD */
+ RExC_parse--;
+ nextchar(pRExC_state);
}
}
break;
case '\0':
- if (PL_regcomp_parse >= PL_regxend)
+ if (RExC_parse >= RExC_end)
FAIL("Trailing \\");
/* FALL THROUGH */
default:
@@ -2758,9 +3040,9 @@ tryagain:
break;
case '#':
- if (PL_regflags & PMf_EXTENDED) {
- while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != '\n') PL_regcomp_parse++;
- if (PL_regcomp_parse < PL_regxend)
+ if (RExC_flags & PMf_EXTENDED) {
+ while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++;
+ if (RExC_parse < RExC_end)
goto tryagain;
}
/* FALL THROUGH */
@@ -2771,22 +3053,26 @@ tryagain:
register char *p;
char *oldp, *s;
STRLEN numlen;
+ STRLEN foldlen;
+ U8 tmpbuf[UTF8_MAXLEN_FOLD+1], *foldbuf;
+
+ parse_start = RExC_parse - 1;
- PL_regcomp_parse++;
+ RExC_parse++;
defchar:
- ret = reg_node(FOLD
- ? (LOC ? EXACTFL : EXACTF)
- : EXACT);
+ ender = 0;
+ ret = reg_node(pRExC_state,
+ (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
s = STRING(ret);
- for (len = 0, p = PL_regcomp_parse - 1;
- len < 127 && p < PL_regxend;
+ for (len = 0, p = RExC_parse - 1;
+ len < 127 && p < RExC_end;
len++)
{
oldp = p;
- if (PL_regflags & PMf_EXTENDED)
- p = regwhite(p, PL_regxend);
+ if (RExC_flags & PMf_EXTENDED)
+ p = regwhite(p, RExC_end);
switch (*p) {
case '^':
case '$':
@@ -2799,6 +3085,8 @@ tryagain:
case '\\':
switch (*++p) {
case 'A':
+ case 'C':
+ case 'X':
case 'G':
case 'Z':
case 'z':
@@ -2831,32 +3119,28 @@ tryagain:
p++;
break;
case 'e':
-#ifdef ASCIIish
- ender = '\033';
-#else
- ender = '\047';
-#endif
+ ender = ASCII_TO_NATIVE('\033');
p++;
break;
case 'a':
-#ifdef ASCIIish
- ender = '\007';
-#else
- ender = '\057';
-#endif
+ ender = ASCII_TO_NATIVE('\007');
p++;
break;
case 'x':
if (*++p == '{') {
char* e = strchr(p, '}');
-
+
if (!e) {
- PL_regcomp_parse = p + 1;
+ RExC_parse = p + 1;
vFAIL("Missing right brace on \\x{}");
}
else {
- numlen = 1; /* allow underscores */
- ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
+ I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX;
+ numlen = e - p - 1;
+ ender = grok_hex(p + 1, &numlen, &flags, NULL);
+ if (ender > 0xff)
+ RExC_utf8 = 1;
/* numlen is generous */
if (numlen + len >= 127) {
p--;
@@ -2866,8 +3150,9 @@ tryagain:
}
}
else {
- numlen = 0; /* disallow underscores */
- ender = (UV)scan_hex(p, 2, &numlen);
+ I32 flags = PERL_SCAN_DISALLOW_PREFIX;
+ numlen = 2;
+ ender = grok_hex(p, &numlen, &flags, NULL);
p += numlen;
}
break;
@@ -2879,9 +3164,10 @@ tryagain:
case '0': case '1': case '2': case '3':case '4':
case '5': case '6': case '7': case '8':case '9':
if (*p == '0' ||
- (isDIGIT(p[1]) && atoi(p) >= PL_regnpar) ) {
- numlen = 0; /* disallow underscores */
- ender = (UV)scan_oct(p, 3, &numlen);
+ (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
+ I32 flags = 0;
+ numlen = 3;
+ ender = grok_oct(p, &numlen, &flags, NULL);
p += numlen;
}
else {
@@ -2890,63 +3176,111 @@ tryagain:
}
break;
case '\0':
- if (p >= PL_regxend)
+ if (p >= RExC_end)
FAIL("Trailing \\");
/* FALL THROUGH */
default:
if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p))
- vWARN2(p +1, "Unrecognized escape \\%c passed through", *p);
+ vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
goto normal_default;
}
break;
default:
normal_default:
if (UTF8_IS_START(*p) && UTF) {
- ender = utf8_to_uv((U8*)p, PL_regxend - p,
- &numlen, 0);
+ ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
+ &numlen, 0);
p += numlen;
}
else
ender = *p++;
break;
}
- if (PL_regflags & PMf_EXTENDED)
- p = regwhite(p, PL_regxend);
+ if (RExC_flags & PMf_EXTENDED)
+ p = regwhite(p, RExC_end);
if (UTF && FOLD) {
- if (LOC)
- ender = toLOWER_LC_uni(ender);
- else
- ender = toLOWER_uni(ender);
+ /* Prime the casefolded buffer. */
+ ender = toFOLD_uni(ender, tmpbuf, &foldlen);
}
if (ISMULT2(p)) { /* Back off on ?+*. */
if (len)
p = oldp;
- /* ender is a Unicode value so it can be > 0xff --
- * in other words, do not use UTF8_IS_CONTINUED(). */
- else if (ender >= 0x80 && UTF) {
- reguni(ender, s, &numlen);
- s += numlen;
- len += numlen;
+ else if (UTF) {
+ STRLEN unilen;
+
+ if (FOLD) {
+ /* Emit all the Unicode characters. */
+ for (foldbuf = tmpbuf;
+ foldlen;
+ foldlen -= numlen) {
+ ender = utf8_to_uvchr(foldbuf, &numlen);
+ if (numlen > 0) {
+ reguni(pRExC_state, ender, s, &unilen);
+ s += unilen;
+ len += unilen;
+ /* In EBCDIC the numlen
+ * and unilen can differ. */
+ foldbuf += numlen;
+ if (numlen >= foldlen)
+ break;
+ }
+ else
+ break; /* "Can't happen." */
+ }
+ }
+ else {
+ reguni(pRExC_state, ender, s, &unilen);
+ if (unilen > 0) {
+ s += unilen;
+ len += unilen;
+ }
+ }
}
else {
len++;
- REGC(ender, s++);
+ REGC((char)ender, s++);
}
break;
}
- /* ender is a Unicode value so it can be > 0xff --
- * in other words, do not use UTF8_IS_CONTINUED(). */
- if (ender >= 0x80 && UTF) {
- reguni(ender, s, &numlen);
- s += numlen;
- len += numlen - 1;
+ if (UTF) {
+ STRLEN unilen;
+
+ if (FOLD) {
+ /* Emit all the Unicode characters. */
+ for (foldbuf = tmpbuf;
+ foldlen;
+ foldlen -= numlen) {
+ ender = utf8_to_uvchr(foldbuf, &numlen);
+ if (numlen > 0) {
+ reguni(pRExC_state, ender, s, &unilen);
+ len += unilen;
+ s += unilen;
+ /* In EBCDIC the numlen
+ * and unilen can differ. */
+ foldbuf += numlen;
+ if (numlen >= foldlen)
+ break;
+ }
+ else
+ break;
+ }
+ }
+ else {
+ reguni(pRExC_state, ender, s, &unilen);
+ if (unilen > 0) {
+ s += unilen;
+ len += unilen;
+ }
+ }
+ len--;
}
else
- REGC(ender, s++);
+ REGC((char)ender, s++);
}
loopdone:
- PL_regcomp_parse = p - 1;
- nextchar();
+ RExC_parse = p - 1;
+ Set_Node_Cur_Length(ret); /* MJD */
+ nextchar(pRExC_state);
{
/* len is STRLEN which is unsigned, need to copy to signed */
IV iv = len;
@@ -2960,13 +3294,37 @@ tryagain:
if (!SIZE_ONLY)
STR_LEN(ret) = len;
if (SIZE_ONLY)
- PL_regsize += STR_SZ(len);
+ RExC_size += STR_SZ(len);
else
- PL_regcode += STR_SZ(len);
+ RExC_emit += STR_SZ(len);
}
break;
}
+ /* If the encoding pragma is in effect recode the text of
+ * any EXACT-kind nodes. */
+ if (PL_encoding && PL_regkind[(U8)OP(ret)] == EXACT) {
+ STRLEN oldlen = STR_LEN(ret);
+ SV *sv = sv_2mortal(newSVpvn(STRING(ret), oldlen));
+
+ if (RExC_utf8)
+ SvUTF8_on(sv);
+ if (sv_utf8_downgrade(sv, TRUE)) {
+ char *s = sv_recode_to_utf8(sv, PL_encoding);
+ STRLEN newlen = SvCUR(sv);
+
+ if (!SIZE_ONLY) {
+ DEBUG_r(PerlIO_printf(Perl_debug_log, "recode %*s to %*s\n",
+ (int)oldlen, STRING(ret),
+ (int)newlen, s));
+ Copy(s, STRING(ret), newlen, char);
+ STR_LEN(ret) += newlen - oldlen;
+ RExC_emit += STR_SZ(newlen) - STR_SZ(oldlen);
+ } else
+ RExC_size += STR_SZ(newlen) - STR_SZ(oldlen);
+ }
+ }
+
return(ret);
}
@@ -2991,31 +3349,34 @@ S_regwhite(pTHX_ char *p, char *e)
Character classes ([:foo:]) can also be negated ([:^foo:]).
Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
- but trigger warnings because they are currently unimplemented. */
+ but trigger failures because they are currently unimplemented. */
+
+#define POSIXCC_DONE(c) ((c) == ':')
+#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
+#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
+
STATIC I32
-S_regpposixcc(pTHX_ I32 value)
+S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
{
char *posixcc = 0;
I32 namedclass = OOB_NAMEDCLASS;
- if (value == '[' && PL_regcomp_parse + 1 < PL_regxend &&
+ if (value == '[' && RExC_parse + 1 < RExC_end &&
/* I smell either [: or [= or [. -- POSIX has been here, right? */
- (*PL_regcomp_parse == ':' ||
- *PL_regcomp_parse == '=' ||
- *PL_regcomp_parse == '.')) {
- char c = *PL_regcomp_parse;
- char* s = PL_regcomp_parse++;
-
- while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != c)
- PL_regcomp_parse++;
- if (PL_regcomp_parse == PL_regxend)
+ POSIXCC(UCHARAT(RExC_parse))) {
+ char c = UCHARAT(RExC_parse);
+ char* s = RExC_parse++;
+
+ while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
+ RExC_parse++;
+ if (RExC_parse == RExC_end)
/* Grandfather lone [:, [=, [. */
- PL_regcomp_parse = s;
+ RExC_parse = s;
else {
- char* t = PL_regcomp_parse++; /* skip over the c */
+ char* t = RExC_parse++; /* skip over the c */
- if (*PL_regcomp_parse == ']') {
- PL_regcomp_parse++; /* skip over the ending ] */
+ if (UCHARAT(RExC_parse) == ']') {
+ RExC_parse++; /* skip over the ending ] */
posixcc = s + 1;
if (*s == ':') {
I32 complement = *posixcc == '^' ? *posixcc++ : 0;
@@ -3101,16 +3462,16 @@ S_regpposixcc(pTHX_ I32 value)
} else if (!SIZE_ONLY) {
/* [[=foo=]] and [[.foo.]] are still future. */
- /* adjust PL_regcomp_parse so the warning shows after
+ /* adjust RExC_parse so the warning shows after
the class closes */
- while (*PL_regcomp_parse && *PL_regcomp_parse != ']')
- PL_regcomp_parse++;
+ while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
+ RExC_parse++;
Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
}
} else {
/* Maternal grandfather:
* "[:" ending in ":" but not in ":]" */
- PL_regcomp_parse = s;
+ RExC_parse = s;
}
}
}
@@ -3119,26 +3480,25 @@ S_regpposixcc(pTHX_ I32 value)
}
STATIC void
-S_checkposixcc(pTHX)
+S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
{
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP) &&
- (*PL_regcomp_parse == ':' ||
- *PL_regcomp_parse == '=' ||
- *PL_regcomp_parse == '.')) {
- char *s = PL_regcomp_parse;
+ if (!SIZE_ONLY && POSIXCC(UCHARAT(RExC_parse))) {
+ char *s = RExC_parse;
char c = *s++;
while(*s && isALNUM(*s))
s++;
if (*s && c == *s && s[1] == ']') {
- vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c);
+ if (ckWARN(WARN_REGEXP))
+ vWARN3(s+2,
+ "POSIX syntax [%c %c] belongs inside character classes",
+ c, c);
/* [[=foo=]] and [[.foo.]] are still future. */
- if (c == '=' || c == '.')
- {
- /* adjust PL_regcomp_parse so the error shows after
+ if (POSIXCC_NOTYET(c)) {
+ /* adjust RExC_parse so the error shows after
the class closes */
- while (*PL_regcomp_parse && *PL_regcomp_parse++ != ']')
+ while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
;
Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
}
@@ -3147,113 +3507,230 @@ S_checkposixcc(pTHX)
}
STATIC regnode *
-S_regclass(pTHX)
+S_regclass(pTHX_ RExC_state_t *pRExC_state)
{
- register U32 value;
- register I32 lastvalue = OOB_CHAR8;
- register I32 range = 0;
+ register UV value;
+ register UV nextvalue;
+ register IV prevvalue = OOB_UNICODE;
+ register IV range = 0;
register regnode *ret;
STRLEN numlen;
- I32 namedclass;
- char *rangebegin;
+ IV namedclass;
+ char *rangebegin = 0;
bool need_class = 0;
+ SV *listsv = Nullsv;
+ register char *e;
+ UV n;
+ bool optimize_invert = TRUE;
+ AV* unicode_alternate = 0;
+#ifdef EBCDIC
+ UV literal_endpoint = 0;
+#endif
+
+ ret = reganode(pRExC_state, ANYOF, 0);
+
+ if (!SIZE_ONLY)
+ ANYOF_FLAGS(ret) = 0;
+
+ if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
+ RExC_naughty++;
+ RExC_parse++;
+ if (!SIZE_ONLY)
+ ANYOF_FLAGS(ret) |= ANYOF_INVERT;
+ }
- ret = reg_node(ANYOF);
if (SIZE_ONLY)
- PL_regsize += ANYOF_SKIP;
+ RExC_size += ANYOF_SKIP;
else {
- ret->flags = 0;
- ANYOF_BITMAP_ZERO(ret);
- PL_regcode += ANYOF_SKIP;
+ RExC_emit += ANYOF_SKIP;
if (FOLD)
ANYOF_FLAGS(ret) |= ANYOF_FOLD;
if (LOC)
ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
- }
- if (*PL_regcomp_parse == '^') { /* Complement of range. */
- PL_regnaughty++;
- PL_regcomp_parse++;
- if (!SIZE_ONLY)
- ANYOF_FLAGS(ret) |= ANYOF_INVERT;
+ ANYOF_BITMAP_ZERO(ret);
+ listsv = newSVpvn("# comment\n", 10);
}
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
- checkposixcc();
+ nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
+
+ if (!SIZE_ONLY && POSIXCC(nextvalue))
+ checkposixcc(pRExC_state);
+
+ /* allow 1st char to be ] (allowing it to be - is dealt with later) */
+ if (UCHARAT(RExC_parse) == ']')
+ goto charclassloop;
+
+ while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
+
+ charclassloop:
+
+ namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
- if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-')
- goto skipcond; /* allow 1st char to be ] or - */
- while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') {
- skipcond:
- namedclass = OOB_NAMEDCLASS;
if (!range)
- rangebegin = PL_regcomp_parse;
- value = UCHARAT(PL_regcomp_parse++);
- if (value == '[')
- namedclass = regpposixcc(value);
+ rangebegin = RExC_parse;
+ if (UTF) {
+ value = utf8n_to_uvchr((U8*)RExC_parse,
+ RExC_end - RExC_parse,
+ &numlen, 0);
+ RExC_parse += numlen;
+ }
+ else
+ value = UCHARAT(RExC_parse++);
+ nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
+ if (value == '[' && POSIXCC(nextvalue))
+ namedclass = regpposixcc(pRExC_state, value);
else if (value == '\\') {
- value = UCHARAT(PL_regcomp_parse++);
+ if (UTF) {
+ value = utf8n_to_uvchr((U8*)RExC_parse,
+ RExC_end - RExC_parse,
+ &numlen, 0);
+ RExC_parse += numlen;
+ }
+ else
+ value = UCHARAT(RExC_parse++);
/* Some compilers cannot handle switching on 64-bit integer
- * values, therefore the 'value' cannot be an UV. --jhi */
- switch (value) {
+ * values, therefore value cannot be an UV. Yes, this will
+ * be a problem later if we want switch on Unicode.
+ * A similar issue a little bit later when switching on
+ * namedclass. --jhi */
+ switch ((I32)value) {
case 'w': namedclass = ANYOF_ALNUM; break;
case 'W': namedclass = ANYOF_NALNUM; break;
case 's': namedclass = ANYOF_SPACE; break;
case 'S': namedclass = ANYOF_NSPACE; break;
case 'd': namedclass = ANYOF_DIGIT; break;
case 'D': namedclass = ANYOF_NDIGIT; break;
+ case 'p':
+ case 'P':
+ if (RExC_parse >= RExC_end)
+ vFAIL2("Empty \\%c{}", (U8)value);
+ if (*RExC_parse == '{') {
+ U8 c = (U8)value;
+ e = strchr(RExC_parse++, '}');
+ if (!e)
+ vFAIL2("Missing right brace on \\%c{}", c);
+ while (isSPACE(UCHARAT(RExC_parse)))
+ RExC_parse++;
+ if (e == RExC_parse)
+ vFAIL2("Empty \\%c{}", c);
+ n = e - RExC_parse;
+ while (isSPACE(UCHARAT(RExC_parse + n - 1)))
+ n--;
+ }
+ else {
+ e = RExC_parse;
+ n = 1;
+ }
+ if (!SIZE_ONLY) {
+ if (UCHARAT(RExC_parse) == '^') {
+ RExC_parse++;
+ n--;
+ value = value == 'p' ? 'P' : 'p'; /* toggle */
+ while (isSPACE(UCHARAT(RExC_parse))) {
+ RExC_parse++;
+ n--;
+ }
+ }
+ if (value == 'p')
+ Perl_sv_catpvf(aTHX_ listsv,
+ "+utf8::%.*s\n", (int)n, RExC_parse);
+ else
+ Perl_sv_catpvf(aTHX_ listsv,
+ "!utf8::%.*s\n", (int)n, RExC_parse);
+ }
+ RExC_parse = e + 1;
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+ continue;
case 'n': value = '\n'; break;
case 'r': value = '\r'; break;
case 't': value = '\t'; break;
case 'f': value = '\f'; break;
case 'b': value = '\b'; break;
-#ifdef ASCIIish
- case 'e': value = '\033'; break;
- case 'a': value = '\007'; break;
-#else
- case 'e': value = '\047'; break;
- case 'a': value = '\057'; break;
-#endif
+ case 'e': value = ASCII_TO_NATIVE('\033');break;
+ case 'a': value = ASCII_TO_NATIVE('\007');break;
case 'x':
- numlen = 0; /* disallow underscores */
- value = (UV)scan_hex(PL_regcomp_parse, 2, &numlen);
- PL_regcomp_parse += numlen;
+ if (*RExC_parse == '{') {
+ I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_DISALLOW_PREFIX;
+ e = strchr(RExC_parse++, '}');
+ if (!e)
+ vFAIL("Missing right brace on \\x{}");
+
+ numlen = e - RExC_parse;
+ value = grok_hex(RExC_parse, &numlen, &flags, NULL);
+ RExC_parse = e + 1;
+ }
+ else {
+ I32 flags = PERL_SCAN_DISALLOW_PREFIX;
+ numlen = 2;
+ value = grok_hex(RExC_parse, &numlen, &flags, NULL);
+ RExC_parse += numlen;
+ }
break;
case 'c':
- value = UCHARAT(PL_regcomp_parse++);
+ value = UCHARAT(RExC_parse++);
value = toCTRL(value);
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
- numlen = 0; /* disallow underscores */
- value = (UV)scan_oct(--PL_regcomp_parse, 3, &numlen);
- PL_regcomp_parse += numlen;
+ {
+ I32 flags = 0;
+ numlen = 3;
+ value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
+ RExC_parse += numlen;
break;
+ }
default:
if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
-
- vWARN2(PL_regcomp_parse, "Unrecognized escape \\%c in character class passed through", (int)value);
+ vWARN2(RExC_parse,
+ "Unrecognized escape \\%c in character class passed through",
+ (int)value);
break;
}
- }
- if (namedclass > OOB_NAMEDCLASS) {
- if (!need_class && !SIZE_ONLY)
+ } /* end of \blah */
+#ifdef EBCDIC
+ else
+ literal_endpoint++;
+#endif
+
+ if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
+
+ if (!SIZE_ONLY && !need_class)
ANYOF_CLASS_ZERO(ret);
+
need_class = 1;
- if (range) { /* a-\d, a-[:digit:] */
+
+ /* a bad range like a-\d, a-[:digit:] ? */
+ if (range) {
if (!SIZE_ONLY) {
if (ckWARN(WARN_REGEXP))
- vWARN4(PL_regcomp_parse,
+ vWARN4(RExC_parse,
"False [] range \"%*.*s\"",
- PL_regcomp_parse - rangebegin,
- PL_regcomp_parse - rangebegin,
+ RExC_parse - rangebegin,
+ RExC_parse - rangebegin,
rangebegin);
- ANYOF_BITMAP_SET(ret, lastvalue);
- ANYOF_BITMAP_SET(ret, '-');
+ if (prevvalue < 256) {
+ ANYOF_BITMAP_SET(ret, prevvalue);
+ ANYOF_BITMAP_SET(ret, '-');
+ }
+ else {
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+ Perl_sv_catpvf(aTHX_ listsv,
+ "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
+ }
}
- range = 0; /* this is not a true range */
+
+ range = 0; /* this was not a true range */
}
+
if (!SIZE_ONLY) {
- switch (namedclass) {
+ if (namedclass > OOB_NAMEDCLASS)
+ optimize_invert = FALSE;
+ /* Possible truncation here but in some 64-bit environments
+ * the compiler gets heartburn about switch on 64-bit values.
+ * A similar issue a little earlier when switching on value.
+ * --jhi */
+ switch ((I32)namedclass) {
case ANYOF_ALNUM:
if (LOC)
ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
@@ -3262,6 +3739,7 @@ S_regclass(pTHX)
if (isALNUM(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
break;
case ANYOF_NALNUM:
if (LOC)
@@ -3271,42 +3749,17 @@ S_regclass(pTHX)
if (!isALNUM(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
break;
- case ANYOF_SPACE:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_SPACE);
- else {
- for (value = 0; value < 256; value++)
- if (isSPACE(value))
- ANYOF_BITMAP_SET(ret, value);
- }
- break;
- case ANYOF_NSPACE:
+ case ANYOF_ALNUMC:
if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
+ ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
else {
for (value = 0; value < 256; value++)
- if (!isSPACE(value))
+ if (isALNUMC(value))
ANYOF_BITMAP_SET(ret, value);
}
- break;
- case ANYOF_DIGIT:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
- else {
- for (value = '0'; value <= '9'; value++)
- ANYOF_BITMAP_SET(ret, value);
- }
- break;
- case ANYOF_NDIGIT:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
- else {
- for (value = 0; value < '0'; value++)
- ANYOF_BITMAP_SET(ret, value);
- for (value = '9' + 1; value < 256; value++)
- ANYOF_BITMAP_SET(ret, value);
- }
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
break;
case ANYOF_NALNUMC:
if (LOC)
@@ -3316,15 +3769,7 @@ S_regclass(pTHX)
if (!isALNUMC(value))
ANYOF_BITMAP_SET(ret, value);
}
- break;
- case ANYOF_ALNUMC:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
- else {
- for (value = 0; value < 256; value++)
- if (isALNUMC(value))
- ANYOF_BITMAP_SET(ret, value);
- }
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
break;
case ANYOF_ALPHA:
if (LOC)
@@ -3334,6 +3779,7 @@ S_regclass(pTHX)
if (isALPHA(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
break;
case ANYOF_NALPHA:
if (LOC)
@@ -3343,34 +3789,39 @@ S_regclass(pTHX)
if (!isALPHA(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
break;
case ANYOF_ASCII:
if (LOC)
ANYOF_CLASS_SET(ret, ANYOF_ASCII);
else {
-#ifdef ASCIIish
+#ifndef EBCDIC
for (value = 0; value < 128; value++)
ANYOF_BITMAP_SET(ret, value);
#else /* EBCDIC */
- for (value = 0; value < 256; value++)
+ for (value = 0; value < 256; value++) {
if (isASCII(value))
- ANYOF_BITMAP_SET(ret, value);
+ ANYOF_BITMAP_SET(ret, value);
+ }
#endif /* EBCDIC */
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
break;
case ANYOF_NASCII:
if (LOC)
ANYOF_CLASS_SET(ret, ANYOF_NASCII);
else {
-#ifdef ASCIIish
+#ifndef EBCDIC
for (value = 128; value < 256; value++)
ANYOF_BITMAP_SET(ret, value);
#else /* EBCDIC */
- for (value = 0; value < 256; value++)
+ for (value = 0; value < 256; value++) {
if (!isASCII(value))
- ANYOF_BITMAP_SET(ret, value);
+ ANYOF_BITMAP_SET(ret, value);
+ }
#endif /* EBCDIC */
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
break;
case ANYOF_BLANK:
if (LOC)
@@ -3380,6 +3831,7 @@ S_regclass(pTHX)
if (isBLANK(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
break;
case ANYOF_NBLANK:
if (LOC)
@@ -3389,6 +3841,7 @@ S_regclass(pTHX)
if (!isBLANK(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
break;
case ANYOF_CNTRL:
if (LOC)
@@ -3398,7 +3851,7 @@ S_regclass(pTHX)
if (isCNTRL(value))
ANYOF_BITMAP_SET(ret, value);
}
- lastvalue = OOB_CHAR8;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
break;
case ANYOF_NCNTRL:
if (LOC)
@@ -3408,6 +3861,29 @@ S_regclass(pTHX)
if (!isCNTRL(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
+ break;
+ case ANYOF_DIGIT:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
+ else {
+ /* consecutive digits assumed */
+ for (value = '0'; value <= '9'; value++)
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
+ break;
+ case ANYOF_NDIGIT:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
+ else {
+ /* consecutive digits assumed */
+ for (value = 0; value < '0'; value++)
+ ANYOF_BITMAP_SET(ret, value);
+ for (value = '9' + 1; value < 256; value++)
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
break;
case ANYOF_GRAPH:
if (LOC)
@@ -3417,6 +3893,7 @@ S_regclass(pTHX)
if (isGRAPH(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
break;
case ANYOF_NGRAPH:
if (LOC)
@@ -3426,6 +3903,7 @@ S_regclass(pTHX)
if (!isGRAPH(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
break;
case ANYOF_LOWER:
if (LOC)
@@ -3435,6 +3913,7 @@ S_regclass(pTHX)
if (isLOWER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
break;
case ANYOF_NLOWER:
if (LOC)
@@ -3444,6 +3923,7 @@ S_regclass(pTHX)
if (!isLOWER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
break;
case ANYOF_PRINT:
if (LOC)
@@ -3453,6 +3933,7 @@ S_regclass(pTHX)
if (isPRINT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
break;
case ANYOF_NPRINT:
if (LOC)
@@ -3462,6 +3943,7 @@ S_regclass(pTHX)
if (!isPRINT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
break;
case ANYOF_PSXSPC:
if (LOC)
@@ -3471,6 +3953,7 @@ S_regclass(pTHX)
if (isPSXSPC(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
break;
case ANYOF_NPSXSPC:
if (LOC)
@@ -3480,6 +3963,7 @@ S_regclass(pTHX)
if (!isPSXSPC(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
break;
case ANYOF_PUNCT:
if (LOC)
@@ -3489,6 +3973,7 @@ S_regclass(pTHX)
if (isPUNCT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
break;
case ANYOF_NPUNCT:
if (LOC)
@@ -3498,6 +3983,27 @@ S_regclass(pTHX)
if (!isPUNCT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
+ break;
+ case ANYOF_SPACE:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_SPACE);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isSPACE(value))
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
+ break;
+ case ANYOF_NSPACE:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isSPACE(value))
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
break;
case ANYOF_UPPER:
if (LOC)
@@ -3507,6 +4013,7 @@ S_regclass(pTHX)
if (isUPPER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
break;
case ANYOF_NUPPER:
if (LOC)
@@ -3516,6 +4023,7 @@ S_regclass(pTHX)
if (!isUPPER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
break;
case ANYOF_XDIGIT:
if (LOC)
@@ -3525,6 +4033,7 @@ S_regclass(pTHX)
if (isXDIGIT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
break;
case ANYOF_NXDIGIT:
if (LOC)
@@ -3534,6 +4043,7 @@ S_regclass(pTHX)
if (!isXDIGIT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
break;
default:
vFAIL("Invalid [::] class");
@@ -3543,377 +4053,216 @@ S_regclass(pTHX)
ANYOF_FLAGS(ret) |= ANYOF_CLASS;
continue;
}
- }
+ } /* end of namedclass \blah */
+
if (range) {
- if (lastvalue > value) /* b-a */ {
+ if (prevvalue > (IV)value) /* b-a */ {
Simple_vFAIL4("Invalid [] range \"%*.*s\"",
- PL_regcomp_parse - rangebegin,
- PL_regcomp_parse - rangebegin,
+ RExC_parse - rangebegin,
+ RExC_parse - rangebegin,
rangebegin);
+ range = 0; /* not a valid range */
}
- range = 0;
}
else {
- lastvalue = value;
- if (*PL_regcomp_parse == '-' && PL_regcomp_parse+1 < PL_regxend &&
- PL_regcomp_parse[1] != ']') {
- PL_regcomp_parse++;
- if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */
+ prevvalue = value; /* save the beginning of the range */
+ if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
+ RExC_parse[1] != ']') {
+ RExC_parse++;
+
+ /* a bad range like \w-, [:word:]- ? */
+ if (namedclass > OOB_NAMEDCLASS) {
if (ckWARN(WARN_REGEXP))
- vWARN4(PL_regcomp_parse,
+ vWARN4(RExC_parse,
"False [] range \"%*.*s\"",
- PL_regcomp_parse - rangebegin,
- PL_regcomp_parse - rangebegin,
+ RExC_parse - rangebegin,
+ RExC_parse - rangebegin,
rangebegin);
if (!SIZE_ONLY)
ANYOF_BITMAP_SET(ret, '-');
} else
- range = 1;
- continue; /* do it next time */
+ range = 1; /* yeah, it's a range! */
+ continue; /* but do it the next time */
}
}
+
/* now is the next time */
if (!SIZE_ONLY) {
-#ifndef ASCIIish /* EBCDIC, for example. */
- if ((isLOWER(lastvalue) && isLOWER(value)) ||
- (isUPPER(lastvalue) && isUPPER(value)))
- {
- I32 i;
- if (isLOWER(lastvalue)) {
- for (i = lastvalue; i <= value; i++)
- if (isLOWER(i))
- ANYOF_BITMAP_SET(ret, i);
- } else {
- for (i = lastvalue; i <= value; i++)
- if (isUPPER(i))
- ANYOF_BITMAP_SET(ret, i);
+ IV i;
+
+ if (prevvalue < 256) {
+ IV ceilvalue = value < 256 ? value : 255;
+
+#ifdef EBCDIC
+ /* In EBCDIC [\x89-\x91] should include
+ * the \x8e but [i-j] should not. */
+ if (literal_endpoint == 2 &&
+ ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
+ (isUPPER(prevvalue) && isUPPER(ceilvalue))))
+ {
+ if (isLOWER(prevvalue)) {
+ for (i = prevvalue; i <= ceilvalue; i++)
+ if (isLOWER(i))
+ ANYOF_BITMAP_SET(ret, i);
+ } else {
+ for (i = prevvalue; i <= ceilvalue; i++)
+ if (isUPPER(i))
+ ANYOF_BITMAP_SET(ret, i);
+ }
+ }
+ else
+#endif
+ for (i = prevvalue; i <= ceilvalue; i++)
+ ANYOF_BITMAP_SET(ret, i);
+ }
+ if (value > 255 || UTF) {
+ UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
+ UV natvalue = NATIVE_TO_UNI(value);
+
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+ if (prevnatvalue < natvalue) { /* what about > ? */
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
+ prevnatvalue, natvalue);
+ }
+ else if (prevnatvalue == natvalue) {
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
+ if (FOLD) {
+ U8 foldbuf[UTF8_MAXLEN_FOLD+1];
+ STRLEN foldlen;
+ UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
+
+ /* If folding and foldable and a single
+ * character, insert also the folded version
+ * to the charclass. */
+ if (f != value) {
+ if (foldlen == (STRLEN)UNISKIP(f))
+ Perl_sv_catpvf(aTHX_ listsv,
+ "%04"UVxf"\n", f);
+ else {
+ /* Any multicharacter foldings
+ * require the following transform:
+ * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
+ * where E folds into "pq" and F folds
+ * into "rst", all other characters
+ * fold to single characters. We save
+ * away these multicharacter foldings,
+ * to be later saved as part of the
+ * additional "s" data. */
+ SV *sv;
+
+ if (!unicode_alternate)
+ unicode_alternate = newAV();
+ sv = newSVpvn((char*)foldbuf, foldlen);
+ SvUTF8_on(sv);
+ av_push(unicode_alternate, sv);
+ }
+ }
+
+ /* If folding and the value is one of the Greek
+ * sigmas insert a few more sigmas to make the
+ * folding rules of the sigmas to work right.
+ * Note that not all the possible combinations
+ * are handled here: some of them are handled
+ * by the standard folding rules, and some of
+ * them (literal or EXACTF cases) are handled
+ * during runtime in regexec.c:S_find_byclass(). */
+ if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+ (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+ (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
+ }
+ else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+ (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
+ }
}
}
- else
+#ifdef EBCDIC
+ literal_endpoint = 0;
#endif
- for ( ; lastvalue <= value; lastvalue++)
- ANYOF_BITMAP_SET(ret, lastvalue);
}
- range = 0;
+
+ range = 0; /* this range (if it was one) is done now */
}
+
if (need_class) {
+ ANYOF_FLAGS(ret) |= ANYOF_LARGE;
if (SIZE_ONLY)
- PL_regsize += ANYOF_CLASS_ADD_SKIP;
+ RExC_size += ANYOF_CLASS_ADD_SKIP;
else
- PL_regcode += ANYOF_CLASS_ADD_SKIP;
+ RExC_emit += ANYOF_CLASS_ADD_SKIP;
}
+
/* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
if (!SIZE_ONLY &&
- (ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) {
+ /* If the only flag is folding (plus possibly inversion). */
+ ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
+ ) {
for (value = 0; value < 256; ++value) {
if (ANYOF_BITMAP_TEST(ret, value)) {
- I32 cf = PL_fold[value];
- ANYOF_BITMAP_SET(ret, cf);
+ UV fold = PL_fold[value];
+
+ if (fold != value)
+ ANYOF_BITMAP_SET(ret, fold);
}
}
ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
}
+
/* optimize inverted simple patterns (e.g. [^a-z]) */
- if (!SIZE_ONLY && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
+ if (!SIZE_ONLY && optimize_invert &&
+ /* If the only flag is inversion. */
+ (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
- ANYOF_FLAGS(ret) = 0;
+ ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
}
- return ret;
-}
-STATIC regnode *
-S_regclassutf8(pTHX)
-{
- register char *e;
- register U32 value;
- register U32 lastvalue = OOB_UTF8;
- register I32 range = 0;
- register regnode *ret;
- STRLEN numlen;
- I32 n;
- SV *listsv;
- U8 flags = 0;
- I32 namedclass;
- char *rangebegin;
-
- if (*PL_regcomp_parse == '^') { /* Complement of range. */
- PL_regnaughty++;
- PL_regcomp_parse++;
- if (!SIZE_ONLY)
- flags |= ANYOF_INVERT;
- }
if (!SIZE_ONLY) {
- if (FOLD)
- flags |= ANYOF_FOLD;
- if (LOC)
- flags |= ANYOF_LOCALE;
- listsv = newSVpvn("# comment\n", 10);
- }
-
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
- checkposixcc();
-
- if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-')
- goto skipcond; /* allow 1st char to be ] or - */
-
- while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') {
- skipcond:
- namedclass = OOB_NAMEDCLASS;
- if (!range)
- rangebegin = PL_regcomp_parse;
- value = utf8_to_uv((U8*)PL_regcomp_parse,
- PL_regxend - PL_regcomp_parse,
- &numlen, 0);
- PL_regcomp_parse += numlen;
- if (value == '[')
- namedclass = regpposixcc(value);
- else if (value == '\\') {
- value = (U32)utf8_to_uv((U8*)PL_regcomp_parse,
- PL_regxend - PL_regcomp_parse,
- &numlen, 0);
- PL_regcomp_parse += numlen;
- /* Some compilers cannot handle switching on 64-bit integer
- * values, therefore value cannot be an UV. Yes, this will
- * be a problem later if we want switch on Unicode. --jhi */
- switch (value) {
- case 'w': namedclass = ANYOF_ALNUM; break;
- case 'W': namedclass = ANYOF_NALNUM; break;
- case 's': namedclass = ANYOF_SPACE; break;
- case 'S': namedclass = ANYOF_NSPACE; break;
- case 'd': namedclass = ANYOF_DIGIT; break;
- case 'D': namedclass = ANYOF_NDIGIT; break;
- case 'p':
- case 'P':
- if (*PL_regcomp_parse == '{') {
- e = strchr(PL_regcomp_parse++, '}');
- if (!e)
- vFAIL("Missing right brace on \\p{}");
- n = e - PL_regcomp_parse;
- }
- else {
- e = PL_regcomp_parse;
- n = 1;
- }
- if (!SIZE_ONLY) {
- if (value == 'p')
- Perl_sv_catpvf(aTHX_ listsv,
- "+utf8::%.*s\n", (int)n, PL_regcomp_parse);
- else
- Perl_sv_catpvf(aTHX_ listsv,
- "!utf8::%.*s\n", (int)n, PL_regcomp_parse);
- }
- PL_regcomp_parse = e + 1;
- lastvalue = OOB_UTF8;
- continue;
- case 'n': value = '\n'; break;
- case 'r': value = '\r'; break;
- case 't': value = '\t'; break;
- case 'f': value = '\f'; break;
- case 'b': value = '\b'; break;
-#ifdef ASCIIish
- case 'e': value = '\033'; break;
- case 'a': value = '\007'; break;
-#else
- case 'e': value = '\047'; break;
- case 'a': value = '\057'; break;
-#endif
- case 'x':
- if (*PL_regcomp_parse == '{') {
- e = strchr(PL_regcomp_parse++, '}');
- if (!e)
- vFAIL("Missing right brace on \\x{}");
- numlen = 1; /* allow underscores */
- value = (UV)scan_hex(PL_regcomp_parse,
- e - PL_regcomp_parse,
- &numlen);
- PL_regcomp_parse = e + 1;
- }
- else {
- numlen = 0; /* disallow underscores */
- value = (UV)scan_hex(PL_regcomp_parse, 2, &numlen);
- PL_regcomp_parse += numlen;
- }
- break;
- case 'c':
- value = UCHARAT(PL_regcomp_parse++);
- value = toCTRL(value);
- break;
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- numlen = 0; /* disallow underscores */
- value = (UV)scan_oct(--PL_regcomp_parse, 3, &numlen);
- PL_regcomp_parse += numlen;
- break;
- default:
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
- vWARN2(PL_regcomp_parse,
- "Unrecognized escape \\%c in character class passed through",
- (int)value);
- break;
- }
- }
- if (namedclass > OOB_NAMEDCLASS) {
- if (range) { /* a-\d, a-[:digit:] */
- if (!SIZE_ONLY) {
- if (ckWARN(WARN_REGEXP))
- vWARN4(PL_regcomp_parse,
- "False [] range \"%*.*s\"",
- PL_regcomp_parse - rangebegin,
- PL_regcomp_parse - rangebegin,
- rangebegin);
- Perl_sv_catpvf(aTHX_ listsv,
- /* 0x002D is Unicode for '-' */
- "%04"UVxf"\n002D\n", (UV)lastvalue);
- }
- range = 0;
- }
- if (!SIZE_ONLY) {
- switch (namedclass) {
- case ANYOF_ALNUM:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break;
- case ANYOF_NALNUM:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); break;
- case ANYOF_ALNUMC:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); break;
- case ANYOF_NALNUMC:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); break;
- case ANYOF_ALPHA:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); break;
- case ANYOF_NALPHA:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); break;
- case ANYOF_ASCII:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); break;
- case ANYOF_NASCII:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); break;
- case ANYOF_CNTRL:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); break;
- case ANYOF_NCNTRL:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); break;
- case ANYOF_GRAPH:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); break;
- case ANYOF_NGRAPH:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); break;
- case ANYOF_DIGIT:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); break;
- case ANYOF_NDIGIT:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); break;
- case ANYOF_LOWER:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); break;
- case ANYOF_NLOWER:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); break;
- case ANYOF_PRINT:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); break;
- case ANYOF_NPRINT:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); break;
- case ANYOF_PUNCT:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); break;
- case ANYOF_NPUNCT:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break;
- case ANYOF_SPACE:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");break;
- case ANYOF_NSPACE:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");break;
- case ANYOF_BLANK:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n"); break;
- case ANYOF_NBLANK:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n"); break;
- case ANYOF_PSXSPC:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break;
- case ANYOF_NPSXSPC:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break;
- case ANYOF_UPPER:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break;
- case ANYOF_NUPPER:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); break;
- case ANYOF_XDIGIT:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); break;
- case ANYOF_NXDIGIT:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break;
- }
- continue;
- }
- }
- if (range) {
- if (lastvalue > value) { /* b-a */
- Simple_vFAIL4("Invalid [] range \"%*.*s\"",
- PL_regcomp_parse - rangebegin,
- PL_regcomp_parse - rangebegin,
- rangebegin);
- }
- range = 0;
- }
- else {
- lastvalue = value;
- if (*PL_regcomp_parse == '-' && PL_regcomp_parse+1 < PL_regxend &&
- PL_regcomp_parse[1] != ']') {
- PL_regcomp_parse++;
- if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */
- if (ckWARN(WARN_REGEXP))
- vWARN4(PL_regcomp_parse,
- "False [] range \"%*.*s\"",
- PL_regcomp_parse - rangebegin,
- PL_regcomp_parse - rangebegin,
- rangebegin);
- if (!SIZE_ONLY)
- Perl_sv_catpvf(aTHX_ listsv,
- /* 0x002D is Unicode for '-' */
- "002D\n");
- } else
- range = 1;
- continue; /* do it next time */
- }
- }
- /* now is the next time */
- if (!SIZE_ONLY)
- Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
- (UV)lastvalue, (UV)value);
- range = 0;
- }
-
- ret = reganode(ANYOFUTF8, 0);
-
- if (!SIZE_ONLY) {
- SV *rv = swash_init("utf8", "", listsv, 1, 0);
-#ifdef DEBUGGING
AV *av = newAV();
- av_push(av, rv);
- av_push(av, listsv);
+ SV *rv;
+
+ /* The 0th element stores the character class description
+ * in its textual form: used later (regexec.c:Perl_regclass_swatch())
+ * to initialize the appropriate swash (which gets stored in
+ * the 1st element), and also useful for dumping the regnode.
+ * The 2nd element stores the multicharacter foldings,
+ * used later (regexec.c:s_reginclasslen()). */
+ av_store(av, 0, listsv);
+ av_store(av, 1, NULL);
+ av_store(av, 2, (SV*)unicode_alternate);
rv = newRV_noinc((SV*)av);
-#else
- SvREFCNT_dec(listsv);
-#endif
- n = add_data(1,"s");
- PL_regcomp_rx->data->data[n] = (void*)rv;
- ARG1_SET(ret, flags);
- ARG2_SET(ret, n);
+ n = add_data(pRExC_state, 1, "s");
+ RExC_rx->data->data[n] = (void*)rv;
+ ARG_SET(ret, n);
}
return ret;
}
STATIC char*
-S_nextchar(pTHX)
+S_nextchar(pTHX_ RExC_state_t *pRExC_state)
{
- char* retval = PL_regcomp_parse++;
+ char* retval = RExC_parse++;
for (;;) {
- if (*PL_regcomp_parse == '(' && PL_regcomp_parse[1] == '?' &&
- PL_regcomp_parse[2] == '#') {
- while (*PL_regcomp_parse && *PL_regcomp_parse != ')')
- PL_regcomp_parse++;
- PL_regcomp_parse++;
+ if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
+ RExC_parse[2] == '#') {
+ while (*RExC_parse && *RExC_parse != ')')
+ RExC_parse++;
+ RExC_parse++;
continue;
}
- if (PL_regflags & PMf_EXTENDED) {
- if (isSPACE(*PL_regcomp_parse)) {
- PL_regcomp_parse++;
+ if (RExC_flags & PMf_EXTENDED) {
+ if (isSPACE(*RExC_parse)) {
+ RExC_parse++;
continue;
}
- else if (*PL_regcomp_parse == '#') {
- while (*PL_regcomp_parse && *PL_regcomp_parse != '\n')
- PL_regcomp_parse++;
- PL_regcomp_parse++;
+ else if (*RExC_parse == '#') {
+ while (*RExC_parse && *RExC_parse != '\n')
+ RExC_parse++;
+ RExC_parse++;
continue;
}
}
@@ -3925,22 +4274,34 @@ S_nextchar(pTHX)
- reg_node - emit a node
*/
STATIC regnode * /* Location. */
-S_reg_node(pTHX_ U8 op)
+S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
{
register regnode *ret;
register regnode *ptr;
- ret = PL_regcode;
+ ret = RExC_emit;
if (SIZE_ONLY) {
- SIZE_ALIGN(PL_regsize);
- PL_regsize += 1;
+ SIZE_ALIGN(RExC_size);
+ RExC_size += 1;
return(ret);
}
NODE_ALIGN_FILL(ret);
ptr = ret;
FILL_ADVANCE_NODE(ptr, op);
- PL_regcode = ptr;
+ if (RExC_offsets) { /* MJD */
+ MJD_OFFSET_DEBUG((stderr, "%s:%u: (op %s) %s %u <- %u (len %u) (max %u).\n",
+ "reg_node", __LINE__,
+ reg_name[op],
+ RExC_emit - RExC_emit_start > RExC_offsets[0]
+ ? "Overwriting end of array!\n" : "OK",
+ RExC_emit - RExC_emit_start,
+ RExC_parse - RExC_start,
+ RExC_offsets[0]));
+ Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
+ }
+
+ RExC_emit = ptr;
return(ret);
}
@@ -3949,22 +4310,33 @@ S_reg_node(pTHX_ U8 op)
- reganode - emit a node with an argument
*/
STATIC regnode * /* Location. */
-S_reganode(pTHX_ U8 op, U32 arg)
+S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
{
register regnode *ret;
register regnode *ptr;
- ret = PL_regcode;
+ ret = RExC_emit;
if (SIZE_ONLY) {
- SIZE_ALIGN(PL_regsize);
- PL_regsize += 2;
+ SIZE_ALIGN(RExC_size);
+ RExC_size += 2;
return(ret);
}
NODE_ALIGN_FILL(ret);
ptr = ret;
FILL_ADVANCE_NODE_ARG(ptr, op, arg);
- PL_regcode = ptr;
+ if (RExC_offsets) { /* MJD */
+ MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
+ "reganode",
+ RExC_emit - RExC_emit_start > RExC_offsets[0] ?
+ "Overwriting end of array!\n" : "OK",
+ RExC_emit - RExC_emit_start,
+ RExC_parse - RExC_start,
+ RExC_offsets[0]));
+ Set_Cur_Node_Offset;
+ }
+
+ RExC_emit = ptr;
return(ret);
}
@@ -3973,9 +4345,9 @@ S_reganode(pTHX_ U8 op, U32 arg)
- reguni - emit (if appropriate) a Unicode character
*/
STATIC void
-S_reguni(pTHX_ UV uv, char* s, STRLEN* lenp)
+S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
{
- *lenp = SIZE_ONLY ? UNISKIP(uv) : (uv_to_utf8((U8*)s, uv) - (U8*)s);
+ *lenp = SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
}
/*
@@ -3984,27 +4356,50 @@ S_reguni(pTHX_ UV uv, char* s, STRLEN* lenp)
* Means relocating the operand.
*/
STATIC void
-S_reginsert(pTHX_ U8 op, regnode *opnd)
+S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
{
register regnode *src;
register regnode *dst;
register regnode *place;
register int offset = regarglen[(U8)op];
-
+
/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
if (SIZE_ONLY) {
- PL_regsize += NODE_STEP_REGNODE + offset;
+ RExC_size += NODE_STEP_REGNODE + offset;
return;
}
- src = PL_regcode;
- PL_regcode += NODE_STEP_REGNODE + offset;
- dst = PL_regcode;
- while (src > opnd)
+ src = RExC_emit;
+ RExC_emit += NODE_STEP_REGNODE + offset;
+ dst = RExC_emit;
+ while (src > opnd) {
StructCopy(--src, --dst, regnode);
+ if (RExC_offsets) { /* MJD 20010112 */
+ MJD_OFFSET_DEBUG((stderr, "%s: %s copy %u -> %u (max %u).\n",
+ "reg_insert",
+ dst - RExC_emit_start > RExC_offsets[0]
+ ? "Overwriting end of array!\n" : "OK",
+ src - RExC_emit_start,
+ dst - RExC_emit_start,
+ RExC_offsets[0]));
+ Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
+ Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
+ }
+ }
+
place = opnd; /* Op node, where operand used to be. */
+ if (RExC_offsets) { /* MJD */
+ MJD_OFFSET_DEBUG((stderr, "%s: %s %u <- %u (max %u).\n",
+ "reginsert",
+ place - RExC_emit_start > RExC_offsets[0]
+ ? "Overwriting end of array!\n" : "OK",
+ place - RExC_emit_start,
+ RExC_parse - RExC_start,
+ RExC_offsets[0]));
+ Set_Node_Offset(place, RExC_parse);
+ }
src = NEXTOPER(place);
FILL_ADVANCE_NODE(place, op);
Zero(src, offset, regnode);
@@ -4014,7 +4409,7 @@ S_reginsert(pTHX_ U8 op, regnode *opnd)
- regtail - set the next-pointer at the end of a node chain of p to val.
*/
STATIC void
-S_regtail(pTHX_ regnode *p, regnode *val)
+S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
{
register regnode *scan;
register regnode *temp;
@@ -4043,16 +4438,16 @@ S_regtail(pTHX_ regnode *p, regnode *val)
- regoptail - regtail on operand of first argument; nop if operandless
*/
STATIC void
-S_regoptail(pTHX_ regnode *p, regnode *val)
+S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
{
/* "Operandless" and "op != BRANCH" are synonymous in practice. */
if (p == NULL || SIZE_ONLY)
return;
if (PL_regkind[(U8)OP(p)] == BRANCH) {
- regtail(NEXTOPER(p), val);
+ regtail(pRExC_state, NEXTOPER(p), val);
}
else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) {
- regtail(NEXTOPER(NEXTOPER(p)), val);
+ regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val);
}
else
return;
@@ -4080,10 +4475,11 @@ S_regcurly(pTHX_ register char *s)
}
+#ifdef DEBUGGING
+
STATIC regnode *
S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
{
-#ifdef DEBUGGING
register U8 op = EXACT; /* Arbitrary non-END op. */
register regnode *next;
@@ -4103,13 +4499,13 @@ S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
(int)(2*l + 1), "", SvPVX(sv));
if (next == NULL) /* Next ptr. */
PerlIO_printf(Perl_debug_log, "(0)");
- else
+ else
PerlIO_printf(Perl_debug_log, "(%"IVdf")", (IV)(next - start));
(void)PerlIO_putc(Perl_debug_log, '\n');
after_print:
if (PL_regkind[(U8)op] == BRANCHJ) {
- register regnode *nnode = (OP(next) == LONGJMP
- ? regnext(next)
+ register regnode *nnode = (OP(next) == LONGJMP
+ ? regnext(next)
: next);
if (last && nnode > last)
nnode = last;
@@ -4130,8 +4526,10 @@ S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
node = dumpuntil(start, NEXTOPER(node), NEXTOPER(node) + 1, sv, l + 1);
}
else if (op == ANYOF) {
+ /* arglen 1 + class block */
+ node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
+ ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
node = NEXTOPER(node);
- node += ANYOF_SKIP;
}
else if (PL_regkind[(U8)op] == EXACT) {
/* Literal string, where present. */
@@ -4147,10 +4545,11 @@ S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l)
else if (op == WHILEM)
l--;
}
-#endif /* DEBUGGING */
return node;
}
+#endif /* DEBUGGING */
+
/*
- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
*/
@@ -4165,31 +4564,50 @@ Perl_regdump(pTHX_ regexp *r)
/* Header fields of interest. */
if (r->anchored_substr)
PerlIO_printf(Perl_debug_log,
- "anchored `%s%.*s%s'%s at %"IVdf" ",
+ "anchored `%s%.*s%s'%s at %"IVdf" ",
PL_colors[0],
(int)(SvCUR(r->anchored_substr) - (SvTAIL(r->anchored_substr)!=0)),
- SvPVX(r->anchored_substr),
+ SvPVX(r->anchored_substr),
PL_colors[1],
SvTAIL(r->anchored_substr) ? "$" : "",
(IV)r->anchored_offset);
+ else if (r->anchored_utf8)
+ PerlIO_printf(Perl_debug_log,
+ "anchored utf8 `%s%.*s%s'%s at %"IVdf" ",
+ PL_colors[0],
+ (int)(SvCUR(r->anchored_utf8) - (SvTAIL(r->anchored_utf8)!=0)),
+ SvPVX(r->anchored_utf8),
+ PL_colors[1],
+ SvTAIL(r->anchored_utf8) ? "$" : "",
+ (IV)r->anchored_offset);
if (r->float_substr)
PerlIO_printf(Perl_debug_log,
- "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
+ "floating `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
PL_colors[0],
- (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
+ (int)(SvCUR(r->float_substr) - (SvTAIL(r->float_substr)!=0)),
SvPVX(r->float_substr),
PL_colors[1],
SvTAIL(r->float_substr) ? "$" : "",
(IV)r->float_min_offset, (UV)r->float_max_offset);
- if (r->check_substr)
- PerlIO_printf(Perl_debug_log,
- r->check_substr == r->float_substr
+ else if (r->float_utf8)
+ PerlIO_printf(Perl_debug_log,
+ "floating utf8 `%s%.*s%s'%s at %"IVdf"..%"UVuf" ",
+ PL_colors[0],
+ (int)(SvCUR(r->float_utf8) - (SvTAIL(r->float_utf8)!=0)),
+ SvPVX(r->float_utf8),
+ PL_colors[1],
+ SvTAIL(r->float_utf8) ? "$" : "",
+ (IV)r->float_min_offset, (UV)r->float_max_offset);
+ if (r->check_substr || r->check_utf8)
+ PerlIO_printf(Perl_debug_log,
+ r->check_substr == r->float_substr
+ && r->check_utf8 == r->float_utf8
? "(checking floating" : "(checking anchored");
if (r->reganch & ROPT_NOSCAN)
PerlIO_printf(Perl_debug_log, " noscan");
if (r->reganch & ROPT_CHECK_ALL)
PerlIO_printf(Perl_debug_log, " isall");
- if (r->check_substr)
+ if (r->check_substr || r->check_utf8)
PerlIO_printf(Perl_debug_log, ") ");
if (r->regstclass) {
@@ -4218,13 +4636,25 @@ Perl_regdump(pTHX_ regexp *r)
if (r->reganch & ROPT_EVAL_SEEN)
PerlIO_printf(Perl_debug_log, "with eval ");
PerlIO_printf(Perl_debug_log, "\n");
+ if (r->offsets) {
+ U32 i;
+ U32 len = r->offsets[0];
+ PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)r->offsets[0]);
+ for (i = 1; i <= len; i++)
+ PerlIO_printf(Perl_debug_log, "%"UVuf"[%"UVuf"] ",
+ (UV)r->offsets[i*2-1],
+ (UV)r->offsets[i*2]);
+ PerlIO_printf(Perl_debug_log, "\n");
+ }
#endif /* DEBUGGING */
}
+#ifdef DEBUGGING
+
STATIC void
S_put_byte(pTHX_ SV *sv, int c)
{
- if (isCNTRL(c) || c == 127 || c == 255)
+ if (isCNTRL(c) || c == 255 || !isPRINT(c))
Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
else if (c == '-' || c == ']' || c == '\\' || c == '^')
Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
@@ -4232,6 +4662,8 @@ S_put_byte(pTHX_ SV *sv, int c)
Perl_sv_catpvf(aTHX_ sv, "%c", c);
}
+#endif /* DEBUGGING */
+
/*
- regprop - printable representation of opcode
*/
@@ -4243,14 +4675,31 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
sv_setpvn(sv, "", 0);
if (OP(o) >= reg_num) /* regnode.type is unsigned */
- FAIL("Corrupted regexp opcode");
+ /* It would be nice to FAIL() here, but this may be called from
+ regexec.c, and it would be hard to supply pRExC_state. */
+ Perl_croak(aTHX_ "Corrupted regexp opcode");
sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */
k = PL_regkind[(U8)OP(o)];
- if (k == EXACT)
- Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0],
- STR_LEN(o), STRING(o), PL_colors[1]);
+ if (k == EXACT) {
+ SV *dsv = sv_2mortal(newSVpvn("", 0));
+ /* Using is_utf8_string() is a crude hack but it may
+ * be the best for now since we have no flag "this EXACTish
+ * node was UTF-8" --jhi */
+ bool do_utf8 = is_utf8_string((U8*)STRING(o), STR_LEN(o));
+ char *s = do_utf8 ?
+ pv_uni_display(dsv, (U8*)STRING(o), STR_LEN(o), 60,
+ UNI_DISPLAY_REGEX) :
+ STRING(o);
+ int len = do_utf8 ?
+ strlen(s) :
+ STR_LEN(o);
+ Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>",
+ PL_colors[0],
+ len, s,
+ PL_colors[1]);
+ }
else if (k == CURLY) {
if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
@@ -4264,8 +4713,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
else if (k == ANYOF) {
int i, rangestart = -1;
- bool anyofutf8 = OP(o) == ANYOFUTF8;
- U8 flags = anyofutf8 ? ARG1(o) : o->flags;
+ U8 flags = ANYOF_FLAGS(o);
const char * const anyofs[] = { /* Should be syncronized with
* ANYOF_ #xdefines in regcomp.h */
"\\w",
@@ -4307,78 +4755,94 @@ Perl_regprop(pTHX_ SV *sv, regnode *o)
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
if (flags & ANYOF_INVERT)
sv_catpv(sv, "^");
- if (OP(o) == ANYOF) {
- for (i = 0; i <= 256; i++) {
- if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
- if (rangestart == -1)
- rangestart = i;
- } else if (rangestart != -1) {
- if (i <= rangestart + 3)
- for (; rangestart < i; rangestart++)
- put_byte(sv, rangestart);
- else {
+ for (i = 0; i <= 256; i++) {
+ if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
+ if (rangestart == -1)
+ rangestart = i;
+ } else if (rangestart != -1) {
+ if (i <= rangestart + 3)
+ for (; rangestart < i; rangestart++)
put_byte(sv, rangestart);
- sv_catpv(sv, "-");
- put_byte(sv, i - 1);
- }
- rangestart = -1;
+ else {
+ put_byte(sv, rangestart);
+ sv_catpv(sv, "-");
+ put_byte(sv, i - 1);
}
+ rangestart = -1;
}
- if (o->flags & ANYOF_CLASS)
- for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
- if (ANYOF_CLASS_TEST(o,i))
- sv_catpv(sv, anyofs[i]);
}
- else {
- SV *rv = (SV*)PL_regdata->data[ARG2(o)];
- AV *av = (AV*)SvRV((SV*)rv);
- SV *sw = *av_fetch(av, 0, FALSE);
- SV *lv = *av_fetch(av, 1, FALSE);
- UV i;
- U8 s[UTF8_MAXLEN+1];
- for (i = 0; i <= 256; i++) { /* just the first 256 */
- U8 *e = uv_to_utf8(s, i);
- if (i < 256 && swash_fetch(sw, s)) {
- if (rangestart == -1)
- rangestart = i;
- } else if (rangestart != -1) {
- U8 *p;
-
- if (i <= rangestart + 3)
- for (; rangestart < i; rangestart++) {
- for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
- put_byte(sv, *p);
+
+ if (o->flags & ANYOF_CLASS)
+ for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
+ if (ANYOF_CLASS_TEST(o,i))
+ sv_catpv(sv, anyofs[i]);
+
+ if (flags & ANYOF_UNICODE)
+ sv_catpv(sv, "{unicode}");
+ else if (flags & ANYOF_UNICODE_ALL)
+ sv_catpv(sv, "{unicode_all}");
+
+ {
+ SV *lv;
+ SV *sw = regclass_swash(o, FALSE, &lv, 0);
+
+ if (lv) {
+ if (sw) {
+ U8 s[UTF8_MAXLEN+1];
+
+ for (i = 0; i <= 256; i++) { /* just the first 256 */
+ U8 *e = uvchr_to_utf8(s, i);
+
+ if (i < 256 && swash_fetch(sw, s, TRUE)) {
+ if (rangestart == -1)
+ rangestart = i;
+ } else if (rangestart != -1) {
+ U8 *p;
+
+ if (i <= rangestart + 3)
+ for (; rangestart < i; rangestart++) {
+ for(e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
+ put_byte(sv, *p);
+ }
+ else {
+ for (e = uvchr_to_utf8(s, rangestart), p = s; p < e; p++)
+ put_byte(sv, *p);
+ sv_catpv(sv, "-");
+ for (e = uvchr_to_utf8(s, i - 1), p = s; p < e; p++)
+ put_byte(sv, *p);
+ }
+ rangestart = -1;
+ }
}
- else {
- for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
- put_byte(sv, *p);
- sv_catpv(sv, "-");
- for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++)
- put_byte(sv, *p);
- }
- rangestart = -1;
+
+ sv_catpv(sv, "..."); /* et cetera */
}
- }
- sv_catpv(sv, "...");
- {
- char *s = savepv(SvPVX(lv));
-
- while(*s && *s != '\n') s++;
- if (*s == '\n') {
- char *t = ++s;
- while (*s) {
- if (*s == '\n')
- *s = ' ';
- s++;
+ {
+ char *s = savepv(SvPVX(lv));
+ char *origs = s;
+
+ while(*s && *s != '\n') s++;
+
+ if (*s == '\n') {
+ char *t = ++s;
+
+ while (*s) {
+ if (*s == '\n')
+ *s = ' ';
+ s++;
+ }
+ if (s[-1] == ' ')
+ s[-1] = 0;
+
+ sv_catpv(sv, t);
}
- if (s[-1] == ' ')
- s[-1] = 0;
-
- sv_catpv(sv, t);
+
+ Safefree(origs);
}
}
}
+
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
}
else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
@@ -4391,43 +4855,65 @@ Perl_re_intuit_string(pTHX_ regexp *prog)
{ /* Assume that RE_INTUIT is set */
DEBUG_r(
{ STRLEN n_a;
- char *s = SvPV(prog->check_substr,n_a);
+ char *s = SvPV(prog->check_substr
+ ? prog->check_substr : prog->check_utf8, n_a);
if (!PL_colorset) reginitcolors();
PerlIO_printf(Perl_debug_log,
- "%sUsing REx substr:%s `%s%.60s%s%s'\n",
- PL_colors[4],PL_colors[5],PL_colors[0],
+ "%sUsing REx %ssubstr:%s `%s%.60s%s%s'\n",
+ PL_colors[4],
+ prog->check_substr ? "" : "utf8 ",
+ PL_colors[5],PL_colors[0],
s,
PL_colors[1],
(strlen(s) > 60 ? "..." : ""));
} );
- return prog->check_substr;
+ return prog->check_substr ? prog->check_substr : prog->check_utf8;
}
void
Perl_pregfree(pTHX_ struct regexp *r)
{
- DEBUG_r(if (!PL_colorset) reginitcolors());
+#ifdef DEBUGGING
+ SV *dsv = PERL_DEBUG_PAD_ZERO(0);
+#endif
if (!r || (--r->refcnt > 0))
return;
- DEBUG_r(PerlIO_printf(Perl_debug_log,
- "%sFreeing REx:%s `%s%.60s%s%s'\n",
- PL_colors[4],PL_colors[5],PL_colors[0],
- r->precomp,
- PL_colors[1],
- (strlen(r->precomp) > 60 ? "..." : "")));
+ DEBUG_r({
+ int len;
+ char *s;
+
+ s = (r->reganch & ROPT_UTF8) ? pv_uni_display(dsv, (U8*)r->precomp,
+ r->prelen, 60, UNI_DISPLAY_REGEX)
+ : pv_display(dsv, r->precomp, r->prelen, 0, 60);
+ len = SvCUR(dsv);
+ if (!PL_colorset)
+ reginitcolors();
+ PerlIO_printf(Perl_debug_log,
+ "%sFreeing REx:%s `%s%*.*s%s%s'\n",
+ PL_colors[4],PL_colors[5],PL_colors[0],
+ len, len, s,
+ PL_colors[1],
+ len > 60 ? "..." : "");
+ });
if (r->precomp)
Safefree(r->precomp);
+ if (r->offsets) /* 20010421 MJD */
+ Safefree(r->offsets);
if (RX_MATCH_COPIED(r))
Safefree(r->subbeg);
if (r->substrs) {
if (r->anchored_substr)
SvREFCNT_dec(r->anchored_substr);
+ if (r->anchored_utf8)
+ SvREFCNT_dec(r->anchored_utf8);
if (r->float_substr)
SvREFCNT_dec(r->float_substr);
+ if (r->float_utf8)
+ SvREFCNT_dec(r->float_utf8);
Safefree(r->substrs);
}
if (r->data) {
@@ -4437,6 +4923,7 @@ Perl_pregfree(pTHX_ struct regexp *r)
SV** old_curpad;
while (--n >= 0) {
+ /* If you add a ->what type here, update the comment in regcomp.h */
switch (r->data->what[n]) {
case 's':
SvREFCNT_dec((SV*)r->data->data[n]);
@@ -4459,16 +4946,20 @@ Perl_pregfree(pTHX_ struct regexp *r)
}
else
PL_curpad = NULL;
- op_free((OP_4tree*)r->data->data[n]);
+
+ if (!OpREFCNT_dec((OP_4tree*)r->data->data[n])) {
+ op_free((OP_4tree*)r->data->data[n]);
+ }
+
PL_comppad = old_comppad;
PL_curpad = old_curpad;
SvREFCNT_dec((SV*)new_comppad);
new_comppad = NULL;
break;
case 'n':
- break;
+ break;
default:
- FAIL2("panic: regfree data code '%c'", r->data->what[n]);
+ Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]);
}
}
Safefree(r->data->what);
@@ -4538,12 +5029,23 @@ S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
void
Perl_save_re_context(pTHX)
-{
+{
+#if 0
+ SAVEPPTR(RExC_precomp); /* uncompiled string. */
+ SAVEI32(RExC_npar); /* () count. */
+ SAVEI32(RExC_size); /* Code size. */
+ SAVEI32(RExC_flags); /* are we folding, multilining? */
+ SAVEVPTR(RExC_rx); /* from regcomp.c */
+ SAVEI32(RExC_seen); /* from regcomp.c */
+ SAVEI32(RExC_sawback); /* Did we see \1, ...? */
+ SAVEI32(RExC_naughty); /* How bad is this pattern? */
+ SAVEVPTR(RExC_emit); /* Code-emit pointer; &regdummy = don't */
+ SAVEPPTR(RExC_end); /* End of input for compile */
+ SAVEPPTR(RExC_parse); /* Input-scan pointer. */
+#endif
+
+ SAVEI32(PL_reg_flags); /* from regexec.c */
SAVEPPTR(PL_bostr);
- SAVEPPTR(PL_regprecomp); /* uncompiled string. */
- SAVEI32(PL_regnpar); /* () count. */
- SAVEI32(PL_regsize); /* Code size. */
- SAVEI16(PL_regflags); /* are we folding, multilining? */
SAVEPPTR(PL_reginput); /* String-input pointer. */
SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */
SAVEPPTR(PL_regeol); /* End of input, for $ check. */
@@ -4551,48 +5053,35 @@ Perl_save_re_context(pTHX)
SAVEVPTR(PL_regendp); /* Ditto for endp. */
SAVEVPTR(PL_reglastparen); /* Similarly for lastparen. */
SAVEPPTR(PL_regtill); /* How far we are required to go. */
- SAVEI8(PL_regprev); /* char before regbol, \n if none */
- SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
+ SAVEGENERICPV(PL_reg_start_tmp); /* from regexec.c */
PL_reg_start_tmp = 0;
SAVEI32(PL_reg_start_tmpl); /* from regexec.c */
PL_reg_start_tmpl = 0;
SAVEVPTR(PL_regdata);
- SAVEI32(PL_reg_flags); /* from regexec.c */
SAVEI32(PL_reg_eval_set); /* from regexec.c */
SAVEI32(PL_regnarrate); /* from regexec.c */
SAVEVPTR(PL_regprogram); /* from regexec.c */
SAVEINT(PL_regindent); /* from regexec.c */
SAVEVPTR(PL_regcc); /* from regexec.c */
SAVEVPTR(PL_curcop);
- SAVEVPTR(PL_regcomp_rx); /* from regcomp.c */
- SAVEI32(PL_regseen); /* from regcomp.c */
- SAVEI32(PL_regsawback); /* Did we see \1, ...? */
- SAVEI32(PL_regnaughty); /* How bad is this pattern? */
- SAVEVPTR(PL_regcode); /* Code-emit pointer; &regdummy = don't */
- SAVEPPTR(PL_regxend); /* End of input for compile */
- SAVEPPTR(PL_regcomp_parse); /* Input-scan pointer. */
SAVEVPTR(PL_reg_call_cc); /* from regexec.c */
SAVEVPTR(PL_reg_re); /* from regexec.c */
SAVEPPTR(PL_reg_ganch); /* from regexec.c */
SAVESPTR(PL_reg_sv); /* from regexec.c */
+ SAVEI8(PL_reg_match_utf8); /* from regexec.c */
SAVEVPTR(PL_reg_magic); /* from regexec.c */
SAVEI32(PL_reg_oldpos); /* from regexec.c */
SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
SAVEVPTR(PL_reg_curpm); /* from regexec.c */
SAVEI32(PL_regnpar); /* () count. */
+ SAVEI32(PL_regsize); /* from regexec.c */
#ifdef DEBUGGING
- SAVEPPTR(PL_reg_starttry); /* from regexec.c */
+ SAVEPPTR(PL_reg_starttry); /* from regexec.c */
#endif
}
-#ifdef PERL_OBJECT
-#include "XSUB.h"
-#undef this
-#define this pPerl
-#endif
-
static void
-clear_re(pTHXo_ void *r)
+clear_re(pTHX_ void *r)
{
ReREFCNT_dec((regexp *)r);
}