/* regcomp.c */ /* * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee */ /* NOTE: this is derived from Henry Spencer's regexp code, and should not * confused with the original package (see point 3 below). Thanks, Henry! */ /* Additional note: this code is very heavily munged from Henry's version * in places. In some spots I've traded clarity for efficiency, so don't * blame Henry for some of the lack of readability. */ /* The names of the functions have been changed from regcomp and * regexec to pregcomp and pregexec in order to avoid conflicts * with the POSIX routines of the same names. */ /*SUPPRESS 112*/ /* * pregcomp and pregexec -- regsub and regerror are not used in perl * * Copyright (c) 1986 by University of Toronto. * Written by Henry Spencer. Not derived from licensed software. * * Permission is granted to anyone to use this software for any * purpose on any computer system, and to redistribute it freely, * subject to the following restrictions: * * 1. The author is not responsible for the consequences of use of * this software, no matter how awful, even if they arise * from defects in it. * * 2. The origin of this software must not be misrepresented, either * by explicit claim or by omission. * * 3. Altered versions must be plainly marked as such, and must not * be misrepresented as being the original software. * * **** Alterations to Henry's code are... **** **** Copyright (c) 1991-1997, Larry Wall **** **** You may distribute under the terms of either the GNU General Public **** License or the Artistic License, as specified in the README file. * * Beware that some of this code is subtly aware of the way operator * precedence is structured in regular expressions. Serious changes in * regular-expression syntax might require a total rethink. */ #include "EXTERN.h" #include "perl.h" #include "INTERN.h" #include "regcomp.h" #ifdef MSDOS # if defined(BUGGY_MSC6) /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */ # pragma optimize("a",off) /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/ # pragma optimize("w",on ) # endif /* BUGGY_MSC6 */ #endif /* MSDOS */ #ifndef STATIC #define STATIC static #endif #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ ((*s) == '{' && regcurly(s))) #ifdef atarist #define PERL_META "^$.[()|?+*\\" #else #define META "^$.[()|?+*\\" #endif #ifdef SPSTART #undef SPSTART /* dratted cpp namespace... */ #endif /* * Flags to be passed up and down. */ #define WORST 0 /* Worst case. */ #define HASWIDTH 0x1 /* Known never to match null string. */ #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */ #define SPSTART 0x4 /* Starts with * or +. */ #define TRYAGAIN 0x8 /* Weeded out a declaration. */ /* * Forward declarations for pregcomp()'s friends. */ static char *reg _((I32, I32 *)); static char *reganode _((char, unsigned short)); static char *regatom _((I32 *)); static char *regbranch _((I32 *)); static void regc _((char)); static char *regclass _((void)); STATIC I32 regcurly _((char *)); static char *regnode _((char)); static char *regpiece _((I32 *)); static void reginsert _((char, char *)); static void regoptail _((char *, char *)); static void regset _((char *, I32)); static void regtail _((char *, char *)); static char* regwhite _((char *, char *)); static char* nextchar _((void)); /* - pregcomp - compile a regular expression into internal code * * We can't allocate space until we know how big the compiled form will be, * but we can't compile it (and thus know how big it is) until we've got a * place to put the code. So we cheat: we compile it twice, once with code * generation turned off and size counting turned on, and once "for real". * This also means that we don't allocate space until we are sure that the * thing really will compile successfully, and we never have to move the * code and thus invalidate pointers into it. (Note that it has to be in * one piece because free() must be able to free it all.) [NB: not true in perl] * * Beware that the optimization-preparation code in here knows about some * of the structure of the compiled regexp. [I'll say.] */ regexp * pregcomp(exp,xend,pm) char* exp; char* xend; PMOP* pm; { register regexp *r; register char *scan; register SV *longish; SV *longest; register I32 len; register char *first; I32 flags; I32 backish; I32 backest; I32 curback; I32 minlen = 0; I32 sawplus = 0; I32 sawopen = 0; #define MAX_REPEAT_DEPTH 12 struct { char *opcode; I32 count; } repeat_stack[MAX_REPEAT_DEPTH]; I32 repeat_depth = 0; I32 repeat_count = 1; /* We start unmultiplied. */ if (exp == NULL) croak("NULL regexp argument"); regprecomp = savepvn(exp, xend - exp); regflags = pm->op_pmflags; regsawback = 0; /* First pass: determine size, legality. */ regparse = exp; regxend = xend; regnaughty = 0; regnpar = 1; regsize = 0L; regcode = ®dummy; regc((char)MAGIC); if (reg(0, &flags) == NULL) { Safefree(regprecomp); regprecomp = Nullch; return(NULL); } /* Small enough for pointer-storage convention? */ if (regsize >= 32767L) /* Probably could be 65535L. */ FAIL("regexp too big"); /* Allocate space and initialize. */ Newc(1001, r, sizeof(regexp) + (unsigned)regsize, char, regexp); if (r == NULL) FAIL("regexp out of space"); r->prelen = xend - exp; r->precomp = regprecomp; r->subbeg = r->subbase = NULL; /* Second pass: emit code. */ regparse = exp; regxend = xend; regnaughty = 0; regnpar = 1; regcode = r->program; regc((char)MAGIC); if (reg(0, &flags) == NULL) return(NULL); /* Dig out information for optimizations. */ pm->op_pmflags = regflags; r->regstart = Nullsv; /* Worst-case defaults. */ r->reganch = 0; r->regmust = Nullsv; r->regback = -1; r->regstclass = Nullch; r->naughty = regnaughty >= 10; /* Probably an expensive pattern. */ scan = r->program+1; /* First BRANCH. */ if (OP(regnext(scan)) == END) {/* Only one top-level choice. */ scan = NEXTOPER(scan); first = scan; while ((OP(first) == OPEN && (sawopen = 1)) || (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) || (OP(first) == PLUS) || (OP(first) == MINMOD) || (regkind[(U8)OP(first)] == CURLY && ARG1(first) > 0) ) { if (OP(first) == PLUS) sawplus = 1; else first += regarglen[(U8)OP(first)]; first = NEXTOPER(first); } /* Starting-point info. */ again: if (OP(first) == EXACT) { r->regstart = newSVpv(OPERAND(first)+1,*OPERAND(first)); if (SvCUR(r->regstart) > !sawstudy) fbm_compile(r->regstart); (void)SvUPGRADE(r->regstart, SVt_PVBM); } else if (strchr(simple+2,OP(first))) r->regstclass = first; else if (regkind[(U8)OP(first)] == BOUND || regkind[(U8)OP(first)] == NBOUND) r->regstclass = first; else if (regkind[(U8)OP(first)] == BOL) { r->reganch |= ROPT_ANCH_BOL; first = NEXTOPER(first); goto again; } else if (OP(first) == GPOS) { r->reganch |= ROPT_ANCH_GPOS; first = NEXTOPER(first); goto again; } else if ((OP(first) == STAR && regkind[(U8)OP(NEXTOPER(first))] == ANY) && !(r->reganch & ROPT_ANCH) ) { /* turn .* into ^.* with an implied $*=1 */ r->reganch |= ROPT_ANCH_BOL | ROPT_IMPLICIT; first = NEXTOPER(first); goto again; } if (sawplus && (!sawopen || !regsawback)) r->reganch |= ROPT_SKIP; /* x+ must match 1st of run */ DEBUG_r(PerlIO_printf(Perl_debug_log, "first %d next %d offset %ld\n", OP(first), OP(NEXTOPER(first)), (long)(first - scan))); /* * If there's something expensive in the r.e., find the * longest literal string that must appear and make it the * regmust. Resolve ties in favor of later strings, since * the regstart check works with the beginning of the r.e. * and avoiding duplication strengthens checking. Not a * strong reason, but sufficient in the absence of others. * [Now we resolve ties in favor of the earlier string if * it happens that curback has been invalidated, since the * earlier string may buy us something the later one won't.] */ longish = newSVpv("",0); longest = newSVpv("",0); len = 0; minlen = 0; curback = 0; backish = 0; backest = 0; while (OP(scan) != END) { if (OP(scan) == BRANCH) { if (OP(regnext(scan)) == BRANCH) { curback = -30000; while (OP(scan) == BRANCH) scan = regnext(scan); } else /* single branch is ok */ scan = NEXTOPER(scan); continue; } if (OP(scan) == UNLESSM) { curback = -30000; scan = regnext(scan); continue; } if (OP(scan) == EXACT) { char *t; first = scan; while ((t = regnext(scan)) && OP(t) == CLOSE) scan = t; minlen += *OPERAND(first) * repeat_count; if (curback - backish == len) { sv_catpvn(longish, OPERAND(first)+1, *OPERAND(first)); len += *OPERAND(first); curback += *OPERAND(first); first = regnext(scan); } else if (*OPERAND(first) >= len + (curback >= 0)) { len = *OPERAND(first); sv_setpvn(longish, OPERAND(first)+1,len); backish = curback; curback += len; first = regnext(scan); } else curback += *OPERAND(first); } else if (strchr(varies,OP(scan))) { int tcount; char *next; if (repeat_depth < MAX_REPEAT_DEPTH && ((OP(scan) == PLUS && (tcount = 1) && (next = NEXTOPER(scan))) || (regkind[(U8)OP(scan)] == CURLY && (tcount = ARG1(scan)) && (next = NEXTOPER(scan)+4)))) { /* We treat (abc)+ as (abc)(abc)*. */ /* Mark the place to return back. */ repeat_stack[repeat_depth].opcode = regnext(scan); repeat_stack[repeat_depth].count = repeat_count; repeat_depth++; repeat_count *= tcount; /* Go deeper: */ scan = next; continue; } else { curback = -30000; len = 0; if (SvCUR(longish) > SvCUR(longest)) { sv_setsv(longest,longish); backest = backish; } sv_setpvn(longish,"",0); } } else if (strchr(simple,OP(scan))) { curback++; minlen += repeat_count; len = 0; if (SvCUR(longish) > SvCUR(longest)) { sv_setsv(longest,longish); backest = backish; } sv_setpvn(longish,"",0); } scan = regnext(scan); if (!scan) { /* Go up PLUS or CURLY. */ if (!repeat_depth--) croak("panic: re scan"); scan = repeat_stack[repeat_depth].opcode; repeat_count = repeat_stack[repeat_depth].count; /* Need to submit the longest string found: */ curback = -30000; len = 0; if (SvCUR(longish) > SvCUR(longest)) { sv_setsv(longest,longish); backest = backish; } sv_setpvn(longish,"",0); } } /* Prefer earlier on tie, unless we can tail match latter */ if (SvCUR(longish) + (first && regkind[(U8)OP(first)] == EOL) > SvCUR(longest)) { sv_setsv(longest,longish); backest = backish; } else sv_setpvn(longish,"",0); if (SvCUR(longest) && (!r->regstart || !fbm_instr((unsigned char*) SvPVX(r->regstart), (unsigned char *) (SvPVX(r->regstart) + SvCUR(r->regstart)), longest))) { r->regmust = longest; if (backest < 0) backest = -1; r->regback = backest; if (SvCUR(longest) > !(sawstudy || (first && regkind[(U8)OP(first)] == EOL))) fbm_compile(r->regmust); (void)SvUPGRADE(r->regmust, SVt_PVBM); BmUSEFUL(r->regmust) = 100; if (first && regkind[(U8)OP(first)] == EOL && SvCUR(longish)) SvTAIL_on(r->regmust); } else { SvREFCNT_dec(longest); longest = Nullsv; } SvREFCNT_dec(longish); } r->nparens = regnpar - 1; r->minlen = minlen; Newz(1002, r->startp, regnpar, char*); Newz(1002, r->endp, regnpar, char*); DEBUG_r(regdump(r)); return(r); } /* - reg - regular expression, i.e. main body or parenthesized thing * * Caller must absorb opening parenthesis. * * Combining parenthesis handling with the base level of regular expression * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */ static char * reg(paren, flagp) I32 paren; /* Parenthesized? */ I32 *flagp; { register char *ret; register char *br; register char *ender = 0; register I32 parno = 0; I32 flags; *flagp = HASWIDTH; /* Tentatively. */ /* Make an OPEN node, if parenthesized. */ if (paren) { if (*regparse == '?') { regparse++; paren = *regparse++; ret = NULL; switch (paren) { case ':': case '=': case '!': break; case '$': case '@': croak("Sequence (?%c...) not implemented", (int)paren); break; case '#': while (*regparse && *regparse != ')') regparse++; if (*regparse != ')') croak("Sequence (?#... not terminated"); nextchar(); *flagp = TRYAGAIN; return NULL; case 0: croak("Sequence (? incomplete"); break; default: --regparse; while (*regparse && strchr("iogcmsx", *regparse)) pmflag(®flags, *regparse++); if (*regparse != ')') croak("Sequence (?%c...) not recognized", *regparse); nextchar(); *flagp = TRYAGAIN; return NULL; } } else { parno = regnpar; regnpar++; ret = reganode(OPEN, parno); } } else ret = NULL; /* Pick up the branches, linking them together. */ br = regbranch(&flags); if (br == NULL) return(NULL); if (ret != NULL) regtail(ret, br); /* OPEN -> first. */ else ret = br; if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags&SPSTART; while (*regparse == '|') { nextchar(); br = regbranch(&flags); if (br == NULL) return(NULL); regtail(ret, br); /* BRANCH -> BRANCH. */ if (!(flags&HASWIDTH)) *flagp &= ~HASWIDTH; *flagp |= flags&SPSTART; } /* Make a closing node, and hook it on the end. */ switch (paren) { case ':': ender = regnode(NOTHING); break; case 1: ender = reganode(CLOSE, parno); break; case '=': case '!': ender = regnode(SUCCEED); *flagp &= ~HASWIDTH; break; case 0: ender = regnode(END); break; } regtail(ret, ender); /* Hook the tails of the branches to the closing node. */ for (br = ret; br != NULL; br = regnext(br)) regoptail(br, ender); if (paren == '=') { reginsert(IFMATCH,ret); regtail(ret, regnode(NOTHING)); } else if (paren == '!') { reginsert(UNLESSM,ret); regtail(ret, regnode(NOTHING)); } /* Check for proper termination. */ if (paren && (regparse >= regxend || *nextchar() != ')')) { FAIL("unmatched () in regexp"); } else if (!paren && regparse < regxend) { if (*regparse == ')') { FAIL("unmatched () in regexp"); } else FAIL("junk on end of regexp"); /* "Can't happen". */ /* NOTREACHED */ } return(ret); } /* - regbranch - one alternative of an | operator * * Implements the concatenation operator. */ static char * regbranch(flagp) I32 *flagp; { register char *ret; register char *chain; register char *latest; I32 flags = 0; *flagp = WORST; /* Tentatively. */ ret = regnode(BRANCH); chain = NULL; regparse--; nextchar(); while (regparse < regxend && *regparse != '|' && *regparse != ')') { flags &= ~TRYAGAIN; latest = regpiece(&flags); if (latest == NULL) { if (flags & TRYAGAIN) continue; return(NULL); } *flagp |= flags&HASWIDTH; if (chain == NULL) /* First piece. */ *flagp |= flags&SPSTART; else { regnaughty++; regtail(chain, latest); } chain = latest; } if (chain == NULL) /* Loop ran zero times. */ (void) regnode(NOTHING); return(ret); } /* - regpiece - something followed by possible [*+?] * * Note that the branching code sequences used for ? and the general cases * of * and + are somewhat optimized: they use the same NOTHING node as * both the endmarker for their branch list and the body of the last branch. * It might seem that this node could be dispensed with entirely, but the * endmarker role is not redundant. */ static char * regpiece(flagp) I32 *flagp; { register char *ret; register char op; register char *next; I32 flags; char *origparse = regparse; char *maxpos; I32 min; I32 max = 32767; ret = regatom(&flags); if (ret == NULL) { if (flags & TRYAGAIN) *flagp |= TRYAGAIN; return(NULL); } op = *regparse; if (op == '(' && regparse[1] == '?' && regparse[2] == '#') { while (op && op != ')') op = *++regparse; if (op) { nextchar(); op = *regparse; } } if (op == '{' && regcurly(regparse)) { next = regparse + 1; maxpos = Nullch; while (isDIGIT(*next) || *next == ',') { if (*next == ',') { if (maxpos) break; else maxpos = next; } next++; } if (*next == '}') { /* got one */ if (!maxpos) maxpos = next; regparse++; min = atoi(regparse); if (*maxpos == ',') maxpos++; else maxpos = regparse; max = atoi(maxpos); if (!max && *maxpos != '0') max = 32767; /* meaning "infinity" */ regparse = next; nextchar(); do_curly: if ((flags&SIMPLE)) { regnaughty += 2 + regnaughty / 2; reginsert(CURLY, ret); } else { regnaughty += 4 + regnaughty; /* compound interest */ regtail(ret, regnode(WHILEM)); reginsert(CURLYX,ret); regtail(ret, regnode(NOTHING)); } if (min > 0) *flagp = (WORST|HASWIDTH); if (max && max < min) croak("Can't do {n,m} with n > m"); if (regcode != ®dummy) { #ifdef REGALIGN *(unsigned short *)(ret+3) = min; *(unsigned short *)(ret+5) = max; #else ret[3] = min >> 8; ret[4] = min & 0377; ret[5] = max >> 8; ret[6] = max & 0377; #endif } goto nest_check; } } if (!ISMULT1(op)) { *flagp = flags; return(ret); } if (!(flags&HASWIDTH) && op != '?') FAIL("regexp *+ operand could be empty"); /* else may core dump */ nextchar(); *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); if (op == '*' && (flags&SIMPLE)) { reginsert(STAR, ret); regnaughty += 4; } else if (op == '*') { min = 0; goto do_curly; } else if (op == '+' && (flags&SIMPLE)) { reginsert(PLUS, ret); regnaughty += 3; } else if (op == '+') { min = 1; goto do_curly; } else if (op == '?') { min = 0; max = 1; goto do_curly; } nest_check: if (dowarn && regcode != ®dummy && !(flags&HASWIDTH) && max > 10000) { warn("%.*s matches null string many times", regparse - origparse, origparse); } if (*regparse == '?') { nextchar(); reginsert(MINMOD, ret); #ifdef REGALIGN regtail(ret, ret + 4); #else regtail(ret, ret + 3); #endif } if (ISMULT2(regparse)) FAIL("nested *?+ in regexp"); return(ret); } /* - regatom - the lowest level * * Optimization: gobbles an entire sequence of ordinary characters so that * it can turn them into a single node, which is smaller to store and * faster to run. Backslashed characters are exceptions, each becoming a * separate node; the code is simpler that way and it's not worth fixing. * * [Yes, it is worth fixing, some scripts can run twice the speed.] */ static char * regatom(flagp) I32 *flagp; { register char *ret = 0; I32 flags; *flagp = WORST; /* Tentatively. */ tryagain: switch (*regparse) { case '^': nextchar(); if (regflags & PMf_MULTILINE) ret = regnode(MBOL); else if (regflags & PMf_SINGLELINE) ret = regnode(SBOL); else ret = regnode(BOL); break; case '$': nextchar(); if (regflags & PMf_MULTILINE) ret = regnode(MEOL); else if (regflags & PMf_SINGLELINE) ret = regnode(SEOL); else ret = regnode(EOL); break; case '.': nextchar(); if (regflags & PMf_SINGLELINE) ret = regnode(SANY); else ret = regnode(ANY); regnaughty++; *flagp |= HASWIDTH|SIMPLE; break; case '[': regparse++; ret = regclass(); *flagp |= HASWIDTH|SIMPLE; break; case '(': nextchar(); ret = reg(1, &flags); if (ret == NULL) { if (flags & TRYAGAIN) goto tryagain; return(NULL); } *flagp |= flags&(HASWIDTH|SPSTART); break; case '|': case ')': if (flags & TRYAGAIN) { *flagp |= TRYAGAIN; return NULL; } croak("internal urp in regexp at /%s/", regparse); /* Supposed to be caught earlier. */ break; case '{': if (!regcurly(regparse)) { regparse++; goto defchar; } /* FALL THROUGH */ case '?': case '+': case '*': FAIL("?+*{} follows nothing in regexp"); break; case '\\': switch (*++regparse) { case 'A': ret = regnode(SBOL); *flagp |= SIMPLE; nextchar(); break; case 'G': ret = regnode(GPOS); *flagp |= SIMPLE; nextchar(); break; case 'Z': ret = regnode(SEOL); *flagp |= SIMPLE; nextchar(); break; case 'w': ret = regnode((regflags & PMf_LOCALE) ? ALNUML : ALNUM); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; case 'W': ret = regnode((regflags & PMf_LOCALE) ? NALNUML : NALNUM); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; case 'b': ret = regnode((regflags & PMf_LOCALE) ? BOUNDL : BOUND); *flagp |= SIMPLE; nextchar(); break; case 'B': ret = regnode((regflags & PMf_LOCALE) ? NBOUNDL : NBOUND); *flagp |= SIMPLE; nextchar(); break; case 's': ret = regnode((regflags & PMf_LOCALE) ? SPACEL : SPACE); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; case 'S': ret = regnode((regflags & PMf_LOCALE) ? NSPACEL : NSPACE); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; case 'd': ret = regnode(DIGIT); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; case 'D': ret = regnode(NDIGIT); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; case 'n': case 'r': case 't': case 'f': case 'e': case 'a': case 'x': case 'c': case '0': goto defchar; case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { I32 num = atoi(regparse); if (num > 9 && num >= regnpar) goto defchar; else { regsawback = 1; ret = reganode((regflags & PMf_FOLD) ? ((regflags & PMf_LOCALE) ? REFFL : REFF) : REF, num); *flagp |= HASWIDTH; while (isDIGIT(*regparse)) regparse++; regparse--; nextchar(); } } break; case '\0': if (regparse >= regxend) FAIL("trailing \\ in regexp"); /* FALL THROUGH */ default: goto defchar; } break; case '#': if (regflags & PMf_EXTENDED) { while (regparse < regxend && *regparse != '\n') regparse++; if (regparse < regxend) goto tryagain; } /* FALL THROUGH */ default: { register I32 len; register char ender; register char *p; char *oldp; I32 numlen; regparse++; defchar: ret = regnode((regflags & PMf_FOLD) ? ((regflags & PMf_LOCALE) ? EXACTFL : EXACTF) : EXACT); regc(0); /* save spot for len */ for (len = 0, p = regparse - 1; len < 127 && p < regxend; len++) { oldp = p; if (regflags & PMf_EXTENDED) p = regwhite(p, regxend); switch (*p) { case '^': case '$': case '.': case '[': case '(': case ')': case '|': goto loopdone; case '\\': switch (*++p) { case 'A': case 'G': case 'Z': case 'w': case 'W': case 'b': case 'B': case 's': case 'S': case 'd': case 'D': --p; goto loopdone; case 'n': ender = '\n'; p++; break; case 'r': ender = '\r'; p++; break; case 't': ender = '\t'; p++; break; case 'f': ender = '\f'; p++; break; case 'e': ender = '\033'; p++; break; case 'a': ender = '\007'; p++; break; case 'x': ender = scan_hex(++p, 2, &numlen); p += numlen; break; case 'c': p++; ender = UCHARAT(p++); ender = toCTRL(ender); break; case '0': case '1': case '2': case '3':case '4': case '5': case '6': case '7': case '8':case '9': if (*p == '0' || (isDIGIT(p[1]) && atoi(p) >= regnpar) ) { ender = scan_oct(p, 3, &numlen); p += numlen; } else { --p; goto loopdone; } break; case '\0': if (p >= regxend) FAIL("trailing \\ in regexp"); /* FALL THROUGH */ default: ender = *p++; break; } break; default: ender = *p++; break; } if (regflags & PMf_EXTENDED) p = regwhite(p, regxend); if (ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; else { len++; regc(ender); } break; } regc(ender); } loopdone: regparse = p - 1; nextchar(); if (len < 0) FAIL("internal disaster in regexp"); if (len > 0) *flagp |= HASWIDTH; if (len == 1) *flagp |= SIMPLE; if (regcode != ®dummy) *OPERAND(ret) = len; regc('\0'); } break; } return(ret); } static char * regwhite(p, e) char *p; char *e; { while (p < e) { if (isSPACE(*p)) ++p; else if (*p == '#') { do { p++; } while (p < e && *p != '\n'); } else break; } return p; } static void regset(opnd, c) char *opnd; register I32 c; { if (opnd == ®dummy) return; c &= 0xFF; opnd[1 + (c >> 3)] |= (1 << (c & 7)); } static char * regclass() { register char *opnd; register I32 class; register I32 lastclass = 1234; register I32 range = 0; register char *ret; register I32 def; I32 numlen; ret = regnode(ANYOF); opnd = regcode; for (class = 0; class < 33; class++) regc(0); if (*regparse == '^') { /* Complement of range. */ regnaughty++; regparse++; if (opnd != ®dummy) *opnd |= ANYOF_INVERT; } if (opnd != ®dummy) { if (regflags & PMf_FOLD) *opnd |= ANYOF_FOLD; if (regflags & PMf_LOCALE) *opnd |= ANYOF_LOCALE; } if (*regparse == ']' || *regparse == '-') goto skipcond; /* allow 1st char to be ] or - */ while (regparse < regxend && *regparse != ']') { skipcond: class = UCHARAT(regparse++); if (class == '\\') { class = UCHARAT(regparse++); switch (class) { case 'w': if (regflags & PMf_LOCALE) { if (opnd != ®dummy) *opnd |= ANYOF_ALNUML; } else { for (class = 0; class < 256; class++) if (isALNUM(class)) regset(opnd, class); } lastclass = 1234; continue; case 'W': if (regflags & PMf_LOCALE) { if (opnd != ®dummy) *opnd |= ANYOF_NALNUML; } else { for (class = 0; class < 256; class++) if (!isALNUM(class)) regset(opnd, class); } lastclass = 1234; continue; case 's': if (regflags & PMf_LOCALE) { if (opnd != ®dummy) *opnd |= ANYOF_SPACEL; } else { for (class = 0; class < 256; class++) if (isSPACE(class)) regset(opnd, class); } lastclass = 1234; continue; case 'S': if (regflags & PMf_LOCALE) { if (opnd != ®dummy) *opnd |= ANYOF_NSPACEL; } else { for (class = 0; class < 256; class++) if (!isSPACE(class)) regset(opnd, class); } lastclass = 1234; continue; case 'd': for (class = '0'; class <= '9'; class++) regset(opnd, class); lastclass = 1234; continue; case 'D': for (class = 0; class < '0'; class++) regset(opnd, class); for (class = '9' + 1; class < 256; class++) regset(opnd, class); lastclass = 1234; continue; case 'n': class = '\n'; break; case 'r': class = '\r'; break; case 't': class = '\t'; break; case 'f': class = '\f'; break; case 'b': class = '\b'; break; case 'e': class = '\033'; break; case 'a': class = '\007'; break; case 'x': class = scan_hex(regparse, 2, &numlen); regparse += numlen; break; case 'c': class = UCHARAT(regparse++); class = toCTRL(class); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': class = scan_oct(--regparse, 3, &numlen); regparse += numlen; break; } } if (range) { if (lastclass > class) FAIL("invalid [] range in regexp"); range = 0; } else { lastclass = class; if (*regparse == '-' && regparse+1 < regxend && regparse[1] != ']') { regparse++; range = 1; continue; /* do it next time */ } } for ( ; lastclass <= class; lastclass++) regset(opnd, lastclass); lastclass = class; } if (*regparse != ']') FAIL("unmatched [] in regexp"); nextchar(); return ret; } static char* nextchar() { char* retval = regparse++; for (;;) { if (*regparse == '(' && regparse[1] == '?' && regparse[2] == '#') { while (*regparse && *regparse != ')') regparse++; regparse++; continue; } if (regflags & PMf_EXTENDED) { if (isSPACE(*regparse)) { regparse++; continue; } else if (*regparse == '#') { while (*regparse && *regparse != '\n') regparse++; regparse++; continue; } } return retval; } } /* - regnode - emit a node */ #ifdef CAN_PROTOTYPE static char * /* Location. */ regnode(char op) #else static char * /* Location. */ regnode(op) char op; #endif { register char *ret; register char *ptr; ret = regcode; if (ret == ®dummy) { #ifdef REGALIGN if (!(regsize & 1)) regsize++; #endif regsize += 3; return(ret); } #ifdef REGALIGN #ifndef lint if (!((long)ret & 1)) *ret++ = 127; #endif #endif ptr = ret; *ptr++ = op; *ptr++ = '\0'; /* Null "next" pointer. */ *ptr++ = '\0'; regcode = ptr; return(ret); } /* - reganode - emit a node with an argument */ #ifdef CAN_PROTOTYPE static char * /* Location. */ reganode(char op, unsigned short arg) #else static char * /* Location. */ reganode(op, arg) char op; unsigned short arg; #endif { register char *ret; register char *ptr; ret = regcode; if (ret == ®dummy) { #ifdef REGALIGN if (!(regsize & 1)) regsize++; #endif regsize += 5; return(ret); } #ifdef REGALIGN #ifndef lint if (!((long)ret & 1)) *ret++ = 127; #endif #endif ptr = ret; *ptr++ = op; *ptr++ = '\0'; /* Null "next" pointer. */ *ptr++ = '\0'; #ifdef REGALIGN *(unsigned short *)(ret+3) = arg; #else ret[3] = arg >> 8; ret[4] = arg & 0377; #endif ptr += 2; regcode = ptr; return(ret); } /* - regc - emit (if appropriate) a byte of code */ #ifdef CAN_PROTOTYPE static void regc(char b) #else static void regc(b) char b; #endif { if (regcode != ®dummy) *regcode++ = b; else regsize++; } /* - reginsert - insert an operator in front of already-emitted operand * * Means relocating the operand. */ #ifdef CAN_PROTOTYPE static void reginsert(char op, char *opnd) #else static void reginsert(op, opnd) char op; char *opnd; #endif { register char *src; register char *dst; register char *place; register int offset = (regkind[(U8)op] == CURLY ? 4 : 0); if (regcode == ®dummy) { #ifdef REGALIGN regsize += 4 + offset; #else regsize += 3 + offset; #endif return; } src = regcode; #ifdef REGALIGN regcode += 4 + offset; #else regcode += 3 + offset; #endif dst = regcode; while (src > opnd) *--dst = *--src; place = opnd; /* Op node, where operand used to be. */ *place++ = op; *place++ = '\0'; *place++ = '\0'; while (offset-- > 0) *place++ = '\0'; #ifdef REGALIGN *place++ = '\177'; #endif } /* - regtail - set the next-pointer at the end of a node chain */ static void regtail(p, val) char *p; char *val; { register char *scan; register char *temp; register I32 offset; if (p == ®dummy) return; /* Find last node. */ scan = p; for (;;) { temp = regnext(scan); if (temp == NULL) break; scan = temp; } #ifdef REGALIGN offset = val - scan; #ifndef lint *(short*)(scan+1) = offset; #else offset = offset; #endif #else if (OP(scan) == BACK) offset = scan - val; else offset = val - scan; *(scan+1) = (offset>>8)&0377; *(scan+2) = offset&0377; #endif } /* - regoptail - regtail on operand of first argument; nop if operandless */ static void regoptail(p, val) char *p; char *val; { /* "Operandless" and "op != BRANCH" are synonymous in practice. */ if (p == NULL || p == ®dummy || regkind[(U8)OP(p)] != BRANCH) return; regtail(NEXTOPER(p), val); } /* - regcurly - a little FSA that accepts {\d+,?\d*} */ STATIC I32 regcurly(s) register char *s; { if (*s++ != '{') return FALSE; if (!isDIGIT(*s)) return FALSE; while (isDIGIT(*s)) s++; if (*s == ',') s++; while (isDIGIT(*s)) s++; if (*s != '}') return FALSE; return TRUE; } #ifdef DEBUGGING /* - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ void regdump(r) regexp *r; { register char *s; register char op = EXACT; /* Arbitrary non-END op. */ register char *next; SV *sv = sv_newmortal(); s = r->program + 1; while (op != END) { /* While that wasn't END last time... */ #ifdef REGALIGN if (!((long)s & 1)) s++; #endif op = OP(s); /* where, what */ regprop(sv, s); PerlIO_printf(Perl_debug_log, "%2ld%s", (long)(s - r->program), SvPVX(sv)); next = regnext(s); s += regarglen[(U8)op]; if (next == NULL) /* Next ptr. */ PerlIO_printf(Perl_debug_log, "(0)"); else PerlIO_printf(Perl_debug_log, "(%ld)", (long)(s-r->program)+(next-s)); s += 3; if (op == ANYOF) { s += 33; } if (regkind[(U8)op] == EXACT) { /* Literal string, where present. */ s++; (void)PerlIO_putc(Perl_debug_log, ' '); (void)PerlIO_putc(Perl_debug_log, '<'); while (*s != '\0') { (void)PerlIO_putc(Perl_debug_log,*s); s++; } (void)PerlIO_putc(Perl_debug_log, '>'); s++; } (void)PerlIO_putc(Perl_debug_log, '\n'); } /* Header fields of interest. */ if (r->regstart) PerlIO_printf(Perl_debug_log, "start `%s' ", SvPVX(r->regstart)); if (r->regstclass) { regprop(sv, r->regstclass); PerlIO_printf(Perl_debug_log, "stclass `%s' ", SvPVX(sv)); } if (r->reganch & ROPT_ANCH) { PerlIO_printf(Perl_debug_log, "anchored"); if (r->reganch & ROPT_ANCH_BOL) PerlIO_printf(Perl_debug_log, "(BOL)"); if (r->reganch & ROPT_ANCH_GPOS) PerlIO_printf(Perl_debug_log, "(GPOS)"); PerlIO_putc(Perl_debug_log, ' '); } if (r->reganch & ROPT_SKIP) PerlIO_printf(Perl_debug_log, "plus "); if (r->reganch & ROPT_IMPLICIT) PerlIO_printf(Perl_debug_log, "implicit "); if (r->regmust != NULL) PerlIO_printf(Perl_debug_log, "must have \"%s\" back %ld ", SvPVX(r->regmust), (long) r->regback); PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen); PerlIO_printf(Perl_debug_log, "\n"); } /* - regprop - printable representation of opcode */ void regprop(sv, op) SV *sv; char *op; { register char *p = 0; sv_setpv(sv, ":"); switch (OP(op)) { case BOL: p = "BOL"; break; case MBOL: p = "MBOL"; break; case SBOL: p = "SBOL"; break; case EOL: p = "EOL"; break; case MEOL: p = "MEOL"; break; case SEOL: p = "SEOL"; break; case ANY: p = "ANY"; break; case SANY: p = "SANY"; break; case ANYOF: p = "ANYOF"; break; case BRANCH: p = "BRANCH"; break; case EXACT: p = "EXACT"; break; case EXACTF: p = "EXACTF"; break; case EXACTFL: p = "EXACTFL"; break; case NOTHING: p = "NOTHING"; break; case BACK: p = "BACK"; break; case END: p = "END"; break; case BOUND: p = "BOUND"; break; case BOUNDL: p = "BOUNDL"; break; case NBOUND: p = "NBOUND"; break; case NBOUNDL: p = "NBOUNDL"; break; case CURLY: sv_catpvf(sv, "CURLY {%d,%d}", ARG1(op), ARG2(op)); break; case CURLYX: sv_catpvf(sv, "CURLYX {%d,%d}", ARG1(op), ARG2(op)); break; case REF: sv_catpvf(sv, "REF%d", ARG1(op)); break; case REFF: sv_catpvf(sv, "REFF%d", ARG1(op)); break; case REFFL: sv_catpvf(sv, "REFFL%d", ARG1(op)); break; case OPEN: sv_catpvf(sv, "OPEN%d", ARG1(op)); break; case CLOSE: sv_catpvf(sv, "CLOSE%d", ARG1(op)); p = NULL; break; case STAR: p = "STAR"; break; case PLUS: p = "PLUS"; break; case MINMOD: p = "MINMOD"; break; case GPOS: p = "GPOS"; break; case UNLESSM: p = "UNLESSM"; break; case IFMATCH: p = "IFMATCH"; break; case SUCCEED: p = "SUCCEED"; break; case WHILEM: p = "WHILEM"; break; case DIGIT: p = "DIGIT"; break; case NDIGIT: p = "NDIGIT"; break; case ALNUM: p = "ALNUM"; break; case NALNUM: p = "NALNUM"; break; case SPACE: p = "SPACE"; break; case NSPACE: p = "NSPACE"; break; case ALNUML: p = "ALNUML"; break; case NALNUML: p = "NALNUML"; break; case SPACEL: p = "SPACEL"; break; case NSPACEL: p = "NSPACEL"; break; default: FAIL("corrupted regexp opcode"); } if (p) sv_catpv(sv, p); } #endif /* DEBUGGING */ void pregfree(r) struct regexp *r; { if (!r) return; if (r->precomp) { Safefree(r->precomp); r->precomp = Nullch; } if (r->subbase) { Safefree(r->subbase); r->subbase = Nullch; } if (r->regmust) { SvREFCNT_dec(r->regmust); r->regmust = Nullsv; } if (r->regstart) { SvREFCNT_dec(r->regstart); r->regstart = Nullsv; } Safefree(r->startp); Safefree(r->endp); Safefree(r); }