/* * Copyright (c) 2002 by The XFree86 Project, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * Except as contained in this notice, the name of the XFree86 Project shall * not be used in advertising or otherwise to promote the sale, use or other * dealings in this Software without prior written authorization from the * XFree86 Project. * * Author: Paulo César Pereira de Andrade */ /* $XFree86: xc/programs/xedit/lisp/re/re.c,v 1.8 2002/11/17 07:51:30 paulo Exp $ */ #include #include "rep.h" /* * Types */ /* Information used when generating the final form of the compiled re. */ struct _re_inf { rec_alt *alt; unsigned char *cod; long len; long spc; /* Start offset of special repetition instruction */ long sr[MAX_DEPTH]; /* Jump offset of special repetition instruction */ long sj[MAX_DEPTH]; /* Just a flag, to know if this nesting is for a special repetition */ char sp[MAX_DEPTH]; int bas; /* Alternatives/repetitions depth */ int par; /* Open parenthesis counter */ int ref; /* Backreference counter */ rec_pat *apat; /* Alternatives duplicate patterns * if a special repetition is found, * this is done to somewhat simplify * the bytecode engine and still allow * some complex (and time consuming) * patterns. */ int flags; int ecode; }; /* This structure is not associated with re_cod as it's data only matters * to the current match search. */ struct _re_eng { unsigned char *bas; /* Base string pointer */ unsigned char *str; /* String to search for pattern */ unsigned char *end; /* Where to stop searching */ unsigned char *cod; /* Pointer in the re_cod structure */ long off; /* Number of used entries in so/eo etc */ /* Match offset/nesting information */ long so[MAX_DEPTH]; /* (s)tart of (m)atch */ long eo[MAX_DEPTH]; /* (e)nd of (m)atch */ long sv[MAX_DEPTH]; /* (s)a(v)e match end offset */ long re[MAX_DEPTH]; /* (re)petition count */ long ss[MAX_DEPTH]; /* (s)ave (s)tart of match */ unsigned char *rcod[MAX_DEPTH]; /* restart position in regex code */ unsigned char *rstr[MAX_DEPTH]; /* restart position in string */ /* Group/backreference information */ long goff; long gso[9]; long geo[9]; }; /* * Prototypes */ static void reinit(void); static int rec_check(re_inf*, int); static int rec_code(re_inf*, ReCode); static int rec_byte(re_inf*, int); static int rec_byte_byte(re_inf*, int, int); static int rec_code_byte(re_inf*, ReCode, int); static int rec_length(re_inf*, int); static int rec_code_byte_byte(re_inf*, ReCode, int, int); static int rec_build_alt(re_inf*, rec_alt*); static int rec_build_pat(re_inf*, rec_pat*); static int rec_build_rng(re_inf*, rec_rng*); static int rec_build_grp(re_inf*, rec_grp*); static int rec_build_stl(re_inf*, rec_stl*); static int rec_build_rep(re_inf*, rec_rep*); static int rec_inc_spc(re_inf*); static int rec_dec_spc(re_inf*); static int rec_add_spc(re_inf*, int); static int rec_off_spc(re_inf*); static int rec_alt_spc(re_inf*, int); static int rec_rep_spc(re_inf*, int); #ifdef DEBUG static void redump(re_cod*); #endif /* * Initialization */ unsigned char re__alnum[256]; unsigned char re__odigit[256]; unsigned char re__ddigit[256]; unsigned char re__xdigit[256]; unsigned char re__control[256]; /* * Implementation */ int recomp(re_cod *preg, const char *pattern, int flags) { int i, ecode; re_inf inf; reinit(); preg->cod = NULL; inf.alt = irec_comp(pattern, flags & RE_PEND ? preg->re_endp : pattern + strlen(pattern), flags, &ecode); if (ecode != 0) return (ecode); inf.cod = NULL; inf.len = inf.spc = 0; inf.bas = 0; inf.par = 0; inf.ref = 0; inf.apat = NULL; inf.flags = flags; inf.ecode = 0; for (i = 0; i < MAX_DEPTH; i++) inf.sp[i] = 0; /* First byte is runtime modifier flags */ if (rec_byte(&inf, flags & (RE_NEWLINE | RE_NOSUB)) == 0 && rec_byte(&inf, 0xff) == 0 && rec_build_alt(&inf, inf.alt) == 0 && rec_rep_spc(&inf, 0) == 0 && rec_code(&inf, Re_Done) == 0) { /* Number of possible references, loops will not leave this * value correct, but it is cheap to read it from the second * byte, instead of adding several extra checks in the bytecode. */ if (inf.ref) inf.cod[1] = inf.ref - 1; preg->cod = inf.cod; /* Public structure member */ preg->re_nsub = inf.ref; } irec_free_alt(inf.alt); if (inf.ecode) free(inf.cod); #ifdef DEBUG else if (flags & RE_DUMP) redump(preg); #endif return (inf.ecode); } int reexec(const re_cod *preg, const char *string, int nmatch, re_mat pmat[], int flags) { unsigned char *ptr, *str, newline, nosub; int len, si, ci, bas, i, j, k, l, m; re_eng eng; if (preg == NULL || preg->cod == NULL || nmatch < 0 || ((flags & RE_STARTEND) && (pmat == NULL || pmat[0].rm_eo < pmat[0].rm_so))) return (RE_INVARG); eng.str = (unsigned char*)string; if (flags & RE_STARTEND) { eng.end = eng.str + pmat[0].rm_eo; eng.str += pmat[0].rm_so; } else eng.end = eng.str + strlen(string); eng.bas = eng.str; nosub = preg->cod[0] & RE_NOSUB; newline = preg->cod[0] & RE_NEWLINE; eng.cod = preg->cod + 2; if (!nosub && preg->cod[1] != 0xff) { for (i = 0; i <= preg->cod[1]; i++) { eng.gso[i] = 0; eng.geo[i] = -1; } } /* Setup to search for start of match from the first character */ eng.so[0] = 0; eng.eo[0] = eng.sv[0] = -1; eng.rcod[0] = eng.cod; eng.rstr[0] = eng.str + 1; eng.off = 0; eng.goff = -1; for (ci = si = 1;;) { reset: switch (*eng.cod) { /**************************************************** * One byte codes * ****************************************************/ case Re_Any: if (eng.str == eng.end || (newline && eng.str[0] == '\n')) goto fail; goto match; case Re_AnyEatAnyTimes: if (newline) { for (ptr = eng.str; ptr < eng.end; ptr++) { if (*ptr == '\n') break; } si = ptr - eng.str; } else si = eng.end - eng.str; goto match; case Re_AnyEatMaybe: si = eng.end > eng.str; if (newline && si && eng.str[0] == '\n') si = 0; goto match; case Re_AnyEatAtLeast: if (newline) { for (ptr = eng.str; ptr < eng.end; ptr++) { if (*ptr == '\n') break; } si = ptr - eng.str; } else si = eng.end - eng.str; if (si == 0) { si = 1; goto fail; } goto match; case Re_Odigit: if (eng.str >= eng.end) goto fail; if (re__odigit[eng.str[0]]) goto match; goto fail; case Re_OdigitNot: if (eng.str >= eng.end || re__odigit[eng.str[0]]) goto fail; goto match; case Re_Digit: if (eng.str >= eng.end) goto fail; if (re__ddigit[eng.str[0]]) goto match; goto fail; case Re_DigitNot: if (eng.str >= eng.end || re__ddigit[eng.str[0]]) goto fail; goto match; case Re_Xdigit: if (eng.str >= eng.end) goto fail; if (re__xdigit[eng.str[0]]) goto match; goto fail; case Re_XdigitNot: if (eng.str >= eng.end || re__xdigit[eng.str[0]]) goto fail; goto match; case Re_Space: if (eng.str >= eng.end) goto fail; if (eng.str[0] == ' ' || eng.str[0] == '\t') goto match; goto fail; case Re_SpaceNot: if (eng.str >= eng.end) goto fail; if (eng.str[0] != ' ' && eng.str[0] != '\t') goto match; goto fail; case Re_Tab: if (eng.str >= eng.end) goto fail; if (eng.str[0] == '\t') goto match; goto fail; case Re_Newline: if (eng.str >= eng.end) goto fail; if (eng.str[0] == '\n') goto match; goto fail; case Re_Lower: if (eng.str >= eng.end) goto fail; if (eng.str[0] >= 'a' && eng.str[0] <= 'z') goto match; goto fail; case Re_Upper: if (eng.str >= eng.end) goto fail; if (eng.str[0] >= 'A' && eng.str[0] <= 'Z') goto match; goto fail; case Re_Alnum: if (eng.str >= eng.end) goto fail; if (re__alnum[eng.str[0]]) goto match; goto fail; case Re_AlnumNot: if (eng.str >= eng.end) goto fail; if (re__alnum[eng.str[0]]) goto fail; goto match; case Re_Control: if (eng.str >= eng.end) goto fail; if (re__control[eng.str[0]]) goto match; goto fail; case Re_ControlNot: if (eng.str >= eng.end || re__control[eng.str[0]]) goto fail; goto match; /**************************************************** * One byte codes, match special emtpy strings * ****************************************************/ case Re_Bol: if (eng.str == eng.bas) { if ((flags & RE_NOTBOL)) { /* String does not start at the beginning of a line */ if (newline) goto fail; goto wont; } si = 0; goto match; } if (newline && eng.str[-1] == '\n') { si = 0; goto match; } goto fail; case Re_Eol: if (eng.str == eng.end) { if (flags & RE_NOTEOL) /* String does not finish at the end of a line */ goto wont; si = 0; goto match; } if (newline && eng.str[0] == '\n') { si = 0; goto match; } goto fail; case Re_Bow: if (eng.str >= eng.end || (eng.str > eng.bas && (re__alnum[eng.str[-1]]))) goto fail; if (re__alnum[eng.str[0]]) { si = 0; goto match; } goto fail; case Re_Eow: if (eng.str == eng.bas || (eng.str < eng.end && re__alnum[eng.str[0]])) goto fail; if (re__alnum[eng.str[-1]]) { si = 0; goto match; } goto fail; /**************************************************** * One byte code, one byte argument * ****************************************************/ case Re_Literal: if (eng.str >= eng.end) goto fail; if (eng.str[0] == eng.cod[1]) { ci = 2; goto match; } goto fail; case Re_LiteralNot: if (eng.str >= eng.end) goto fail; if (eng.str[0] != eng.cod[1]) { ci = 2; goto match; } goto fail; case Re_SearchLiteral: for (str = eng.str; str < eng.end; str++) { if (*str == eng.cod[1]) { ci = 2; eng.str = str; goto match; } } /* This bytecode only happens in the toplevel */ eng.so[0] = str - eng.bas; eng.str = str; goto fail; /**************************************************** * One byte code, two bytes argument * ****************************************************/ case Re_CaseLiteral: if (eng.str >= eng.end) goto fail; if (eng.str[0] == eng.cod[1] || eng.str[0] == eng.cod[2]) { ci = 3; goto match; } goto fail; case Re_CaseLiteralNot: if (eng.str >= eng.end) goto fail; if (eng.str[0] != eng.cod[1] && eng.str[0] != eng.cod[2]) { ci = 3; goto match; } goto fail; case Re_SearchCaseLiteral: for (str = eng.str; str < eng.end; str++) { if (*str == eng.cod[1] || *str == eng.cod[2]) { ci = 3; eng.str = str; goto match; } } eng.so[0] = str - eng.bas; eng.str = str; goto fail; /**************************************************** * One byte codes, two arguments, n bytes * ****************************************************/ case Re_String: len = eng.cod[1]; if (len & 0x80) { i = 3; len = (len & 0x7f) + (eng.cod[2] << 7); } else i = 2; if (eng.end - eng.str < len) goto fail; ptr = eng.cod + i; str = eng.str; for (k = len; k > 0; k--) { if (*ptr++ != *str++) goto fail; } ci = i + len; si = len; goto match; case Re_SearchString: len = eng.cod[1]; if (len & 0x80) { i = 3; len = (len & 0x7f) + (eng.cod[2] << 7); } else i = 2; for (str = eng.str; eng.end - str >= len; str = eng.str++) { for (ptr = eng.cod + i, str = eng.str, k = len; k > 0; k--) if (*ptr++ != *str++) break; if (k == 0) { /* Substring found */ ci = i + len; si = str - eng.str; goto match; } } eng.so[0] = eng.end - eng.bas; eng.str = eng.end; goto fail; case Re_CaseString: len = eng.cod[1]; if (len & 0x80) { i = 3; len = (len & 0x7f) + (eng.cod[2] << 7); } else i = 2; len >>= 1; /* Check if there are at least len/2 bytes left, string * is represented as two bytes, lower and upper case */ if (eng.end - eng.str < len) goto fail; ptr = eng.cod + i; str = eng.str; for (k = len; k > 0; str++, ptr += 2, k--) { if (*str != ptr[0] && *str != ptr[1]) goto fail; } ci = i + (len << 1); si = len; goto match; case Re_SearchCaseString: len = eng.cod[1]; if (len & 0x80) { i = 3; len = (len & 0x7f) + (eng.cod[2] << 7); } else i = 2; len >>= 1; for (str = eng.str; eng.end - str >= len; str = eng.str++) { for (ptr = eng.cod + i, str = eng.str, k = len; k > 0; k--, ptr += 2, str++) if (ptr[0] != str[0] && ptr[1] != str[0]) break; if (k == 0) { /* Substring found */ ci = i + (len << 1); si = str - eng.str; goto match; } } eng.so[0] = eng.end - eng.bas; eng.str = eng.end; goto fail; case Re_StringList: /* Number of strings */ k = eng.cod[1]; /* Where to jump after match */ bas = eng.cod[2] | (eng.cod[3] << 8); str = eng.str; ptr = eng.cod + k + 4; l = eng.end - eng.str; for (j = 0; j < k; j++) { len = eng.cod[j + 4]; if (len <= l) { for (i = 0; i < len; i++) if (ptr[i] != str[i]) goto next_stl; goto stl_match; } next_stl: ptr += len; } goto fail; stl_match: ci = bas; si = len; goto match; case Re_CaseStringList: /* Number of strings */ k = eng.cod[1]; /* Where to jump after match */ bas = eng.cod[2] | (eng.cod[3] << 8); str = eng.str; ptr = eng.cod + k + 4; l = eng.end - eng.str; for (j = 0; j < k; j++) { len = eng.cod[j + 4]; if ((len >> 1) <= l) { for (i = m = 0; i < len; m++, i += 2) if (ptr[i] != str[m] && ptr[i + 1] != str[m]) goto next_cstl; goto cstl_match; } next_cstl: ptr += len; } goto fail; cstl_match: ci = bas; si = len >> 1; goto match; case Re_LargeStringList: /* Where to jump after match */ bas = eng.cod[1] | (eng.cod[2] << 8); str = eng.str; /* First entry in index map */ ptr = eng.cod + 3; i = (int)str[0] << 1; j = ptr[i] | (ptr[i + 1] << 8); if (j == 0xffff) /* No entry with this byte */ goto fail; /* Bytes left in input */ l = eng.end - eng.str; /* First entry matching initial byte */ ptr += 512 + j; for (len = ptr[0]; str[0] == ptr[1]; ptr += len + 1, len = ptr[0]) { if (len <= l) { for (i = 1; i < len; i++) { if (ptr[i + 1] != str[i]) goto next_lstl; } ci = bas; si = len; goto match; } next_lstl:; } goto fail; case Re_LargeCaseStringList: /* Where to jump after match */ bas = eng.cod[1] | (eng.cod[2] << 8); str = eng.str; /* First entry in index map */ ptr = eng.cod + 3; i = (int)str[0] << 1; j = ptr[i] | (ptr[i + 1] << 8); if (j == 0xffff) /* No entry with this byte */ goto fail; /* Bytes left in input */ l = eng.end - eng.str; /* First entry matching initial byte */ ptr += 512 + j; for (len = ptr[0]; str[0] == ptr[1] || str[0] == ptr[2]; ptr += len + 1, len = ptr[0]) { if ((k = (len >> 1)) <= l) { for (i = 2, j = 1; i < len; i += 2, j++) { if (ptr[i + 1] != str[j] && ptr[i + 2] != str[j]) goto next_lcstl; } ci = bas; si = k; goto match; } next_lcstl:; } goto fail; /**************************************************** * Character range matching * ****************************************************/ case Re_Range: if (eng.str < eng.end && eng.cod[eng.str[0] + 1]) { ci = 257; goto match; } goto fail; case Re_RangeNot: if (eng.str >= eng.end || eng.cod[eng.str[0] + 1]) goto fail; ci = 257; goto match; /**************************************************** * Group handling * ****************************************************/ case Re_Open: if (++eng.goff >= 9) return (RE_ASSERT); eng.gso[eng.goff] = eng.str - eng.bas; ++eng.cod; continue; case Re_Close: eng.geo[eng.goff] = eng.str - eng.bas; ++eng.cod; continue; case Re_Update: bas = eng.cod[1]; eng.geo[eng.goff] = eng.str - eng.bas; eng.cod += 2; /* + Update + bas */ continue; /**************************************************** * Backreference * ****************************************************/ case Re_Backref: i = eng.cod[1]; j = eng.gso[i]; k = eng.geo[i]; len = k - j; if (k < j || eng.end - eng.str < len) goto fail; ptr = eng.bas + j; str = eng.str; for (l = len; l > 0; l--) { if (*ptr++ != *str++) goto fail; } ci = 2; si = len; goto match; /**************************************************** * Alternatives handling * ****************************************************/ case Re_Alt: bas = eng.off; if (++eng.off >= MAX_DEPTH) return (RE_ASSERT); /* Get offset of next alternative */ i = eng.cod[1] | (eng.cod[2] << 8); /* Setup for next alternative if the current fails */ eng.rcod[eng.off] = eng.cod + i + 1; /* + Alt */ /* If fail, test the next alternative in the same string */ eng.rstr[eng.off] = eng.str; /* Setup match offsets */ if (eng.so[bas] <= eng.eo[bas]) eng.so[eng.off] = eng.eo[bas]; else eng.so[eng.off] = eng.so[bas]; eng.sv[eng.off] = eng.eo[eng.off] = eng.so[eng.off] - 1; /* Save start of possible previous matches */ eng.ss[eng.off] = eng.so[bas]; /* Skip code */ eng.cod += 3; /* Go try the first alternative */ continue; case Re_AltNext: bas = eng.off - 1; /* Check if matched and if it is a better match */ if (eng.eo[eng.off] >= eng.so[eng.off] && eng.sv[eng.off] - eng.so[eng.off] < eng.eo[eng.off] - eng.so[eng.off]) eng.sv[eng.off] = eng.eo[eng.off]; /* Get offset of next alternative */ i = eng.cod[1] | (eng.cod[2] << 8); /* Setup for next alternative if the current fails */ eng.rcod[eng.off] = eng.cod + i + 1; /* + AltNext */ /* Setup match offset */ eng.eo[eng.off] = eng.so[eng.off] - 1; /* Reset string for next alternative */ eng.str = eng.rstr[eng.off]; /* Skip code */ eng.cod += 3; /* Go try the next alternative */ continue; case Re_AltDone: bas = eng.off - 1; /* Check if matched and if it is a better match */ if (eng.sv[eng.off] - eng.so[eng.off] < eng.eo[eng.off] - eng.so[eng.off]) eng.sv[eng.off] = eng.eo[eng.off]; /* If there is a match */ if (eng.sv[eng.off] >= eng.so[eng.off]) { eng.so[bas] = eng.ss[eng.off]; eng.eo[bas] = eng.sv[eng.off]; eng.str = eng.bas + eng.eo[bas]; /* Pop stack and skip code */ --eng.off; ++eng.cod; /* Go try next regular expression pattern */ continue; } /* Failed, reset string and pop stack */ eng.str = eng.rstr[eng.off]; --eng.off; goto fail; /**************************************************** * Repetition * ****************************************************/ /* Note that the repetition counter is not * updated for *, + and ? * it is easy to updated, but since it is not * really useful, code to do it was removed * to save a few cpu cicles. */ #define REPETITION_SETUP() \ if (++eng.off >= MAX_DEPTH) \ return (RE_ASSERT); \ \ /* Return here for recovery if match fail */ \ eng.rcod[eng.off] = eng.cod; \ \ /* Setup match offsets */ \ if (eng.so[bas] <= eng.eo[bas]) \ eng.so[eng.off] = eng.eo[bas]; \ else \ eng.so[eng.off] = eng.so[bas]; \ eng.ss[eng.off] = eng.so[bas]; \ eng.sv[eng.off] = eng.eo[eng.off] = eng.so[eng.off] - 1;\ \ /* Skip repetition instruction */ \ eng.cod += 4; case Re_AnyTimes: bas = eng.cod[1]; if (eng.off == bas) { /* First iteration */ REPETITION_SETUP(); } else { if (eng.eo[eng.off] >= eng.so[eng.off] && eng.eo[eng.off] > eng.sv[eng.off]) { /* Update offset of match */ eng.sv[eng.off] = eng.eo[eng.off]; /* Skip repetition instruction */ eng.cod += 4; } else { /* Match failed but it is ok */ len = eng.cod[2] | (eng.cod[3] << 8); eng.so[bas] = eng.ss[eng.off]; if (eng.sv[eng.off] >= eng.so[eng.off]) /* Something matched earlier, update */ eng.eo[bas] = eng.sv[eng.off]; else if (eng.eo[bas] < eng.so[bas]) /* Empty match */ eng.eo[bas] = eng.so[bas]; /* Try next pattern at correct offset */ eng.str = eng.bas + eng.eo[bas]; /* Pop stack and skip code */ --eng.off; eng.cod += len; } } continue; case Re_Maybe: bas = eng.cod[1]; if (eng.off == bas) { /* First iteration */ REPETITION_SETUP(); } else { /* Matched or first iteration is done */ len = eng.cod[2] | (eng.cod[3] << 8); eng.so[bas] = eng.ss[eng.off]; if (eng.eo[eng.off] > eng.so[eng.off]) { /* Something matched earlier, update */ eng.eo[bas] = eng.eo[eng.off]; eng.str = eng.bas + eng.eo[bas]; /* Don't need to update counter */ } else { /* Empty match */ if (eng.eo[bas] < eng.so[bas]) eng.eo[bas] = eng.so[bas]; /* Try next pattern at correct offset */ eng.str = eng.bas + eng.eo[bas]; } /* Pop stack */ --eng.off; /* Skip code */ eng.cod += len; } continue; case Re_AtLeast: bas = eng.cod[1]; if (eng.off == bas) { /* First iteration */ REPETITION_SETUP(); } else { if (eng.eo[eng.off] >= eng.so[eng.off] && eng.eo[eng.off] > eng.sv[eng.off]) { /* Update offset of match */ eng.sv[eng.off] = eng.eo[eng.off]; /* Skip repetition instruction */ eng.cod += 4; } else { /* Last try failed */ len = eng.cod[2] | (eng.cod[3] << 8); if (eng.sv[eng.off] >= eng.so[eng.off]) { /* Something matched earlier, update */ eng.so[bas] = eng.ss[eng.off]; eng.eo[bas] = eng.sv[eng.off]; eng.str = eng.bas + eng.eo[bas]; } else { /* Do it here, so that the fail label does * not need to do too expensive work for * simple patterns. */ eng.so[bas] = eng.str - eng.bas; /* Zero matches, pop stack and restart */ --eng.off; goto fail; } /* Pop stack and skip code */ --eng.off; eng.cod += len; } } continue; /**************************************************** * Repetition with arguments * ****************************************************/ case Re_Exact: #define COMPLEX_REPETITION_SETUP_0() \ i = eng.cod[1]; \ bas = eng.cod[2]; #define COMPLEX_REPETITION_SETUP() \ /* First iteration */ \ if (++eng.off >= MAX_DEPTH) \ return (RE_ASSERT); \ \ /* Remeber number or repetitions */ \ eng.re[eng.off] = 0; \ \ /* Return here for recovery if match fail */ \ eng.rcod[eng.off] = eng.cod; \ \ /* Setup match offsets */ \ if (eng.so[bas] <= eng.eo[bas]) \ eng.so[eng.off] = eng.eo[bas]; \ else \ eng.so[eng.off] = eng.so[bas]; \ eng.sv[eng.off] = eng.eo[eng.off] = eng.so[eng.off] - 1;\ eng.ss[eng.off] = eng.so[bas]; \ \ /* Skip repetition instruction */ \ eng.cod += 5; COMPLEX_REPETITION_SETUP_0(); if (eng.off == bas) { /* First iteration */ COMPLEX_REPETITION_SETUP(); } else { if (eng.eo[eng.off] >= eng.so[eng.off] && eng.eo[eng.off] > eng.sv[eng.off]) { /* Update offset of match */ eng.sv[eng.off] = eng.eo[eng.off]; /* Update repetition counter */ if (++eng.re[eng.off] == i) { /* Matched the required times */ eng.so[bas] = eng.ss[eng.off]; eng.eo[bas] = eng.sv[eng.off]; eng.str = eng.bas + eng.eo[bas]; /* Update code */ k = eng.cod[3] | (eng.cod[4] << 8); eng.cod += k; /* Pop stack and go for next pattern */ --eng.off; continue; } /* Skip repetition instruction */ eng.cod += 5; } else { /* Do it here, so that the fail label does * not need to do too expensive work for * simple patterns. */ eng.so[bas] = eng.str - eng.bas; /* Pop stack and restart */ --eng.off; goto fail; } } continue; case Re_Min: COMPLEX_REPETITION_SETUP_0(); if (eng.off == bas) { /* First iteration */ COMPLEX_REPETITION_SETUP(); } else { if (eng.eo[eng.off] >= eng.so[eng.off] && eng.eo[eng.off] > eng.sv[eng.off]) { /* Update offset of match */ eng.sv[eng.off] = eng.eo[eng.off]; /* Update repetition counter */ ++eng.re[eng.off]; /* Skip repetition instruction and try again */ eng.cod += 5; } else { /* Match failed! */ if (eng.re[eng.off] < i) { /* Do it here, so that the fail label does * not need to do too expensive work for * simple patterns. */ eng.so[bas] = eng.str - eng.bas; /* Didn't match required number of times */ --eng.off; goto fail; } else { /* Matched minimum number of times */ eng.eo[bas] = eng.sv[eng.off]; eng.str = eng.bas + eng.eo[bas]; k = eng.cod[3] | (eng.cod[4] << 8); /* Update code and pop stack */ eng.cod += k; --eng.off; } } } continue; case Re_Max: COMPLEX_REPETITION_SETUP_0(); if (eng.off == bas) { /* First iteration */ COMPLEX_REPETITION_SETUP(); } else { if (eng.eo[eng.off] >= eng.so[eng.off] && eng.eo[eng.off] > eng.sv[eng.off]) { /* Update offset of match */ eng.sv[eng.off] = eng.eo[eng.off]; /* Update repetition counter */ if (++eng.re[eng.off] == i) { /* Matched the maximum times */ eng.so[bas] = eng.ss[eng.off]; eng.eo[bas] = eng.sv[eng.off]; eng.str = eng.bas + eng.eo[bas]; k = eng.cod[3] | (eng.cod[4] << 8); /* Update code and pop stack */ eng.cod += k; --eng.off; continue; } /* Skip repetition instruction and try again */ eng.cod += 5; } else { /* No matches, but zero matches are ok */ k = eng.cod[3] | (eng.cod[4] << 8); if (eng.sv[eng.off] >= eng.so[eng.off]) { /* Something matched earlier, update */ eng.so[bas] = eng.ss[eng.off]; eng.eo[bas] = eng.sv[eng.off]; eng.str = eng.bas + eng.eo[bas]; } else { /* Empty match */ if (eng.eo[bas] < eng.so[bas]) eng.eo[bas] = eng.so[bas]; /* Try next pattern at correct offset */ eng.str = eng.bas + eng.eo[bas]; } /* Pop stack and update code */ --eng.off; eng.cod += k; } } continue; case Re_MinMax: bas = eng.cod[3]; if (eng.off == bas) { /* First iteration */ COMPLEX_REPETITION_SETUP(); } else { if (eng.eo[eng.off] >= eng.so[eng.off] && eng.eo[eng.off] > eng.sv[eng.off]) { /* Update offset of match */ eng.sv[eng.off] = eng.eo[eng.off]; /* Update repetition counter */ if (++eng.re[eng.off] == eng.cod[2]) { /* Matched the maximum times */ eng.so[bas] = eng.ss[eng.off]; eng.eo[bas] = eng.sv[eng.off]; eng.str = eng.bas + eng.eo[bas]; k = eng.cod[4] | (eng.cod[5] << 8); /* Update code and pop stack */ eng.cod += k; --eng.off; continue; } /* Skip repetition instruction and try again */ eng.cod += 6; } else { /* Match failed! */ if (eng.re[eng.off] < eng.cod[1]) { /* Do it here, so that the fail label does * not need to do too expensive work for * simple patterns. */ eng.so[bas] = eng.str - eng.bas; /* Didn't match required number of times */ --eng.off; goto fail; } else { /* Matched minimum number of times */ eng.so[bas] = eng.ss[eng.off]; eng.eo[bas] = eng.sv[eng.off]; eng.str = eng.bas + eng.eo[bas]; k = eng.cod[4] | (eng.cod[5] << 8); /* Update code and pop stack */ eng.cod += k; --eng.off; } } } continue; /**************************************************** * Special repetition handling * ****************************************************/ case Re_AnyAnyTimes: /* code(1) + bas(1) + gbas(1) + jump(2) */ bas = eng.cod[1]; if (eng.off == bas) { /* First iteration */ if (++eng.off >= MAX_DEPTH) return (RE_ASSERT); /* Return here for recovery if match fail */ eng.rcod[eng.off] = eng.cod; /* If fail, test the next pattern at the same point */ eng.rstr[eng.off] = eng.str; /* Setup match offsets */ eng.so[eng.off] = eng.str - eng.bas; eng.eo[eng.off] = eng.so[eng.off] - 1; if (newline) /* Use the repetition counter to store start of * skipped string, to later check if skipping a * newline. */ eng.re[eng.off] = eng.so[eng.off]; /* Save start of possible previous matches */ eng.ss[eng.off] = eng.so[bas]; /* Skip repetition instruction */ eng.cod += 5; } else { /* -1 as an unsigned char */ if (eng.cod[2] != 0xff) eng.goff = eng.cod[2]; else eng.goff = -1; if (newline) { ptr = eng.bas + eng.re[eng.off]; str = eng.bas + eng.so[eng.off]; for (; ptr < str; ptr++) if (*ptr == '\n') { eng.cod = eng.rcod[0]; eng.so[0] = ptr - eng.bas + 1; eng.eo[0] = eng.so[0] - 1; eng.rstr[0] = eng.str = ptr + 1; eng.off = 0; goto reset; } /* If looping, don't do too many noops */ eng.re[eng.off] = ptr - eng.bas; } if (eng.eo[eng.off] >= eng.so[eng.off]) { /* Note that this is only true if all possibly * nested special repetitions also matched. */ if (eng.goff >= 0) { if (eng.cod[5] == Re_Update) eng.gso[eng.goff] = eng.eo[bas] + (eng.so[bas] > eng.eo[bas]); else if (eng.geo[eng.goff] < eng.so[eng.off]) eng.geo[eng.goff] = eng.so[eng.off]; } /* Jump relative offset */ len = eng.cod[3] | (eng.cod[4] << 8); /* Restore offset from where started trying */ eng.so[bas] = eng.ss[eng.off]; eng.eo[bas] = eng.eo[eng.off]; /* Pop stack and skip code */ --eng.off; eng.cod += len; } else { /* Only give up if the entire string was scanned */ if (eng.str < eng.end) { /* Update restart point for next pattern */ eng.str = ++eng.rstr[eng.off]; /* Reset start of nested match */ eng.so[eng.off] = eng.str - eng.bas; /* Skip repetition instruction */ eng.cod += 5; } else { /* Entire string scanned and failed */ /* Jump relative offset */ len = eng.cod[3] | (eng.cod[4] << 8); /* Restore offset from where started trying */ eng.so[bas] = eng.ss[eng.off]; eng.eo[bas] = eng.ss[eng.off] - 1; /* Pop stack and skip code */ --eng.off; eng.cod += len; } } } continue; /* This is significantly different than matching .* * because it may need to restart several times since it is * possible to find too many false positives, for example: * a.*b => once one "a" is found, scan all * the remaining string searching for a "b" * a.?b => the string may have too many "a"s, but the * first occurrences of "a" may not be followed * by any-character and a "b" or a single "b". */ case Re_AnyMaybe: bas = eng.cod[1]; if (eng.off == bas) { /* First iteration */ if (++eng.off >= MAX_DEPTH) return (RE_ASSERT); /* Return here for recovery if match fail */ eng.rcod[eng.off] = eng.cod; /* First try without eating a byte */ eng.rstr[eng.off] = eng.str; /* Remember this is the first try if match fail */ eng.re[eng.off] = 0; /* Setup match offsets */ eng.so[eng.off] = eng.str - eng.bas; eng.eo[eng.off] = eng.so[eng.off] - 1; /* Save start of possible previous matches */ eng.ss[eng.off] = eng.so[bas]; /* Skip repetition instruction */ eng.cod += 5; } else { /* -1 as an unsigned char */ if (eng.cod[2] != 0xff) eng.goff = eng.cod[2]; else eng.goff = -1; if (eng.eo[eng.off] >= eng.so[eng.off]) { /* Something matched */ if (eng.goff >= 0) { if (eng.cod[5] == Re_Update) eng.gso[eng.goff] = eng.eo[bas] + (eng.so[bas] > eng.eo[bas]); else if (eng.geo[eng.goff] < eng.so[eng.off]) eng.geo[eng.goff] = eng.so[eng.off]; } /* Jump relative offset */ len = eng.cod[3] | (eng.cod[4] << 8); /* Update offset of match */ eng.eo[bas] = eng.eo[eng.off]; /* Pop stack and skip code */ --eng.off; eng.cod += len; } else if (eng.re[eng.off] == 0 && (!newline || eng.rstr[eng.off][1] != '\n')) { /* Try this time skiping a byte */ ++eng.re[eng.off]; /* Reset string, skip code and go try one time more */ eng.str = ++eng.rstr[eng.off]; eng.cod += 5; } else { /* Failed to match */ /* Update offsets */ eng.eo[bas] = eng.ss[eng.off]; eng.so[bas] = eng.eo[bas] + 1; eng.str = eng.rstr[eng.off] + (eng.re[eng.off] == 0); /* Pop stack and return to toplevel code */ --eng.off; if (eng.str >= eng.end) goto wont; eng.cod = eng.rcod[bas]; } } continue; /* .+ almost identical to .* but requires eating at least one byte */ case Re_AnyAtLeast: bas = eng.cod[1]; if (eng.off == bas) { /* First iteration */ if (++eng.off >= MAX_DEPTH) return (RE_ASSERT); /* Return here for recovery if match fail */ eng.rcod[eng.off] = eng.cod; /* Skip one byte for the restart string */ if (newline && eng.str[0] == '\n') { /* Cannot skip newline */ eng.cod = eng.rcod[0]; eng.rstr[0] = ++eng.str; eng.so[0] = eng.str - eng.bas; eng.eo[0] = eng.so[0] - 1; eng.off = 0; goto reset; } eng.rstr[eng.off] = ++eng.str; /* Setup match offsets */ eng.so[eng.off] = eng.str - eng.bas; eng.eo[eng.off] = eng.so[eng.off] - 1; if (newline) /* Use the repetition counter to store start of * skipped string, to later check if skipping a * newline. */ eng.re[eng.off] = eng.so[eng.off]; /* Save start of possible previous matches */ eng.ss[eng.off] = eng.so[bas]; /* Skip repetition instruction */ eng.cod += 5; } else { /* -1 as an unsigned char */ if (eng.cod[2] != 0xff) eng.goff = eng.cod[2]; else eng.goff = -1; if (newline) { ptr = eng.bas + eng.re[eng.off]; str = eng.bas + eng.so[eng.off]; for (; ptr < str; ptr++) if (*ptr == '\n') { eng.cod = eng.rcod[0]; eng.so[0] = ptr - eng.bas + 1; eng.eo[0] = eng.so[0] - 1; eng.rstr[0] = eng.str = ptr + 1; eng.off = 0; goto reset; } /* If looping, don't do too many noops */ eng.re[eng.off] = ptr - eng.bas; } if (eng.eo[eng.off] >= eng.so[eng.off]) { /* Note that this is only true if all possibly * nested special repetitions also matched. */ if (eng.goff >= 0) { if (eng.cod[5] == Re_Update) eng.gso[eng.goff] = eng.eo[bas] + (eng.so[bas] > eng.eo[bas]); else if (eng.geo[eng.goff] < eng.so[eng.off]) eng.geo[eng.goff] = eng.so[eng.off]; } /* Jump relative offset */ len = eng.cod[3] | (eng.cod[4] << 8); /* Restore offset from where started trying */ eng.so[bas] = eng.ss[eng.off]; eng.eo[bas] = eng.eo[eng.off]; /* Pop stack and skip code */ --eng.off; eng.cod += len; } else { /* Only give up if the entire string was scanned */ if (eng.str < eng.end) { /* Update restart point for next pattern */ eng.str = ++eng.rstr[eng.off]; /* Reset start of nested match */ eng.so[eng.off] = eng.str - eng.bas; /* Skip repetition instruction */ eng.cod += 5; } else { /* Entire string scanned and failed */ /* Jump relative offset */ len = eng.cod[3] | (eng.cod[4] << 8); /* Restore offset from where started trying */ eng.so[bas] = eng.ss[eng.off]; eng.eo[bas] = eng.ss[eng.off] - 1; /* Pop stack and skip code */ --eng.off; eng.cod += len; } } } continue; /**************************************************** * Repetition matched! * ****************************************************/ case Re_RepJump: /* eng.cod[1] is toplevel offset of repetition */ if (eng.off > eng.cod[1]) /* If still needs to try matches */ eng.cod -= eng.cod[2]; else eng.cod += 3; /* + RepJump + bas + len-size */ continue; case Re_RepLongJump: /* eng.cod[1] is toplevel offset of repetition */ if (eng.off > eng.cod[1]) /* If still needs to try matches */ eng.cod -= eng.cod[2] | (eng.cod[3] << 8); else eng.cod += 4; /* + RepLongJump + bas + len-size */ continue; /**************************************************** * Finished * ****************************************************/ case Re_DoneIf: if (eng.eo[eng.off] >= eng.so[eng.off]) { eng.so[0] = eng.ss[eng.off]; eng.eo[0] = eng.eo[eng.off]; goto done; } ++eng.cod; continue; case Re_MaybeDone: if (eng.eo[eng.off] >= eng.so[eng.off]) { eng.so[0] = eng.ss[eng.off]; eng.eo[0] = eng.eo[eng.off]; goto done; } ++eng.cod; continue; case Re_Done: goto done; default: /* Fatal internal error */ return (RE_ASSERT); } wont: /* Surely won't match */ if (eng.off == 0) { eng.eo[0] = eng.so[0] - 1; break; } fail: if (eng.off == 0) { /* If the entire string scanned */ if (++eng.str > eng.end) { eng.eo[0] = eng.so[0] - 1; break; } eng.goff = -1; /* Update start of possible match after restart */ if (eng.eo[0] >= eng.so[0]) { /* If first fail */ eng.str = eng.rstr[0]; ++eng.rstr[0]; eng.so[0] = eng.str - eng.bas; eng.eo[0] = eng.so[eng.off] - 1; } else /* Just trying at next byte */ ++eng.so[0]; } else /* Remember this match failed */ eng.eo[eng.off] = eng.so[eng.off] - 1; /* Restart code */ eng.cod = eng.rcod[eng.off]; continue; match: /* If first match */ if (eng.eo[eng.off] < eng.so[eng.off]) { if (eng.off == 0) eng.rstr[0] = eng.str + 1; eng.so[eng.off] = eng.eo[eng.off] = eng.str - eng.bas; } eng.eo[eng.off] += si; eng.cod += ci; eng.str += si; ci = si = 1; continue; done: break; } if (nmatch) { if (flags & RE_STARTEND) len = pmat[0].rm_so; else len = 0; if (!nosub) { if (preg->cod[1] != 0xff) eng.goff = preg->cod[1]; pmat[0].rm_so = eng.so[0]; pmat[0].rm_eo = eng.eo[0]; for (i = 1; i < nmatch; i++) { if (i - 1 <= eng.goff) { pmat[i].rm_so = eng.gso[i - 1]; pmat[i].rm_eo = eng.geo[i - 1]; } else { pmat[i].rm_so = 0; pmat[i].rm_eo = -1; } } if (len) { /* Update offsets, since the match was done in a substring */ j = eng.goff + 2 > nmatch ? nmatch : eng.goff + 2; for (i = 0; i < j; i++) { pmat[i].rm_so += len; pmat[i].rm_eo += len; } } } else { /* Already know these values, allow compiling the regex with * RE_NOSUB to use parenthesis only for grouping, but avoiding * the runtime overhead of keeping track of the subexpression * offsets. */ pmat[0].rm_so = eng.so[0] + len; pmat[0].rm_eo = eng.eo[0] + len; } } return (eng.so[0] <= eng.eo[0] ? 0 : RE_NOMATCH); } int reerror(int ecode, const re_cod *preg, char *ebuffer, int ebuffer_size) { static char *errors[] = { "No error", "Failed to match", /* NOMATCH */ /* Errors not generated */ "Invalid regular expression", /* BADPAT */ "Invalid collating element", /* ECOLLATE */ "Invalid character class", /* ECTYPE */ "`\' applied to unescapable character", /* EESCAPE */ "Invalid backreference number", /* ESUBREG */ "Brackets `[ ]' not balanced", /* EBRACK */ "Parentheses `( )' not balanced", /* EPAREN */ "Braces `{ }' not balanced", /* EBRACE */ "Invalid repetition count(s) in `{ }'", /* BADBR */ "Invalid character range in `[ ]'", /* ERANGE */ "Out of memory", /* ESPACE */ "`?', `*', or `+' operand invalid", /* BADRPT */ "Empty (sub)expression", /* EMPTY */ "Assertion error - you found a bug", /* ASSERT */ "Invalid argument" /* INVARG */ }; char *str; if (ecode >= 0 && ecode < sizeof(errors) / sizeof(errors[0])) str = errors[ecode]; else str = "Unknown error"; return (snprintf(ebuffer, ebuffer_size, "%s", str)); } void refree(re_cod *cod) { free(cod->cod); cod->cod = NULL; } static void reinit(void) { int i; static int first = 1; if (!first) return; first = 0; re__alnum['_'] = 1; for (i = '0'; i <= '7'; i++) re__alnum[i] = re__odigit[i] = re__ddigit[i] = re__xdigit[i] = 1; for (; i <= '9'; i++) re__alnum[i] = re__ddigit[i] = re__xdigit[i] = 1; for (i = 'a'; i <= 'f'; i++) re__alnum[i] = re__xdigit[i] = 1; for (; i <= 'z'; i++) re__alnum[i] = 1; for (i = 'A'; i <= 'F'; i++) re__alnum[i] = re__xdigit[i] = 1; for (; i <= 'Z'; i++) re__alnum[i] = 1; for (i = 1; i < 32; i++) re__control[i] = 1; re__control[127] = 1; /* Don't show tabs as control characters */ re__control['\t'] = 0; } static int rec_check(re_inf *inf, int count) { if (inf->len + count >= inf->spc) { int spc; unsigned char *cod; if ((spc = (count % 64)) != 0) spc = 64 - spc; spc += count + inf->spc; if ((cod = realloc(inf->cod, spc)) == NULL) return (inf->ecode = RE_ESPACE); inf->cod = cod; inf->spc = spc; } return (inf->ecode); } static int rec_code(re_inf *inf, ReCode code) { if (rec_check(inf, 1) == 0) inf->cod[inf->len++] = code; return (inf->ecode); } static int rec_byte(re_inf *inf, int value) { if (rec_check(inf, 1) == 0) inf->cod[inf->len++] = value; return (inf->ecode); } static int rec_code_byte(re_inf *inf, ReCode code, int value) { if (rec_check(inf, 2) == 0) { inf->cod[inf->len++] = code; inf->cod[inf->len++] = value; } return (inf->ecode); } static int rec_length(re_inf *inf, int length) { int lo, hi, two; if (length >= 16384) return (inf->ecode = RE_ESPACE); lo = length & 0xff; hi = length & 0xff00; two = ((length > 0x7f) != 0) + 1; if (two == 2) { hi <<= 1; hi |= (lo & 0x80) != 0; lo |= 0x80; } if (rec_check(inf, two) == 0) { inf->cod[inf->len++] = lo; if (two == 2) inf->cod[inf->len++] = hi >> 8; } return (inf->ecode); } static int rec_byte_byte(re_inf *inf, int value0, int value1) { if (rec_check(inf, 2) == 0) { inf->cod[inf->len++] = value0; inf->cod[inf->len++] = value1; } return (inf->ecode); } static int rec_code_byte_byte(re_inf *inf, ReCode code, int value0, int value1) { if (rec_check(inf, 3) == 0) { inf->cod[inf->len++] = code; inf->cod[inf->len++] = value0; inf->cod[inf->len++] = value1; } return (inf->ecode); } static int rec_build_alt(re_inf *inf, rec_alt *alt) { int offset, value, bas = inf->bas + 1; if (alt) { if (alt->next) { if (rec_inc_spc(inf)) return (inf->ecode); /* A real a list of alternatives */ rec_code(inf, Re_Alt); offset = inf->len; /* Remember current offset */ rec_byte_byte(inf, 0, 0); /* Reserve two bytes for retry address */ while (alt && inf->ecode == 0) { if (rec_build_pat(inf, alt->pat)) break; alt = alt->next; if (alt && inf->ecode == 0) { /* Handle (hyper)complex repetitions */ if (inf->bas != bas) { /* Duplicate patterns up to end of expression */ rec_build_pat(inf, inf->apat); /* Restore engine state for next alternative(s) */ rec_alt_spc(inf, bas - 1); } /* If the jump would be so long */ if ((value = inf->len - offset) >= 16384) { inf->ecode = RE_ESPACE; break; } inf->cod[offset] = value & 0xff; inf->cod[offset + 1] = (value & 0xff00) >> 8; rec_code(inf, Re_AltNext); offset = inf->len; rec_byte_byte(inf, 0, 0); } } if (inf->ecode == 0) { /* Handle (hyper)complex repetitions */ if (inf->bas != bas) { /* Duplicate patterns up to end of expression */ rec_build_pat(inf, inf->apat); /* Restore engine state for next alternative(s) */ rec_alt_spc(inf, bas - 1); } /* If the jump would be so long */ if ((value = inf->len - offset) >= 16384) return (inf->ecode = RE_ESPACE); inf->cod[offset] = value & 0xff; inf->cod[offset + 1] = (value & 0xff00) >> 8; /* Last jump is here */ rec_code(inf, Re_AltDone); } rec_dec_spc(inf); } else /* Single alternative */ rec_build_pat(inf, alt->pat); } return (inf->ecode); } static int rec_build_pat(re_inf *inf, rec_pat *pat) { rec_pat *apat; int length, offset = 0, distance, jump = 0, bas = 0; while (pat && inf->ecode == 0) { if (pat->rep) { bas = inf->bas; if (pat->type == Rep_Group && !inf->par && rec_code(inf, Re_Open)) return (inf->ecode); if (rec_inc_spc(inf)) return (inf->ecode); offset = inf->len; if (rec_build_rep(inf, pat->rep)) break; /* Reserve space to jump after repetition done */ jump = inf->len; rec_byte_byte(inf, 0, 0); } switch (pat->type) { case Rep_AnyAnyTimes: case Rep_AnyMaybe: case Rep_AnyAtLeast: if (rec_add_spc(inf, pat->type == Rep_AnyMaybe)) return (inf->ecode); if (rec_code(inf, (ReCode)pat->type) == 0 && rec_byte(inf, inf->bas - 1) == 0 && rec_byte(inf, inf->ref - 1) == 0) rec_off_spc(inf); break; case Rep_Literal: case Rep_LiteralNot: case Rep_SearchLiteral: rec_code_byte(inf, (ReCode)pat->type, pat->data.chr); break; case Rep_CaseLiteral: case Rep_CaseLiteralNot: case Rep_SearchCaseLiteral: rec_code_byte_byte(inf, (ReCode)pat->type, pat->data.cse.lower, pat->data.cse.upper); break; case Rep_Range: case Rep_RangeNot: if (rec_code(inf, (ReCode)pat->type) == 0) rec_build_rng(inf, pat->data.rng); break; case Rep_String: case Rep_SearchString: case Rep_CaseString: case Rep_SearchCaseString: rec_code(inf, (ReCode)pat->type); length = strlen((char*)pat->data.str); if (rec_length(inf, length) == 0 && rec_check(inf, length) == 0) { memcpy(inf->cod + inf->len, pat->data.str, length); inf->len += length; } break; case Rep_Any: case Rep_AnyEatAnyTimes: case Rep_AnyEatMaybe: case Rep_AnyEatAtLeast: case Rep_Odigit: case Rep_OdigitNot: case Rep_Digit: case Rep_DigitNot: case Rep_Xdigit: case Rep_XdigitNot: case Rep_Space: case Rep_SpaceNot: case Rep_Tab: case Rep_Newline: case Rep_Lower: case Rep_Upper: case Rep_Alnum: case Rep_AlnumNot: case Rep_Control: case Rep_ControlNot: case Rep_Bol: case Rep_Eol: case Rep_Bow: case Rep_Eow: rec_code(inf, (ReCode)pat->type); break; case Rep_Backref: rec_code_byte(inf, Re_Backref, pat->data.chr); break; case Rep_Group: if (pat->rep == NULL && !inf->par && rec_code(inf, Re_Open)) break; apat = inf->apat; inf->apat = pat->next; rec_build_grp(inf, pat->data.grp); inf->apat = apat; break; case Rep_StringList: rec_build_stl(inf, pat->data.stl); break; } if (pat->rep) { #if 0 if (rec_dec_spc(inf)) return (inf->ecode); #else if (rec_rep_spc(inf, bas)) return (inf->ecode); #endif distance = inf->len - offset; if (distance > 255) { if (rec_code(inf, Re_RepLongJump) || rec_byte(inf, inf->bas) || rec_byte(inf, distance & 0xff) || rec_byte(inf, (distance & 0xff00) >> 8)) break; } else if (rec_code(inf, Re_RepJump) || rec_byte(inf, inf->bas) || rec_byte(inf, distance)) break; distance = inf->len - offset; inf->cod[jump] = distance & 0xff; inf->cod[jump + 1] = (distance & 0xff00) >> 8; } pat = pat->next; } return (inf->ecode); } static int rec_build_rng(re_inf *inf, rec_rng *rng) { if (rec_check(inf, sizeof(rng->range)) == 0) { memcpy(inf->cod + inf->len, rng->range, sizeof(rng->range)); inf->len += sizeof(rng->range); } return (inf->ecode); } static int rec_build_grp(re_inf *inf, rec_grp *grp) { int par = inf->par; if (!(inf->flags & RE_NOSUB)) { ++inf->par; if (par == 0) ++inf->ref; if (rec_build_alt(inf, grp->alt) == 0) { if (par == 0) { if (grp->comp) rec_code_byte(inf, Re_Update, inf->ref - 1); else rec_code(inf, Re_Close); } } --inf->par; } else rec_build_alt(inf, grp->alt); return (inf->ecode); } static int rec_build_stl(re_inf *inf, rec_stl *stl) { int i, len, rlen; ReCode code; /* Calculate jump distance information */ rlen = stl->tlen + stl->nstrs + 4; /* + code + nstrs + place-offset + data-length */ if (stl->nstrs >= LARGE_STL_COUNT) { rlen += 511; /* Don't write number of strings */ code = stl->type == Rep_StringList ? Re_LargeStringList : Re_LargeCaseStringList; } else code = (ReCode)stl->type; if (rlen >= 16386) return (inf->ecode = RE_ESPACE); if (rec_check(inf, rlen) || rec_code(inf, code)) return (inf->ecode); /* Space is allocated, just write the data */ if (stl->nstrs < LARGE_STL_COUNT) inf->cod[inf->len++] = stl->nstrs; inf->cod[inf->len++] = rlen & 0xff; inf->cod[inf->len++] = (rlen & 0xff00) >> 8; if (stl->nstrs < LARGE_STL_COUNT) { for (i = 0; i < stl->nstrs; i++) inf->cod[inf->len++] = stl->lens[i]; for (i = 0; i < stl->nstrs; i++) { len = stl->lens[i]; if (len > 2) { memcpy(inf->cod + inf->len, stl->strs[i], len); inf->len += len; } else { if (len == 1) inf->cod[inf->len++] = (long)stl->strs[i]; else { inf->cod[inf->len++] = (long)stl->strs[i] & 0xff; inf->cod[inf->len++] = ((long)stl->strs[i] & 0xff00) >> 8; } } } } else { /* The string length goes before the string itself */ int j, chl, chu; /* Fill everything with an invalid jump address */ memset(inf->cod + inf->len, 0xff, 512); for (i = len = 0, j = -1; i < stl->nstrs; i++) { chl = stl->lens[i] > 2 ? stl->strs[i][0] : (long)stl->strs[i] & 0xff; if (chl != j) { inf->cod[inf->len + (chl << 1)] = len & 0xff; inf->cod[inf->len + (chl << 1) + 1] = (len & 0xff00) >> 8; if (code == Re_LargeCaseStringList) { chu = stl->lens[i] > 2 ? stl->strs[i][1] : ((long)(stl->strs[i]) & 0xff00) >> 8; inf->cod[inf->len + (chu << 1)] = len & 0xff; inf->cod[inf->len + (chu << 1) + 1] = (len & 0xff00) >> 8; } j = chl; } len += stl->lens[i] + 1; } inf->len += 512; for (i = 0; i < stl->nstrs; i++) { len = stl->lens[i]; inf->cod[inf->len++] = len; if (len > 2) { memcpy(inf->cod + inf->len, stl->strs[i], len); inf->len += len; } else { if (len == 1) inf->cod[inf->len++] = (long)stl->strs[i]; else { inf->cod[inf->len++] = (long)stl->strs[i] & 0xff; inf->cod[inf->len++] = ((long)stl->strs[i] & 0xff00) >> 8; } } } } return (inf->ecode); } static int rec_build_rep(re_inf *inf, rec_rep *rep) { if (rep) { switch (rep->type) { case Rer_AnyTimes: case Rer_AtLeast: case Rer_Maybe: rec_code(inf, (ReCode)rep->type); break; case Rer_Exact: if (rec_code(inf, Re_Exact) == 0) rec_byte(inf, rep->mine); break; case Rer_Min: if (rec_code(inf, Re_Min) == 0) rec_byte(inf, rep->mine); break; case Rer_Max: if (rec_code(inf, Re_Max) == 0) rec_byte(inf, rep->maxc); break; case Rer_MinMax: if (rec_code(inf, Re_MinMax) == 0 && rec_byte(inf, rep->mine) == 0) rec_byte(inf, rep->maxc); break; } /* It is incremented in rec_build_pat */ rec_byte(inf, inf->bas - 1); } return (inf->ecode); } static int rec_inc_spc(re_inf *inf) { if (++inf->bas >= MAX_DEPTH) return (inf->ecode = RE_ESPACE); return (inf->ecode); } static int rec_dec_spc(re_inf *inf) { if (--inf->bas < 0) return (inf->ecode = RE_ASSERT); return (inf->ecode); } static int rec_add_spc(re_inf *inf, int maybe) { if (++inf->bas >= MAX_DEPTH) return (inf->ecode = RE_ESPACE); inf->sp[inf->bas] = maybe + 1; return (inf->ecode); } /* Could be joined with rec_rep_spc, code almost identical */ static int rec_alt_spc(re_inf *inf, int top) { int distance, i, bas = inf->bas; while ((inf->bas > top) && inf->sp[inf->bas]) { /* Jump to this repetition for cleanup */ distance = inf->len - inf->sr[inf->bas]; /* This will generate a jump to a jump decision opcode */ inf->sj[inf->bas] = inf->len; if (distance > 255) { if (rec_code(inf, Re_RepLongJump) || rec_byte(inf, inf->bas - 1) || rec_byte(inf, distance & 0xff) || rec_byte(inf, (distance & 0xff00) >> 8)) break; } else if (rec_code(inf, Re_RepJump) || rec_byte(inf, inf->bas - 1) || rec_byte(inf, distance)) break; /* Top of stack value before repetition, or end condition value */ --inf->bas; } i = inf->bas + 1; if (inf->ecode == 0 && i <= bas && inf->sp[i]) { /* Only the repetition at the bottom jump to code after testing * all possibilities */ distance = inf->len - inf->sr[i]; inf->cod[inf->sr[i] + 3] = distance & 0xff; inf->cod[inf->sr[i] + 4] = (distance & 0xff00) >> 8; /* The bottom jump is here */ if (rec_code(inf, inf->sp[i] == 1 ? Re_DoneIf : Re_MaybeDone)) return (inf->ecode); /* Generate jumps to the previous special repetition */ for (++i; i <= bas; i++) { if (inf->sp[i]) { distance = inf->sj[i] - inf->sr[i]; inf->cod[inf->sr[i] + 3] = distance & 0xff; inf->cod[inf->sr[i] + 4] = (distance & 0xff00) >> 8; } } } return (inf->ecode); } static int rec_rep_spc(re_inf *inf, int top) { int distance, i, bas = inf->bas; while (inf->bas > top) { if (inf->sp[inf->bas]) { /* Jump to this repetition for cleanup */ distance = inf->len - inf->sr[inf->bas]; /* This will generate a jump to a jump decision opcode */ inf->sj[inf->bas] = inf->len; if (distance > 255) { if (rec_code(inf, Re_RepLongJump) || rec_byte(inf, inf->bas - 1) || rec_byte(inf, distance & 0xff) || rec_byte(inf, (distance & 0xff00) >> 8)) break; } else if (rec_code(inf, Re_RepJump) || rec_byte(inf, inf->bas - 1) || rec_byte(inf, distance)) break; } /* Top of stack value before repetition, or end condition value */ --inf->bas; } /* Find first special repetition offset. XXX This should be a noop */ for (i = 0; i < bas; i++) if (inf->sp[i]) break; if (inf->ecode == 0 && i <= bas && inf->sp[i]) { /* Only the repetition at the bottom jump to code after testing * all possibilities */ distance = inf->len - inf->sr[i]; inf->cod[inf->sr[i] + 3] = distance & 0xff; inf->cod[inf->sr[i] + 4] = (distance & 0xff00) >> 8; /* Generate jumps to the previous special repetition */ for (++i; i <= bas; i++) { if (inf->sp[i]) { distance = inf->sj[i] - inf->sr[i]; inf->cod[inf->sr[i] + 3] = distance & 0xff; inf->cod[inf->sr[i] + 4] = (distance & 0xff00) >> 8; } } } return (inf->ecode); } static int rec_off_spc(re_inf *inf) { /* The jump address before the three bytes instruction */ inf->sr[inf->bas] = inf->len - 3; /* Don't know yet where to go after done with the special * repetition, just reserve two bytes for the jump address. */ return (rec_byte_byte(inf, 0, 0)); } #ifdef DEBUG static void redump(re_cod *code) { int i, j, k; unsigned char *cod = code->cod, *stl; if (cod[0] & RE_NOSUB) printf("Nosub\n"); if (cod[0] & RE_NEWLINE) printf("Newline\n"); ++cod; if (cod[0] != 0xff) printf("%d backrefs\n", cod[0] + 1); ++cod; for (;;) { switch (*cod++) { case Re_Open: printf("Open"); break; case Re_Close: printf("Close"); break; case Re_Update: printf("Update (%d)", (int)*cod++); break; case Re_Alt: printf("Alt"); i = cod[0] | cod[1]; cod += 2; printf(" %d", i); break; case Re_AltNext: printf("Alt-next"); i = cod[0] | cod[1]; cod += 2; printf(" %d", i); break; case Re_AltDone: printf("Alt-done"); break; case Re_AnyTimes: printf("-> Anytimes %d", (int)*cod++); i = cod[0] | (cod[1] << 8); cod += 2; printf(" /%d", i); break; case Re_AnyEatAnyTimes: printf("Any-eat-anytimes"); break; case Re_AnyAnyTimes: printf("-> Any-anytimes %d", (int)*cod++); printf(" (%d)", (int)*cod++); i = cod[0] | (cod[1] << 8); cod += 2; printf(" /%d", i); break; case Re_AnyEatMaybe: printf("Any-eat-maybe"); break; case Re_AnyMaybe: printf("-> Any-maybe %d", (int)*cod++); printf(" (%d)", (int)*cod++); i = cod[0] | (cod[1] << 8); cod += 2; printf(" /%d", i); break; case Re_AnyAtLeast: printf("-> Any-atleast %d", (int)*cod++); printf(" (%d)", (int)*cod++); i = cod[0] | (cod[1] << 8); cod += 2; printf(" /%d", i); break; case Re_AnyEatAtLeast: printf("Any-eat-atleast"); break; case Re_Maybe: printf("-> Maybe %d", (int)*cod++); i = cod[0] | (cod[1] << 8); cod += 2; printf(" /%d", i); break; case Re_AtLeast: printf("-> Atleast %d", (int)*cod++); i = cod[0] | (cod[1] << 8); cod += 2; printf(" /%d", i); break; case Re_Exact: printf("-> Exact "); i = *cod++; printf("%d", i); printf(" %d", (int)*cod++); i = cod[0] | (cod[1] << 8); cod += 2; printf(" /%d", i); break; case Re_Min: printf("-> Min "); i = *cod++; printf("%d", i); printf(" %d", (int)*cod++); i = cod[0] | (cod[1] << 8); cod += 2; printf(" /%d", i); break; case Re_Max: printf("-> Max "); i = *cod++; printf("%d", i); printf(" %d", (int)*cod++); i = cod[0] | (cod[1] << 8); cod += 2; printf(" /%d", i); break; case Re_MinMax: printf("-> Min-max "); i = *cod++; printf("%d ", i); i = *cod++; printf("%d", i); printf(" %d", (int)*cod++); i = cod[0] | (cod[1] << 8); cod += 2; printf(" /%d", i); break; case Re_RepJump: printf("<- Rep-jump %d ", (int)*cod++); i = *cod++; printf("%d", i); break; case Re_RepLongJump: printf("<- Rep-long-jump %d ", (int)*cod++); i = cod[0] | (cod[1] << 8); printf("%d", i); break; case Re_Any: printf("Any"); break; case Re_Odigit: printf("Odigit"); break; case Re_OdigitNot: printf("Odigit-not"); break; case Re_Digit: printf("Digit"); break; case Re_DigitNot: printf("Digit-not"); break; case Re_Xdigit: printf("Xdigit"); break; case Re_XdigitNot: printf("Xdigit-not"); break; case Re_Space: printf("Space"); break; case Re_SpaceNot: printf("Space-not"); break; case Re_Tab: printf("Tab"); break; case Re_Newline: printf("Newline"); break; case Re_Lower: printf("Lower"); break; case Re_Upper: printf("Upper"); break; case Re_Alnum: printf("Alnum"); break; case Re_AlnumNot: printf("Alnum-not"); break; case Re_Control: printf("Control"); break; case Re_ControlNot: printf("Control-not"); break; case Re_Bol: printf("Bol"); break; case Re_Eol: printf("Eol"); break; case Re_Bow: printf("Bow"); break; case Re_Eow: printf("Eow"); break; case Re_Range: printf("Range "); goto range; case Re_RangeNot: printf("Range-not "); range: for (i = 0; i < 256; i += 32) { for (j = k = 0; j < 32; j++) k |= (*cod++ & 1) << (31 - j); printf("%x ", k); } break; case Re_Literal: printf("Literal %c", *cod++); break; case Re_LiteralNot: printf("Literal-not %c", *cod++); break; case Re_SearchLiteral: printf("Search-literal %c", *cod++); break; case Re_CaseLiteral: printf("Case-literal %c", *cod++); putchar(*cod++); break; case Re_CaseLiteralNot: printf("Case-literal-not %c", *cod++); putchar(*cod++); break; case Re_SearchCaseLiteral: printf("Search-case-literal %c", *cod++); putchar(*cod++); break; case Re_String: printf("String "); goto string; case Re_SearchString: printf("Search-string "); goto string; case Re_CaseString: printf("Case-string "); goto string; case Re_SearchCaseString: printf("Search-case-string "); string: i = *cod++; if (i & 0x80) i = (i & 0x7f) | (*cod++ << 7); for (j = 0; j < i; j++) putchar(*cod++); break; case Re_StringList: printf("String-list"); goto string_list; case Re_CaseStringList: printf("Case-string-list"); string_list: j = *cod++; cod += 2; stl = cod + j; for (i = 0; i < j; i++) { k = *cod++; putchar(i ? ',' : ' '); fwrite(stl, k, 1, stdout); stl += k; } cod = stl; break; case Re_LargeStringList: printf("Large-string-list"); large_string_list: i = cod[0] | (cod[1] << 8); stl = cod + i - 1; for (i = 0, cod += 514; cod < stl; i++) { k = *cod++; putchar(i ? ',' : ' '); fwrite(cod, k, 1, stdout); cod += k; } cod = stl; break; case Re_LargeCaseStringList: printf("Large-case-string-list"); goto large_string_list; case Re_Backref: printf("Backref %d", (int)*cod++); break; case Re_DoneIf: printf("Done-if"); break; case Re_MaybeDone: printf("Maybe-done"); break; case Re_Done: printf("Done\n"); return; } putchar('\n'); } } #endif