diff options
Diffstat (limited to 'lisp/re/reo.c')
-rw-r--r-- | lisp/re/reo.c | 685 |
1 files changed, 685 insertions, 0 deletions
diff --git a/lisp/re/reo.c b/lisp/re/reo.c new file mode 100644 index 0000000..59cbf3b --- /dev/null +++ b/lisp/re/reo.c @@ -0,0 +1,685 @@ +/* + * Copyright (c) 2002 by The XFree86 Project, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF + * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * Except as contained in this notice, the name of the XFree86 Project shall + * not be used in advertising or otherwise to promote the sale, use or other + * dealings in this Software without prior written authorization from the + * XFree86 Project. + * + * Author: Paulo César Pereira de Andrade + */ + +/* $XFree86: xc/programs/xedit/lisp/re/reo.c,v 1.9 2002/11/15 07:01:33 paulo Exp $ */ + +#include "rep.h" + +/* + * This file is a placeholder to add code to analyse and optimize the + * intermediate data structure generated in rep.c. + * Character ranges are optimized while being generated. + */ + +/* + * Types + */ +typedef struct _orec_inf { + rec_alt *alt; /* Main alternatives list */ + rec_grp *grp; /* Current group pointer */ + int flags; + int ecode; +} orec_inf; + +/* + * Prototypes + */ +static int orec_alt(orec_inf*, rec_alt*); +static int orec_pat(orec_inf*, rec_pat*); +static int orec_grp(orec_inf*, rec_grp*); +static int orec_pat_bad_rpt(orec_inf*, rec_pat*); +static int orec_pat_bad_forward_rpt(orec_inf*, rec_pat*); +static int orec_pat_rng(orec_inf*, rec_pat*); +static int orec_pat_cse(orec_inf*, rec_pat*); +static int orec_pat_cse_can(orec_inf*, rec_pat*); +static int orec_str_list(orec_inf*, rec_alt*, int, int); + +/* + * Initialization + */ +extern unsigned char re__alnum[256]; +extern unsigned char re__odigit[256]; +extern unsigned char re__ddigit[256]; +extern unsigned char re__xdigit[256]; +extern unsigned char re__control[256]; + +/* + * Implementation + */ +int +orec_comp(rec_alt *alt, int flags) +{ + orec_inf inf; + + inf.alt = alt; + inf.grp = NULL; + inf.flags = flags; + inf.ecode = 0; + + orec_alt(&inf, alt); + + return (inf.ecode); +} + +void +orec_free_stl(rec_stl *stl) +{ + int i; + + for (i = 0; i < stl->nstrs; i++) { + if (stl->lens[i] > 2) + free(stl->strs[i]); + } + + free(stl->lens); + free(stl->strs); + free(stl); +} + + +static int +orec_alt(orec_inf *inf, rec_alt *alt) +{ + if (alt) { + rec_alt *ptr = alt; + int ret, count = 0, str = 1, cstr = 1, lits = 0, clits = 0; + + /* Check if can build a string list */ + if (ptr->next) { + /* If more than one alternative */ + while (ptr && (str || cstr)) { + if (ptr->pat == NULL || ptr->pat->rep != NULL) { + cstr = str = 0; + break; + } + if ((inf->flags & RE_ICASE)) { + if (!(ret = orec_pat_cse_can(inf, ptr->pat))) { + cstr = str = 0; + break; + } + if (ret == 1) + ++lits; + else if (ret == 2) + ++clits; + } + else if (ptr->pat->next == NULL) { + if (ptr->pat->type != Rep_String) { + if (ptr->pat->type != Rep_Literal) { + str = 0; + if (ptr->pat->type != Rep_CaseString) { + if (ptr->pat->type != Rep_CaseLiteral) + cstr = 0; + else + ++clits; + } + else if (strlen((char*)ptr->pat->data.str) >= 255) + str = cstr = 0; + } + else + ++lits; + } + else if (strlen((char*)ptr->pat->data.str) >= 255) + str = cstr = 0; + } + else { + str = cstr = 0; + break; + } + if (++count >= 255) + str = cstr = 0; + ptr = ptr->next; + } + + if (str || cstr) { + if (inf->flags & RE_ICASE) { + for (ptr = alt; ptr; ptr = ptr->next) { + if (orec_pat_cse(inf, ptr->pat)) + return (inf->ecode); + } + str = 0; + } + return (orec_str_list(inf, alt, str, count)); + } + } + else if (alt == inf->alt && alt->pat && alt->pat->rep == NULL) { + /* If the toplevel single alternative */ + switch (alt->pat->type) { + /* One of these will always be true for RE_NOSPEC, + * but can also be optimized for simple patterns */ + case Rep_Literal: + alt->pat->type = Rep_SearchLiteral; + break; + case Rep_CaseLiteral: + alt->pat->type = Rep_SearchCaseLiteral; + break; + case Rep_String: + alt->pat->type = Rep_SearchString; + break; + case Rep_CaseString: + alt->pat->type = Rep_SearchCaseString; + break; + default: + break; + } + } + + while (alt) { + orec_pat(inf, alt->pat); + alt = alt->next; + } + } + + return (inf->ecode); +} + +static int +orec_pat(orec_inf *inf, rec_pat *pat) +{ + rec_pat *next; + + while (pat) { + switch (pat->type) { + case Rep_AnyAnyTimes: + if (pat->next == NULL) { + rec_grp *grp = inf->grp; + + next = NULL; + while (grp) { + next = grp->parent->next; + /* Cannot check if is .*$ as the input + * may be a substring */ + if (next) + break; + grp = grp->pgrp; + } + if (next == NULL) { + /* <re>.* */ + pat->type = Rep_AnyEatAnyTimes; + grp = inf->grp; + while (grp) { + --grp->comp; + next = grp->parent->next; + if (next) + break; + grp = grp->pgrp; + } + } + else if (orec_pat_bad_rpt(inf, next)) + return (inf->ecode); + } + else if (orec_pat_bad_rpt(inf, pat->next)) + return (inf->ecode); + break; + case Rep_AnyMaybe: + if (pat->next == NULL) { + rec_grp *grp = inf->grp; + + next = NULL; + while (grp) { + next = grp->parent->next; + if (next) + break; + grp = grp->pgrp; + } + if (next == NULL) { + /* <re>.? */ + pat->type = Rep_AnyEatMaybe; + grp = inf->grp; + while (grp) { + --grp->comp; + next = grp->parent->next; + if (next) + break; + grp = grp->pgrp; + } + } + else if (orec_pat_bad_rpt(inf, next)) + return (inf->ecode); + } + else if (orec_pat_bad_rpt(inf, pat->next)) + return (inf->ecode); + break; + case Rep_AnyAtLeast: + if (pat->next == NULL) { + rec_grp *grp = inf->grp; + + next = NULL; + while (grp) { + next = grp->parent->next; + if (next) + break; + grp = grp->pgrp; + } + if (next == NULL) { + /* <re>.+ */ + pat->type = Rep_AnyEatAtLeast; + grp = inf->grp; + while (grp) { + --grp->comp; + next = grp->parent->next; + if (next) + break; + grp = grp->pgrp; + } + } + else if (orec_pat_bad_rpt(inf, next)) + return (inf->ecode); + } + else if (orec_pat_bad_rpt(inf, pat->next)) + return (inf->ecode); + break; + case Rep_Range: + case Rep_RangeNot: + orec_pat_rng(inf, pat); + break; + case Rep_Group: + orec_grp(inf, pat->data.grp); + break; + default: + break; + } + pat = pat->next; + } + + return (inf->ecode); +} + +static int +orec_pat_bad_rpt(orec_inf *inf, rec_pat *pat) +{ + switch (pat->type) { + /* Not really an error, but aren't supported by the library. + * Includes: .*.*, .+<re>? .*<re>*, (.*)(<re>*), etc. + */ + + /* Not a repetition, but mathes anything... */ + case Rep_Any: + + /* Zero length matches */ + case Rep_Eol: + if (!(inf->flags & RE_NEWLINE)) + break; + case Rep_Bol: + case Rep_Bow: + case Rep_Eow: + + /* Repetitions */ + case Rep_AnyAnyTimes: + case Rep_AnyMaybe: + case Rep_AnyAtLeast: + inf->ecode = RE_BADRPT; + break; + + /* Check if the first group element is a complex pattern */ + case Rep_Group: + if (pat->rep == NULL) { + if (pat->data.grp->alt) { + for (pat = pat->data.grp->alt->pat; pat; pat = pat->next) { + if (orec_pat_bad_rpt(inf, pat)) + break; + } + } + break; + } + /*FALLTHROUGH*/ + default: + if (pat->rep) + inf->ecode = RE_BADRPT; + break; + } + + if (!inf->ecode && pat && pat->next) + orec_pat_bad_forward_rpt(inf, pat->next); + + return (inf->ecode); +} + +static int +orec_pat_bad_forward_rpt(orec_inf *inf, rec_pat *pat) +{ + if (pat->rep) { + switch (pat->rep->type) { + case Rer_MinMax: + if (pat->rep->mine > 0) + break; + case Rer_AnyTimes: + case Rer_Maybe: + case Rer_Max: + inf->ecode = RE_BADRPT; + default: + break; + } + } + else if (pat->type == Rep_Group && + pat->data.grp->alt && + pat->data.grp->alt->pat) + orec_pat_bad_forward_rpt(inf, pat->data.grp->alt->pat); + + return (inf->ecode); +} + +static int +orec_grp(orec_inf *inf, rec_grp *grp) +{ + rec_grp *prev = inf->grp; + + inf->grp = grp; + orec_alt(inf, grp->alt); + /* Could also just say: inf->grp = grp->gparent */ + inf->grp = prev; + + return (inf->ecode); +} + +static int +orec_pat_rng(orec_inf *inf, rec_pat *pat) +{ + int i, j[2], count; + rec_pat_t type = pat->type; + unsigned char *range = pat->data.rng->range; + + for (i = count = j[0] = j[1] = 0; i < 256; i++) { + if (range[i]) { + if (count == 2) { + ++count; + break; + } + j[count++] = i; + } + } + + if (count == 1 || + (count == 2 && + ((islower(j[0]) && toupper(j[0]) == j[1]) || + (isupper(j[0]) && tolower(j[0]) == j[1])))) { + free(pat->data.rng); + if (count == 1) { + pat->data.chr = j[0]; + pat->type = type == Rep_Range ? Rep_Literal : Rep_LiteralNot; + } + else { + pat->data.cse.upper = j[0]; + pat->data.cse.lower = j[1]; + pat->type = type == Rep_Range ? Rep_CaseLiteral : Rep_CaseLiteralNot; + } + } + else { + if (memcmp(re__alnum, range, 256) == 0) + type = type == Rep_Range ? Rep_Alnum : Rep_AlnumNot; + else if (memcmp(re__odigit, range, 256) == 0) + type = type == Rep_Range ? Rep_Odigit : Rep_OdigitNot; + else if (memcmp(re__ddigit, range, 256) == 0) + type = type == Rep_Range ? Rep_Digit : Rep_DigitNot; + else if (memcmp(re__xdigit, range, 256) == 0) + type = type == Rep_Range ? Rep_Xdigit : Rep_XdigitNot; + else if (memcmp(re__control, range, 256) == 0) + type = type == Rep_Range ? Rep_Control : Rep_ControlNot; + + if (type != pat->type) { + free(pat->data.rng); + pat->type = type; + } + } + + return (inf->ecode); +} + +/* Join patterns if required, will only fail on memory allocation failure: + */ +static int +orec_pat_cse(orec_inf *inf, rec_pat *pat) +{ + rec_pat_t type; + int i, len, length; + rec_pat *ptr, *next; + unsigned char *str, *tofree; + + if (pat->next == NULL && pat->type == Rep_CaseString) + return (inf->ecode); + + type = Rep_CaseString; + + /* First calculate how many bytes will be required */ + for (ptr = pat, length = 1; ptr; ptr = ptr->next) { + switch (ptr->type) { + case Rep_Literal: + length += 2; + break; + case Rep_String: + length += strlen((char*)ptr->data.str) << 1; + break; + case Rep_CaseLiteral: + length += 2; + break; + case Rep_CaseString: + length += strlen((char*)ptr->data.str); + break; + default: + break; + } + } + + if ((str = malloc(length)) == NULL) + return (inf->ecode = RE_ESPACE); + + for (ptr = pat, length = 0; ptr; ptr = next) { + tofree = NULL; + next = ptr->next; + switch (ptr->type) { + case Rep_Literal: + str[length++] = ptr->data.chr; + str[length++] = ptr->data.chr; + break; + case Rep_String: + tofree = ptr->data.str; + len = strlen((char*)tofree); + for (i = 0; i < len; i++) { + str[length++] = tofree[i]; + str[length++] = tofree[i]; + } + break; + case Rep_CaseLiteral: + str[length++] = ptr->data.cse.lower; + str[length++] = ptr->data.cse.upper; + break; + case Rep_CaseString: + tofree = ptr->data.str; + len = strlen((char*)tofree); + memcpy(str + length, tofree, len); + length += len; + break; + default: + break; + } + if (tofree) + free(tofree); + if (ptr != pat) + free(ptr); + } + str[length] = '\0'; + + pat->type = type; + pat->data.str = str; + pat->next = NULL; + + return (inf->ecode); +} + +/* Return 0 if the patterns in the list cannot be merged, 1 if will + * be a simple string, 2 if a case string. + * This is useful when building an alternative list that is composed + * only of strings, but the regex is case insensitive, in wich case + * the first pass may have splited some patterns, but if it is a member + * of an alternatives list, the cost of using a string list is smaller */ +static int +orec_pat_cse_can(orec_inf *inf, rec_pat *pat) +{ + int ret; + + if (pat == NULL) + return (0); + + for (ret = 1; pat; pat = pat->next) { + if (pat->rep) + return (0); + switch (pat->type) { + case Rep_Literal: + case Rep_String: + break; + case Rep_CaseLiteral: + case Rep_CaseString: + ret = 2; + break; + default: + return (0); + } + } + + return (ret); +} + + +/* XXX If everything is a (case) byte, the pattern should be + * [abcde] instead of a|b|c|d|e (or [aAbBcCdDeE] instead of aA|bB|cC|dD|eE) + * as a string list works fine, but as a character range + * should be faster, and maybe could be converted here. But not + * very important, if performance is required, it should have already + * been done in the pattern. + */ +static int +orec_str_list(orec_inf *inf, rec_alt *alt, int str, int count) +{ + rec_stl *stl; + rec_pat *pat; + rec_alt *ptr, *next; + int i, j, tlen, len, is; + + if ((stl = calloc(1, sizeof(rec_stl))) == NULL) + return (inf->ecode = RE_ESPACE); + + if ((stl->lens = malloc(sizeof(unsigned char) * count)) == NULL) { + free(stl); + return (inf->ecode = RE_ESPACE); + } + + if ((stl->strs = malloc(sizeof(char*) * count)) == NULL) { + free(stl->lens); + free(stl); + return (inf->ecode = RE_ESPACE); + } + + if ((pat = calloc(1, sizeof(rec_pat))) == NULL) { + free(stl->strs); + free(stl->lens); + free(stl); + return (inf->ecode = RE_ESPACE); + } + + pat->data.stl = stl; + pat->type = Rep_StringList; + stl->type = str ? Resl_StringList : Resl_CaseStringList; + for (i = tlen = 0, ptr = alt; i < count; i++) { + next = ptr->next; + switch (ptr->pat->type) { + case Rep_Literal: + is = len = 1; + break; + case Rep_CaseLiteral: + is = len = 2; + break; + default: + is = 0; + len = strlen((char*)ptr->pat->data.str); + break; + } + tlen += len; + stl->lens[i] = len; + if (!is) { + if (len > 2) + stl->strs[i] = ptr->pat->data.str; + else { + if (len == 1) + stl->strs[i] = (void*)(long)(ptr->pat->data.str[0]); + else + stl->strs[i] = (void*)(long) + (ptr->pat->data.str[0] | + ((int)ptr->pat->data.str[1] << 8)); + free(ptr->pat->data.str); + } + } + else { + if (is == 1) + stl->strs[i] = (void*)(long)ptr->pat->data.chr; + else + stl->strs[i] = (void*)(long) + (ptr->pat->data.cse.lower | + (ptr->pat->data.cse.upper << 8)); + } + free(ptr->pat); + if (i) + free(ptr); + ptr = next; + } + stl->tlen = tlen; + stl->nstrs = count; + + alt->pat = pat; + alt->next = NULL; + + { + int li, lj; + unsigned char ci, cj, *str; + + /* Don't need a stable sort, there shouldn't be duplicated strings, + * but don't check for it either. Only need to make sure that all + * strings that start with the same byte are together */ + for (i = 0; i < count; i++) { + li = stl->lens[i]; + ci = li > 2 ? stl->strs[i][0] : (long)stl->strs[i] & 0xff; + for (j = i + 1; j < count; j++) { + lj = stl->lens[j]; + cj = lj > 2 ? stl->strs[j][0] : (long)stl->strs[j] & 0xff; + if ((count >= LARGE_STL_COUNT && cj < ci) || + (cj == ci && lj > li)) { + /* If both strings start with the same byte, + * put the longer first */ + str = stl->strs[j]; + stl->strs[j] = stl->strs[i]; + stl->strs[i] = str; + stl->lens[j] = li; + stl->lens[i] = lj; + li ^= lj; lj ^= li; li ^= lj; + ci ^= cj; cj ^= ci; ci ^= cj; + } + } + } + } + + return (inf->ecode); +} |