summaryrefslogtreecommitdiff
path: root/lisp/re/reo.c
diff options
context:
space:
mode:
Diffstat (limited to 'lisp/re/reo.c')
-rw-r--r--lisp/re/reo.c685
1 files changed, 685 insertions, 0 deletions
diff --git a/lisp/re/reo.c b/lisp/re/reo.c
new file mode 100644
index 0000000..59cbf3b
--- /dev/null
+++ b/lisp/re/reo.c
@@ -0,0 +1,685 @@
+/*
+ * Copyright (c) 2002 by The XFree86 Project, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Except as contained in this notice, the name of the XFree86 Project shall
+ * not be used in advertising or otherwise to promote the sale, use or other
+ * dealings in this Software without prior written authorization from the
+ * XFree86 Project.
+ *
+ * Author: Paulo César Pereira de Andrade
+ */
+
+/* $XFree86: xc/programs/xedit/lisp/re/reo.c,v 1.9 2002/11/15 07:01:33 paulo Exp $ */
+
+#include "rep.h"
+
+/*
+ * This file is a placeholder to add code to analyse and optimize the
+ * intermediate data structure generated in rep.c.
+ * Character ranges are optimized while being generated.
+ */
+
+/*
+ * Types
+ */
+typedef struct _orec_inf {
+ rec_alt *alt; /* Main alternatives list */
+ rec_grp *grp; /* Current group pointer */
+ int flags;
+ int ecode;
+} orec_inf;
+
+/*
+ * Prototypes
+ */
+static int orec_alt(orec_inf*, rec_alt*);
+static int orec_pat(orec_inf*, rec_pat*);
+static int orec_grp(orec_inf*, rec_grp*);
+static int orec_pat_bad_rpt(orec_inf*, rec_pat*);
+static int orec_pat_bad_forward_rpt(orec_inf*, rec_pat*);
+static int orec_pat_rng(orec_inf*, rec_pat*);
+static int orec_pat_cse(orec_inf*, rec_pat*);
+static int orec_pat_cse_can(orec_inf*, rec_pat*);
+static int orec_str_list(orec_inf*, rec_alt*, int, int);
+
+/*
+ * Initialization
+ */
+extern unsigned char re__alnum[256];
+extern unsigned char re__odigit[256];
+extern unsigned char re__ddigit[256];
+extern unsigned char re__xdigit[256];
+extern unsigned char re__control[256];
+
+/*
+ * Implementation
+ */
+int
+orec_comp(rec_alt *alt, int flags)
+{
+ orec_inf inf;
+
+ inf.alt = alt;
+ inf.grp = NULL;
+ inf.flags = flags;
+ inf.ecode = 0;
+
+ orec_alt(&inf, alt);
+
+ return (inf.ecode);
+}
+
+void
+orec_free_stl(rec_stl *stl)
+{
+ int i;
+
+ for (i = 0; i < stl->nstrs; i++) {
+ if (stl->lens[i] > 2)
+ free(stl->strs[i]);
+ }
+
+ free(stl->lens);
+ free(stl->strs);
+ free(stl);
+}
+
+
+static int
+orec_alt(orec_inf *inf, rec_alt *alt)
+{
+ if (alt) {
+ rec_alt *ptr = alt;
+ int ret, count = 0, str = 1, cstr = 1, lits = 0, clits = 0;
+
+ /* Check if can build a string list */
+ if (ptr->next) {
+ /* If more than one alternative */
+ while (ptr && (str || cstr)) {
+ if (ptr->pat == NULL || ptr->pat->rep != NULL) {
+ cstr = str = 0;
+ break;
+ }
+ if ((inf->flags & RE_ICASE)) {
+ if (!(ret = orec_pat_cse_can(inf, ptr->pat))) {
+ cstr = str = 0;
+ break;
+ }
+ if (ret == 1)
+ ++lits;
+ else if (ret == 2)
+ ++clits;
+ }
+ else if (ptr->pat->next == NULL) {
+ if (ptr->pat->type != Rep_String) {
+ if (ptr->pat->type != Rep_Literal) {
+ str = 0;
+ if (ptr->pat->type != Rep_CaseString) {
+ if (ptr->pat->type != Rep_CaseLiteral)
+ cstr = 0;
+ else
+ ++clits;
+ }
+ else if (strlen((char*)ptr->pat->data.str) >= 255)
+ str = cstr = 0;
+ }
+ else
+ ++lits;
+ }
+ else if (strlen((char*)ptr->pat->data.str) >= 255)
+ str = cstr = 0;
+ }
+ else {
+ str = cstr = 0;
+ break;
+ }
+ if (++count >= 255)
+ str = cstr = 0;
+ ptr = ptr->next;
+ }
+
+ if (str || cstr) {
+ if (inf->flags & RE_ICASE) {
+ for (ptr = alt; ptr; ptr = ptr->next) {
+ if (orec_pat_cse(inf, ptr->pat))
+ return (inf->ecode);
+ }
+ str = 0;
+ }
+ return (orec_str_list(inf, alt, str, count));
+ }
+ }
+ else if (alt == inf->alt && alt->pat && alt->pat->rep == NULL) {
+ /* If the toplevel single alternative */
+ switch (alt->pat->type) {
+ /* One of these will always be true for RE_NOSPEC,
+ * but can also be optimized for simple patterns */
+ case Rep_Literal:
+ alt->pat->type = Rep_SearchLiteral;
+ break;
+ case Rep_CaseLiteral:
+ alt->pat->type = Rep_SearchCaseLiteral;
+ break;
+ case Rep_String:
+ alt->pat->type = Rep_SearchString;
+ break;
+ case Rep_CaseString:
+ alt->pat->type = Rep_SearchCaseString;
+ break;
+ default:
+ break;
+ }
+ }
+
+ while (alt) {
+ orec_pat(inf, alt->pat);
+ alt = alt->next;
+ }
+ }
+
+ return (inf->ecode);
+}
+
+static int
+orec_pat(orec_inf *inf, rec_pat *pat)
+{
+ rec_pat *next;
+
+ while (pat) {
+ switch (pat->type) {
+ case Rep_AnyAnyTimes:
+ if (pat->next == NULL) {
+ rec_grp *grp = inf->grp;
+
+ next = NULL;
+ while (grp) {
+ next = grp->parent->next;
+ /* Cannot check if is .*$ as the input
+ * may be a substring */
+ if (next)
+ break;
+ grp = grp->pgrp;
+ }
+ if (next == NULL) {
+ /* <re>.* */
+ pat->type = Rep_AnyEatAnyTimes;
+ grp = inf->grp;
+ while (grp) {
+ --grp->comp;
+ next = grp->parent->next;
+ if (next)
+ break;
+ grp = grp->pgrp;
+ }
+ }
+ else if (orec_pat_bad_rpt(inf, next))
+ return (inf->ecode);
+ }
+ else if (orec_pat_bad_rpt(inf, pat->next))
+ return (inf->ecode);
+ break;
+ case Rep_AnyMaybe:
+ if (pat->next == NULL) {
+ rec_grp *grp = inf->grp;
+
+ next = NULL;
+ while (grp) {
+ next = grp->parent->next;
+ if (next)
+ break;
+ grp = grp->pgrp;
+ }
+ if (next == NULL) {
+ /* <re>.? */
+ pat->type = Rep_AnyEatMaybe;
+ grp = inf->grp;
+ while (grp) {
+ --grp->comp;
+ next = grp->parent->next;
+ if (next)
+ break;
+ grp = grp->pgrp;
+ }
+ }
+ else if (orec_pat_bad_rpt(inf, next))
+ return (inf->ecode);
+ }
+ else if (orec_pat_bad_rpt(inf, pat->next))
+ return (inf->ecode);
+ break;
+ case Rep_AnyAtLeast:
+ if (pat->next == NULL) {
+ rec_grp *grp = inf->grp;
+
+ next = NULL;
+ while (grp) {
+ next = grp->parent->next;
+ if (next)
+ break;
+ grp = grp->pgrp;
+ }
+ if (next == NULL) {
+ /* <re>.+ */
+ pat->type = Rep_AnyEatAtLeast;
+ grp = inf->grp;
+ while (grp) {
+ --grp->comp;
+ next = grp->parent->next;
+ if (next)
+ break;
+ grp = grp->pgrp;
+ }
+ }
+ else if (orec_pat_bad_rpt(inf, next))
+ return (inf->ecode);
+ }
+ else if (orec_pat_bad_rpt(inf, pat->next))
+ return (inf->ecode);
+ break;
+ case Rep_Range:
+ case Rep_RangeNot:
+ orec_pat_rng(inf, pat);
+ break;
+ case Rep_Group:
+ orec_grp(inf, pat->data.grp);
+ break;
+ default:
+ break;
+ }
+ pat = pat->next;
+ }
+
+ return (inf->ecode);
+}
+
+static int
+orec_pat_bad_rpt(orec_inf *inf, rec_pat *pat)
+{
+ switch (pat->type) {
+ /* Not really an error, but aren't supported by the library.
+ * Includes: .*.*, .+<re>? .*<re>*, (.*)(<re>*), etc.
+ */
+
+ /* Not a repetition, but mathes anything... */
+ case Rep_Any:
+
+ /* Zero length matches */
+ case Rep_Eol:
+ if (!(inf->flags & RE_NEWLINE))
+ break;
+ case Rep_Bol:
+ case Rep_Bow:
+ case Rep_Eow:
+
+ /* Repetitions */
+ case Rep_AnyAnyTimes:
+ case Rep_AnyMaybe:
+ case Rep_AnyAtLeast:
+ inf->ecode = RE_BADRPT;
+ break;
+
+ /* Check if the first group element is a complex pattern */
+ case Rep_Group:
+ if (pat->rep == NULL) {
+ if (pat->data.grp->alt) {
+ for (pat = pat->data.grp->alt->pat; pat; pat = pat->next) {
+ if (orec_pat_bad_rpt(inf, pat))
+ break;
+ }
+ }
+ break;
+ }
+ /*FALLTHROUGH*/
+ default:
+ if (pat->rep)
+ inf->ecode = RE_BADRPT;
+ break;
+ }
+
+ if (!inf->ecode && pat && pat->next)
+ orec_pat_bad_forward_rpt(inf, pat->next);
+
+ return (inf->ecode);
+}
+
+static int
+orec_pat_bad_forward_rpt(orec_inf *inf, rec_pat *pat)
+{
+ if (pat->rep) {
+ switch (pat->rep->type) {
+ case Rer_MinMax:
+ if (pat->rep->mine > 0)
+ break;
+ case Rer_AnyTimes:
+ case Rer_Maybe:
+ case Rer_Max:
+ inf->ecode = RE_BADRPT;
+ default:
+ break;
+ }
+ }
+ else if (pat->type == Rep_Group &&
+ pat->data.grp->alt &&
+ pat->data.grp->alt->pat)
+ orec_pat_bad_forward_rpt(inf, pat->data.grp->alt->pat);
+
+ return (inf->ecode);
+}
+
+static int
+orec_grp(orec_inf *inf, rec_grp *grp)
+{
+ rec_grp *prev = inf->grp;
+
+ inf->grp = grp;
+ orec_alt(inf, grp->alt);
+ /* Could also just say: inf->grp = grp->gparent */
+ inf->grp = prev;
+
+ return (inf->ecode);
+}
+
+static int
+orec_pat_rng(orec_inf *inf, rec_pat *pat)
+{
+ int i, j[2], count;
+ rec_pat_t type = pat->type;
+ unsigned char *range = pat->data.rng->range;
+
+ for (i = count = j[0] = j[1] = 0; i < 256; i++) {
+ if (range[i]) {
+ if (count == 2) {
+ ++count;
+ break;
+ }
+ j[count++] = i;
+ }
+ }
+
+ if (count == 1 ||
+ (count == 2 &&
+ ((islower(j[0]) && toupper(j[0]) == j[1]) ||
+ (isupper(j[0]) && tolower(j[0]) == j[1])))) {
+ free(pat->data.rng);
+ if (count == 1) {
+ pat->data.chr = j[0];
+ pat->type = type == Rep_Range ? Rep_Literal : Rep_LiteralNot;
+ }
+ else {
+ pat->data.cse.upper = j[0];
+ pat->data.cse.lower = j[1];
+ pat->type = type == Rep_Range ? Rep_CaseLiteral : Rep_CaseLiteralNot;
+ }
+ }
+ else {
+ if (memcmp(re__alnum, range, 256) == 0)
+ type = type == Rep_Range ? Rep_Alnum : Rep_AlnumNot;
+ else if (memcmp(re__odigit, range, 256) == 0)
+ type = type == Rep_Range ? Rep_Odigit : Rep_OdigitNot;
+ else if (memcmp(re__ddigit, range, 256) == 0)
+ type = type == Rep_Range ? Rep_Digit : Rep_DigitNot;
+ else if (memcmp(re__xdigit, range, 256) == 0)
+ type = type == Rep_Range ? Rep_Xdigit : Rep_XdigitNot;
+ else if (memcmp(re__control, range, 256) == 0)
+ type = type == Rep_Range ? Rep_Control : Rep_ControlNot;
+
+ if (type != pat->type) {
+ free(pat->data.rng);
+ pat->type = type;
+ }
+ }
+
+ return (inf->ecode);
+}
+
+/* Join patterns if required, will only fail on memory allocation failure:
+ */
+static int
+orec_pat_cse(orec_inf *inf, rec_pat *pat)
+{
+ rec_pat_t type;
+ int i, len, length;
+ rec_pat *ptr, *next;
+ unsigned char *str, *tofree;
+
+ if (pat->next == NULL && pat->type == Rep_CaseString)
+ return (inf->ecode);
+
+ type = Rep_CaseString;
+
+ /* First calculate how many bytes will be required */
+ for (ptr = pat, length = 1; ptr; ptr = ptr->next) {
+ switch (ptr->type) {
+ case Rep_Literal:
+ length += 2;
+ break;
+ case Rep_String:
+ length += strlen((char*)ptr->data.str) << 1;
+ break;
+ case Rep_CaseLiteral:
+ length += 2;
+ break;
+ case Rep_CaseString:
+ length += strlen((char*)ptr->data.str);
+ break;
+ default:
+ break;
+ }
+ }
+
+ if ((str = malloc(length)) == NULL)
+ return (inf->ecode = RE_ESPACE);
+
+ for (ptr = pat, length = 0; ptr; ptr = next) {
+ tofree = NULL;
+ next = ptr->next;
+ switch (ptr->type) {
+ case Rep_Literal:
+ str[length++] = ptr->data.chr;
+ str[length++] = ptr->data.chr;
+ break;
+ case Rep_String:
+ tofree = ptr->data.str;
+ len = strlen((char*)tofree);
+ for (i = 0; i < len; i++) {
+ str[length++] = tofree[i];
+ str[length++] = tofree[i];
+ }
+ break;
+ case Rep_CaseLiteral:
+ str[length++] = ptr->data.cse.lower;
+ str[length++] = ptr->data.cse.upper;
+ break;
+ case Rep_CaseString:
+ tofree = ptr->data.str;
+ len = strlen((char*)tofree);
+ memcpy(str + length, tofree, len);
+ length += len;
+ break;
+ default:
+ break;
+ }
+ if (tofree)
+ free(tofree);
+ if (ptr != pat)
+ free(ptr);
+ }
+ str[length] = '\0';
+
+ pat->type = type;
+ pat->data.str = str;
+ pat->next = NULL;
+
+ return (inf->ecode);
+}
+
+/* Return 0 if the patterns in the list cannot be merged, 1 if will
+ * be a simple string, 2 if a case string.
+ * This is useful when building an alternative list that is composed
+ * only of strings, but the regex is case insensitive, in wich case
+ * the first pass may have splited some patterns, but if it is a member
+ * of an alternatives list, the cost of using a string list is smaller */
+static int
+orec_pat_cse_can(orec_inf *inf, rec_pat *pat)
+{
+ int ret;
+
+ if (pat == NULL)
+ return (0);
+
+ for (ret = 1; pat; pat = pat->next) {
+ if (pat->rep)
+ return (0);
+ switch (pat->type) {
+ case Rep_Literal:
+ case Rep_String:
+ break;
+ case Rep_CaseLiteral:
+ case Rep_CaseString:
+ ret = 2;
+ break;
+ default:
+ return (0);
+ }
+ }
+
+ return (ret);
+}
+
+
+/* XXX If everything is a (case) byte, the pattern should be
+ * [abcde] instead of a|b|c|d|e (or [aAbBcCdDeE] instead of aA|bB|cC|dD|eE)
+ * as a string list works fine, but as a character range
+ * should be faster, and maybe could be converted here. But not
+ * very important, if performance is required, it should have already
+ * been done in the pattern.
+ */
+static int
+orec_str_list(orec_inf *inf, rec_alt *alt, int str, int count)
+{
+ rec_stl *stl;
+ rec_pat *pat;
+ rec_alt *ptr, *next;
+ int i, j, tlen, len, is;
+
+ if ((stl = calloc(1, sizeof(rec_stl))) == NULL)
+ return (inf->ecode = RE_ESPACE);
+
+ if ((stl->lens = malloc(sizeof(unsigned char) * count)) == NULL) {
+ free(stl);
+ return (inf->ecode = RE_ESPACE);
+ }
+
+ if ((stl->strs = malloc(sizeof(char*) * count)) == NULL) {
+ free(stl->lens);
+ free(stl);
+ return (inf->ecode = RE_ESPACE);
+ }
+
+ if ((pat = calloc(1, sizeof(rec_pat))) == NULL) {
+ free(stl->strs);
+ free(stl->lens);
+ free(stl);
+ return (inf->ecode = RE_ESPACE);
+ }
+
+ pat->data.stl = stl;
+ pat->type = Rep_StringList;
+ stl->type = str ? Resl_StringList : Resl_CaseStringList;
+ for (i = tlen = 0, ptr = alt; i < count; i++) {
+ next = ptr->next;
+ switch (ptr->pat->type) {
+ case Rep_Literal:
+ is = len = 1;
+ break;
+ case Rep_CaseLiteral:
+ is = len = 2;
+ break;
+ default:
+ is = 0;
+ len = strlen((char*)ptr->pat->data.str);
+ break;
+ }
+ tlen += len;
+ stl->lens[i] = len;
+ if (!is) {
+ if (len > 2)
+ stl->strs[i] = ptr->pat->data.str;
+ else {
+ if (len == 1)
+ stl->strs[i] = (void*)(long)(ptr->pat->data.str[0]);
+ else
+ stl->strs[i] = (void*)(long)
+ (ptr->pat->data.str[0] |
+ ((int)ptr->pat->data.str[1] << 8));
+ free(ptr->pat->data.str);
+ }
+ }
+ else {
+ if (is == 1)
+ stl->strs[i] = (void*)(long)ptr->pat->data.chr;
+ else
+ stl->strs[i] = (void*)(long)
+ (ptr->pat->data.cse.lower |
+ (ptr->pat->data.cse.upper << 8));
+ }
+ free(ptr->pat);
+ if (i)
+ free(ptr);
+ ptr = next;
+ }
+ stl->tlen = tlen;
+ stl->nstrs = count;
+
+ alt->pat = pat;
+ alt->next = NULL;
+
+ {
+ int li, lj;
+ unsigned char ci, cj, *str;
+
+ /* Don't need a stable sort, there shouldn't be duplicated strings,
+ * but don't check for it either. Only need to make sure that all
+ * strings that start with the same byte are together */
+ for (i = 0; i < count; i++) {
+ li = stl->lens[i];
+ ci = li > 2 ? stl->strs[i][0] : (long)stl->strs[i] & 0xff;
+ for (j = i + 1; j < count; j++) {
+ lj = stl->lens[j];
+ cj = lj > 2 ? stl->strs[j][0] : (long)stl->strs[j] & 0xff;
+ if ((count >= LARGE_STL_COUNT && cj < ci) ||
+ (cj == ci && lj > li)) {
+ /* If both strings start with the same byte,
+ * put the longer first */
+ str = stl->strs[j];
+ stl->strs[j] = stl->strs[i];
+ stl->strs[i] = str;
+ stl->lens[j] = li;
+ stl->lens[i] = lj;
+ li ^= lj; lj ^= li; li ^= lj;
+ ci ^= cj; cj ^= ci; ci ^= cj;
+ }
+ }
+ }
+ }
+
+ return (inf->ecode);
+}