/* * Copyright (c) 2002 by The XFree86 Project, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * * Except as contained in this notice, the name of the XFree86 Project shall * not be used in advertising or otherwise to promote the sale, use or other * dealings in this Software without prior written authorization from the * XFree86 Project. * * Author: Paulo César Pereira de Andrade */ /* $XFree86: xc/programs/xedit/lisp/re/reo.c,v 1.9 2002/11/15 07:01:33 paulo Exp $ */ #include "rep.h" /* * This file is a placeholder to add code to analyse and optimize the * intermediate data structure generated in rep.c. * Character ranges are optimized while being generated. */ /* * Types */ typedef struct _orec_inf { rec_alt *alt; /* Main alternatives list */ rec_grp *grp; /* Current group pointer */ int flags; int ecode; } orec_inf; /* * Prototypes */ static int orec_alt(orec_inf*, rec_alt*); static int orec_pat(orec_inf*, rec_pat*); static int orec_grp(orec_inf*, rec_grp*); static int orec_pat_bad_rpt(orec_inf*, rec_pat*); static int orec_pat_bad_forward_rpt(orec_inf*, rec_pat*); static int orec_pat_rng(orec_inf*, rec_pat*); static int orec_pat_cse(orec_inf*, rec_pat*); static int orec_pat_cse_can(orec_inf*, rec_pat*); static int orec_str_list(orec_inf*, rec_alt*, int, int); /* * Initialization */ extern unsigned char re__alnum[256]; extern unsigned char re__odigit[256]; extern unsigned char re__ddigit[256]; extern unsigned char re__xdigit[256]; extern unsigned char re__control[256]; /* * Implementation */ int orec_comp(rec_alt *alt, int flags) { orec_inf inf; inf.alt = alt; inf.grp = NULL; inf.flags = flags; inf.ecode = 0; orec_alt(&inf, alt); return (inf.ecode); } void orec_free_stl(rec_stl *stl) { int i; for (i = 0; i < stl->nstrs; i++) { if (stl->lens[i] > 2) free(stl->strs[i]); } free(stl->lens); free(stl->strs); free(stl); } static int orec_alt(orec_inf *inf, rec_alt *alt) { if (alt) { rec_alt *ptr = alt; int ret, count = 0, str = 1, cstr = 1, lits = 0, clits = 0; /* Check if can build a string list */ if (ptr->next) { /* If more than one alternative */ while (ptr && (str || cstr)) { if (ptr->pat == NULL || ptr->pat->rep != NULL) { cstr = str = 0; break; } if ((inf->flags & RE_ICASE)) { if (!(ret = orec_pat_cse_can(inf, ptr->pat))) { cstr = str = 0; break; } if (ret == 1) ++lits; else if (ret == 2) ++clits; } else if (ptr->pat->next == NULL) { if (ptr->pat->type != Rep_String) { if (ptr->pat->type != Rep_Literal) { str = 0; if (ptr->pat->type != Rep_CaseString) { if (ptr->pat->type != Rep_CaseLiteral) cstr = 0; else ++clits; } else if (strlen((char*)ptr->pat->data.str) >= 255) str = cstr = 0; } else ++lits; } else if (strlen((char*)ptr->pat->data.str) >= 255) str = cstr = 0; } else { str = cstr = 0; break; } if (++count >= 255) str = cstr = 0; ptr = ptr->next; } if (str || cstr) { if (inf->flags & RE_ICASE) { for (ptr = alt; ptr; ptr = ptr->next) { if (orec_pat_cse(inf, ptr->pat)) return (inf->ecode); } str = 0; } return (orec_str_list(inf, alt, str, count)); } } else if (alt == inf->alt && alt->pat && alt->pat->rep == NULL) { /* If the toplevel single alternative */ switch (alt->pat->type) { /* One of these will always be true for RE_NOSPEC, * but can also be optimized for simple patterns */ case Rep_Literal: alt->pat->type = Rep_SearchLiteral; break; case Rep_CaseLiteral: alt->pat->type = Rep_SearchCaseLiteral; break; case Rep_String: alt->pat->type = Rep_SearchString; break; case Rep_CaseString: alt->pat->type = Rep_SearchCaseString; break; default: break; } } while (alt) { orec_pat(inf, alt->pat); alt = alt->next; } } return (inf->ecode); } static int orec_pat(orec_inf *inf, rec_pat *pat) { rec_pat *next; while (pat) { switch (pat->type) { case Rep_AnyAnyTimes: if (pat->next == NULL) { rec_grp *grp = inf->grp; next = NULL; while (grp) { next = grp->parent->next; /* Cannot check if is .*$ as the input * may be a substring */ if (next) break; grp = grp->pgrp; } if (next == NULL) { /* .* */ pat->type = Rep_AnyEatAnyTimes; grp = inf->grp; while (grp) { --grp->comp; next = grp->parent->next; if (next) break; grp = grp->pgrp; } } else if (orec_pat_bad_rpt(inf, next)) return (inf->ecode); } else if (orec_pat_bad_rpt(inf, pat->next)) return (inf->ecode); break; case Rep_AnyMaybe: if (pat->next == NULL) { rec_grp *grp = inf->grp; next = NULL; while (grp) { next = grp->parent->next; if (next) break; grp = grp->pgrp; } if (next == NULL) { /* .? */ pat->type = Rep_AnyEatMaybe; grp = inf->grp; while (grp) { --grp->comp; next = grp->parent->next; if (next) break; grp = grp->pgrp; } } else if (orec_pat_bad_rpt(inf, next)) return (inf->ecode); } else if (orec_pat_bad_rpt(inf, pat->next)) return (inf->ecode); break; case Rep_AnyAtLeast: if (pat->next == NULL) { rec_grp *grp = inf->grp; next = NULL; while (grp) { next = grp->parent->next; if (next) break; grp = grp->pgrp; } if (next == NULL) { /* .+ */ pat->type = Rep_AnyEatAtLeast; grp = inf->grp; while (grp) { --grp->comp; next = grp->parent->next; if (next) break; grp = grp->pgrp; } } else if (orec_pat_bad_rpt(inf, next)) return (inf->ecode); } else if (orec_pat_bad_rpt(inf, pat->next)) return (inf->ecode); break; case Rep_Range: case Rep_RangeNot: orec_pat_rng(inf, pat); break; case Rep_Group: orec_grp(inf, pat->data.grp); break; default: break; } pat = pat->next; } return (inf->ecode); } static int orec_pat_bad_rpt(orec_inf *inf, rec_pat *pat) { switch (pat->type) { /* Not really an error, but aren't supported by the library. * Includes: .*.*, .+? .**, (.*)(*), etc. */ /* Not a repetition, but mathes anything... */ case Rep_Any: /* Zero length matches */ case Rep_Eol: if (!(inf->flags & RE_NEWLINE)) break; case Rep_Bol: case Rep_Bow: case Rep_Eow: /* Repetitions */ case Rep_AnyAnyTimes: case Rep_AnyMaybe: case Rep_AnyAtLeast: inf->ecode = RE_BADRPT; break; /* Check if the first group element is a complex pattern */ case Rep_Group: if (pat->rep == NULL) { if (pat->data.grp->alt) { for (pat = pat->data.grp->alt->pat; pat; pat = pat->next) { if (orec_pat_bad_rpt(inf, pat)) break; } } break; } /*FALLTHROUGH*/ default: if (pat->rep) inf->ecode = RE_BADRPT; break; } if (!inf->ecode && pat && pat->next) orec_pat_bad_forward_rpt(inf, pat->next); return (inf->ecode); } static int orec_pat_bad_forward_rpt(orec_inf *inf, rec_pat *pat) { if (pat->rep) { switch (pat->rep->type) { case Rer_MinMax: if (pat->rep->mine > 0) break; case Rer_AnyTimes: case Rer_Maybe: case Rer_Max: inf->ecode = RE_BADRPT; default: break; } } else if (pat->type == Rep_Group && pat->data.grp->alt && pat->data.grp->alt->pat) orec_pat_bad_forward_rpt(inf, pat->data.grp->alt->pat); return (inf->ecode); } static int orec_grp(orec_inf *inf, rec_grp *grp) { rec_grp *prev = inf->grp; inf->grp = grp; orec_alt(inf, grp->alt); /* Could also just say: inf->grp = grp->gparent */ inf->grp = prev; return (inf->ecode); } static int orec_pat_rng(orec_inf *inf, rec_pat *pat) { int i, j[2], count; rec_pat_t type = pat->type; unsigned char *range = pat->data.rng->range; for (i = count = j[0] = j[1] = 0; i < 256; i++) { if (range[i]) { if (count == 2) { ++count; break; } j[count++] = i; } } if (count == 1 || (count == 2 && ((islower(j[0]) && toupper(j[0]) == j[1]) || (isupper(j[0]) && tolower(j[0]) == j[1])))) { free(pat->data.rng); if (count == 1) { pat->data.chr = j[0]; pat->type = type == Rep_Range ? Rep_Literal : Rep_LiteralNot; } else { pat->data.cse.upper = j[0]; pat->data.cse.lower = j[1]; pat->type = type == Rep_Range ? Rep_CaseLiteral : Rep_CaseLiteralNot; } } else { if (memcmp(re__alnum, range, 256) == 0) type = type == Rep_Range ? Rep_Alnum : Rep_AlnumNot; else if (memcmp(re__odigit, range, 256) == 0) type = type == Rep_Range ? Rep_Odigit : Rep_OdigitNot; else if (memcmp(re__ddigit, range, 256) == 0) type = type == Rep_Range ? Rep_Digit : Rep_DigitNot; else if (memcmp(re__xdigit, range, 256) == 0) type = type == Rep_Range ? Rep_Xdigit : Rep_XdigitNot; else if (memcmp(re__control, range, 256) == 0) type = type == Rep_Range ? Rep_Control : Rep_ControlNot; if (type != pat->type) { free(pat->data.rng); pat->type = type; } } return (inf->ecode); } /* Join patterns if required, will only fail on memory allocation failure: */ static int orec_pat_cse(orec_inf *inf, rec_pat *pat) { rec_pat_t type; int i, len, length; rec_pat *ptr, *next; unsigned char *str, *tofree; if (pat->next == NULL && pat->type == Rep_CaseString) return (inf->ecode); type = Rep_CaseString; /* First calculate how many bytes will be required */ for (ptr = pat, length = 1; ptr; ptr = ptr->next) { switch (ptr->type) { case Rep_Literal: length += 2; break; case Rep_String: length += strlen((char*)ptr->data.str) << 1; break; case Rep_CaseLiteral: length += 2; break; case Rep_CaseString: length += strlen((char*)ptr->data.str); break; default: break; } } if ((str = malloc(length)) == NULL) return (inf->ecode = RE_ESPACE); for (ptr = pat, length = 0; ptr; ptr = next) { tofree = NULL; next = ptr->next; switch (ptr->type) { case Rep_Literal: str[length++] = ptr->data.chr; str[length++] = ptr->data.chr; break; case Rep_String: tofree = ptr->data.str; len = strlen((char*)tofree); for (i = 0; i < len; i++) { str[length++] = tofree[i]; str[length++] = tofree[i]; } break; case Rep_CaseLiteral: str[length++] = ptr->data.cse.lower; str[length++] = ptr->data.cse.upper; break; case Rep_CaseString: tofree = ptr->data.str; len = strlen((char*)tofree); memcpy(str + length, tofree, len); length += len; break; default: break; } if (tofree) free(tofree); if (ptr != pat) free(ptr); } str[length] = '\0'; pat->type = type; pat->data.str = str; pat->next = NULL; return (inf->ecode); } /* Return 0 if the patterns in the list cannot be merged, 1 if will * be a simple string, 2 if a case string. * This is useful when building an alternative list that is composed * only of strings, but the regex is case insensitive, in wich case * the first pass may have splited some patterns, but if it is a member * of an alternatives list, the cost of using a string list is smaller */ static int orec_pat_cse_can(orec_inf *inf, rec_pat *pat) { int ret; if (pat == NULL) return (0); for (ret = 1; pat; pat = pat->next) { if (pat->rep) return (0); switch (pat->type) { case Rep_Literal: case Rep_String: break; case Rep_CaseLiteral: case Rep_CaseString: ret = 2; break; default: return (0); } } return (ret); } /* XXX If everything is a (case) byte, the pattern should be * [abcde] instead of a|b|c|d|e (or [aAbBcCdDeE] instead of aA|bB|cC|dD|eE) * as a string list works fine, but as a character range * should be faster, and maybe could be converted here. But not * very important, if performance is required, it should have already * been done in the pattern. */ static int orec_str_list(orec_inf *inf, rec_alt *alt, int str, int count) { rec_stl *stl; rec_pat *pat; rec_alt *ptr, *next; int i, j, tlen, len, is; if ((stl = calloc(1, sizeof(rec_stl))) == NULL) return (inf->ecode = RE_ESPACE); if ((stl->lens = malloc(sizeof(unsigned char) * count)) == NULL) { free(stl); return (inf->ecode = RE_ESPACE); } if ((stl->strs = malloc(sizeof(char*) * count)) == NULL) { free(stl->lens); free(stl); return (inf->ecode = RE_ESPACE); } if ((pat = calloc(1, sizeof(rec_pat))) == NULL) { free(stl->strs); free(stl->lens); free(stl); return (inf->ecode = RE_ESPACE); } pat->data.stl = stl; pat->type = Rep_StringList; stl->type = str ? Resl_StringList : Resl_CaseStringList; for (i = tlen = 0, ptr = alt; i < count; i++) { next = ptr->next; switch (ptr->pat->type) { case Rep_Literal: is = len = 1; break; case Rep_CaseLiteral: is = len = 2; break; default: is = 0; len = strlen((char*)ptr->pat->data.str); break; } tlen += len; stl->lens[i] = len; if (!is) { if (len > 2) stl->strs[i] = ptr->pat->data.str; else { if (len == 1) stl->strs[i] = (void*)(long)(ptr->pat->data.str[0]); else stl->strs[i] = (void*)(long) (ptr->pat->data.str[0] | ((int)ptr->pat->data.str[1] << 8)); free(ptr->pat->data.str); } } else { if (is == 1) stl->strs[i] = (void*)(long)ptr->pat->data.chr; else stl->strs[i] = (void*)(long) (ptr->pat->data.cse.lower | (ptr->pat->data.cse.upper << 8)); } free(ptr->pat); if (i) free(ptr); ptr = next; } stl->tlen = tlen; stl->nstrs = count; alt->pat = pat; alt->next = NULL; { int li, lj; unsigned char ci, cj, *str; /* Don't need a stable sort, there shouldn't be duplicated strings, * but don't check for it either. Only need to make sure that all * strings that start with the same byte are together */ for (i = 0; i < count; i++) { li = stl->lens[i]; ci = li > 2 ? stl->strs[i][0] : (long)stl->strs[i] & 0xff; for (j = i + 1; j < count; j++) { lj = stl->lens[j]; cj = lj > 2 ? stl->strs[j][0] : (long)stl->strs[j] & 0xff; if ((count >= LARGE_STL_COUNT && cj < ci) || (cj == ci && lj > li)) { /* If both strings start with the same byte, * put the longer first */ str = stl->strs[j]; stl->strs[j] = stl->strs[i]; stl->strs[i] = str; stl->lens[j] = li; stl->lens[i] = lj; li ^= lj; lj ^= li; li ^= lj; ci ^= cj; cj ^= ci; ci ^= cj; } } } } return (inf->ecode); }