summaryrefslogtreecommitdiff
path: root/lisp/re/rep.h
diff options
context:
space:
mode:
Diffstat (limited to 'lisp/re/rep.h')
-rw-r--r--lisp/re/rep.h369
1 files changed, 369 insertions, 0 deletions
diff --git a/lisp/re/rep.h b/lisp/re/rep.h
new file mode 100644
index 0000000..5e4d5d5
--- /dev/null
+++ b/lisp/re/rep.h
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2002 by The XFree86 Project, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE XFREE86 PROJECT BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF
+ * OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Except as contained in this notice, the name of the XFree86 Project shall
+ * not be used in advertising or otherwise to promote the sale, use or other
+ * dealings in this Software without prior written authorization from the
+ * XFree86 Project.
+ *
+ * Author: Paulo César Pereira de Andrade
+ */
+
+/* $XFree86: xc/programs/xedit/lisp/re/rep.h,v 1.3 2002/11/25 02:35:32 paulo Exp $ */
+
+#include "re.h"
+
+#ifndef _rep_h
+#define _rep_h
+
+/*
+ * Local defines
+ */
+
+#ifdef MIN
+#undef MIN
+#endif
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+#ifdef MAX
+#undef MAX
+#endif
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+/* This value can not be larger than 255, a depth value is the nesting of
+ * repetition operations and alternatives. The number of nested parenthesis
+ * does not matter, but a repetition on the pattern inside the parenthesis
+ * does. Note also that you cannot have more than 9 parenthesis pairs in
+ * an expression.
+ * Depth is always at least 1. So for MAX_DEPTH 8, it is only allowed
+ * 7 complex repetitions. A complex repetition is a dot followed by an
+ * repetition operator. It is called a complex repetition because dot
+ * matches anything but the empty string, so the engine needs to test
+ * all possible combinations until the end of the string is found.
+ * Repetitions like .* use one depth until the end of the string is found,
+ * for example a.*b.*c.*d has depth 4, while a*b*c*d has depth 2.
+ */
+#define MAX_DEPTH 8
+
+/* Minimum number of strings to generate a "large" string list, that is,
+ * sort the strings and allocate 512 extra bytes to map the first string
+ * with a given initial byte. */
+#define LARGE_STL_COUNT 16
+
+/*
+ * Local types
+ */
+/* Intermediate compilation types declaration */
+ /* (r)egular (e)xpression (c)ompile (c)a(se) */
+typedef struct _rec_cse rec_cse;
+
+ /* (r)egular (e)xpression (c)ompile (r)a(ng)e */
+typedef struct _rec_rng rec_rng;
+
+ /* (r)egular (e)xpression (c)ompile (pat)tern */
+typedef struct _rec_pat rec_pat;
+
+ /* (r)egular (e)xpression (c)ompile (rep)etition */
+typedef struct _rec_rep rec_rep;
+
+ /* (r)egular (e)xpression (c)ompile (gr)ou(p) */
+typedef struct _rec_grp rec_grp;
+
+ /* (r)egular (e)xpression (c)ompile (alt)ernatives */
+typedef struct _rec_alt rec_alt;
+
+
+/* Optimization types */
+ /* (r)egular (e)xpression (c)ompile (st)ring (l)ist */
+typedef struct _rec_stl rec_stl;
+
+/* Final compilation and execution types */
+ /* (re)gular expression (inf)ormation */
+typedef struct _re_inf re_inf;
+
+ /* (re)gular expression (eng)ine */
+typedef struct _re_eng re_eng;
+
+
+/* Codes used by the engine */
+typedef enum {
+ /* Grouping */
+ Re_Open, /* ( */
+ Re_Close, /* ) */
+ Re_Update, /* Like Re_Close, but is inside a loop */
+
+ /* Alternatives */
+ Re_Alt, /* Start alternative list, + next offset */
+ Re_AltNext, /* Next alternative, + next offset */
+ Re_AltDone, /* Finish alternative list */
+
+ /* Repetition */
+ Re_AnyTimes, /* * */
+ Re_Maybe, /* ? */
+ Re_AtLeast, /* +, at least one */
+
+ /* Repetition like */
+ Re_AnyAnyTimes, /* .*<re> */
+ Re_AnyMaybe, /* .?<re> */
+ Re_AnyAtLeast, /* .+<re> */
+
+ Re_AnyEatAnyTimes, /* Expression ends with .* */
+ Re_AnyEatMaybe, /* Expression ends with .? */
+ Re_AnyEatAtLeast, /* Expression ends with .+ */
+
+ /* Repetition with arguments */
+ Re_Exact, /* {e} */
+ Re_Min, /* {n,} */
+ Re_Max, /* {,m} */
+ Re_MinMax, /* {n,m} */
+
+ /* Repetition helper instruction */
+ Re_RepJump, /* Special code, go back to repetition */
+ Re_RepLongJump, /* Jump needs two bytes */
+ /* After the repetition data, all repetitions have an offset
+ * to the code after the repetition */
+
+ /* Matching */
+ Re_Any, /* . */
+ Re_Odigit, /* \o */
+ Re_OdigitNot, /* \O */
+ Re_Digit, /* \d */
+ Re_DigitNot, /* \D */
+ Re_Xdigit, /* \x */
+ Re_XdigitNot, /* \x */
+ Re_Space, /* \s */
+ Re_SpaceNot, /* \S */
+ Re_Tab, /* \t */
+ Re_Newline, /* \n */
+ Re_Lower, /* \l */
+ Re_Upper, /* \u */
+ Re_Alnum, /* \w */
+ Re_AlnumNot, /* \W */
+ Re_Control, /* \c */
+ Re_ControlNot, /* \C */
+ Re_Bol, /* ^ */
+ Re_Eol, /* $ */
+ Re_Bow, /* \< */
+ Re_Eow, /* \> */
+
+ /* Range matching information */
+ Re_Range, /* + 256 bytes */
+ Re_RangeNot, /* + 256 bytes */
+
+ /* Matching with arguments */
+ Re_Literal, /* + character */
+ Re_CaseLiteral, /* + lower + upper */
+ Re_LiteralNot, /* + character */
+ Re_CaseLiteralNot, /* + lower + upper */
+ Re_String, /* + length + string */
+ Re_CaseString, /* + length + string in format lower-upper */
+
+ /* These are useful to start matching, or when RE_NOSPEC is used. */
+ Re_SearchLiteral,
+ Re_SearchCaseLiteral,
+ Re_SearchString,
+ Re_SearchCaseString,
+
+ Re_StringList, /* + total-length + lengths + strings */
+ Re_CaseStringList, /* + total-length + lengths + strings */
+
+ Re_LargeStringList, /* + total-length + lengths + map + strings */
+ Re_LargeCaseStringList, /* + total-length + lengths + map + strings */
+
+ /* Backreference */
+ Re_Backref, /* + reference number */
+
+ /* The last codes */
+ Re_DoneIf, /* Done if at end of input */
+ Re_MaybeDone, /* Done */
+ Re_Done /* If this code found, finished execution */
+} ReCode;
+
+
+/* (r)egular (e)xpresssion (pat)rern (t)ype */
+typedef enum _rec_pat_t {
+ Rep_Literal = Re_Literal,
+ Rep_CaseLiteral = Re_CaseLiteral,
+ Rep_LiteralNot = Re_LiteralNot,
+ Rep_CaseLiteralNot = Re_CaseLiteralNot,
+ Rep_Range = Re_Range,
+ Rep_RangeNot = Re_RangeNot,
+ Rep_String = Re_String,
+ Rep_CaseString = Re_CaseString,
+ Rep_SearchLiteral = Re_SearchLiteral,
+ Rep_SearchCaseLiteral = Re_SearchCaseLiteral,
+ Rep_SearchString = Re_SearchString,
+ Rep_SearchCaseString = Re_SearchCaseString,
+ Rep_Any = Re_Any,
+ Rep_AnyAnyTimes = Re_AnyAnyTimes,
+ Rep_AnyEatAnyTimes = Re_AnyEatAnyTimes,
+ Rep_AnyMaybe = Re_AnyMaybe,
+ Rep_AnyEatMaybe = Re_AnyEatMaybe,
+ Rep_AnyAtLeast = Re_AnyAtLeast,
+ Rep_AnyEatAtLeast = Re_AnyEatAtLeast,
+ Rep_Odigit = Re_Odigit,
+ Rep_OdigitNot = Re_OdigitNot,
+ Rep_Digit = Re_Digit,
+ Rep_DigitNot = Re_DigitNot,
+ Rep_Xdigit = Re_Xdigit,
+ Rep_XdigitNot = Re_XdigitNot,
+ Rep_Space = Re_Space,
+ Rep_SpaceNot = Re_SpaceNot,
+ Rep_Tab = Re_Tab,
+ Rep_Newline = Re_Newline,
+ Rep_Lower = Re_Lower,
+ Rep_Upper = Re_Upper,
+ Rep_Alnum = Re_Alnum,
+ Rep_AlnumNot = Re_AlnumNot,
+ Rep_Control = Re_Control,
+ Rep_ControlNot = Re_ControlNot,
+ Rep_Bol = Re_Bol,
+ Rep_Eol = Re_Eol,
+ Rep_Bow = Re_Bow,
+ Rep_Eow = Re_Eow,
+ Rep_Backref = Re_Backref,
+ Rep_StringList = Re_StringList,
+ Rep_Group = Re_Open
+} rec_pat_t;
+
+
+/* (r)egular (e)xpression (rep)etition (t)ype */
+typedef enum _rec_rep_t {
+ Rer_AnyTimes = Re_AnyTimes,
+ Rer_AtLeast = Re_AtLeast,
+ Rer_Maybe = Re_Maybe,
+ Rer_Exact = Re_Exact,
+ Rer_Min = Re_Min,
+ Rer_Max = Re_Max,
+ Rer_MinMax = Re_MinMax
+} rec_rep_t;
+
+
+/* Decide at re compilation time what is lowercase and what is uppercase */
+struct _rec_cse {
+ unsigned char lower;
+ unsigned char upper;
+};
+
+
+/* A rec_rng is used only during compilation, just a character map */
+struct _rec_rng {
+ unsigned char range[256];
+};
+
+
+/* A rec_pat is used only during compilation, and can be viewed as
+ * a regular expression element like a match to any character, a match
+ * to the beginning or end of the line, etc.
+ * It is implemented as a linked list, and does not have nesting.
+ * The data field can contain:
+ * chr: the value of a single character to match.
+ * cse: the upper and lower case value of a character to match.
+ * rng: a character map to match or not match.
+ * str: a simple string or a string where every two bytes
+ * represents the character to match, in lower/upper
+ * case sequence.
+ * The rep field is not used for strings, strings are broken in the
+ * last character in this case. That is, strings are just a concatenation
+ * of several character matches.
+ */
+struct _rec_pat {
+ rec_pat_t type;
+ rec_pat *next, *prev; /* Linked list information */
+ union {
+ unsigned char chr;
+ rec_cse cse;
+ rec_rng *rng;
+ rec_grp *grp;
+ unsigned char *str;
+ rec_stl *stl;
+ } data;
+ rec_rep *rep; /* Pattern repetition information */
+};
+
+
+/* A rec_rep is used only during compilation, and can be viewed as:
+ *
+ * ? or * or + or {<e>} or {<m>,} or {,<M>} or {<m>,<M>}
+ *
+ * where <e> is "exact", <m> is "minimum" and <M> is "maximum".
+ * In the compiled step it can also be just a NULL pointer, that
+ * is actually equivalent to {1}.
+ */
+struct _rec_rep {
+ rec_rep_t type;
+ short mine; /* minimum or exact number of matches */
+ short maxc; /* maximum number of matches */
+};
+
+
+/* A rec_alt is used only during compilation, and can be viewed as:
+ *
+ * <re>|<re>
+ *
+ * where <re> is any regular expression. The expressions are nested
+ * using the grp field of the rec_pat structure.
+ */
+struct _rec_alt {
+ rec_alt *next, *prev; /* Linked list information */
+ rec_pat *pat;
+};
+
+
+/* A rec_grp is a place holder for expressions enclosed in parenthesis
+ * and is linked to the compilation data by an rec_pat structure. */
+struct _rec_grp {
+ rec_pat *parent; /* Reference to parent pattern */
+ rec_alt *alt; /* The pattern information */
+ rec_alt *palt; /* Parent alternative */
+ rec_grp *pgrp; /* Nested groups */
+ int comp; /* (comp)lex repetition pattern inside group */
+};
+
+
+/* Optimization compilation types definition */
+ /* (r)egular (e)xpression (c)ompile (st)ring (l)ist (t)ype */
+typedef enum {
+ Resl_StringList = Re_StringList,
+ Resl_CaseStringList = Re_CaseStringList
+} rec_stl_t;
+
+struct _rec_stl {
+ rec_stl_t type;
+ int nstrs; /* Number of strings in list */
+ int tlen; /* Total length of all strings */
+ unsigned char *lens; /* Vector of string lengths */
+ unsigned char **strs; /* The strings */
+};
+
+
+/*
+ * Prototypes
+ */
+ /* rep.c */
+rec_alt *irec_comp(const char*, const char*, int, int*);
+void irec_free_alt(rec_alt*);
+
+ /* reo.c */
+int orec_comp(rec_alt*, int);
+void orec_free_stl(rec_stl*);
+
+#endif /* _rep_h */