diff options
Diffstat (limited to 'gnu/usr.bin/grep/dfa.c')
-rw-r--r-- | gnu/usr.bin/grep/dfa.c | 681 |
1 files changed, 366 insertions, 315 deletions
diff --git a/gnu/usr.bin/grep/dfa.c b/gnu/usr.bin/grep/dfa.c index 4a454a9664a..048e901c5e8 100644 --- a/gnu/usr.bin/grep/dfa.c +++ b/gnu/usr.bin/grep/dfa.c @@ -1,5 +1,5 @@ /* dfa.c - deterministic extended regexp routines for GNU - Copyright (C) 1988 Free Software Foundation, Inc. + Copyright 1988, 1998, 2000 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -13,23 +13,23 @@ You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software - Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ - -#ifndef lint -static char rcsid[] = "$Id: dfa.c,v 1.1 1995/10/18 08:40:17 deraadt Exp $"; -#endif /* not lint */ + Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */ /* Written June, 1988 by Mike Haertel Modified July, 1988 by Arthur David Olson to assist BMG speedups */ +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + #include <assert.h> #include <ctype.h> #include <stdio.h> +#include <sys/types.h> #ifdef STDC_HEADERS #include <stdlib.h> #else -#include <sys/types.h> extern char *calloc(), *malloc(), *realloc(); extern void free(); #endif @@ -42,23 +42,16 @@ extern void free(); #include <strings.h> #endif +#ifndef DEBUG /* use the same approach as regex.c */ +#undef assert +#define assert(e) +#endif /* DEBUG */ + #ifndef isgraph #define isgraph(C) (isprint(C) && !isspace(C)) #endif -#ifdef isascii -#define ISALPHA(C) (isascii(C) && isalpha(C)) -#define ISUPPER(C) (isascii(C) && isupper(C)) -#define ISLOWER(C) (isascii(C) && islower(C)) -#define ISDIGIT(C) (isascii(C) && isdigit(C)) -#define ISXDIGIT(C) (isascii(C) && isxdigit(C)) -#define ISSPACE(C) (isascii(C) && isspace(C)) -#define ISPUNCT(C) (isascii(C) && ispunct(C)) -#define ISALNUM(C) (isascii(C) && isalnum(C)) -#define ISPRINT(C) (isascii(C) && isprint(C)) -#define ISGRAPH(C) (isascii(C) && isgraph(C)) -#define ISCNTRL(C) (isascii(C) && iscntrl(C)) -#else +#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII)) #define ISALPHA(C) isalpha(C) #define ISUPPER(C) isupper(C) #define ISLOWER(C) islower(C) @@ -70,57 +63,130 @@ extern void free(); #define ISPRINT(C) isprint(C) #define ISGRAPH(C) isgraph(C) #define ISCNTRL(C) iscntrl(C) +#else +#define ISALPHA(C) (isascii(C) && isalpha(C)) +#define ISUPPER(C) (isascii(C) && isupper(C)) +#define ISLOWER(C) (isascii(C) && islower(C)) +#define ISDIGIT(C) (isascii(C) && isdigit(C)) +#define ISXDIGIT(C) (isascii(C) && isxdigit(C)) +#define ISSPACE(C) (isascii(C) && isspace(C)) +#define ISPUNCT(C) (isascii(C) && ispunct(C)) +#define ISALNUM(C) (isascii(C) && isalnum(C)) +#define ISPRINT(C) (isascii(C) && isprint(C)) +#define ISGRAPH(C) (isascii(C) && isgraph(C)) +#define ISCNTRL(C) (isascii(C) && iscntrl(C)) +#endif + +/* ISASCIIDIGIT differs from ISDIGIT, as follows: + - Its arg may be any int or unsigned int; it need not be an unsigned char. + - It's guaranteed to evaluate its argument exactly once. + - It's typically faster. + Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that + only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless + it's important to use the locale's definition of `digit' even when the + host does not conform to Posix. */ +#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9) + +/* If we (don't) have I18N. */ +/* glibc defines _ */ +#ifndef _ +# ifdef HAVE_LIBINTL_H +# include <libintl.h> +# ifndef _ +# define _(Str) gettext (Str) +# endif +# else +# define _(Str) (Str) +# endif #endif -#include "dfa.h" #include "regex.h" +#include "dfa.h" -#if __STDC__ -typedef void *ptr_t; -#else -typedef char *ptr_t; +/* HPUX, define those as macros in sys/param.h */ +#ifdef setbit +# undef setbit +#endif +#ifdef clrbit +# undef clrbit #endif -static void dfamust(); +static void dfamust PARAMS ((struct dfa *dfa)); + +static ptr_t xcalloc PARAMS ((size_t n, size_t s)); +static ptr_t xmalloc PARAMS ((size_t n)); +static ptr_t xrealloc PARAMS ((ptr_t p, size_t n)); +#ifdef DEBUG +static void prtok PARAMS ((token t)); +#endif +static int tstbit PARAMS ((int b, charclass c)); +static void setbit PARAMS ((int b, charclass c)); +static void clrbit PARAMS ((int b, charclass c)); +static void copyset PARAMS ((charclass src, charclass dst)); +static void zeroset PARAMS ((charclass s)); +static void notset PARAMS ((charclass s)); +static int equal PARAMS ((charclass s1, charclass s2)); +static int charclass_index PARAMS ((charclass s)); +static int looking_at PARAMS ((const char *s)); +static token lex PARAMS ((void)); +static void addtok PARAMS ((token t)); +static void atom PARAMS ((void)); +static int nsubtoks PARAMS ((int tindex)); +static void copytoks PARAMS ((int tindex, int ntokens)); +static void closure PARAMS ((void)); +static void branch PARAMS ((void)); +static void regexp PARAMS ((int toplevel)); +static void copy PARAMS ((position_set *src, position_set *dst)); +static void insert PARAMS ((position p, position_set *s)); +static void merge PARAMS ((position_set *s1, position_set *s2, position_set *m)); +static void delete PARAMS ((position p, position_set *s)); +static int state_index PARAMS ((struct dfa *d, position_set *s, + int newline, int letter)); +static void build_state PARAMS ((int s, struct dfa *d)); +static void build_state_zero PARAMS ((struct dfa *d)); +static char *icatalloc PARAMS ((char *old, char *new)); +static char *icpyalloc PARAMS ((char *string)); +static char *istrstr PARAMS ((char *lookin, char *lookfor)); +static void ifree PARAMS ((char *cp)); +static void freelist PARAMS ((char **cpp)); +static char **enlist PARAMS ((char **cpp, char *new, size_t len)); +static char **comsubs PARAMS ((char *left, char *right)); +static char **addlists PARAMS ((char **old, char **new)); +static char **inboth PARAMS ((char **left, char **right)); static ptr_t -xcalloc(n, s) - int n; - size_t s; +xcalloc (size_t n, size_t s) { ptr_t r = calloc(n, s); if (!r) - dfaerror("Memory exhausted"); + dfaerror(_("Memory exhausted")); return r; } static ptr_t -xmalloc(n) - size_t n; +xmalloc (size_t n) { ptr_t r = malloc(n); assert(n != 0); if (!r) - dfaerror("Memory exhausted"); + dfaerror(_("Memory exhausted")); return r; } static ptr_t -xrealloc(p, n) - ptr_t p; - size_t n; +xrealloc (ptr_t p, size_t n) { ptr_t r = realloc(p, n); assert(n != 0); if (!r) - dfaerror("Memory exhausted"); + dfaerror(_("Memory exhausted")); return r; } -#define CALLOC(p, t, n) ((p) = (t *) xcalloc((n), sizeof (t))) +#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t))) #define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t))) #define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t))) @@ -136,8 +202,7 @@ xrealloc(p, n) #ifdef DEBUG static void -prtok(t) - token t; +prtok (token t) { char *s; @@ -175,33 +240,25 @@ prtok(t) /* Stuff pertaining to charclasses. */ static int -tstbit(b, c) - int b; - charclass c; +tstbit (int b, charclass c) { return c[b / INTBITS] & 1 << b % INTBITS; } static void -setbit(b, c) - int b; - charclass c; +setbit (int b, charclass c) { c[b / INTBITS] |= 1 << b % INTBITS; } static void -clrbit(b, c) - int b; - charclass c; +clrbit (int b, charclass c) { c[b / INTBITS] &= ~(1 << b % INTBITS); } static void -copyset(src, dst) - charclass src; - charclass dst; +copyset (charclass src, charclass dst) { int i; @@ -210,8 +267,7 @@ copyset(src, dst) } static void -zeroset(s) - charclass s; +zeroset (charclass s) { int i; @@ -220,8 +276,7 @@ zeroset(s) } static void -notset(s) - charclass s; +notset (charclass s) { int i; @@ -230,9 +285,7 @@ notset(s) } static int -equal(s1, s2) - charclass s1; - charclass s2; +equal (charclass s1, charclass s2) { int i; @@ -247,8 +300,7 @@ static struct dfa *dfa; /* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */ static int -charclass_index(s) - charclass s; +charclass_index (charclass s) { int i; @@ -262,20 +314,22 @@ charclass_index(s) } /* Syntax bits controlling the behavior of the lexical analyzer. */ -static int syntax_bits, syntax_bits_set; +static reg_syntax_t syntax_bits, syntax_bits_set; /* Flag for case-folding letters into sets. */ static int case_fold; +/* End-of-line byte in data. */ +static unsigned char eolbyte; + /* Entry point to set syntax options. */ void -dfasyntax(bits, fold) - int bits; - int fold; +dfasyntax (reg_syntax_t bits, int fold, int eol) { syntax_bits_set = 1; syntax_bits = bits; case_fold = fold; + eolbyte = eol; } /* Lexical analyzer. All the dross that deals with the obnoxious @@ -285,7 +339,7 @@ dfasyntax(bits, fold) static char *lexstart; /* Pointer to beginning of input string. */ static char *lexptr; /* Pointer to next input character. */ -static lexleft; /* Number of characters remaining. */ +static int lexleft; /* Number of characters remaining. */ static token lasttok; /* Previous token returned; initially END. */ static int laststart; /* True if we're separated from beginning or (, | only by zero-width characters. */ @@ -296,15 +350,21 @@ static int minrep, maxrep; /* Repeat counts for {m,n}. */ #define FETCH(c, eoferr) \ { \ if (! lexleft) \ - if (eoferr != 0) \ - dfaerror(eoferr); \ - else \ - return END; \ + { \ + if (eoferr != 0) \ + dfaerror (eoferr); \ + else \ + return lasttok = END; \ + } \ (c) = (unsigned char) *lexptr++; \ --lexleft; \ } +#ifdef __STDC__ +#define FUNC(F, P) static int F(int c) { return P(c); } +#else #define FUNC(F, P) static int F(c) int c; { return P(c); } +#endif FUNC(is_alpha, ISALPHA) FUNC(is_upper, ISUPPER) @@ -318,32 +378,41 @@ FUNC(is_print, ISPRINT) FUNC(is_graph, ISGRAPH) FUNC(is_cntrl, ISCNTRL) +static int +is_blank (int c) +{ + return (c == ' ' || c == '\t'); +} + /* The following list maps the names of the Posix named character classes to predicate functions that determine whether a given character is in the class. The leading [ has already been eaten by the lexical analyzer. */ static struct { - char *name; - int (*pred)(); + const char *name; + int (*pred) PARAMS ((int)); } prednames[] = { - ":alpha:]", is_alpha, - ":upper:]", is_upper, - ":lower:]", is_lower, - ":digit:]", is_digit, - ":xdigit:]", is_xdigit, - ":space:]", is_space, - ":punct:]", is_punct, - ":alnum:]", is_alnum, - ":print:]", is_print, - ":graph:]", is_graph, - ":cntrl:]", is_cntrl, - 0 + { ":alpha:]", is_alpha }, + { ":upper:]", is_upper }, + { ":lower:]", is_lower }, + { ":digit:]", is_digit }, + { ":xdigit:]", is_xdigit }, + { ":space:]", is_space }, + { ":punct:]", is_punct }, + { ":alnum:]", is_alnum }, + { ":print:]", is_print }, + { ":graph:]", is_graph }, + { ":cntrl:]", is_cntrl }, + { ":blank:]", is_blank }, + { 0 } }; +/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */ +#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_') + static int -looking_at(s) - char *s; +looking_at (char const *s) { - int len; + size_t len; len = strlen(s); if (lexleft < len) @@ -352,12 +421,14 @@ looking_at(s) } static token -lex() +lex (void) { token c, c1, c2; int backslash = 0, invert; charclass ccl; int i; + char lo[2]; + char hi[2]; /* Basic plan: We fetch a character. If it's a backslash, we set the backslash flag and go through the loop again. @@ -374,7 +445,7 @@ lex() if (backslash) goto normal_char; if (lexleft == 0) - dfaerror("Unfinished \\ escape"); + dfaerror(_("Unfinished \\ escape")); backslash = 1; break; @@ -420,23 +491,33 @@ lex() } goto normal_char; + case '`': + if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) + return lasttok = BEGLINE; /* FIXME: should be beginning of string */ + goto normal_char; + + case '\'': + if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) + return lasttok = ENDLINE; /* FIXME: should be end of string */ + goto normal_char; + case '<': - if (backslash) + if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) return lasttok = BEGWORD; goto normal_char; case '>': - if (backslash) + if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) return lasttok = ENDWORD; goto normal_char; case 'b': - if (backslash) + if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) return lasttok = LIMWORD; goto normal_char; case 'B': - if (backslash) + if (backslash && !(syntax_bits & RE_NO_GNU_OPS)) return lasttok = NOTLIMWORD; goto normal_char; @@ -470,44 +551,76 @@ lex() goto normal_char; if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0)) goto normal_char; - minrep = maxrep = 0; + if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart) + goto normal_char; + + if (syntax_bits & RE_NO_BK_BRACES) + { + /* Scan ahead for a valid interval; if it's not valid, + treat it as a literal '{'. */ + int lo = -1, hi = -1; + char const *p = lexptr; + char const *lim = p + lexleft; + for (; p != lim && ISASCIIDIGIT (*p); p++) + lo = (lo < 0 ? 0 : lo * 10) + *p - '0'; + if (p != lim && *p == ',') + while (++p != lim && ISASCIIDIGIT (*p)) + hi = (hi < 0 ? 0 : hi * 10) + *p - '0'; + else + hi = lo; + if (p == lim || *p != '}' + || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo)) + goto normal_char; + } + + minrep = 0; /* Cases: {M} - exact count {M,} - minimum count, maximum is infinity - {,M} - 0 through M {M,N} - M through N */ - FETCH(c, "unfinished repeat count"); - if (ISDIGIT(c)) + FETCH(c, _("unfinished repeat count")); + if (ISASCIIDIGIT (c)) { minrep = c - '0'; for (;;) { - FETCH(c, "unfinished repeat count"); - if (!ISDIGIT(c)) + FETCH(c, _("unfinished repeat count")); + if (! ISASCIIDIGIT (c)) break; minrep = 10 * minrep + c - '0'; } } - else if (c != ',') - dfaerror("malformed repeat count"); + else + dfaerror(_("malformed repeat count")); if (c == ',') - for (;;) - { - FETCH(c, "unfinished repeat count"); - if (!ISDIGIT(c)) - break; - maxrep = 10 * maxrep + c - '0'; - } + { + FETCH (c, _("unfinished repeat count")); + if (! ISASCIIDIGIT (c)) + maxrep = -1; + else + { + maxrep = c - '0'; + for (;;) + { + FETCH (c, _("unfinished repeat count")); + if (! ISASCIIDIGIT (c)) + break; + maxrep = 10 * maxrep + c - '0'; + } + if (0 <= maxrep && maxrep < minrep) + dfaerror (_("malformed repeat count")); + } + } else maxrep = minrep; if (!(syntax_bits & RE_NO_BK_BRACES)) { if (c != '\\') - dfaerror("malformed repeat count"); - FETCH(c, "unfinished repeat count"); + dfaerror(_("malformed repeat count")); + FETCH(c, _("unfinished repeat count")); } if (c != '}') - dfaerror("malformed repeat count"); + dfaerror(_("malformed repeat count")); laststart = 0; return lasttok = REPMN; @@ -549,7 +662,7 @@ lex() zeroset(ccl); notset(ccl); if (!(syntax_bits & RE_DOT_NEWLINE)) - clrbit('\n', ccl); + clrbit(eolbyte, ccl); if (syntax_bits & RE_DOT_NOT_NULL) clrbit('\0', ccl); laststart = 0; @@ -557,25 +670,25 @@ lex() case 'w': case 'W': - if (!backslash) + if (!backslash || (syntax_bits & RE_NO_GNU_OPS)) goto normal_char; zeroset(ccl); for (c2 = 0; c2 < NOTCHAR; ++c2) - if (ISALNUM(c2)) + if (IS_WORD_CONSTITUENT(c2)) setbit(c2, ccl); if (c == 'W') notset(ccl); laststart = 0; return lasttok = CSET + charclass_index(ccl); - + case '[': if (backslash) goto normal_char; zeroset(ccl); - FETCH(c, "Unbalanced ["); + FETCH(c, _("Unbalanced [")); if (c == '^') { - FETCH(c, "Unbalanced ["); + FETCH(c, _("Unbalanced [")); invert = 1; } else @@ -592,20 +705,25 @@ lex() for (c1 = 0; prednames[c1].name; ++c1) if (looking_at(prednames[c1].name)) { + int (*pred)() = prednames[c1].pred; + if (case_fold + && (pred == is_upper || pred == is_lower)) + pred = is_alpha; + for (c2 = 0; c2 < NOTCHAR; ++c2) - if ((*prednames[c1].pred)(c2)) + if ((*pred)(c2)) setbit(c2, ccl); lexptr += strlen(prednames[c1].name); lexleft -= strlen(prednames[c1].name); - FETCH(c1, "Unbalanced ["); + FETCH(c1, _("Unbalanced [")); goto skip; } if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) - FETCH(c, "Unbalanced ["); - FETCH(c1, "Unbalanced ["); + FETCH(c, _("Unbalanced [")); + FETCH(c1, _("Unbalanced [")); if (c1 == '-') { - FETCH(c2, "Unbalanced ["); + FETCH(c2, _("Unbalanced [")); if (c2 == ']') { /* In the case [x-], the - is an ordinary hyphen, @@ -618,22 +736,32 @@ lex() { if (c2 == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) - FETCH(c2, "Unbalanced ["); - FETCH(c1, "Unbalanced ["); + FETCH(c2, _("Unbalanced [")); + FETCH(c1, _("Unbalanced [")); } } else c2 = c; - while (c <= c2) + + lo[0] = c; lo[1] = '\0'; + hi[0] = c2; hi[1] = '\0'; + for (c = 0; c < NOTCHAR; c++) { - setbit(c, ccl); - if (case_fold) - if (ISUPPER(c)) - setbit(tolower(c), ccl); - else if (ISLOWER(c)) - setbit(toupper(c), ccl); - ++c; + char ch[2]; + ch[0] = c; ch[1] = '\0'; + if (strcoll (lo, ch) <= 0 && strcoll (ch, hi) <= 0) + { + setbit (c, ccl); + if (case_fold) + { + if (ISUPPER (c)) + setbit (tolower (c), ccl); + else if (ISLOWER (c)) + setbit (toupper (c), ccl); + } + } } + skip: ; } @@ -642,7 +770,7 @@ lex() { notset(ccl); if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) - clrbit('\n', ccl); + clrbit(eolbyte, ccl); } laststart = 0; return lasttok = CSET + charclass_index(ccl); @@ -667,12 +795,13 @@ lex() /* The above loop should consume at most a backslash and some other character. */ abort(); + return END; /* keeps pedantic compilers happy. */ } /* Recursive descent parser for regular expressions. */ static token tok; /* Lookahead token. */ -static depth; /* Current depth of a hypothetical stack +static int depth; /* Current depth of a hypothetical stack holding deferred productions. This is used to determine the depth that will be required of the real stack later on in @@ -681,8 +810,7 @@ static depth; /* Current depth of a hypothetical stack /* Add the given token to the parse tree, maintaining the depth count and updating the maximum depth if necessary. */ static void -addtok(t) - token t; +addtok (token t) { REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex); dfa->tokens[dfa->tindex++] = t; @@ -740,14 +868,8 @@ addtok(t) The parser builds a parse tree in postfix form in an array of tokens. */ -#if __STDC__ -static void regexp(int); -#else -static void regexp(); -#endif - static void -atom() +atom (void) { if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD @@ -761,7 +883,7 @@ atom() tok = lex(); regexp(0); if (tok != RPAREN) - dfaerror("Unbalanced ("); + dfaerror(_("Unbalanced (")); tok = lex(); } else @@ -770,7 +892,7 @@ atom() /* Return the number of tokens in the given subexpression. */ static int -nsubtoks(tindex) +nsubtoks (int tindex) { int ntoks1; @@ -792,8 +914,7 @@ nsubtoks(tindex) /* Copy the given subexpression to the top of the tree. */ static void -copytoks(tindex, ntokens) - int tindex, ntokens; +copytoks (int tindex, int ntokens) { int i; @@ -802,7 +923,7 @@ copytoks(tindex, ntokens) } static void -closure() +closure (void) { int tindex, ntokens, i; @@ -812,7 +933,7 @@ closure() { ntokens = nsubtoks(dfa->tindex); tindex = dfa->tindex - ntokens; - if (maxrep == 0) + if (maxrep < 0) addtok(PLUS); if (minrep == 0) addtok(QMARK); @@ -837,7 +958,7 @@ closure() } static void -branch() +branch (void) { closure(); while (tok != RPAREN && tok != OR && tok >= 0) @@ -848,8 +969,7 @@ branch() } static void -regexp(toplevel) - int toplevel; +regexp (int toplevel) { branch(); while (tok == OR) @@ -867,11 +987,7 @@ regexp(toplevel) length of the string, so s can include NUL characters. D is a pointer to the struct dfa to parse into. */ void -dfaparse(s, len, d) - char *s; - size_t len; - struct dfa *d; - +dfaparse (char *s, size_t len, struct dfa *d) { dfa = d; lexstart = lexptr = s; @@ -881,7 +997,7 @@ dfaparse(s, len, d) parens = 0; if (! syntax_bits_set) - dfaerror("No syntax specified"); + dfaerror(_("No syntax specified")); tok = lex(); depth = d->depth; @@ -889,7 +1005,7 @@ dfaparse(s, len, d) regexp(1); if (tok != END) - dfaerror("Unbalanced )"); + dfaerror(_("Unbalanced )")); addtok(END - d->nregexps); addtok(CAT); @@ -904,9 +1020,7 @@ dfaparse(s, len, d) /* Copy one set to another; the destination must be large enough. */ static void -copy(src, dst) - position_set *src; - position_set *dst; +copy (position_set *src, position_set *dst) { int i; @@ -920,15 +1034,13 @@ copy(src, dst) the same index then their constraints are logically or'd together. S->elems must point to an array large enough to hold the resulting set. */ static void -insert(p, s) - position p; - position_set *s; +insert (position p, position_set *s) { int i; position t1, t2; for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i) - ; + continue; if (i < s->nelem && p.index == s->elems[i].index) s->elems[i].constraint |= p.constraint; else @@ -947,10 +1059,7 @@ insert(p, s) /* Merge two sets of positions into a third. The result is exactly as if the positions of both sets were inserted into an initially empty set. */ static void -merge(s1, s2, m) - position_set *s1; - position_set *s2; - position_set *m; +merge (position_set *s1, position_set *s2, position_set *m) { int i = 0, j = 0; @@ -973,9 +1082,7 @@ merge(s1, s2, m) /* Delete a position from a set. */ static void -delete(p, s) - position p; - position_set *s; +delete (position p, position_set *s) { int i; @@ -992,11 +1099,7 @@ delete(p, s) state. Newline and letter tell whether we got here on a newline or letter, respectively. */ static int -state_index(d, s, newline, letter) - struct dfa *d; - position_set *s; - int newline; - int letter; +state_index (struct dfa *d, position_set *s, int newline, int letter) { int hash = 0; int constraint; @@ -1061,10 +1164,8 @@ state_index(d, s, newline, letter) that position with the elements of its follow labeled with an appropriate constraint. Repeat exhaustively until no funny positions are left. S->elems must be large enough to hold the result. */ -void -epsclosure(s, d) - position_set *s; - struct dfa *d; +static void +epsclosure (position_set *s, struct dfa *d) { int i, j; int *visited; @@ -1176,9 +1277,7 @@ epsclosure(s, d) scheme; the number of elements in each set deeper in the stack can be used to determine the address of a particular set's array. */ void -dfaanalyze(d, searchflag) - struct dfa *d; - int searchflag; +dfaanalyze (struct dfa *d, int searchflag) { int *nullable; /* Nullable stack. */ int *nfirstpos; /* Element count stack for firstpos sets. */ @@ -1439,10 +1538,7 @@ dfaanalyze(d, searchflag) create a new group labeled with the characters of C and insert this position in that group. */ void -dfastate(s, d, trans) - int s; - struct dfa *d; - int trans[]; +dfastate (int s, struct dfa *d, int trans[]) { position_set grps[NOTCHAR]; /* As many as will ever be needed. */ charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */ @@ -1463,7 +1559,7 @@ dfastate(s, d, trans) int state_newline; /* New state on a newline transition. */ int wants_letter; /* New state wants to know letter context. */ int state_letter; /* New state on a letter transition. */ - static initialized; /* Flag for static initialization. */ + static int initialized; /* Flag for static initialization. */ int i, j, k; /* Initialize the set of letters, if necessary. */ @@ -1471,9 +1567,9 @@ dfastate(s, d, trans) { initialized = 1; for (i = 0; i < NOTCHAR; ++i) - if (ISALNUM(i)) + if (IS_WORD_CONSTITUENT(i)) setbit(i, letters); - setbit('\n', newline); + setbit(eolbyte, newline); } zeroset(matches); @@ -1494,7 +1590,7 @@ dfastate(s, d, trans) { if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, d->states[s].newline, 1)) - clrbit('\n', matches); + clrbit(eolbyte, matches); if (! MATCHES_NEWLINE_CONTEXT(pos.constraint, d->states[s].newline, 0)) for (j = 0; j < CHARCLASS_INTS; ++j) @@ -1510,7 +1606,7 @@ dfastate(s, d, trans) /* If there are no characters left, there's no point in going on. */ for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j) - ; + continue; if (j == CHARCLASS_INTS) continue; } @@ -1528,7 +1624,7 @@ dfastate(s, d, trans) matches. */ intersectf = 0; for (k = 0; k < CHARCLASS_INTS; ++k) - (intersect[k] = matches[k] & labels[j][k]) ? intersectf = 1 : 0; + (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0; if (! intersectf) continue; @@ -1539,8 +1635,8 @@ dfastate(s, d, trans) /* Even an optimizing compiler can't know this for sure. */ int match = matches[k], label = labels[j][k]; - (leftovers[k] = ~match & label) ? leftoversf = 1 : 0; - (matches[k] = match & ~label) ? matchesf = 1 : 0; + (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0; + (matches[k] = match & ~label) ? (matchesf = 1) : 0; } /* If there were leftovers, create a new group labeled with them. */ @@ -1604,12 +1700,8 @@ dfastate(s, d, trans) else state_letter = state; for (i = 0; i < NOTCHAR; ++i) - if (i == '\n') - trans[i] = state_newline; - else if (ISALNUM(i)) - trans[i] = state_letter; - else - trans[i] = state; + trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state; + trans[eolbyte] = state_newline; } else for (i = 0; i < NOTCHAR; ++i) @@ -1633,7 +1725,7 @@ dfastate(s, d, trans) /* Find out if the new state will want any context information. */ wants_newline = 0; - if (tstbit('\n', labels[i])) + if (tstbit(eolbyte, labels[i])) for (j = 0; j < follows.nelem; ++j) if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint)) wants_newline = 1; @@ -1665,9 +1757,9 @@ dfastate(s, d, trans) { int c = j * INTBITS + k; - if (c == '\n') + if (c == eolbyte) trans[c] = state_newline; - else if (ISALNUM(c)) + else if (IS_WORD_CONSTITUENT(c)) trans[c] = state_letter; else if (c < NOTCHAR) trans[c] = state; @@ -1688,9 +1780,7 @@ dfastate(s, d, trans) TODO: Improve this comment, get rid of the unnecessary redundancy. */ static void -build_state(s, d) - int s; - struct dfa *d; +build_state (int s, struct dfa *d) { int *trans; /* The new transition table. */ int i; @@ -1756,8 +1846,8 @@ build_state(s, d) /* Keep the newline transition in a special place so we can use it as a sentinel. */ - d->newlines[s] = trans['\n']; - trans['\n'] = -1; + d->newlines[s] = trans[eolbyte]; + trans[eolbyte] = -1; if (ACCEPTING(s, *d)) d->fails[s] = trans; @@ -1766,8 +1856,7 @@ build_state(s, d) } static void -build_state_zero(d) - struct dfa *d; +build_state_zero (struct dfa *d) { d->tralloc = 1; d->trcount = 0; @@ -1793,20 +1882,16 @@ build_state_zero(d) match needs to be verified by a backtracking matcher. Otherwise we store a 0 in *backref. */ char * -dfaexec(d, begin, end, newline, count, backref) - struct dfa *d; - char *begin; - char *end; - int newline; - int *count; - int *backref; +dfaexec (struct dfa *d, char *begin, char *end, + int newline, int *count, int *backref) { - register s, s1, tmp; /* Current state. */ + register int s, s1, tmp; /* Current state. */ register unsigned char *p; /* Current input character. */ - register **trans, *t; /* Copy of d->trans so it can be optimized + register int **trans, *t; /* Copy of d->trans so it can be optimized into a register. */ - static sbit[NOTCHAR]; /* Table for anding with d->success. */ - static sbit_init; + register unsigned char eol = eolbyte; /* Likewise for eolbyte. */ + static int sbit[NOTCHAR]; /* Table for anding with d->success. */ + static int sbit_init; if (! sbit_init) { @@ -1814,12 +1899,8 @@ dfaexec(d, begin, end, newline, count, backref) sbit_init = 1; for (i = 0; i < NOTCHAR; ++i) - if (i == '\n') - sbit[i] = 4; - else if (ISALNUM(i)) - sbit[i] = 2; - else - sbit[i] = 1; + sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1; + sbit[eol] = 4; } if (! d->tralloc) @@ -1828,34 +1909,25 @@ dfaexec(d, begin, end, newline, count, backref) s = s1 = 0; p = (unsigned char *) begin; trans = d->trans; - *end = '\n'; + *end = eol; for (;;) { - /* The dreaded inner loop. */ - if ((t = trans[s]) != 0) - do - { - s1 = t[*p++]; - if (! (t = trans[s1])) - goto last_was_s; - s = t[*p++]; - } - while ((t = trans[s]) != 0); - goto last_was_s1; - last_was_s: - tmp = s, s = s1, s1 = tmp; - last_was_s1: + while ((t = trans[s]) != 0) { /* hand-optimized loop */ + s1 = t[*p++]; + if ((t = trans[s1]) == 0) { + tmp = s ; s = s1 ; s1 = tmp ; /* swap */ + break; + } + s = t[*p++]; + } if (s >= 0 && p <= (unsigned char *) end && d->fails[s]) { if (d->success[s] & sbit[*p]) { if (backref) - if (d->states[s].backref) - *backref = 1; - else - *backref = 0; + *backref = (d->states[s].backref != 0); return (char *) p; } @@ -1865,7 +1937,7 @@ dfaexec(d, begin, end, newline, count, backref) } /* If the previous character was a newline, count it. */ - if (count && (char *) p <= end && p[-1] == '\n') + if (count && (char *) p <= end && p[-1] == eol) ++*count; /* Check if we've run off the end of the buffer. */ @@ -1879,7 +1951,7 @@ dfaexec(d, begin, end, newline, count, backref) continue; } - if (p[-1] == '\n' && newline) + if (p[-1] == eol && newline) { s = d->newlines[s1]; continue; @@ -1892,8 +1964,7 @@ dfaexec(d, begin, end, newline, count, backref) /* Initialize the components of a dfa that the other routines don't initialize for themselves. */ void -dfainit(d) - struct dfa *d; +dfainit (struct dfa *d) { d->calloc = 1; MALLOC(d->charclasses, charclass, d->calloc); @@ -1911,32 +1982,28 @@ dfainit(d) /* Parse and analyze a single string of the given length. */ void -dfacomp(s, len, d, searchflag) - char *s; - size_t len; - struct dfa *d; - int searchflag; +dfacomp (char *s, size_t len, struct dfa *d, int searchflag) { if (case_fold) /* dummy folding in service of dfamust() */ { - char *copy; + char *lcopy; int i; - copy = malloc(len); - if (!copy) - dfaerror("out of memory"); - + lcopy = malloc(len); + if (!lcopy) + dfaerror(_("out of memory")); + /* This is a kludge. */ case_fold = 0; for (i = 0; i < len; ++i) - if (ISUPPER(s[i])) - copy[i] = tolower(s[i]); + if (ISUPPER ((unsigned char) s[i])) + lcopy[i] = tolower ((unsigned char) s[i]); else - copy[i] = s[i]; + lcopy[i] = s[i]; dfainit(d); - dfaparse(copy, len, d); - free(copy); + dfaparse(lcopy, len, d); + free(lcopy); dfamust(d); d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0; case_fold = 1; @@ -1954,8 +2021,7 @@ dfacomp(s, len, d, searchflag) /* Free the storage held by the components of a dfa. */ void -dfafree(d) - struct dfa *d; +dfafree (struct dfa *d) { int i; struct dfamust *dm, *ndm; @@ -1974,9 +2040,10 @@ dfafree(d) free((ptr_t) d->trans[i]); else if (d->fails[i]) free((ptr_t) d->fails[i]); - free((ptr_t) d->realtrans); - free((ptr_t) d->fails); - free((ptr_t) d->newlines); + if (d->realtrans) free((ptr_t) d->realtrans); + if (d->fails) free((ptr_t) d->fails); + if (d->newlines) free((ptr_t) d->newlines); + if (d->success) free((ptr_t) d->success); for (dm = d->musts; dm; dm = ndm) { ndm = dm->next; @@ -2015,9 +2082,9 @@ dfafree(d) Type left right is in ---- ---- ----- -- -- char c # c # c # c # c - + CSET ZERO ZERO ZERO ZERO - + STAR ZERO ZERO ZERO ZERO QMARK ZERO ZERO ZERO ZERO @@ -2028,12 +2095,12 @@ dfafree(d) p->left : q->right : q->is!=ZERO) ? q->in plus p->is##q->left p->right##q->is p->is##q->is : p->right##q->left ZERO - + OR longest common longest common (do p->is and substrings common to leading trailing q->is have same p->in and q->in - (sub)sequence (sub)sequence length and - of p->left of p->right content) ? - and q->left and q->right p->is : NULL + (sub)sequence (sub)sequence length and + of p->left of p->right content) ? + and q->left and q->right p->is : NULL If there's anything else we recognize in the tree, all four sequences get set to zero-length sequences. If there's something we don't recognize in the tree, @@ -2060,18 +2127,16 @@ dfafree(d) Does optimization actually accomplish anything, or is the automaton you get from "psi|epsilon" (for example) the same as the one you get from "psi" (for example)? - + Are optimizable r.e.'s likely to be used in real-life situations (something like 'ab*' is probably unlikely; something like is 'psi|epsilon' is likelier)? */ static char * -icatalloc(old, new) - char *old; - char *new; +icatalloc (char *old, char *new) { char *result; - int oldsize, newsize; + size_t oldsize, newsize; newsize = (new == NULL) ? 0 : strlen(new); if (old == NULL) @@ -2089,19 +2154,16 @@ icatalloc(old, new) } static char * -icpyalloc(string) - char *string; +icpyalloc (char *string) { return icatalloc((char *) NULL, string); } static char * -istrstr(lookin, lookfor) - char *lookin; - char *lookfor; +istrstr (char *lookin, char *lookfor) { char *cp; - int len; + size_t len; len = strlen(lookfor); for (cp = lookin; *cp != '\0'; ++cp) @@ -2111,16 +2173,14 @@ istrstr(lookin, lookfor) } static void -ifree(cp) - char *cp; +ifree (char *cp) { if (cp != NULL) free(cp); } static void -freelist(cpp) - char **cpp; +freelist (char **cpp) { int i; @@ -2134,10 +2194,7 @@ freelist(cpp) } static char ** -enlist(cpp, new, len) - char **cpp; - char *new; - int len; +enlist (char **cpp, char *new, size_t len) { int i, j; @@ -2182,14 +2239,12 @@ enlist(cpp, new, len) list of their distinct common substrings. Return NULL if something seems wild. */ static char ** -comsubs(left, right) - char *left; - char *right; +comsubs (char *left, char *right) { char **cpp; char *lcp; char *rcp; - int i, len; + size_t i, len; if (left == NULL || right == NULL) return NULL; @@ -2204,7 +2259,7 @@ comsubs(left, right) while (rcp != NULL) { for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) - ; + continue; if (i > len) len = i; rcp = index(rcp + 1, *lcp); @@ -2218,9 +2273,7 @@ comsubs(left, right) } static char ** -addlists(old, new) -char **old; -char **new; +addlists (char **old, char **new) { int i; @@ -2238,9 +2291,7 @@ char **new; /* Given two lists of substrings, return a new list giving substrings common to both. */ static char ** -inboth(left, right) - char **left; - char **right; +inboth (char **left, char **right) { char **both; char **temp; @@ -2264,6 +2315,7 @@ inboth(left, right) } both = addlists(both, temp); freelist(temp); + free(temp); if (both == NULL) return NULL; } @@ -2280,16 +2332,14 @@ typedef struct } must; static void -resetmust(mp) -must *mp; +resetmust (must *mp) { mp->left[0] = mp->right[0] = mp->is[0] = '\0'; freelist(mp->in); } static void -dfamust(dfa) -struct dfa *dfa; +dfamust (struct dfa *dfa) { must *musts; must *mp; @@ -2300,8 +2350,9 @@ struct dfa *dfa; token t; static must must0; struct dfamust *dm; + static char empty_string[] = ""; - result = ""; + result = empty_string; exact = 0; musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts); if (musts == NULL) @@ -2488,7 +2539,7 @@ struct dfa *dfa; resetmust(mp); mp->is[0] = mp->left[0] = mp->right[0] = t; mp->is[1] = mp->left[1] = mp->right[1] = '\0'; - mp->in = enlist(mp->in, mp->is, 1); + mp->in = enlist(mp->in, mp->is, (size_t)1); if (mp->in == NULL) goto done; } |