summaryrefslogtreecommitdiff
path: root/gnu/usr.bin/grep/dfa.c
diff options
context:
space:
mode:
Diffstat (limited to 'gnu/usr.bin/grep/dfa.c')
-rw-r--r--gnu/usr.bin/grep/dfa.c681
1 files changed, 366 insertions, 315 deletions
diff --git a/gnu/usr.bin/grep/dfa.c b/gnu/usr.bin/grep/dfa.c
index 4a454a9664a..048e901c5e8 100644
--- a/gnu/usr.bin/grep/dfa.c
+++ b/gnu/usr.bin/grep/dfa.c
@@ -1,5 +1,5 @@
/* dfa.c - deterministic extended regexp routines for GNU
- Copyright (C) 1988 Free Software Foundation, Inc.
+ Copyright 1988, 1998, 2000 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -13,23 +13,23 @@
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
- Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
-
-#ifndef lint
-static char rcsid[] = "$Id: dfa.c,v 1.1 1995/10/18 08:40:17 deraadt Exp $";
-#endif /* not lint */
+ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA */
/* Written June, 1988 by Mike Haertel
Modified July, 1988 by Arthur David Olson to assist BMG speedups */
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
#include <assert.h>
#include <ctype.h>
#include <stdio.h>
+#include <sys/types.h>
#ifdef STDC_HEADERS
#include <stdlib.h>
#else
-#include <sys/types.h>
extern char *calloc(), *malloc(), *realloc();
extern void free();
#endif
@@ -42,23 +42,16 @@ extern void free();
#include <strings.h>
#endif
+#ifndef DEBUG /* use the same approach as regex.c */
+#undef assert
+#define assert(e)
+#endif /* DEBUG */
+
#ifndef isgraph
#define isgraph(C) (isprint(C) && !isspace(C))
#endif
-#ifdef isascii
-#define ISALPHA(C) (isascii(C) && isalpha(C))
-#define ISUPPER(C) (isascii(C) && isupper(C))
-#define ISLOWER(C) (isascii(C) && islower(C))
-#define ISDIGIT(C) (isascii(C) && isdigit(C))
-#define ISXDIGIT(C) (isascii(C) && isxdigit(C))
-#define ISSPACE(C) (isascii(C) && isspace(C))
-#define ISPUNCT(C) (isascii(C) && ispunct(C))
-#define ISALNUM(C) (isascii(C) && isalnum(C))
-#define ISPRINT(C) (isascii(C) && isprint(C))
-#define ISGRAPH(C) (isascii(C) && isgraph(C))
-#define ISCNTRL(C) (isascii(C) && iscntrl(C))
-#else
+#if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
#define ISALPHA(C) isalpha(C)
#define ISUPPER(C) isupper(C)
#define ISLOWER(C) islower(C)
@@ -70,57 +63,130 @@ extern void free();
#define ISPRINT(C) isprint(C)
#define ISGRAPH(C) isgraph(C)
#define ISCNTRL(C) iscntrl(C)
+#else
+#define ISALPHA(C) (isascii(C) && isalpha(C))
+#define ISUPPER(C) (isascii(C) && isupper(C))
+#define ISLOWER(C) (isascii(C) && islower(C))
+#define ISDIGIT(C) (isascii(C) && isdigit(C))
+#define ISXDIGIT(C) (isascii(C) && isxdigit(C))
+#define ISSPACE(C) (isascii(C) && isspace(C))
+#define ISPUNCT(C) (isascii(C) && ispunct(C))
+#define ISALNUM(C) (isascii(C) && isalnum(C))
+#define ISPRINT(C) (isascii(C) && isprint(C))
+#define ISGRAPH(C) (isascii(C) && isgraph(C))
+#define ISCNTRL(C) (isascii(C) && iscntrl(C))
+#endif
+
+/* ISASCIIDIGIT differs from ISDIGIT, as follows:
+ - Its arg may be any int or unsigned int; it need not be an unsigned char.
+ - It's guaranteed to evaluate its argument exactly once.
+ - It's typically faster.
+ Posix 1003.2-1992 section 2.5.2.1 page 50 lines 1556-1558 says that
+ only '0' through '9' are digits. Prefer ISASCIIDIGIT to ISDIGIT unless
+ it's important to use the locale's definition of `digit' even when the
+ host does not conform to Posix. */
+#define ISASCIIDIGIT(c) ((unsigned) (c) - '0' <= 9)
+
+/* If we (don't) have I18N. */
+/* glibc defines _ */
+#ifndef _
+# ifdef HAVE_LIBINTL_H
+# include <libintl.h>
+# ifndef _
+# define _(Str) gettext (Str)
+# endif
+# else
+# define _(Str) (Str)
+# endif
#endif
-#include "dfa.h"
#include "regex.h"
+#include "dfa.h"
-#if __STDC__
-typedef void *ptr_t;
-#else
-typedef char *ptr_t;
+/* HPUX, define those as macros in sys/param.h */
+#ifdef setbit
+# undef setbit
+#endif
+#ifdef clrbit
+# undef clrbit
#endif
-static void dfamust();
+static void dfamust PARAMS ((struct dfa *dfa));
+
+static ptr_t xcalloc PARAMS ((size_t n, size_t s));
+static ptr_t xmalloc PARAMS ((size_t n));
+static ptr_t xrealloc PARAMS ((ptr_t p, size_t n));
+#ifdef DEBUG
+static void prtok PARAMS ((token t));
+#endif
+static int tstbit PARAMS ((int b, charclass c));
+static void setbit PARAMS ((int b, charclass c));
+static void clrbit PARAMS ((int b, charclass c));
+static void copyset PARAMS ((charclass src, charclass dst));
+static void zeroset PARAMS ((charclass s));
+static void notset PARAMS ((charclass s));
+static int equal PARAMS ((charclass s1, charclass s2));
+static int charclass_index PARAMS ((charclass s));
+static int looking_at PARAMS ((const char *s));
+static token lex PARAMS ((void));
+static void addtok PARAMS ((token t));
+static void atom PARAMS ((void));
+static int nsubtoks PARAMS ((int tindex));
+static void copytoks PARAMS ((int tindex, int ntokens));
+static void closure PARAMS ((void));
+static void branch PARAMS ((void));
+static void regexp PARAMS ((int toplevel));
+static void copy PARAMS ((position_set *src, position_set *dst));
+static void insert PARAMS ((position p, position_set *s));
+static void merge PARAMS ((position_set *s1, position_set *s2, position_set *m));
+static void delete PARAMS ((position p, position_set *s));
+static int state_index PARAMS ((struct dfa *d, position_set *s,
+ int newline, int letter));
+static void build_state PARAMS ((int s, struct dfa *d));
+static void build_state_zero PARAMS ((struct dfa *d));
+static char *icatalloc PARAMS ((char *old, char *new));
+static char *icpyalloc PARAMS ((char *string));
+static char *istrstr PARAMS ((char *lookin, char *lookfor));
+static void ifree PARAMS ((char *cp));
+static void freelist PARAMS ((char **cpp));
+static char **enlist PARAMS ((char **cpp, char *new, size_t len));
+static char **comsubs PARAMS ((char *left, char *right));
+static char **addlists PARAMS ((char **old, char **new));
+static char **inboth PARAMS ((char **left, char **right));
static ptr_t
-xcalloc(n, s)
- int n;
- size_t s;
+xcalloc (size_t n, size_t s)
{
ptr_t r = calloc(n, s);
if (!r)
- dfaerror("Memory exhausted");
+ dfaerror(_("Memory exhausted"));
return r;
}
static ptr_t
-xmalloc(n)
- size_t n;
+xmalloc (size_t n)
{
ptr_t r = malloc(n);
assert(n != 0);
if (!r)
- dfaerror("Memory exhausted");
+ dfaerror(_("Memory exhausted"));
return r;
}
static ptr_t
-xrealloc(p, n)
- ptr_t p;
- size_t n;
+xrealloc (ptr_t p, size_t n)
{
ptr_t r = realloc(p, n);
assert(n != 0);
if (!r)
- dfaerror("Memory exhausted");
+ dfaerror(_("Memory exhausted"));
return r;
}
-#define CALLOC(p, t, n) ((p) = (t *) xcalloc((n), sizeof (t)))
+#define CALLOC(p, t, n) ((p) = (t *) xcalloc((size_t)(n), sizeof (t)))
#define MALLOC(p, t, n) ((p) = (t *) xmalloc((n) * sizeof (t)))
#define REALLOC(p, t, n) ((p) = (t *) xrealloc((ptr_t) (p), (n) * sizeof (t)))
@@ -136,8 +202,7 @@ xrealloc(p, n)
#ifdef DEBUG
static void
-prtok(t)
- token t;
+prtok (token t)
{
char *s;
@@ -175,33 +240,25 @@ prtok(t)
/* Stuff pertaining to charclasses. */
static int
-tstbit(b, c)
- int b;
- charclass c;
+tstbit (int b, charclass c)
{
return c[b / INTBITS] & 1 << b % INTBITS;
}
static void
-setbit(b, c)
- int b;
- charclass c;
+setbit (int b, charclass c)
{
c[b / INTBITS] |= 1 << b % INTBITS;
}
static void
-clrbit(b, c)
- int b;
- charclass c;
+clrbit (int b, charclass c)
{
c[b / INTBITS] &= ~(1 << b % INTBITS);
}
static void
-copyset(src, dst)
- charclass src;
- charclass dst;
+copyset (charclass src, charclass dst)
{
int i;
@@ -210,8 +267,7 @@ copyset(src, dst)
}
static void
-zeroset(s)
- charclass s;
+zeroset (charclass s)
{
int i;
@@ -220,8 +276,7 @@ zeroset(s)
}
static void
-notset(s)
- charclass s;
+notset (charclass s)
{
int i;
@@ -230,9 +285,7 @@ notset(s)
}
static int
-equal(s1, s2)
- charclass s1;
- charclass s2;
+equal (charclass s1, charclass s2)
{
int i;
@@ -247,8 +300,7 @@ static struct dfa *dfa;
/* Find the index of charclass s in dfa->charclasses, or allocate a new charclass. */
static int
-charclass_index(s)
- charclass s;
+charclass_index (charclass s)
{
int i;
@@ -262,20 +314,22 @@ charclass_index(s)
}
/* Syntax bits controlling the behavior of the lexical analyzer. */
-static int syntax_bits, syntax_bits_set;
+static reg_syntax_t syntax_bits, syntax_bits_set;
/* Flag for case-folding letters into sets. */
static int case_fold;
+/* End-of-line byte in data. */
+static unsigned char eolbyte;
+
/* Entry point to set syntax options. */
void
-dfasyntax(bits, fold)
- int bits;
- int fold;
+dfasyntax (reg_syntax_t bits, int fold, int eol)
{
syntax_bits_set = 1;
syntax_bits = bits;
case_fold = fold;
+ eolbyte = eol;
}
/* Lexical analyzer. All the dross that deals with the obnoxious
@@ -285,7 +339,7 @@ dfasyntax(bits, fold)
static char *lexstart; /* Pointer to beginning of input string. */
static char *lexptr; /* Pointer to next input character. */
-static lexleft; /* Number of characters remaining. */
+static int lexleft; /* Number of characters remaining. */
static token lasttok; /* Previous token returned; initially END. */
static int laststart; /* True if we're separated from beginning or (, |
only by zero-width characters. */
@@ -296,15 +350,21 @@ static int minrep, maxrep; /* Repeat counts for {m,n}. */
#define FETCH(c, eoferr) \
{ \
if (! lexleft) \
- if (eoferr != 0) \
- dfaerror(eoferr); \
- else \
- return END; \
+ { \
+ if (eoferr != 0) \
+ dfaerror (eoferr); \
+ else \
+ return lasttok = END; \
+ } \
(c) = (unsigned char) *lexptr++; \
--lexleft; \
}
+#ifdef __STDC__
+#define FUNC(F, P) static int F(int c) { return P(c); }
+#else
#define FUNC(F, P) static int F(c) int c; { return P(c); }
+#endif
FUNC(is_alpha, ISALPHA)
FUNC(is_upper, ISUPPER)
@@ -318,32 +378,41 @@ FUNC(is_print, ISPRINT)
FUNC(is_graph, ISGRAPH)
FUNC(is_cntrl, ISCNTRL)
+static int
+is_blank (int c)
+{
+ return (c == ' ' || c == '\t');
+}
+
/* The following list maps the names of the Posix named character classes
to predicate functions that determine whether a given character is in
the class. The leading [ has already been eaten by the lexical analyzer. */
static struct {
- char *name;
- int (*pred)();
+ const char *name;
+ int (*pred) PARAMS ((int));
} prednames[] = {
- ":alpha:]", is_alpha,
- ":upper:]", is_upper,
- ":lower:]", is_lower,
- ":digit:]", is_digit,
- ":xdigit:]", is_xdigit,
- ":space:]", is_space,
- ":punct:]", is_punct,
- ":alnum:]", is_alnum,
- ":print:]", is_print,
- ":graph:]", is_graph,
- ":cntrl:]", is_cntrl,
- 0
+ { ":alpha:]", is_alpha },
+ { ":upper:]", is_upper },
+ { ":lower:]", is_lower },
+ { ":digit:]", is_digit },
+ { ":xdigit:]", is_xdigit },
+ { ":space:]", is_space },
+ { ":punct:]", is_punct },
+ { ":alnum:]", is_alnum },
+ { ":print:]", is_print },
+ { ":graph:]", is_graph },
+ { ":cntrl:]", is_cntrl },
+ { ":blank:]", is_blank },
+ { 0 }
};
+/* Return non-zero if C is a `word-constituent' byte; zero otherwise. */
+#define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
+
static int
-looking_at(s)
- char *s;
+looking_at (char const *s)
{
- int len;
+ size_t len;
len = strlen(s);
if (lexleft < len)
@@ -352,12 +421,14 @@ looking_at(s)
}
static token
-lex()
+lex (void)
{
token c, c1, c2;
int backslash = 0, invert;
charclass ccl;
int i;
+ char lo[2];
+ char hi[2];
/* Basic plan: We fetch a character. If it's a backslash,
we set the backslash flag and go through the loop again.
@@ -374,7 +445,7 @@ lex()
if (backslash)
goto normal_char;
if (lexleft == 0)
- dfaerror("Unfinished \\ escape");
+ dfaerror(_("Unfinished \\ escape"));
backslash = 1;
break;
@@ -420,23 +491,33 @@ lex()
}
goto normal_char;
+ case '`':
+ if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
+ return lasttok = BEGLINE; /* FIXME: should be beginning of string */
+ goto normal_char;
+
+ case '\'':
+ if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
+ return lasttok = ENDLINE; /* FIXME: should be end of string */
+ goto normal_char;
+
case '<':
- if (backslash)
+ if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
return lasttok = BEGWORD;
goto normal_char;
case '>':
- if (backslash)
+ if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
return lasttok = ENDWORD;
goto normal_char;
case 'b':
- if (backslash)
+ if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
return lasttok = LIMWORD;
goto normal_char;
case 'B':
- if (backslash)
+ if (backslash && !(syntax_bits & RE_NO_GNU_OPS))
return lasttok = NOTLIMWORD;
goto normal_char;
@@ -470,44 +551,76 @@ lex()
goto normal_char;
if (backslash != ((syntax_bits & RE_NO_BK_BRACES) == 0))
goto normal_char;
- minrep = maxrep = 0;
+ if (!(syntax_bits & RE_CONTEXT_INDEP_OPS) && laststart)
+ goto normal_char;
+
+ if (syntax_bits & RE_NO_BK_BRACES)
+ {
+ /* Scan ahead for a valid interval; if it's not valid,
+ treat it as a literal '{'. */
+ int lo = -1, hi = -1;
+ char const *p = lexptr;
+ char const *lim = p + lexleft;
+ for (; p != lim && ISASCIIDIGIT (*p); p++)
+ lo = (lo < 0 ? 0 : lo * 10) + *p - '0';
+ if (p != lim && *p == ',')
+ while (++p != lim && ISASCIIDIGIT (*p))
+ hi = (hi < 0 ? 0 : hi * 10) + *p - '0';
+ else
+ hi = lo;
+ if (p == lim || *p != '}'
+ || lo < 0 || RE_DUP_MAX < hi || (0 <= hi && hi < lo))
+ goto normal_char;
+ }
+
+ minrep = 0;
/* Cases:
{M} - exact count
{M,} - minimum count, maximum is infinity
- {,M} - 0 through M
{M,N} - M through N */
- FETCH(c, "unfinished repeat count");
- if (ISDIGIT(c))
+ FETCH(c, _("unfinished repeat count"));
+ if (ISASCIIDIGIT (c))
{
minrep = c - '0';
for (;;)
{
- FETCH(c, "unfinished repeat count");
- if (!ISDIGIT(c))
+ FETCH(c, _("unfinished repeat count"));
+ if (! ISASCIIDIGIT (c))
break;
minrep = 10 * minrep + c - '0';
}
}
- else if (c != ',')
- dfaerror("malformed repeat count");
+ else
+ dfaerror(_("malformed repeat count"));
if (c == ',')
- for (;;)
- {
- FETCH(c, "unfinished repeat count");
- if (!ISDIGIT(c))
- break;
- maxrep = 10 * maxrep + c - '0';
- }
+ {
+ FETCH (c, _("unfinished repeat count"));
+ if (! ISASCIIDIGIT (c))
+ maxrep = -1;
+ else
+ {
+ maxrep = c - '0';
+ for (;;)
+ {
+ FETCH (c, _("unfinished repeat count"));
+ if (! ISASCIIDIGIT (c))
+ break;
+ maxrep = 10 * maxrep + c - '0';
+ }
+ if (0 <= maxrep && maxrep < minrep)
+ dfaerror (_("malformed repeat count"));
+ }
+ }
else
maxrep = minrep;
if (!(syntax_bits & RE_NO_BK_BRACES))
{
if (c != '\\')
- dfaerror("malformed repeat count");
- FETCH(c, "unfinished repeat count");
+ dfaerror(_("malformed repeat count"));
+ FETCH(c, _("unfinished repeat count"));
}
if (c != '}')
- dfaerror("malformed repeat count");
+ dfaerror(_("malformed repeat count"));
laststart = 0;
return lasttok = REPMN;
@@ -549,7 +662,7 @@ lex()
zeroset(ccl);
notset(ccl);
if (!(syntax_bits & RE_DOT_NEWLINE))
- clrbit('\n', ccl);
+ clrbit(eolbyte, ccl);
if (syntax_bits & RE_DOT_NOT_NULL)
clrbit('\0', ccl);
laststart = 0;
@@ -557,25 +670,25 @@ lex()
case 'w':
case 'W':
- if (!backslash)
+ if (!backslash || (syntax_bits & RE_NO_GNU_OPS))
goto normal_char;
zeroset(ccl);
for (c2 = 0; c2 < NOTCHAR; ++c2)
- if (ISALNUM(c2))
+ if (IS_WORD_CONSTITUENT(c2))
setbit(c2, ccl);
if (c == 'W')
notset(ccl);
laststart = 0;
return lasttok = CSET + charclass_index(ccl);
-
+
case '[':
if (backslash)
goto normal_char;
zeroset(ccl);
- FETCH(c, "Unbalanced [");
+ FETCH(c, _("Unbalanced ["));
if (c == '^')
{
- FETCH(c, "Unbalanced [");
+ FETCH(c, _("Unbalanced ["));
invert = 1;
}
else
@@ -592,20 +705,25 @@ lex()
for (c1 = 0; prednames[c1].name; ++c1)
if (looking_at(prednames[c1].name))
{
+ int (*pred)() = prednames[c1].pred;
+ if (case_fold
+ && (pred == is_upper || pred == is_lower))
+ pred = is_alpha;
+
for (c2 = 0; c2 < NOTCHAR; ++c2)
- if ((*prednames[c1].pred)(c2))
+ if ((*pred)(c2))
setbit(c2, ccl);
lexptr += strlen(prednames[c1].name);
lexleft -= strlen(prednames[c1].name);
- FETCH(c1, "Unbalanced [");
+ FETCH(c1, _("Unbalanced ["));
goto skip;
}
if (c == '\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
- FETCH(c, "Unbalanced [");
- FETCH(c1, "Unbalanced [");
+ FETCH(c, _("Unbalanced ["));
+ FETCH(c1, _("Unbalanced ["));
if (c1 == '-')
{
- FETCH(c2, "Unbalanced [");
+ FETCH(c2, _("Unbalanced ["));
if (c2 == ']')
{
/* In the case [x-], the - is an ordinary hyphen,
@@ -618,22 +736,32 @@ lex()
{
if (c2 == '\\'
&& (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS))
- FETCH(c2, "Unbalanced [");
- FETCH(c1, "Unbalanced [");
+ FETCH(c2, _("Unbalanced ["));
+ FETCH(c1, _("Unbalanced ["));
}
}
else
c2 = c;
- while (c <= c2)
+
+ lo[0] = c; lo[1] = '\0';
+ hi[0] = c2; hi[1] = '\0';
+ for (c = 0; c < NOTCHAR; c++)
{
- setbit(c, ccl);
- if (case_fold)
- if (ISUPPER(c))
- setbit(tolower(c), ccl);
- else if (ISLOWER(c))
- setbit(toupper(c), ccl);
- ++c;
+ char ch[2];
+ ch[0] = c; ch[1] = '\0';
+ if (strcoll (lo, ch) <= 0 && strcoll (ch, hi) <= 0)
+ {
+ setbit (c, ccl);
+ if (case_fold)
+ {
+ if (ISUPPER (c))
+ setbit (tolower (c), ccl);
+ else if (ISLOWER (c))
+ setbit (toupper (c), ccl);
+ }
+ }
}
+
skip:
;
}
@@ -642,7 +770,7 @@ lex()
{
notset(ccl);
if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE)
- clrbit('\n', ccl);
+ clrbit(eolbyte, ccl);
}
laststart = 0;
return lasttok = CSET + charclass_index(ccl);
@@ -667,12 +795,13 @@ lex()
/* The above loop should consume at most a backslash
and some other character. */
abort();
+ return END; /* keeps pedantic compilers happy. */
}
/* Recursive descent parser for regular expressions. */
static token tok; /* Lookahead token. */
-static depth; /* Current depth of a hypothetical stack
+static int depth; /* Current depth of a hypothetical stack
holding deferred productions. This is
used to determine the depth that will be
required of the real stack later on in
@@ -681,8 +810,7 @@ static depth; /* Current depth of a hypothetical stack
/* Add the given token to the parse tree, maintaining the depth count and
updating the maximum depth if necessary. */
static void
-addtok(t)
- token t;
+addtok (token t)
{
REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex);
dfa->tokens[dfa->tindex++] = t;
@@ -740,14 +868,8 @@ addtok(t)
The parser builds a parse tree in postfix form in an array of tokens. */
-#if __STDC__
-static void regexp(int);
-#else
-static void regexp();
-#endif
-
static void
-atom()
+atom (void)
{
if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF
|| tok == BEGLINE || tok == ENDLINE || tok == BEGWORD
@@ -761,7 +883,7 @@ atom()
tok = lex();
regexp(0);
if (tok != RPAREN)
- dfaerror("Unbalanced (");
+ dfaerror(_("Unbalanced ("));
tok = lex();
}
else
@@ -770,7 +892,7 @@ atom()
/* Return the number of tokens in the given subexpression. */
static int
-nsubtoks(tindex)
+nsubtoks (int tindex)
{
int ntoks1;
@@ -792,8 +914,7 @@ nsubtoks(tindex)
/* Copy the given subexpression to the top of the tree. */
static void
-copytoks(tindex, ntokens)
- int tindex, ntokens;
+copytoks (int tindex, int ntokens)
{
int i;
@@ -802,7 +923,7 @@ copytoks(tindex, ntokens)
}
static void
-closure()
+closure (void)
{
int tindex, ntokens, i;
@@ -812,7 +933,7 @@ closure()
{
ntokens = nsubtoks(dfa->tindex);
tindex = dfa->tindex - ntokens;
- if (maxrep == 0)
+ if (maxrep < 0)
addtok(PLUS);
if (minrep == 0)
addtok(QMARK);
@@ -837,7 +958,7 @@ closure()
}
static void
-branch()
+branch (void)
{
closure();
while (tok != RPAREN && tok != OR && tok >= 0)
@@ -848,8 +969,7 @@ branch()
}
static void
-regexp(toplevel)
- int toplevel;
+regexp (int toplevel)
{
branch();
while (tok == OR)
@@ -867,11 +987,7 @@ regexp(toplevel)
length of the string, so s can include NUL characters. D is a pointer to
the struct dfa to parse into. */
void
-dfaparse(s, len, d)
- char *s;
- size_t len;
- struct dfa *d;
-
+dfaparse (char *s, size_t len, struct dfa *d)
{
dfa = d;
lexstart = lexptr = s;
@@ -881,7 +997,7 @@ dfaparse(s, len, d)
parens = 0;
if (! syntax_bits_set)
- dfaerror("No syntax specified");
+ dfaerror(_("No syntax specified"));
tok = lex();
depth = d->depth;
@@ -889,7 +1005,7 @@ dfaparse(s, len, d)
regexp(1);
if (tok != END)
- dfaerror("Unbalanced )");
+ dfaerror(_("Unbalanced )"));
addtok(END - d->nregexps);
addtok(CAT);
@@ -904,9 +1020,7 @@ dfaparse(s, len, d)
/* Copy one set to another; the destination must be large enough. */
static void
-copy(src, dst)
- position_set *src;
- position_set *dst;
+copy (position_set *src, position_set *dst)
{
int i;
@@ -920,15 +1034,13 @@ copy(src, dst)
the same index then their constraints are logically or'd together.
S->elems must point to an array large enough to hold the resulting set. */
static void
-insert(p, s)
- position p;
- position_set *s;
+insert (position p, position_set *s)
{
int i;
position t1, t2;
for (i = 0; i < s->nelem && p.index < s->elems[i].index; ++i)
- ;
+ continue;
if (i < s->nelem && p.index == s->elems[i].index)
s->elems[i].constraint |= p.constraint;
else
@@ -947,10 +1059,7 @@ insert(p, s)
/* Merge two sets of positions into a third. The result is exactly as if
the positions of both sets were inserted into an initially empty set. */
static void
-merge(s1, s2, m)
- position_set *s1;
- position_set *s2;
- position_set *m;
+merge (position_set *s1, position_set *s2, position_set *m)
{
int i = 0, j = 0;
@@ -973,9 +1082,7 @@ merge(s1, s2, m)
/* Delete a position from a set. */
static void
-delete(p, s)
- position p;
- position_set *s;
+delete (position p, position_set *s)
{
int i;
@@ -992,11 +1099,7 @@ delete(p, s)
state. Newline and letter tell whether we got here on a newline or
letter, respectively. */
static int
-state_index(d, s, newline, letter)
- struct dfa *d;
- position_set *s;
- int newline;
- int letter;
+state_index (struct dfa *d, position_set *s, int newline, int letter)
{
int hash = 0;
int constraint;
@@ -1061,10 +1164,8 @@ state_index(d, s, newline, letter)
that position with the elements of its follow labeled with an appropriate
constraint. Repeat exhaustively until no funny positions are left.
S->elems must be large enough to hold the result. */
-void
-epsclosure(s, d)
- position_set *s;
- struct dfa *d;
+static void
+epsclosure (position_set *s, struct dfa *d)
{
int i, j;
int *visited;
@@ -1176,9 +1277,7 @@ epsclosure(s, d)
scheme; the number of elements in each set deeper in the stack can be
used to determine the address of a particular set's array. */
void
-dfaanalyze(d, searchflag)
- struct dfa *d;
- int searchflag;
+dfaanalyze (struct dfa *d, int searchflag)
{
int *nullable; /* Nullable stack. */
int *nfirstpos; /* Element count stack for firstpos sets. */
@@ -1439,10 +1538,7 @@ dfaanalyze(d, searchflag)
create a new group labeled with the characters of C and insert this
position in that group. */
void
-dfastate(s, d, trans)
- int s;
- struct dfa *d;
- int trans[];
+dfastate (int s, struct dfa *d, int trans[])
{
position_set grps[NOTCHAR]; /* As many as will ever be needed. */
charclass labels[NOTCHAR]; /* Labels corresponding to the groups. */
@@ -1463,7 +1559,7 @@ dfastate(s, d, trans)
int state_newline; /* New state on a newline transition. */
int wants_letter; /* New state wants to know letter context. */
int state_letter; /* New state on a letter transition. */
- static initialized; /* Flag for static initialization. */
+ static int initialized; /* Flag for static initialization. */
int i, j, k;
/* Initialize the set of letters, if necessary. */
@@ -1471,9 +1567,9 @@ dfastate(s, d, trans)
{
initialized = 1;
for (i = 0; i < NOTCHAR; ++i)
- if (ISALNUM(i))
+ if (IS_WORD_CONSTITUENT(i))
setbit(i, letters);
- setbit('\n', newline);
+ setbit(eolbyte, newline);
}
zeroset(matches);
@@ -1494,7 +1590,7 @@ dfastate(s, d, trans)
{
if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
d->states[s].newline, 1))
- clrbit('\n', matches);
+ clrbit(eolbyte, matches);
if (! MATCHES_NEWLINE_CONTEXT(pos.constraint,
d->states[s].newline, 0))
for (j = 0; j < CHARCLASS_INTS; ++j)
@@ -1510,7 +1606,7 @@ dfastate(s, d, trans)
/* If there are no characters left, there's no point in going on. */
for (j = 0; j < CHARCLASS_INTS && !matches[j]; ++j)
- ;
+ continue;
if (j == CHARCLASS_INTS)
continue;
}
@@ -1528,7 +1624,7 @@ dfastate(s, d, trans)
matches. */
intersectf = 0;
for (k = 0; k < CHARCLASS_INTS; ++k)
- (intersect[k] = matches[k] & labels[j][k]) ? intersectf = 1 : 0;
+ (intersect[k] = matches[k] & labels[j][k]) ? (intersectf = 1) : 0;
if (! intersectf)
continue;
@@ -1539,8 +1635,8 @@ dfastate(s, d, trans)
/* Even an optimizing compiler can't know this for sure. */
int match = matches[k], label = labels[j][k];
- (leftovers[k] = ~match & label) ? leftoversf = 1 : 0;
- (matches[k] = match & ~label) ? matchesf = 1 : 0;
+ (leftovers[k] = ~match & label) ? (leftoversf = 1) : 0;
+ (matches[k] = match & ~label) ? (matchesf = 1) : 0;
}
/* If there were leftovers, create a new group labeled with them. */
@@ -1604,12 +1700,8 @@ dfastate(s, d, trans)
else
state_letter = state;
for (i = 0; i < NOTCHAR; ++i)
- if (i == '\n')
- trans[i] = state_newline;
- else if (ISALNUM(i))
- trans[i] = state_letter;
- else
- trans[i] = state;
+ trans[i] = (IS_WORD_CONSTITUENT(i)) ? state_letter : state;
+ trans[eolbyte] = state_newline;
}
else
for (i = 0; i < NOTCHAR; ++i)
@@ -1633,7 +1725,7 @@ dfastate(s, d, trans)
/* Find out if the new state will want any context information. */
wants_newline = 0;
- if (tstbit('\n', labels[i]))
+ if (tstbit(eolbyte, labels[i]))
for (j = 0; j < follows.nelem; ++j)
if (PREV_NEWLINE_DEPENDENT(follows.elems[j].constraint))
wants_newline = 1;
@@ -1665,9 +1757,9 @@ dfastate(s, d, trans)
{
int c = j * INTBITS + k;
- if (c == '\n')
+ if (c == eolbyte)
trans[c] = state_newline;
- else if (ISALNUM(c))
+ else if (IS_WORD_CONSTITUENT(c))
trans[c] = state_letter;
else if (c < NOTCHAR)
trans[c] = state;
@@ -1688,9 +1780,7 @@ dfastate(s, d, trans)
TODO: Improve this comment, get rid of the unnecessary redundancy. */
static void
-build_state(s, d)
- int s;
- struct dfa *d;
+build_state (int s, struct dfa *d)
{
int *trans; /* The new transition table. */
int i;
@@ -1756,8 +1846,8 @@ build_state(s, d)
/* Keep the newline transition in a special place so we can use it as
a sentinel. */
- d->newlines[s] = trans['\n'];
- trans['\n'] = -1;
+ d->newlines[s] = trans[eolbyte];
+ trans[eolbyte] = -1;
if (ACCEPTING(s, *d))
d->fails[s] = trans;
@@ -1766,8 +1856,7 @@ build_state(s, d)
}
static void
-build_state_zero(d)
- struct dfa *d;
+build_state_zero (struct dfa *d)
{
d->tralloc = 1;
d->trcount = 0;
@@ -1793,20 +1882,16 @@ build_state_zero(d)
match needs to be verified by a backtracking matcher. Otherwise
we store a 0 in *backref. */
char *
-dfaexec(d, begin, end, newline, count, backref)
- struct dfa *d;
- char *begin;
- char *end;
- int newline;
- int *count;
- int *backref;
+dfaexec (struct dfa *d, char *begin, char *end,
+ int newline, int *count, int *backref)
{
- register s, s1, tmp; /* Current state. */
+ register int s, s1, tmp; /* Current state. */
register unsigned char *p; /* Current input character. */
- register **trans, *t; /* Copy of d->trans so it can be optimized
+ register int **trans, *t; /* Copy of d->trans so it can be optimized
into a register. */
- static sbit[NOTCHAR]; /* Table for anding with d->success. */
- static sbit_init;
+ register unsigned char eol = eolbyte; /* Likewise for eolbyte. */
+ static int sbit[NOTCHAR]; /* Table for anding with d->success. */
+ static int sbit_init;
if (! sbit_init)
{
@@ -1814,12 +1899,8 @@ dfaexec(d, begin, end, newline, count, backref)
sbit_init = 1;
for (i = 0; i < NOTCHAR; ++i)
- if (i == '\n')
- sbit[i] = 4;
- else if (ISALNUM(i))
- sbit[i] = 2;
- else
- sbit[i] = 1;
+ sbit[i] = (IS_WORD_CONSTITUENT(i)) ? 2 : 1;
+ sbit[eol] = 4;
}
if (! d->tralloc)
@@ -1828,34 +1909,25 @@ dfaexec(d, begin, end, newline, count, backref)
s = s1 = 0;
p = (unsigned char *) begin;
trans = d->trans;
- *end = '\n';
+ *end = eol;
for (;;)
{
- /* The dreaded inner loop. */
- if ((t = trans[s]) != 0)
- do
- {
- s1 = t[*p++];
- if (! (t = trans[s1]))
- goto last_was_s;
- s = t[*p++];
- }
- while ((t = trans[s]) != 0);
- goto last_was_s1;
- last_was_s:
- tmp = s, s = s1, s1 = tmp;
- last_was_s1:
+ while ((t = trans[s]) != 0) { /* hand-optimized loop */
+ s1 = t[*p++];
+ if ((t = trans[s1]) == 0) {
+ tmp = s ; s = s1 ; s1 = tmp ; /* swap */
+ break;
+ }
+ s = t[*p++];
+ }
if (s >= 0 && p <= (unsigned char *) end && d->fails[s])
{
if (d->success[s] & sbit[*p])
{
if (backref)
- if (d->states[s].backref)
- *backref = 1;
- else
- *backref = 0;
+ *backref = (d->states[s].backref != 0);
return (char *) p;
}
@@ -1865,7 +1937,7 @@ dfaexec(d, begin, end, newline, count, backref)
}
/* If the previous character was a newline, count it. */
- if (count && (char *) p <= end && p[-1] == '\n')
+ if (count && (char *) p <= end && p[-1] == eol)
++*count;
/* Check if we've run off the end of the buffer. */
@@ -1879,7 +1951,7 @@ dfaexec(d, begin, end, newline, count, backref)
continue;
}
- if (p[-1] == '\n' && newline)
+ if (p[-1] == eol && newline)
{
s = d->newlines[s1];
continue;
@@ -1892,8 +1964,7 @@ dfaexec(d, begin, end, newline, count, backref)
/* Initialize the components of a dfa that the other routines don't
initialize for themselves. */
void
-dfainit(d)
- struct dfa *d;
+dfainit (struct dfa *d)
{
d->calloc = 1;
MALLOC(d->charclasses, charclass, d->calloc);
@@ -1911,32 +1982,28 @@ dfainit(d)
/* Parse and analyze a single string of the given length. */
void
-dfacomp(s, len, d, searchflag)
- char *s;
- size_t len;
- struct dfa *d;
- int searchflag;
+dfacomp (char *s, size_t len, struct dfa *d, int searchflag)
{
if (case_fold) /* dummy folding in service of dfamust() */
{
- char *copy;
+ char *lcopy;
int i;
- copy = malloc(len);
- if (!copy)
- dfaerror("out of memory");
-
+ lcopy = malloc(len);
+ if (!lcopy)
+ dfaerror(_("out of memory"));
+
/* This is a kludge. */
case_fold = 0;
for (i = 0; i < len; ++i)
- if (ISUPPER(s[i]))
- copy[i] = tolower(s[i]);
+ if (ISUPPER ((unsigned char) s[i]))
+ lcopy[i] = tolower ((unsigned char) s[i]);
else
- copy[i] = s[i];
+ lcopy[i] = s[i];
dfainit(d);
- dfaparse(copy, len, d);
- free(copy);
+ dfaparse(lcopy, len, d);
+ free(lcopy);
dfamust(d);
d->cindex = d->tindex = d->depth = d->nleaves = d->nregexps = 0;
case_fold = 1;
@@ -1954,8 +2021,7 @@ dfacomp(s, len, d, searchflag)
/* Free the storage held by the components of a dfa. */
void
-dfafree(d)
- struct dfa *d;
+dfafree (struct dfa *d)
{
int i;
struct dfamust *dm, *ndm;
@@ -1974,9 +2040,10 @@ dfafree(d)
free((ptr_t) d->trans[i]);
else if (d->fails[i])
free((ptr_t) d->fails[i]);
- free((ptr_t) d->realtrans);
- free((ptr_t) d->fails);
- free((ptr_t) d->newlines);
+ if (d->realtrans) free((ptr_t) d->realtrans);
+ if (d->fails) free((ptr_t) d->fails);
+ if (d->newlines) free((ptr_t) d->newlines);
+ if (d->success) free((ptr_t) d->success);
for (dm = d->musts; dm; dm = ndm)
{
ndm = dm->next;
@@ -2015,9 +2082,9 @@ dfafree(d)
Type left right is in
---- ---- ----- -- --
char c # c # c # c # c
-
+
CSET ZERO ZERO ZERO ZERO
-
+
STAR ZERO ZERO ZERO ZERO
QMARK ZERO ZERO ZERO ZERO
@@ -2028,12 +2095,12 @@ dfafree(d)
p->left : q->right : q->is!=ZERO) ? q->in plus
p->is##q->left p->right##q->is p->is##q->is : p->right##q->left
ZERO
-
+
OR longest common longest common (do p->is and substrings common to
leading trailing q->is have same p->in and q->in
- (sub)sequence (sub)sequence length and
- of p->left of p->right content) ?
- and q->left and q->right p->is : NULL
+ (sub)sequence (sub)sequence length and
+ of p->left of p->right content) ?
+ and q->left and q->right p->is : NULL
If there's anything else we recognize in the tree, all four sequences get set
to zero-length sequences. If there's something we don't recognize in the tree,
@@ -2060,18 +2127,16 @@ dfafree(d)
Does optimization actually accomplish anything,
or is the automaton you get from "psi|epsilon" (for example)
the same as the one you get from "psi" (for example)?
-
+
Are optimizable r.e.'s likely to be used in real-life situations
(something like 'ab*' is probably unlikely; something like is
'psi|epsilon' is likelier)? */
static char *
-icatalloc(old, new)
- char *old;
- char *new;
+icatalloc (char *old, char *new)
{
char *result;
- int oldsize, newsize;
+ size_t oldsize, newsize;
newsize = (new == NULL) ? 0 : strlen(new);
if (old == NULL)
@@ -2089,19 +2154,16 @@ icatalloc(old, new)
}
static char *
-icpyalloc(string)
- char *string;
+icpyalloc (char *string)
{
return icatalloc((char *) NULL, string);
}
static char *
-istrstr(lookin, lookfor)
- char *lookin;
- char *lookfor;
+istrstr (char *lookin, char *lookfor)
{
char *cp;
- int len;
+ size_t len;
len = strlen(lookfor);
for (cp = lookin; *cp != '\0'; ++cp)
@@ -2111,16 +2173,14 @@ istrstr(lookin, lookfor)
}
static void
-ifree(cp)
- char *cp;
+ifree (char *cp)
{
if (cp != NULL)
free(cp);
}
static void
-freelist(cpp)
- char **cpp;
+freelist (char **cpp)
{
int i;
@@ -2134,10 +2194,7 @@ freelist(cpp)
}
static char **
-enlist(cpp, new, len)
- char **cpp;
- char *new;
- int len;
+enlist (char **cpp, char *new, size_t len)
{
int i, j;
@@ -2182,14 +2239,12 @@ enlist(cpp, new, len)
list of their distinct common substrings. Return NULL if something
seems wild. */
static char **
-comsubs(left, right)
- char *left;
- char *right;
+comsubs (char *left, char *right)
{
char **cpp;
char *lcp;
char *rcp;
- int i, len;
+ size_t i, len;
if (left == NULL || right == NULL)
return NULL;
@@ -2204,7 +2259,7 @@ comsubs(left, right)
while (rcp != NULL)
{
for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i)
- ;
+ continue;
if (i > len)
len = i;
rcp = index(rcp + 1, *lcp);
@@ -2218,9 +2273,7 @@ comsubs(left, right)
}
static char **
-addlists(old, new)
-char **old;
-char **new;
+addlists (char **old, char **new)
{
int i;
@@ -2238,9 +2291,7 @@ char **new;
/* Given two lists of substrings, return a new list giving substrings
common to both. */
static char **
-inboth(left, right)
- char **left;
- char **right;
+inboth (char **left, char **right)
{
char **both;
char **temp;
@@ -2264,6 +2315,7 @@ inboth(left, right)
}
both = addlists(both, temp);
freelist(temp);
+ free(temp);
if (both == NULL)
return NULL;
}
@@ -2280,16 +2332,14 @@ typedef struct
} must;
static void
-resetmust(mp)
-must *mp;
+resetmust (must *mp)
{
mp->left[0] = mp->right[0] = mp->is[0] = '\0';
freelist(mp->in);
}
static void
-dfamust(dfa)
-struct dfa *dfa;
+dfamust (struct dfa *dfa)
{
must *musts;
must *mp;
@@ -2300,8 +2350,9 @@ struct dfa *dfa;
token t;
static must must0;
struct dfamust *dm;
+ static char empty_string[] = "";
- result = "";
+ result = empty_string;
exact = 0;
musts = (must *) malloc((dfa->tindex + 1) * sizeof *musts);
if (musts == NULL)
@@ -2488,7 +2539,7 @@ struct dfa *dfa;
resetmust(mp);
mp->is[0] = mp->left[0] = mp->right[0] = t;
mp->is[1] = mp->left[1] = mp->right[1] = '\0';
- mp->in = enlist(mp->in, mp->is, 1);
+ mp->in = enlist(mp->in, mp->is, (size_t)1);
if (mp->in == NULL)
goto done;
}