summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTed Unangst <tedu@cvs.openbsd.org>2003-06-23 22:05:24 +0000
committerTed Unangst <tedu@cvs.openbsd.org>2003-06-23 22:05:24 +0000
commit943ad3c82d880acc9851f5086b4475401e539e42 (patch)
treee7102624316dbb10b7087ef49e74566a50b23c60
parent1e3542cd612d942114a8eac614daad44324691a8 (diff)
faster grep for simple patterns. derived from a patch by sean farley.
this makes searching for constant strings much faster by avoiding regex. ok deraadt@
-rw-r--r--usr.bin/grep/grep.c43
-rw-r--r--usr.bin/grep/grep.h20
-rw-r--r--usr.bin/grep/mmfile.c23
-rw-r--r--usr.bin/grep/util.c250
4 files changed, 322 insertions, 14 deletions
diff --git a/usr.bin/grep/grep.c b/usr.bin/grep/grep.c
index ffc333d817b..bc75324af47 100644
--- a/usr.bin/grep/grep.c
+++ b/usr.bin/grep/grep.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: grep.c,v 1.12 2003/06/23 07:52:18 deraadt Exp $ */
+/* $OpenBSD: grep.c,v 1.13 2003/06/23 22:05:23 tedu Exp $ */
/*-
* Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
@@ -49,6 +49,7 @@ int matchall; /* shortcut */
int patterns, pattern_sz;
char **pattern;
regex_t *r_pattern;
+fastgrep_t *fg_pattern;
/* For regex errors */
char re_error[RE_ERROR_BUF + 1];
@@ -93,6 +94,8 @@ enum {
int first; /* flag whether or not this is our first match */
int tail; /* lines left to print */
int lead; /* number of lines in leading context queue */
+int boleol; /* At least one pattern has a bol or eol */
+int maxPatternLen; /* Longest length of all patterns */
extern char *__progname;
@@ -163,13 +166,16 @@ add_pattern(char *pat, size_t len)
}
if (patterns == pattern_sz) {
pattern_sz *= 2;
- pattern = grep_realloc(pattern, ++pattern_sz);
+ pattern = grep_realloc(pattern, ++pattern_sz * sizeof(int));
}
if (pat[len - 1] == '\n')
--len;
pattern[patterns] = grep_malloc(len + 1);
strlcpy(pattern[patterns], pat, len + 1);
++patterns;
+
+ if (len > maxPatternLen)
+ maxPatternLen = len;
}
static void
@@ -200,6 +206,24 @@ read_patterns(char *fn)
fclose(f);
}
+static void
+free_patterns(void)
+{
+ int i;
+
+ for (i = 0; i < patterns; i++) {
+ if (fg_pattern[i].pattern)
+ free(fg_pattern[i].pattern);
+ else
+ regfree(&r_pattern[i]);
+ free(pattern[i]);
+ }
+
+ free(fg_pattern);
+ free(r_pattern);
+ free(pattern);
+}
+
int
main(int argc, char *argv[])
{
@@ -293,6 +317,7 @@ main(int argc, char *argv[])
break;
case 'i':
case 'y':
+ iflag = 1;
cflags |= REG_ICASE;
break;
case 'l':
@@ -382,11 +407,17 @@ main(int argc, char *argv[])
}
cflags |= Eflag ? REG_EXTENDED : REG_BASIC;
+ fg_pattern = grep_malloc(patterns * sizeof(*fg_pattern));
r_pattern = grep_malloc(patterns * sizeof(regex_t));
for (i = 0; i < patterns; ++i) {
- if ((c = regcomp(&r_pattern[i], pattern[i], cflags))) {
- regerror(c, &r_pattern[i], re_error, RE_ERROR_BUF);
- errx(1, "%s", re_error);
+ /* Check if cheating is allowed */
+ if (fastcomp(&fg_pattern[i], pattern[i])) {
+ /* Fall back to full regex library */
+ if ((c = regcomp(&r_pattern[i], pattern[i], cflags))) {
+ regerror(c, &r_pattern[i], re_error,
+ RE_ERROR_BUF);
+ errx(1, "%s", re_error);
+ }
}
}
@@ -402,5 +433,7 @@ main(int argc, char *argv[])
for (c = 0; argc--; ++argv)
c += procfile(*argv);
+ free_patterns();
+
exit(!c);
}
diff --git a/usr.bin/grep/grep.h b/usr.bin/grep/grep.h
index d9981e3808b..782343ae066 100644
--- a/usr.bin/grep/grep.h
+++ b/usr.bin/grep/grep.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: grep.h,v 1.3 2003/06/23 00:55:09 tedu Exp $ */
+/* $OpenBSD: grep.h,v 1.4 2003/06/23 22:05:23 tedu Exp $ */
/*-
* Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
@@ -27,6 +27,7 @@
*/
#include <sys/types.h>
+#include <sys/limits.h>
#include <regex.h>
#include <stdio.h>
@@ -47,17 +48,28 @@ typedef struct {
char *dat;
} str_t;
+typedef struct {
+ unsigned char *pattern;
+ int patternLen;
+ int qsBc[UCHAR_MAX + 1];
+ /* flags */
+ int bol;
+ int eol;
+ int reversedSearch;
+} fastgrep_t;
+
/* Flags passed to regcomp() and regexec() */
extern int cflags, eflags;
/* Command line flags */
extern int Aflag, Bflag, Hflag, Lflag, Pflag, Sflag, Rflag, Zflag,
- bflag, cflag, hflag, lflag, nflag, qflag, sflag,
+ bflag, cflag, hflag, iflag, lflag, nflag, qflag, sflag,
vflag, wflag, xflag;
-extern int binbehave;
+extern int binbehave, boleol, maxPatternLen;
extern int first, lead, matchall, patterns, tail;
extern char **pattern;
+extern fastgrep_t *fg_pattern;
extern regex_t *r_pattern;
/* For regex errors */
@@ -69,7 +81,9 @@ int procfile(char *fn);
int grep_tree(char **argv);
void *grep_malloc(size_t size);
void *grep_realloc(void *ptr, size_t size);
+unsigned char *grep_strdup(const char *);
void printline(str_t *line, int sep);
+int fastcomp(fastgrep_t *, const char *);
/* queue.c */
void initqueue();
diff --git a/usr.bin/grep/mmfile.c b/usr.bin/grep/mmfile.c
index d58b213b631..2d66ddd671b 100644
--- a/usr.bin/grep/mmfile.c
+++ b/usr.bin/grep/mmfile.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: mmfile.c,v 1.4 2003/06/23 07:52:18 deraadt Exp $ */
+/* $OpenBSD: mmfile.c,v 1.5 2003/06/23 22:05:23 tedu Exp $ */
/*-
* Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
@@ -38,6 +38,7 @@
#include "grep.h"
#define MAX_MAP_LEN 1048576
+#define BLOCKSIZE 32768
mmf_t *
mmopen(char *fn, char *mode)
@@ -88,9 +89,23 @@ mmfgetln(mmf_t *mmf, size_t *l)
if (mmf->ptr >= mmf->end)
return NULL;
- for (p = mmf->ptr; mmf->ptr < mmf->end; ++mmf->ptr)
- if (*mmf->ptr == '\n')
- break;
+ if ((lflag || qflag) &! boleol) {
+ /* Find starting point to search. */
+ if (mmf->ptr == mmf->base)
+ p = mmf->ptr;
+ else
+ p = mmf->ptr - maxPatternLen;
+ /* Move the start pointer ahead for next iteration */
+ if (mmf->end - mmf->ptr > BLOCKSIZE)
+ mmf->ptr += BLOCKSIZE;
+ else
+ mmf->ptr = mmf->end;
+ } else {
+ for (p = mmf->ptr; mmf->ptr < mmf->end; ++mmf->ptr)
+ if (*mmf->ptr == '\n')
+ break;
+ }
+
*l = mmf->ptr - p;
++mmf->ptr;
return p;
diff --git a/usr.bin/grep/util.c b/usr.bin/grep/util.c
index 3ff11188f12..42959fb4447 100644
--- a/usr.bin/grep/util.c
+++ b/usr.bin/grep/util.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: util.c,v 1.5 2003/06/23 07:52:18 deraadt Exp $ */
+/* $OpenBSD: util.c,v 1.6 2003/06/23 22:05:23 tedu Exp $ */
/*-
* Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
@@ -48,6 +48,9 @@
static int linesqueued;
static int procline(str_t *l, int);
+static int grep_search(fastgrep_t *, unsigned char *, int);
+static int grep_cmp(const unsigned char *, const unsigned char *, size_t);
+static void grep_revstr(unsigned char *, int);
int
grep_tree(char **argv)
@@ -177,7 +180,11 @@ procline(str_t *l, int nottext)
pmatch.rm_so = 0;
pmatch.rm_eo = l->len;
for (c = i = 0; i < patterns; i++) {
- r = regexec(&r_pattern[i], l->dat, 0, &pmatch, eflags);
+ if (fg_pattern[i].pattern)
+ r = grep_search(&fg_pattern[i], (unsigned char *)l->dat,
+ l->len);
+ else
+ r = regexec(&r_pattern[i], l->dat, 0, &pmatch, eflags);
if (r == REG_NOMATCH && t == 0)
continue;
if (r == 0) {
@@ -222,6 +229,202 @@ print:
return c;
}
+/*
+ * Returns: -1 on failure
+ * 0 on success
+ */
+int
+fastcomp(fastgrep_t *fg, const char *pattern)
+{
+ int i;
+ int bol = 0;
+ int eol = 0;
+ int origPatternLen;
+ int shiftPatternLen;
+ int hasDot = 0;
+ int firstHalfDot = -1;
+ int firstLastHalfDot = -1;
+ int lastHalfDot = 0;
+
+ /* Initialize. */
+ origPatternLen = fg->patternLen = strlen(pattern);
+ fg->bol = 0;
+ fg->eol = 0;
+ fg->reversedSearch = 0;
+
+ /* Remove end-of-line character ('$'). */
+ if (pattern[fg->patternLen - 1] == '$') {
+ eol++;
+ fg->eol = 1;
+ fg->patternLen--;
+ boleol = 1;
+ }
+
+ /* Remove beginning-of-line character ('^'). */
+ if (pattern[0] == '^') {
+ bol++;
+ fg->bol = 1;
+ fg->patternLen--;
+ boleol = 1;
+ }
+
+ /*
+ * Copy pattern minus '^' and '$' characters at the beginning and ending of
+ * the string respectively.
+ */
+ fg->pattern = grep_strdup(pattern + bol);
+
+ /* Look for ways to cheat...er...avoid the full regex engine. */
+ for (i = 0; i < fg->patternLen; i++)
+ {
+ /* Can still cheat? */
+ if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) ||
+ (fg->pattern[i] == '_') || (fg->pattern[i] == ',') ||
+ (fg->pattern[i] == '^') || (fg->pattern[i] == '$') ||
+ (fg->pattern[i] == '=') || (fg->pattern[i] == '-') ||
+ (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) {
+ /* As long as it is good, upper case it for later. */
+ if (iflag)
+ fg->pattern[i] = toupper(fg->pattern[i]);
+ } else if (fg->pattern[i] == '.') {
+ hasDot = i;
+ if (i < fg->patternLen / 2) {
+ if (firstHalfDot < -1)
+ /* Closest dot to the beginning */
+ firstHalfDot = i;
+ } else {
+ /* Closest dot to the end of the pattern. */
+ lastHalfDot = i;
+ if (firstLastHalfDot < 0)
+ firstLastHalfDot = i;
+ }
+ } else {
+ /* Free memory and let others know this is empty. */
+ free(fg->pattern);
+ fg->pattern = NULL;
+ return (-1);
+ }
+ }
+
+ /*
+ * Determine if a reverse search would be faster based on the placement
+ * of the dots.
+ */
+ if ((!(lflag || cflag)) && ((!(bol || eol)) &&
+ ((lastHalfDot) && ((firstHalfDot < 0) ||
+ ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
+ fg->reversedSearch = 1;
+ hasDot = fg->patternLen - (firstHalfDot < 0 ?
+ firstLastHalfDot : firstHalfDot) - 1;
+ grep_revstr(fg->pattern, fg->patternLen);
+ }
+
+ /*
+ * Normal Quick Search would require a shift based on the position the
+ * next character after the comparison is within the pattern. With
+ * wildcards, the position of the last dot effects the maximum shift
+ * distance.
+ * The closer to the end the wild card is the slower the search. A
+ * reverse version of this algorithm would be useful for wildcards near
+ * the end of the string.
+ *
+ * Examples:
+ * Pattern Max shift
+ * ------- ---------
+ * this 5
+ * .his 4
+ * t.is 3
+ * th.s 2
+ * thi. 1
+ */
+
+ /* Adjust the shift based on location of the last dot ('.'). */
+ shiftPatternLen = fg->patternLen - hasDot;
+
+ /* Preprocess pattern. */
+ for (i = 0; i <= UCHAR_MAX; i++)
+ fg->qsBc[i] = shiftPatternLen;
+ for (i = hasDot + 1; i < fg->patternLen; i++) {
+ fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
+ /*
+ * If case is ignored, make the jump apply to both upper and
+ * lower cased characters. As the pattern is stored in upper
+ * case, apply the same to the lower case equivalents.
+ */
+ if (iflag)
+ fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
+ }
+
+ /*
+ * Put pattern back to normal after pre-processing to allow for easy
+ * comparisons later.
+ */
+ if (fg->reversedSearch)
+ grep_revstr(fg->pattern, fg->patternLen);
+
+ return (0);
+}
+
+static int grep_search(fastgrep_t *fg, unsigned char *data, int dataLen)
+{
+ int j;
+ int rtrnVal = REG_NOMATCH;
+
+ /* No point in going farther if we do not have enough data. */
+ if (dataLen < fg->patternLen)
+ return (rtrnVal);
+
+ /* Only try once at the beginning or ending of the line. */
+ if (fg->bol || fg->eol) {
+ /* Simple text comparison. */
+ /* Verify data is >= pattern length before searching on it. */
+ if (dataLen >= fg->patternLen) {
+ /* Determine where in data to start search at. */
+ if (fg->eol)
+ j = dataLen - fg->patternLen;
+ else
+ j = 0;
+ if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen)))
+ if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1)
+ rtrnVal = 0;
+ }
+ } else if (fg->reversedSearch) {
+ /* Quick Search algorithm. */
+ j = dataLen;
+ do {
+ if (grep_cmp(fg->pattern, data + j - fg->patternLen,
+ fg->patternLen) == -1) {
+ rtrnVal = 0;
+ break;
+ }
+
+ /* Shift if within bounds, otherwise, we are done. */
+ if (j == 0)
+ break;
+ else
+ j -= fg->qsBc[data[j - fg->patternLen - 1]];
+ } while (j >= 0);
+ } else {
+ /* Quick Search algorithm. */
+ j = 0;
+ do {
+ if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
+ rtrnVal = 0;
+ break;
+ }
+
+ /* Shift if within bounds, otherwise, we are done. */
+ if (j + fg->patternLen == dataLen)
+ break;
+ else
+ j += fg->qsBc[data[j + fg->patternLen]];
+ } while (j <= (dataLen - fg->patternLen));
+ }
+
+ return (rtrnVal);
+}
+
+
void *
grep_malloc(size_t size)
{
@@ -240,6 +443,49 @@ grep_realloc(void *ptr, size_t size)
return ptr;
}
+unsigned char *
+grep_strdup(const char *str)
+{
+ unsigned char *ptr;
+
+ if ((ptr = (unsigned char *)strdup(str)) == NULL)
+ err(1, "strdup");
+ return ptr;
+}
+
+/*
+ * Returns: i >= 0 on failure (position that it failed)
+ * -1 on success
+ */
+int
+grep_cmp(const unsigned char *pattern, const unsigned char *data,
+ size_t len)
+{
+ int i;
+
+ for (i = 0; i < len; i++) {
+ if (((pattern[i] == data[i]) || (pattern[i] == '.')) ||
+ (iflag && pattern[i] == toupper(data[i])))
+ continue;
+ return (i);
+ }
+
+ return (-1);
+}
+
+static void
+grep_revstr(unsigned char *str, int len)
+{
+ int i;
+ char c;
+
+ for (i = 0; i < len / 2; i++) {
+ c = str[i];
+ str[i] = str[len - i - 1];
+ str[len - i - 1] = c;
+ }
+}
+
void
printline(str_t *line, int sep)
{