diff options
author | Ted Unangst <tedu@cvs.openbsd.org> | 2003-06-23 22:05:24 +0000 |
---|---|---|
committer | Ted Unangst <tedu@cvs.openbsd.org> | 2003-06-23 22:05:24 +0000 |
commit | 943ad3c82d880acc9851f5086b4475401e539e42 (patch) | |
tree | e7102624316dbb10b7087ef49e74566a50b23c60 | |
parent | 1e3542cd612d942114a8eac614daad44324691a8 (diff) |
faster grep for simple patterns. derived from a patch by sean farley.
this makes searching for constant strings much faster by avoiding regex.
ok deraadt@
-rw-r--r-- | usr.bin/grep/grep.c | 43 | ||||
-rw-r--r-- | usr.bin/grep/grep.h | 20 | ||||
-rw-r--r-- | usr.bin/grep/mmfile.c | 23 | ||||
-rw-r--r-- | usr.bin/grep/util.c | 250 |
4 files changed, 322 insertions, 14 deletions
diff --git a/usr.bin/grep/grep.c b/usr.bin/grep/grep.c index ffc333d817b..bc75324af47 100644 --- a/usr.bin/grep/grep.c +++ b/usr.bin/grep/grep.c @@ -1,4 +1,4 @@ -/* $OpenBSD: grep.c,v 1.12 2003/06/23 07:52:18 deraadt Exp $ */ +/* $OpenBSD: grep.c,v 1.13 2003/06/23 22:05:23 tedu Exp $ */ /*- * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav @@ -49,6 +49,7 @@ int matchall; /* shortcut */ int patterns, pattern_sz; char **pattern; regex_t *r_pattern; +fastgrep_t *fg_pattern; /* For regex errors */ char re_error[RE_ERROR_BUF + 1]; @@ -93,6 +94,8 @@ enum { int first; /* flag whether or not this is our first match */ int tail; /* lines left to print */ int lead; /* number of lines in leading context queue */ +int boleol; /* At least one pattern has a bol or eol */ +int maxPatternLen; /* Longest length of all patterns */ extern char *__progname; @@ -163,13 +166,16 @@ add_pattern(char *pat, size_t len) } if (patterns == pattern_sz) { pattern_sz *= 2; - pattern = grep_realloc(pattern, ++pattern_sz); + pattern = grep_realloc(pattern, ++pattern_sz * sizeof(int)); } if (pat[len - 1] == '\n') --len; pattern[patterns] = grep_malloc(len + 1); strlcpy(pattern[patterns], pat, len + 1); ++patterns; + + if (len > maxPatternLen) + maxPatternLen = len; } static void @@ -200,6 +206,24 @@ read_patterns(char *fn) fclose(f); } +static void +free_patterns(void) +{ + int i; + + for (i = 0; i < patterns; i++) { + if (fg_pattern[i].pattern) + free(fg_pattern[i].pattern); + else + regfree(&r_pattern[i]); + free(pattern[i]); + } + + free(fg_pattern); + free(r_pattern); + free(pattern); +} + int main(int argc, char *argv[]) { @@ -293,6 +317,7 @@ main(int argc, char *argv[]) break; case 'i': case 'y': + iflag = 1; cflags |= REG_ICASE; break; case 'l': @@ -382,11 +407,17 @@ main(int argc, char *argv[]) } cflags |= Eflag ? REG_EXTENDED : REG_BASIC; + fg_pattern = grep_malloc(patterns * sizeof(*fg_pattern)); r_pattern = grep_malloc(patterns * sizeof(regex_t)); for (i = 0; i < patterns; ++i) { - if ((c = regcomp(&r_pattern[i], pattern[i], cflags))) { - regerror(c, &r_pattern[i], re_error, RE_ERROR_BUF); - errx(1, "%s", re_error); + /* Check if cheating is allowed */ + if (fastcomp(&fg_pattern[i], pattern[i])) { + /* Fall back to full regex library */ + if ((c = regcomp(&r_pattern[i], pattern[i], cflags))) { + regerror(c, &r_pattern[i], re_error, + RE_ERROR_BUF); + errx(1, "%s", re_error); + } } } @@ -402,5 +433,7 @@ main(int argc, char *argv[]) for (c = 0; argc--; ++argv) c += procfile(*argv); + free_patterns(); + exit(!c); } diff --git a/usr.bin/grep/grep.h b/usr.bin/grep/grep.h index d9981e3808b..782343ae066 100644 --- a/usr.bin/grep/grep.h +++ b/usr.bin/grep/grep.h @@ -1,4 +1,4 @@ -/* $OpenBSD: grep.h,v 1.3 2003/06/23 00:55:09 tedu Exp $ */ +/* $OpenBSD: grep.h,v 1.4 2003/06/23 22:05:23 tedu Exp $ */ /*- * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav @@ -27,6 +27,7 @@ */ #include <sys/types.h> +#include <sys/limits.h> #include <regex.h> #include <stdio.h> @@ -47,17 +48,28 @@ typedef struct { char *dat; } str_t; +typedef struct { + unsigned char *pattern; + int patternLen; + int qsBc[UCHAR_MAX + 1]; + /* flags */ + int bol; + int eol; + int reversedSearch; +} fastgrep_t; + /* Flags passed to regcomp() and regexec() */ extern int cflags, eflags; /* Command line flags */ extern int Aflag, Bflag, Hflag, Lflag, Pflag, Sflag, Rflag, Zflag, - bflag, cflag, hflag, lflag, nflag, qflag, sflag, + bflag, cflag, hflag, iflag, lflag, nflag, qflag, sflag, vflag, wflag, xflag; -extern int binbehave; +extern int binbehave, boleol, maxPatternLen; extern int first, lead, matchall, patterns, tail; extern char **pattern; +extern fastgrep_t *fg_pattern; extern regex_t *r_pattern; /* For regex errors */ @@ -69,7 +81,9 @@ int procfile(char *fn); int grep_tree(char **argv); void *grep_malloc(size_t size); void *grep_realloc(void *ptr, size_t size); +unsigned char *grep_strdup(const char *); void printline(str_t *line, int sep); +int fastcomp(fastgrep_t *, const char *); /* queue.c */ void initqueue(); diff --git a/usr.bin/grep/mmfile.c b/usr.bin/grep/mmfile.c index d58b213b631..2d66ddd671b 100644 --- a/usr.bin/grep/mmfile.c +++ b/usr.bin/grep/mmfile.c @@ -1,4 +1,4 @@ -/* $OpenBSD: mmfile.c,v 1.4 2003/06/23 07:52:18 deraadt Exp $ */ +/* $OpenBSD: mmfile.c,v 1.5 2003/06/23 22:05:23 tedu Exp $ */ /*- * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav @@ -38,6 +38,7 @@ #include "grep.h" #define MAX_MAP_LEN 1048576 +#define BLOCKSIZE 32768 mmf_t * mmopen(char *fn, char *mode) @@ -88,9 +89,23 @@ mmfgetln(mmf_t *mmf, size_t *l) if (mmf->ptr >= mmf->end) return NULL; - for (p = mmf->ptr; mmf->ptr < mmf->end; ++mmf->ptr) - if (*mmf->ptr == '\n') - break; + if ((lflag || qflag) &! boleol) { + /* Find starting point to search. */ + if (mmf->ptr == mmf->base) + p = mmf->ptr; + else + p = mmf->ptr - maxPatternLen; + /* Move the start pointer ahead for next iteration */ + if (mmf->end - mmf->ptr > BLOCKSIZE) + mmf->ptr += BLOCKSIZE; + else + mmf->ptr = mmf->end; + } else { + for (p = mmf->ptr; mmf->ptr < mmf->end; ++mmf->ptr) + if (*mmf->ptr == '\n') + break; + } + *l = mmf->ptr - p; ++mmf->ptr; return p; diff --git a/usr.bin/grep/util.c b/usr.bin/grep/util.c index 3ff11188f12..42959fb4447 100644 --- a/usr.bin/grep/util.c +++ b/usr.bin/grep/util.c @@ -1,4 +1,4 @@ -/* $OpenBSD: util.c,v 1.5 2003/06/23 07:52:18 deraadt Exp $ */ +/* $OpenBSD: util.c,v 1.6 2003/06/23 22:05:23 tedu Exp $ */ /*- * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav @@ -48,6 +48,9 @@ static int linesqueued; static int procline(str_t *l, int); +static int grep_search(fastgrep_t *, unsigned char *, int); +static int grep_cmp(const unsigned char *, const unsigned char *, size_t); +static void grep_revstr(unsigned char *, int); int grep_tree(char **argv) @@ -177,7 +180,11 @@ procline(str_t *l, int nottext) pmatch.rm_so = 0; pmatch.rm_eo = l->len; for (c = i = 0; i < patterns; i++) { - r = regexec(&r_pattern[i], l->dat, 0, &pmatch, eflags); + if (fg_pattern[i].pattern) + r = grep_search(&fg_pattern[i], (unsigned char *)l->dat, + l->len); + else + r = regexec(&r_pattern[i], l->dat, 0, &pmatch, eflags); if (r == REG_NOMATCH && t == 0) continue; if (r == 0) { @@ -222,6 +229,202 @@ print: return c; } +/* + * Returns: -1 on failure + * 0 on success + */ +int +fastcomp(fastgrep_t *fg, const char *pattern) +{ + int i; + int bol = 0; + int eol = 0; + int origPatternLen; + int shiftPatternLen; + int hasDot = 0; + int firstHalfDot = -1; + int firstLastHalfDot = -1; + int lastHalfDot = 0; + + /* Initialize. */ + origPatternLen = fg->patternLen = strlen(pattern); + fg->bol = 0; + fg->eol = 0; + fg->reversedSearch = 0; + + /* Remove end-of-line character ('$'). */ + if (pattern[fg->patternLen - 1] == '$') { + eol++; + fg->eol = 1; + fg->patternLen--; + boleol = 1; + } + + /* Remove beginning-of-line character ('^'). */ + if (pattern[0] == '^') { + bol++; + fg->bol = 1; + fg->patternLen--; + boleol = 1; + } + + /* + * Copy pattern minus '^' and '$' characters at the beginning and ending of + * the string respectively. + */ + fg->pattern = grep_strdup(pattern + bol); + + /* Look for ways to cheat...er...avoid the full regex engine. */ + for (i = 0; i < fg->patternLen; i++) + { + /* Can still cheat? */ + if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) || + (fg->pattern[i] == '_') || (fg->pattern[i] == ',') || + (fg->pattern[i] == '^') || (fg->pattern[i] == '$') || + (fg->pattern[i] == '=') || (fg->pattern[i] == '-') || + (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) { + /* As long as it is good, upper case it for later. */ + if (iflag) + fg->pattern[i] = toupper(fg->pattern[i]); + } else if (fg->pattern[i] == '.') { + hasDot = i; + if (i < fg->patternLen / 2) { + if (firstHalfDot < -1) + /* Closest dot to the beginning */ + firstHalfDot = i; + } else { + /* Closest dot to the end of the pattern. */ + lastHalfDot = i; + if (firstLastHalfDot < 0) + firstLastHalfDot = i; + } + } else { + /* Free memory and let others know this is empty. */ + free(fg->pattern); + fg->pattern = NULL; + return (-1); + } + } + + /* + * Determine if a reverse search would be faster based on the placement + * of the dots. + */ + if ((!(lflag || cflag)) && ((!(bol || eol)) && + ((lastHalfDot) && ((firstHalfDot < 0) || + ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) { + fg->reversedSearch = 1; + hasDot = fg->patternLen - (firstHalfDot < 0 ? + firstLastHalfDot : firstHalfDot) - 1; + grep_revstr(fg->pattern, fg->patternLen); + } + + /* + * Normal Quick Search would require a shift based on the position the + * next character after the comparison is within the pattern. With + * wildcards, the position of the last dot effects the maximum shift + * distance. + * The closer to the end the wild card is the slower the search. A + * reverse version of this algorithm would be useful for wildcards near + * the end of the string. + * + * Examples: + * Pattern Max shift + * ------- --------- + * this 5 + * .his 4 + * t.is 3 + * th.s 2 + * thi. 1 + */ + + /* Adjust the shift based on location of the last dot ('.'). */ + shiftPatternLen = fg->patternLen - hasDot; + + /* Preprocess pattern. */ + for (i = 0; i <= UCHAR_MAX; i++) + fg->qsBc[i] = shiftPatternLen; + for (i = hasDot + 1; i < fg->patternLen; i++) { + fg->qsBc[fg->pattern[i]] = fg->patternLen - i; + /* + * If case is ignored, make the jump apply to both upper and + * lower cased characters. As the pattern is stored in upper + * case, apply the same to the lower case equivalents. + */ + if (iflag) + fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i; + } + + /* + * Put pattern back to normal after pre-processing to allow for easy + * comparisons later. + */ + if (fg->reversedSearch) + grep_revstr(fg->pattern, fg->patternLen); + + return (0); +} + +static int grep_search(fastgrep_t *fg, unsigned char *data, int dataLen) +{ + int j; + int rtrnVal = REG_NOMATCH; + + /* No point in going farther if we do not have enough data. */ + if (dataLen < fg->patternLen) + return (rtrnVal); + + /* Only try once at the beginning or ending of the line. */ + if (fg->bol || fg->eol) { + /* Simple text comparison. */ + /* Verify data is >= pattern length before searching on it. */ + if (dataLen >= fg->patternLen) { + /* Determine where in data to start search at. */ + if (fg->eol) + j = dataLen - fg->patternLen; + else + j = 0; + if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen))) + if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) + rtrnVal = 0; + } + } else if (fg->reversedSearch) { + /* Quick Search algorithm. */ + j = dataLen; + do { + if (grep_cmp(fg->pattern, data + j - fg->patternLen, + fg->patternLen) == -1) { + rtrnVal = 0; + break; + } + + /* Shift if within bounds, otherwise, we are done. */ + if (j == 0) + break; + else + j -= fg->qsBc[data[j - fg->patternLen - 1]]; + } while (j >= 0); + } else { + /* Quick Search algorithm. */ + j = 0; + do { + if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) { + rtrnVal = 0; + break; + } + + /* Shift if within bounds, otherwise, we are done. */ + if (j + fg->patternLen == dataLen) + break; + else + j += fg->qsBc[data[j + fg->patternLen]]; + } while (j <= (dataLen - fg->patternLen)); + } + + return (rtrnVal); +} + + void * grep_malloc(size_t size) { @@ -240,6 +443,49 @@ grep_realloc(void *ptr, size_t size) return ptr; } +unsigned char * +grep_strdup(const char *str) +{ + unsigned char *ptr; + + if ((ptr = (unsigned char *)strdup(str)) == NULL) + err(1, "strdup"); + return ptr; +} + +/* + * Returns: i >= 0 on failure (position that it failed) + * -1 on success + */ +int +grep_cmp(const unsigned char *pattern, const unsigned char *data, + size_t len) +{ + int i; + + for (i = 0; i < len; i++) { + if (((pattern[i] == data[i]) || (pattern[i] == '.')) || + (iflag && pattern[i] == toupper(data[i]))) + continue; + return (i); + } + + return (-1); +} + +static void +grep_revstr(unsigned char *str, int len) +{ + int i; + char c; + + for (i = 0; i < len / 2; i++) { + c = str[i]; + str[i] = str[len - i - 1]; + str[len - i - 1] = c; + } +} + void printline(str_t *line, int sep) { |