diff options
author | Todd C. Miller <millert@cvs.openbsd.org> | 2004-01-25 21:36:01 +0000 |
---|---|---|
committer | Todd C. Miller <millert@cvs.openbsd.org> | 2004-01-25 21:36:01 +0000 |
commit | 87fcaaa7c3c1c0c0dd898f12de2d379baafb0ed1 (patch) | |
tree | c73dc31f440b3c08e0564898d8c004b236de23a8 /usr.bin/grep | |
parent | 6451aa97207663c0fae475e74fed893c6d313f45 (diff) |
Previously, in -w mode, for each match on a line grep would check
to see if the match was on a word boundary. However, this missed
lines where the first match was not on a word boundary but a
subsequent match was. Problem originally spotted by miod@
We fix this by using the [[:<:]] and [[:>:]] character classes for
the slow path and by checking the word boundaries in grep_search()
for the fast path instead of doing the checks after running
regexec() or grep_search().
With this change, grep passes the new regress tests 15 and 16.
problem originally spotted by espie@.
Diffstat (limited to 'usr.bin/grep')
-rw-r--r-- | usr.bin/grep/grep.c | 14 | ||||
-rw-r--r-- | usr.bin/grep/grep.h | 4 | ||||
-rw-r--r-- | usr.bin/grep/util.c | 78 |
3 files changed, 54 insertions, 42 deletions
diff --git a/usr.bin/grep/grep.c b/usr.bin/grep/grep.c index 111545df623..6e6a8cf6126 100644 --- a/usr.bin/grep/grep.c +++ b/usr.bin/grep/grep.c @@ -1,4 +1,4 @@ -/* $OpenBSD: grep.c,v 1.24 2003/12/11 20:49:20 mcbride Exp $ */ +/* $OpenBSD: grep.c,v 1.25 2004/01/25 21:36:00 millert Exp $ */ /*- * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav @@ -172,10 +172,16 @@ add_pattern(char *pat, size_t len) } if (pat[len - 1] == '\n') --len; - pattern[patterns] = grep_malloc(len + 1); /* pat may not be NUL-terminated */ - memcpy(pattern[patterns], pat, len); - pattern[patterns][len] = '\0'; + if (wflag) { + pattern[patterns] = grep_malloc(len + 15); + snprintf(pattern[patterns], len + 15, "[[:<:]]%.*s[[:>:]]", + (int)len, pat); + } else { + pattern[patterns] = grep_malloc(len + 1); + memcpy(pattern[patterns], pat, len); + pattern[patterns][len] = '\0'; + } ++patterns; if (len > maxPatternLen) diff --git a/usr.bin/grep/grep.h b/usr.bin/grep/grep.h index 1d886d57c20..43842ba56d5 100644 --- a/usr.bin/grep/grep.h +++ b/usr.bin/grep/grep.h @@ -1,4 +1,4 @@ -/* $OpenBSD: grep.h,v 1.9 2004/01/19 16:12:04 otto Exp $ */ +/* $OpenBSD: grep.h,v 1.10 2004/01/25 21:36:00 millert Exp $ */ /*- * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav @@ -55,6 +55,7 @@ typedef struct { /* flags */ int bol; int eol; + int wmatch; int reversedSearch; } fastgrep_t; @@ -83,7 +84,6 @@ int procfile(char *fn); int grep_tree(char **argv); void *grep_malloc(size_t size); void *grep_realloc(void *ptr, size_t size); -unsigned char *grep_strdup(const char *); void printline(str_t *line, int sep); int fastcomp(fastgrep_t *, const char *); diff --git a/usr.bin/grep/util.c b/usr.bin/grep/util.c index 341d787ce44..80c0ee0132e 100644 --- a/usr.bin/grep/util.c +++ b/usr.bin/grep/util.c @@ -1,4 +1,4 @@ -/* $OpenBSD: util.c,v 1.21 2004/01/19 16:12:04 otto Exp $ */ +/* $OpenBSD: util.c,v 1.22 2004/01/25 21:36:00 millert Exp $ */ /*- * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav @@ -178,25 +178,17 @@ procline(str_t *l, int nottext) } for (c = i = 0; i < patterns; i++) { - pmatch.rm_so = 0; - pmatch.rm_eo = l->len; - if (fg_pattern[i].pattern) + if (fg_pattern[i].pattern) { r = grep_search(&fg_pattern[i], (unsigned char *)l->dat, l->len, &pmatch); - else + } else { + pmatch.rm_so = 0; + pmatch.rm_eo = l->len; r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags); - if (r == 0) { - if (wflag) { - if ((pmatch.rm_so != 0 && - isword(l->dat[pmatch.rm_so - 1])) || - (pmatch.rm_eo != l->len && - isword(l->dat[pmatch.rm_eo]))) - r = REG_NOMATCH; - } - if (xflag) { - if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len) - r = REG_NOMATCH; - } + } + if (r == 0 && xflag) { + if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len) + r = REG_NOMATCH; } if (r == 0) { c++; @@ -254,6 +246,7 @@ fastcomp(fastgrep_t *fg, const char *pattern) origPatternLen = fg->patternLen = strlen(pattern); fg->bol = 0; fg->eol = 0; + fg->wmatch = 0; fg->reversedSearch = 0; /* Remove end-of-line character ('$'). */ @@ -272,11 +265,22 @@ fastcomp(fastgrep_t *fg, const char *pattern) boleol = 1; } + /* Remove enclosing [[:<:]] and [[:>:]] (word match). */ + if (fg->patternLen > 14 + fg->bol + fg->eol && + strncmp(pattern + fg->bol, "[[:<:]]", 7) == 0 && + strncmp(pattern + fg->patternLen - (7 + fg->eol), "[[:>:]]", 7) == 0) { + fg->patternLen -= 14; + fg->wmatch = 7; + } + /* - * Copy pattern minus '^' and '$' characters at the beginning and - * ending of the string respectively. + * Copy pattern minus '^' and '$' characters as well as word + * match character classes at the beginning and ending of the + * string respectively. */ - fg->pattern = grep_strdup(pattern + bol); + fg->pattern = grep_malloc(fg->patternLen + 1); + memcpy(fg->pattern, pattern + bol + fg->wmatch, fg->patternLen); + fg->pattern[fg->patternLen] = '\0'; /* Look for ways to cheat...er...avoid the full regex engine. */ for (i = 0; i < fg->patternLen; i++) @@ -369,6 +373,9 @@ fastcomp(fastgrep_t *fg, const char *pattern) return (0); } +#define wmatch(d, l, s, e) \ + ((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e]))) + static int grep_search(fastgrep_t *fg, unsigned char *data, int dataLen, regmatch_t *pmatch) { @@ -393,10 +400,13 @@ grep_search(fastgrep_t *fg, unsigned char *data, int dataLen, regmatch_t *pmatch else j = 0; if (!((fg->bol && fg->eol) && (dataLen != fg->patternLen))) - if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) { - rtrnVal = 0; + if (grep_cmp(fg->pattern, data + j, + fg->patternLen) == -1) { pmatch->rm_so = j; pmatch->rm_eo = j + fg->patternLen; + if (!fg->wmatch || wmatch(data, dataLen, + pmatch->rm_so, pmatch->rm_eo)) + rtrnVal = 0; } } } else if (fg->reversedSearch) { @@ -405,10 +415,13 @@ grep_search(fastgrep_t *fg, unsigned char *data, int dataLen, regmatch_t *pmatch do { if (grep_cmp(fg->pattern, data + j - fg->patternLen, fg->patternLen) == -1) { - rtrnVal = 0; pmatch->rm_so = j - fg->patternLen; pmatch->rm_eo = j; - break; + if (!fg->wmatch || wmatch(data, dataLen, + pmatch->rm_so, pmatch->rm_eo)) { + rtrnVal = 0; + break; + } } /* Shift if within bounds, otherwise, we are done. */ if (j == fg->patternLen) @@ -420,10 +433,13 @@ grep_search(fastgrep_t *fg, unsigned char *data, int dataLen, regmatch_t *pmatch j = 0; do { if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) { - rtrnVal = 0; pmatch->rm_so = j; pmatch->rm_eo = j + fg->patternLen; - break; + if (!fg->wmatch || wmatch(data, dataLen, + pmatch->rm_so, pmatch->rm_eo)) { + rtrnVal = 0; + break; + } } /* Shift if within bounds, otherwise, we are done. */ @@ -456,16 +472,6 @@ grep_realloc(void *ptr, size_t size) return ptr; } -unsigned char * -grep_strdup(const char *str) -{ - unsigned char *ptr; - - if ((ptr = (unsigned char *)strdup(str)) == NULL) - err(2, "strdup"); - return ptr; -} - /* * Returns: i >= 0 on failure (position that it failed) * -1 on success |