diff options
author | Todd C. Miller <millert@cvs.openbsd.org> | 2020-06-10 21:01:51 +0000 |
---|---|---|
committer | Todd C. Miller <millert@cvs.openbsd.org> | 2020-06-10 21:01:51 +0000 |
commit | e5618da0be005b32ae4f7ed6e0c7d379b53b76bc (patch) | |
tree | f5ac11c8372bbc7275ba333ea57d0b0f86853a13 | |
parent | d4a9f9a2c15918d627426cc99b76e43a7c447eca (diff) |
Update awk to March 5, 2019 version.
-rw-r--r-- | usr.bin/awk/FIXES | 18 | ||||
-rw-r--r-- | usr.bin/awk/Makefile | 4 | ||||
-rw-r--r-- | usr.bin/awk/awk.1 | 9 | ||||
-rw-r--r-- | usr.bin/awk/b.c | 251 | ||||
-rw-r--r-- | usr.bin/awk/main.c | 4 | ||||
-rw-r--r-- | usr.bin/awk/maketab.c | 10 |
6 files changed, 277 insertions, 19 deletions
diff --git a/usr.bin/awk/FIXES b/usr.bin/awk/FIXES index 0370221a135..dd8fd980ebe 100644 --- a/usr.bin/awk/FIXES +++ b/usr.bin/awk/FIXES @@ -1,4 +1,4 @@ -/* $OpenBSD: FIXES,v 1.22 2020/06/10 21:01:32 millert Exp $ */ +/* $OpenBSD: FIXES,v 1.23 2020/06/10 21:01:50 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved @@ -26,6 +26,22 @@ THIS SOFTWARE. This file lists all bug fixes, changes, etc., made since the AWK book was sent to the printers in August, 1987. +Mar 5, 2019: + Added support for POSIX-standard interval expressions (a.k.a. + bounds, a.k.a. repetition expressions) in regular expressions, + backported (via NetBSD) from Apple awk-24 (20070501). + Thanks to Martijn Dekker <martijn@inlv.org> for the port. + (Merged from PR #30.) + +Mar 3, 2019: + Merge PRs as follows: + #12: Avoid undefined behaviour when using ctype(3) functions in + relex(). Thanks to GitHub user iamleot. + #31: Make getline handle numeric strings, and update FIXES. Thanks + to GitHub user arnoldrobbins + #32: maketab: support build systems with read-only source. Thanks + to GitHub user enh. + Jan 25, 2019: Make getline handle numeric strings properly in all cases. (Thanks, Arnold.) diff --git a/usr.bin/awk/Makefile b/usr.bin/awk/Makefile index 352d521f799..5d817dbcea9 100644 --- a/usr.bin/awk/Makefile +++ b/usr.bin/awk/Makefile @@ -1,4 +1,4 @@ -# $OpenBSD: Makefile,v 1.16 2017/07/10 21:30:37 espie Exp $ +# $OpenBSD: Makefile,v 1.17 2020/06/10 21:01:50 millert Exp $ PROG= awk SRCS= ytab.c lex.c b.c main.c parse.c proctab.c tran.c lib.c run.c @@ -14,7 +14,7 @@ ytab.c ytab.h: awkgram.y BUILDFIRST = ytab.h proctab.c: maketab - ./maketab >proctab.c + ./maketab ytab.h >proctab.c maketab: ytab.h maketab.c ${HOSTCC} ${HOSTCFLAGS} ${.CURDIR}/maketab.c -o $@ diff --git a/usr.bin/awk/awk.1 b/usr.bin/awk/awk.1 index 7c0fda130be..fb257d029ec 100644 --- a/usr.bin/awk/awk.1 +++ b/usr.bin/awk/awk.1 @@ -1,4 +1,4 @@ -.\" $OpenBSD: awk.1,v 1.47 2020/06/10 21:00:01 millert Exp $ +.\" $OpenBSD: awk.1,v 1.48 2020/06/10 21:01:50 millert Exp $ .\" .\" Copyright (C) Lucent Technologies 1997 .\" All Rights Reserved @@ -787,10 +787,7 @@ The .Nm utility is compliant with the .St -p1003.1-2008 -specification, -except -.Nm -does not support {n,m} pattern matching. +specification. .Pp The flags .Op Fl \&dV @@ -815,6 +812,4 @@ to it. The scope rules for variables in functions are a botch; the syntax is worse. .Pp -POSIX-standard interval expressions in regular expressions are not supported. -.Pp Only eight-bit character sets are handled correctly. diff --git a/usr.bin/awk/b.c b/usr.bin/awk/b.c index 3b88f46debd..ff3ac472c6f 100644 --- a/usr.bin/awk/b.c +++ b/usr.bin/awk/b.c @@ -1,4 +1,4 @@ -/* $OpenBSD: b.c,v 1.22 2020/06/10 21:01:32 millert Exp $ */ +/* $OpenBSD: b.c,v 1.23 2020/06/10 21:01:50 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved @@ -28,6 +28,7 @@ THIS SOFTWARE. #define DEBUG #include <ctype.h> +#include <limits.h> #include <stdio.h> #include <string.h> #include <stdlib.h> @@ -66,6 +67,11 @@ int rlxval; static uschar *rlxstr; static uschar *prestr; /* current position in current re */ static uschar *lastre; /* origin of last re */ +static uschar *lastatom; /* origin of last Atom */ +static uschar *starttok; +static uschar *basestr; /* starts with original, replaced during + repetition processing */ +static uschar *firstbasestr; static int setcnt; static int poscnt; @@ -125,6 +131,8 @@ fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */ Node *p, *p1; fa *f; + firstbasestr = (uschar *) s; + basestr = firstbasestr; p = reparse(s); p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p); /* put ALL STAR in front of reg. exp. */ @@ -146,6 +154,10 @@ fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */ f->initstat = makeinit(f, anchor); f->anchor = anchor; f->restr = (uschar *) tostring(s); + if (firstbasestr != basestr) { + if (basestr) + xfree(basestr); + } return f; } @@ -637,9 +649,11 @@ Node *regexp(void) /* top-level parse of reg expr */ Node *primary(void) { Node *np; + int savelastatom; switch (rtok) { case CHAR: + lastatom = starttok; np = op2(CHAR, NIL, itonp(rlxval)); rtok = relex(); return (unary(np)); @@ -648,16 +662,19 @@ Node *primary(void) return (unary(op2(ALL, NIL, NIL))); case EMPTYRE: rtok = relex(); - return (unary(op2(ALL, NIL, NIL))); + return (unary(op2(EMPTYRE, NIL, NIL))); case DOT: + lastatom = starttok; rtok = relex(); return (unary(op2(DOT, NIL, NIL))); case CCL: np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr)); + lastatom = starttok; rtok = relex(); return (unary(np)); case NCCL: np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr)); + lastatom = starttok; rtok = relex(); return (unary(np)); case '^': @@ -667,6 +684,8 @@ Node *primary(void) rtok = relex(); return (unary(op2(CHAR, NIL, NIL))); case '(': + lastatom = starttok; + savelastatom = starttok - basestr; /* Retain over recursion */ rtok = relex(); if (rtok == ')') { /* special pleading for () */ rtok = relex(); @@ -674,6 +693,7 @@ Node *primary(void) } np = regexp(); if (rtok == ')') { + lastatom = basestr + savelastatom; /* Restore */ rtok = relex(); return (unary(np)); } @@ -688,8 +708,12 @@ Node *primary(void) Node *concat(Node *np) { switch (rtok) { - case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(': + case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(': return (concat(op2(CAT, np, primary()))); + case EMPTYRE: + rtok = relex(); + return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")), + primary()))); } return (np); } @@ -773,6 +797,117 @@ struct charclass { { NULL, 0, NULL }, }; +#define REPEAT_SIMPLE 0 +#define REPEAT_PLUS_APPENDED 1 +#define REPEAT_WITH_Q 2 +#define REPEAT_ZERO 3 + +static int +replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom, + int atomlen, int firstnum, int secondnum, int special_case) +{ + int i, j; + uschar *buf = 0; + int ret = 1; + int init_q = (firstnum==0); /* first added char will be ? */ + int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */ + int prefix_length = reptok - basestr; /* prefix includes first rep */ + int suffix_length = strlen((char *) reptok) - reptoklen; /* string after rep specifier */ + int size = prefix_length + suffix_length; + + if (firstnum > 1) { /* add room for reps 2 through firstnum */ + size += atomlen*(firstnum-1); + } + + /* Adjust size of buffer for special cases */ + if (special_case == REPEAT_PLUS_APPENDED) { + size++; /* for the final + */ + } else if (special_case == REPEAT_WITH_Q) { + size += init_q + (atomlen+1)* n_q_reps; + } else if (special_case == REPEAT_ZERO) { + size += 2; /* just a null ERE: () */ + } + if ((buf = (uschar *) malloc(size+1)) == NULL) + FATAL("out of space in reg expr %.10s..", lastre); + memcpy(buf, basestr, prefix_length); /* copy prefix */ + j = prefix_length; + if (special_case == REPEAT_ZERO) { + j -= atomlen; + buf[j++] = '('; + buf[j++] = ')'; + } + for (i=1; i < firstnum; i++) { /* copy x reps */ + memcpy(&buf[j], atom, atomlen); + j += atomlen; + } + if (special_case == REPEAT_PLUS_APPENDED) { + buf[j++] = '+'; + } else if (special_case == REPEAT_WITH_Q) { + if (init_q) buf[j++] = '?'; + for (i=0; i < n_q_reps; i++) { /* copy x? reps */ + memcpy(&buf[j], atom, atomlen); + j += atomlen; + buf[j++] = '?'; + } + } + memcpy(&buf[j], reptok+reptoklen, suffix_length); + if (special_case == REPEAT_ZERO) { + buf[j+suffix_length] = '\0'; + } else { + buf[size] = '\0'; + } + /* free old basestr */ + if (firstbasestr != basestr) { + if (basestr) + xfree(basestr); + } + basestr = buf; + prestr = buf + prefix_length; + if (special_case == REPEAT_ZERO) { + prestr -= atomlen; + ret++; + } + return ret; +} + +static int repeat(const uschar *reptok, int reptoklen, const uschar *atom, + int atomlen, int firstnum, int secondnum) +{ + /* + In general, the repetition specifier or "bound" is replaced here + by an equivalent ERE string, repeating the immediately previous atom + and appending ? and + as needed. Note that the first copy of the + atom is left in place, except in the special_case of a zero-repeat + (i.e., {0}). + */ + if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */ + if (firstnum < 2) { + /* 0 or 1: should be handled before you get here */ + FATAL("internal error"); + } else { + return replace_repeat(reptok, reptoklen, atom, atomlen, + firstnum, secondnum, REPEAT_PLUS_APPENDED); + } + } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */ + if (firstnum == 0) { /* {0} or {0,0} */ + /* This case is unusual because the resulting + replacement string might actually be SMALLER than + the original ERE */ + return replace_repeat(reptok, reptoklen, atom, atomlen, + firstnum, secondnum, REPEAT_ZERO); + } else { /* (firstnum >= 1) */ + return replace_repeat(reptok, reptoklen, atom, atomlen, + firstnum, secondnum, REPEAT_SIMPLE); + } + } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */ + /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */ + return replace_repeat(reptok, reptoklen, atom, atomlen, + firstnum, secondnum, REPEAT_WITH_Q); + } else { /* Error - shouldn't be here (n>m) */ + FATAL("internal error"); + } + return 0; +} int relex(void) /* lexical analyzer for reparse */ { @@ -783,6 +918,11 @@ int relex(void) /* lexical analyzer for reparse */ uschar *bp; struct charclass *cc; int i; + int num, m, commafound, digitfound; + const uschar *startreptok; + +rescan: + starttok = prestr; switch (c = *prestr++) { case '|': return OR; @@ -839,7 +979,7 @@ int relex(void) /* lexical analyzer for reparse */ * not without first adapting the entire * program to track each string's length. */ - for (i = 1; i < NCHARS; i++) { + for (i = 1; i <= UCHAR_MAX; i++) { if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2")) FATAL("out of space for reg expr %.10s...", lastre); if (cc->cc_func(i)) { @@ -849,6 +989,40 @@ int relex(void) /* lexical analyzer for reparse */ } } else *bp++ = c; + } else if (c == '[' && *prestr == '.') { + char collate_char; + prestr++; + collate_char = *prestr++; + if (*prestr == '.' && prestr[1] == ']') { + prestr += 2; + /* Found it: map via locale TBD: for + now, simply return this char. This + is sufficient to pass conformance + test awk.ex 156 + */ + if (*prestr == ']') { + prestr++; + rlxval = collate_char; + return CHAR; + } + } + } else if (c == '[' && *prestr == '=') { + char equiv_char; + prestr++; + equiv_char = *prestr++; + if (*prestr == '=' && prestr[1] == ']') { + prestr += 2; + /* Found it: map via locale TBD: for now + simply return this char. This is + sufficient to pass conformance test + awk.ex 156 + */ + if (*prestr == ']') { + prestr++; + rlxval = equiv_char; + return CHAR; + } + } } else if (c == '\0') { FATAL("nonterminated character class %.20s", lastre); } else if (bp == buf) { /* 1st char is special */ @@ -863,6 +1037,75 @@ int relex(void) /* lexical analyzer for reparse */ } else *bp++ = c; } + break; + case '{': + if (isdigit(*(prestr))) { + num = 0; /* Process as a repetition */ + n = -1; m = -1; + commafound = 0; + digitfound = 0; + startreptok = prestr-1; + /* Remember start of previous atom here ? */ + } else { /* just a { char, not a repetition */ + rlxval = c; + return CHAR; + } + for (; ; ) { + if ((c = *prestr++) == '}') { + if (commafound) { + if (digitfound) { /* {n,m} */ + m = num; + if (m<n) + FATAL("illegal repetition expression: class %.20s", + lastre); + if ((n==0) && (m==1)) { + return QUEST; + } + } else { /* {n,} */ + if (n==0) return STAR; + if (n==1) return PLUS; + } + } else { + if (digitfound) { /* {n} same as {n,n} */ + n = num; + m = num; + } else { /* {} */ + FATAL("illegal repetition expression: class %.20s", + lastre); + } + } + if (repeat(starttok, prestr-starttok, lastatom, + startreptok - lastatom, n, m) > 0) { + if ((n==0) && (m==0)) { + return EMPTYRE; + } + /* must rescan input for next token */ + goto rescan; + } + /* Failed to replace: eat up {...} characters + and treat like just PLUS */ + return PLUS; + } else if (c == '\0') { + FATAL("nonterminated character class %.20s", + lastre); + } else if (isdigit(c)) { + num = 10 * num + c - '0'; + digitfound = 1; + } else if (c == ',') { + if (commafound) + FATAL("illegal repetition expression: class %.20s", + lastre); + /* looking for {n,} or {n,m} */ + commafound = 1; + n = num; + digitfound = 0; /* reset */ + num = 0; + } else { + FATAL("illegal repetition expression: class %.20s", + lastre); + } + } + break; } } diff --git a/usr.bin/awk/main.c b/usr.bin/awk/main.c index 3af780e1225..bbf1adfb0d8 100644 --- a/usr.bin/awk/main.c +++ b/usr.bin/awk/main.c @@ -1,4 +1,4 @@ -/* $OpenBSD: main.c,v 1.27 2020/06/10 21:01:32 millert Exp $ */ +/* $OpenBSD: main.c,v 1.28 2020/06/10 21:01:50 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved @@ -23,7 +23,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ****************************************************************/ -const char *version = "version 20190125"; +const char *version = "version 20190305"; #define DEBUG #include <stdio.h> diff --git a/usr.bin/awk/maketab.c b/usr.bin/awk/maketab.c index f9b4d0a8fac..1d6b3abb2ae 100644 --- a/usr.bin/awk/maketab.c +++ b/usr.bin/awk/maketab.c @@ -1,4 +1,4 @@ -/* $OpenBSD: maketab.c,v 1.13 2020/06/10 21:01:13 millert Exp $ */ +/* $OpenBSD: maketab.c,v 1.14 2020/06/10 21:01:50 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved @@ -126,8 +126,12 @@ int main(int argc, char *argv[]) for (i = SIZE; --i >= 0; ) names[i] = ""; - if ((fp = fopen("ytab.h", "r")) == NULL) { - fprintf(stderr, "maketab: can't open ytab.h!\n"); + if (argc != 2) { + fprintf(stderr, "usage: maketab YTAB_H\n"); + exit(1); + } + if ((fp = fopen(argv[1], "r")) == NULL) { + fprintf(stderr, "maketab can't open %s!\n", argv[1]); exit(1); } printf("static char *printname[%d] = {\n", SIZE); |