src - OpenBSD base system

diff options


context:
space:
mode:

author	Todd C. Miller <millert@cvs.openbsd.org>	2020-06-10 21:01:51 +0000
committer	Todd C. Miller <millert@cvs.openbsd.org>	2020-06-10 21:01:51 +0000
commit	e5618da0be005b32ae4f7ed6e0c7d379b53b76bc (patch)
tree	f5ac11c8372bbc7275ba333ea57d0b0f86853a13
parent	d4a9f9a2c15918d627426cc99b76e43a7c447eca (diff)

Update awk to March 5, 2019 version.

-rw-r--r--

-rw-r--r--

-rw-r--r--

-rw-r--r--

251

-rw-r--r--

usr.bin/awk/main.c

-rw-r--r--

usr.bin/awk/maketab.c

6 files changed, 277 insertions, 19 deletions

diff --git a/usr.bin/awk/FIXES b/usr.bin/awk/FIXES
index 0370221a135..dd8fd980ebe 100644
--- a/usr.bin/awk/FIXES
+++ b/usr.bin/awk/FIXES

@@ -1,4 +1,4 @@

-/* $OpenBSD: FIXES,v 1.22 2020/06/10 21:01:32 millert Exp $ */

+/* $OpenBSD: FIXES,v 1.23 2020/06/10 21:01:50 millert Exp $ */

/****************************************************************

@@ -26,6 +26,22 @@ THIS SOFTWARE.

This file lists all bug fixes, changes, etc., made since the AWK book

was sent to the printers in August, 1987.

+Mar 5, 2019:

+ Added support for POSIX-standard interval expressions (a.k.a.

+ bounds, a.k.a. repetition expressions) in regular expressions,

+ backported (via NetBSD) from Apple awk-24 (20070501).

+ Thanks to Martijn Dekker <martijn@inlv.org> for the port.

+ (Merged from PR #30.)

+Mar 3, 2019:

+ Merge PRs as follows:

+ #12: Avoid undefined behaviour when using ctype(3) functions in

+ relex(). Thanks to GitHub user iamleot.

+ #31: Make getline handle numeric strings, and update FIXES. Thanks

+ to GitHub user arnoldrobbins

+ #32: maketab: support build systems with read-only source. Thanks

+ to GitHub user enh.

Jan 25, 2019:

Make getline handle numeric strings properly in all cases.

(Thanks, Arnold.)

diff --git a/usr.bin/awk/Makefile b/usr.bin/awk/Makefile
index 352d521f799..5d817dbcea9 100644
--- a/usr.bin/awk/Makefile
+++ b/usr.bin/awk/Makefile

@@ -1,4 +1,4 @@

-# $OpenBSD: Makefile,v 1.16 2017/07/10 21:30:37 espie Exp $

+# $OpenBSD: Makefile,v 1.17 2020/06/10 21:01:50 millert Exp $

PROG= awk

SRCS= ytab.c lex.c b.c main.c parse.c proctab.c tran.c lib.c run.c

@@ -14,7 +14,7 @@ ytab.c ytab.h: awkgram.y

BUILDFIRST = ytab.h

proctab.c: maketab

- ./maketab >proctab.c

+ ./maketab ytab.h >proctab.c

maketab: ytab.h maketab.c

${HOSTCC} ${HOSTCFLAGS} ${.CURDIR}/maketab.c -o $@

diff --git a/usr.bin/awk/awk.1 b/usr.bin/awk/awk.1
index 7c0fda130be..fb257d029ec 100644
--- a/usr.bin/awk/awk.1
+++ b/usr.bin/awk/awk.1

@@ -1,4 +1,4 @@

-.\" $OpenBSD: awk.1,v 1.47 2020/06/10 21:00:01 millert Exp $

+.\" $OpenBSD: awk.1,v 1.48 2020/06/10 21:01:50 millert Exp $

.\"

.\" Copyright (C) Lucent Technologies 1997

@@ -787,10 +787,7 @@ The

.Nm

utility is compliant with the

.St -p1003.1-2008

-specification,

-except

-.Nm

-does not support {n,m} pattern matching.

+specification.

.Pp

The flags

.Op Fl \&dV

@@ -815,6 +812,4 @@ to it.

The scope rules for variables in functions are a botch;

the syntax is worse.

.Pp

-POSIX-standard interval expressions in regular expressions are not supported.

-.Pp

Only eight-bit character sets are handled correctly.

diff --git a/usr.bin/awk/b.c b/usr.bin/awk/b.c
index 3b88f46debd..ff3ac472c6f 100644
--- a/usr.bin/awk/b.c
+++ b/usr.bin/awk/b.c

@@ -1,4 +1,4 @@

-/* $OpenBSD: b.c,v 1.22 2020/06/10 21:01:32 millert Exp $ */

+/* $OpenBSD: b.c,v 1.23 2020/06/10 21:01:50 millert Exp $ */

/****************************************************************

@@ -28,6 +28,7 @@ THIS SOFTWARE.

#define DEBUG

#include <ctype.h>

+#include <limits.h>

#include <stdio.h>

#include <string.h>

#include <stdlib.h>

@@ -66,6 +67,11 @@ int rlxval;

static uschar *rlxstr;

static uschar *prestr; /* current position in current re */

static uschar *lastre; /* origin of last re */

+static uschar *lastatom; /* origin of last Atom */

+static uschar *starttok;

+static uschar *basestr; /* starts with original, replaced during

+ repetition processing */

+static uschar *firstbasestr;

static int setcnt;

static int poscnt;

@@ -125,6 +131,8 @@ fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */

Node *p, *p1;

fa *f;

+ firstbasestr = (uschar *) s;

+ basestr = firstbasestr;

p = reparse(s);

p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);

/* put ALL STAR in front of reg. exp. */

@@ -146,6 +154,10 @@ fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */

f->initstat = makeinit(f, anchor);

f->anchor = anchor;

f->restr = (uschar *) tostring(s);

+ if (firstbasestr != basestr) {

+ if (basestr)

+ xfree(basestr);

+ }

return f;

}

@@ -637,9 +649,11 @@ Node *regexp(void) /* top-level parse of reg expr */

Node *primary(void)

{

Node *np;

+ int savelastatom;

switch (rtok) {

case CHAR:

+ lastatom = starttok;

np = op2(CHAR, NIL, itonp(rlxval));

rtok = relex();

return (unary(np));

@@ -648,16 +662,19 @@ Node *primary(void)

return (unary(op2(ALL, NIL, NIL)));

case EMPTYRE:

rtok = relex();

- return (unary(op2(ALL, NIL, NIL)));

+ return (unary(op2(EMPTYRE, NIL, NIL)));

case DOT:

+ lastatom = starttok;

rtok = relex();

return (unary(op2(DOT, NIL, NIL)));

case CCL:

np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));

+ lastatom = starttok;

rtok = relex();

return (unary(np));

case NCCL:

np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));

+ lastatom = starttok;

rtok = relex();

return (unary(np));

case '^':

@@ -667,6 +684,8 @@ Node *primary(void)

rtok = relex();

return (unary(op2(CHAR, NIL, NIL)));

case '(':

+ lastatom = starttok;

+ savelastatom = starttok - basestr; /* Retain over recursion */

rtok = relex();

if (rtok == ')') { /* special pleading for () */

rtok = relex();

@@ -674,6 +693,7 @@ Node *primary(void)

}

np = regexp();

if (rtok == ')') {

+ lastatom = basestr + savelastatom; /* Restore */

rtok = relex();

return (unary(np));

}

@@ -688,8 +708,12 @@ Node *primary(void)

Node *concat(Node *np)

{

switch (rtok) {

- case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':

+ case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':

return (concat(op2(CAT, np, primary())));

+ case EMPTYRE:

+ rtok = relex();

+ return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),

+ primary())));

}

return (np);

}

@@ -773,6 +797,117 @@ struct charclass {

{ NULL, 0, NULL },

};

+#define REPEAT_SIMPLE 0

+#define REPEAT_PLUS_APPENDED 1

+#define REPEAT_WITH_Q 2

+#define REPEAT_ZERO 3

+static int

+replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,

+ int atomlen, int firstnum, int secondnum, int special_case)

+ int i, j;

+ uschar *buf = 0;

+ int ret = 1;

+ int init_q = (firstnum==0); /* first added char will be ? */

+ int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */

+ int prefix_length = reptok - basestr; /* prefix includes first rep */

+ int suffix_length = strlen((char *) reptok) - reptoklen; /* string after rep specifier */

+ int size = prefix_length + suffix_length;

+ if (firstnum > 1) { /* add room for reps 2 through firstnum */

+ size += atomlen*(firstnum-1);

+ }

+ /* Adjust size of buffer for special cases */

+ if (special_case == REPEAT_PLUS_APPENDED) {

+ size++; /* for the final + */

+ } else if (special_case == REPEAT_WITH_Q) {

+ size += init_q + (atomlen+1)* n_q_reps;

+ } else if (special_case == REPEAT_ZERO) {

+ size += 2; /* just a null ERE: () */

+ }

+ if ((buf = (uschar *) malloc(size+1)) == NULL)

+ FATAL("out of space in reg expr %.10s..", lastre);

+ memcpy(buf, basestr, prefix_length); /* copy prefix */

+ j = prefix_length;

+ if (special_case == REPEAT_ZERO) {

+ j -= atomlen;

+ buf[j++] = '(';

+ buf[j++] = ')';

+ }

+ for (i=1; i < firstnum; i++) { /* copy x reps */

+ memcpy(&buf[j], atom, atomlen);

+ j += atomlen;

+ }

+ if (special_case == REPEAT_PLUS_APPENDED) {

+ buf[j++] = '+';

+ } else if (special_case == REPEAT_WITH_Q) {

+ if (init_q) buf[j++] = '?';

+ for (i=0; i < n_q_reps; i++) { /* copy x? reps */

+ memcpy(&buf[j], atom, atomlen);

+ j += atomlen;

+ buf[j++] = '?';

+ }

+ memcpy(&buf[j], reptok+reptoklen, suffix_length);

+ if (special_case == REPEAT_ZERO) {

+ buf[j+suffix_length] = '\0';

+ } else {

+ buf[size] = '\0';

+ }

+ /* free old basestr */

+ if (firstbasestr != basestr) {

+ if (basestr)

+ xfree(basestr);

+ }

+ basestr = buf;

+ prestr = buf + prefix_length;

+ if (special_case == REPEAT_ZERO) {

+ prestr -= atomlen;

+ ret++;

+ }

+ return ret;

+static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,

+ int atomlen, int firstnum, int secondnum)

+ /*

+ In general, the repetition specifier or "bound" is replaced here

+ by an equivalent ERE string, repeating the immediately previous atom

+ and appending ? and + as needed. Note that the first copy of the

+ atom is left in place, except in the special_case of a zero-repeat

+ (i.e., {0}).

+ */

+ if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */

+ if (firstnum < 2) {

+ /* 0 or 1: should be handled before you get here */

+ FATAL("internal error");

+ } else {

+ return replace_repeat(reptok, reptoklen, atom, atomlen,

+ firstnum, secondnum, REPEAT_PLUS_APPENDED);

+ }

+ } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */

+ if (firstnum == 0) { /* {0} or {0,0} */

+ /* This case is unusual because the resulting

+ replacement string might actually be SMALLER than

+ the original ERE */

+ return replace_repeat(reptok, reptoklen, atom, atomlen,

+ firstnum, secondnum, REPEAT_ZERO);

+ } else { /* (firstnum >= 1) */

+ return replace_repeat(reptok, reptoklen, atom, atomlen,

+ firstnum, secondnum, REPEAT_SIMPLE);

+ }

+ } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */

+ /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */

+ return replace_repeat(reptok, reptoklen, atom, atomlen,

+ firstnum, secondnum, REPEAT_WITH_Q);

+ } else { /* Error - shouldn't be here (n>m) */

+ FATAL("internal error");

+ }

+ return 0;

int relex(void) /* lexical analyzer for reparse */

{

@@ -783,6 +918,11 @@ int relex(void) /* lexical analyzer for reparse */

uschar *bp;

struct charclass *cc;

int i;

+ int num, m, commafound, digitfound;

+ const uschar *startreptok;

+rescan:

+ starttok = prestr;

switch (c = *prestr++) {

case '|': return OR;

@@ -839,7 +979,7 @@ int relex(void) /* lexical analyzer for reparse */

* not without first adapting the entire

* program to track each string's length.

- for (i = 1; i < NCHARS; i++) {

+ for (i = 1; i <= UCHAR_MAX; i++) {

if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))

FATAL("out of space for reg expr %.10s...", lastre);

if (cc->cc_func(i)) {

@@ -849,6 +989,40 @@ int relex(void) /* lexical analyzer for reparse */

}

} else

*bp++ = c;

+ } else if (c == '[' && *prestr == '.') {

+ char collate_char;

+ prestr++;

+ collate_char = *prestr++;

+ if (*prestr == '.' && prestr[1] == ']') {

+ prestr += 2;

+ /* Found it: map via locale TBD: for

+ now, simply return this char. This

+ is sufficient to pass conformance

+ test awk.ex 156

+ */

+ if (*prestr == ']') {

+ prestr++;

+ rlxval = collate_char;

+ return CHAR;

+ }

+ } else if (c == '[' && *prestr == '=') {

+ char equiv_char;

+ prestr++;

+ equiv_char = *prestr++;

+ if (*prestr == '=' && prestr[1] == ']') {

+ prestr += 2;

+ /* Found it: map via locale TBD: for now

+ simply return this char. This is

+ sufficient to pass conformance test

+ awk.ex 156

+ */

+ if (*prestr == ']') {

+ prestr++;

+ rlxval = equiv_char;

+ return CHAR;

+ }

} else if (c == '\0') {

FATAL("nonterminated character class %.20s", lastre);

} else if (bp == buf) { /* 1st char is special */

@@ -863,6 +1037,75 @@ int relex(void) /* lexical analyzer for reparse */

} else

*bp++ = c;

}

+ break;

+ case '{':

+ if (isdigit(*(prestr))) {

+ num = 0; /* Process as a repetition */

+ n = -1; m = -1;

+ commafound = 0;

+ digitfound = 0;

+ startreptok = prestr-1;

+ /* Remember start of previous atom here ? */

+ } else { /* just a { char, not a repetition */

+ rlxval = c;

+ return CHAR;

+ }

+ for (; ; ) {

+ if ((c = *prestr++) == '}') {

+ if (commafound) {

+ if (digitfound) { /* {n,m} */

+ m = num;

+ if (m<n)

+ FATAL("illegal repetition expression: class %.20s",

+ lastre);

+ if ((n==0) && (m==1)) {

+ return QUEST;

+ }

+ } else { /* {n,} */

+ if (n==0) return STAR;

+ if (n==1) return PLUS;

+ }

+ } else {

+ if (digitfound) { /* {n} same as {n,n} */

+ n = num;

+ m = num;

+ } else { /* {} */

+ FATAL("illegal repetition expression: class %.20s",

+ lastre);

+ }

+ if (repeat(starttok, prestr-starttok, lastatom,

+ startreptok - lastatom, n, m) > 0) {

+ if ((n==0) && (m==0)) {

+ return EMPTYRE;

+ }

+ /* must rescan input for next token */

+ goto rescan;

+ }

+ /* Failed to replace: eat up {...} characters

+ and treat like just PLUS */

+ return PLUS;

+ } else if (c == '\0') {

+ FATAL("nonterminated character class %.20s",

+ lastre);

+ } else if (isdigit(c)) {

+ num = 10 * num + c - '0';

+ digitfound = 1;

+ } else if (c == ',') {

+ if (commafound)

+ FATAL("illegal repetition expression: class %.20s",

+ lastre);

+ /* looking for {n,} or {n,m} */

+ commafound = 1;

+ n = num;

+ digitfound = 0; /* reset */

+ num = 0;

+ } else {

+ FATAL("illegal repetition expression: class %.20s",

+ lastre);

+ }

+ break;

}

diff --git a/usr.bin/awk/main.c b/usr.bin/awk/main.c
index 3af780e1225..bbf1adfb0d8 100644
--- a/usr.bin/awk/main.c
+++ b/usr.bin/awk/main.c

@@ -1,4 +1,4 @@

-/* $OpenBSD: main.c,v 1.27 2020/06/10 21:01:32 millert Exp $ */

+/* $OpenBSD: main.c,v 1.28 2020/06/10 21:01:50 millert Exp $ */

/****************************************************************

@@ -23,7 +23,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF

THIS SOFTWARE.

****************************************************************/

-const char *version = "version 20190125";

+const char *version = "version 20190305";

#define DEBUG

#include <stdio.h>

diff --git a/usr.bin/awk/maketab.c b/usr.bin/awk/maketab.c
index f9b4d0a8fac..1d6b3abb2ae 100644
--- a/usr.bin/awk/maketab.c
+++ b/usr.bin/awk/maketab.c

@@ -1,4 +1,4 @@

-/* $OpenBSD: maketab.c,v 1.13 2020/06/10 21:01:13 millert Exp $ */

+/* $OpenBSD: maketab.c,v 1.14 2020/06/10 21:01:50 millert Exp $ */

/****************************************************************

@@ -126,8 +126,12 @@ int main(int argc, char *argv[])

for (i = SIZE; --i >= 0; )

names[i] = "";

- if ((fp = fopen("ytab.h", "r")) == NULL) {

- fprintf(stderr, "maketab: can't open ytab.h!\n");

+ if (argc != 2) {

+ fprintf(stderr, "usage: maketab YTAB_H\n");

+ exit(1);

+ }

+ if ((fp = fopen(argv[1], "r")) == NULL) {

+ fprintf(stderr, "maketab can't open %s!\n", argv[1]);

exit(1);

}

printf("static char *printname[%d] = {\n", SIZE);