src - OpenBSD base system

diff options


context:
space:
mode:

author	Todd C. Miller <millert@cvs.openbsd.org>	2023-11-22 01:01:22 +0000
committer	Todd C. Miller <millert@cvs.openbsd.org>	2023-11-22 01:01:22 +0000
commit	996ad718e8cd39d414989a2bb7d97761e63cc57e (patch)
tree	b967c198d8068ac4b3b5131b694fa44bfbbae085 /usr.bin
parent	145a77d85085fe1cf2c10c3d77fd045c09290eea (diff)

Update awk to the Nov 20, 2023 version.

This includes a rewrite of the fnematch() function as well as a refactoring of the sub and gsub implementation.

Diffstat (limited to 'usr.bin')

-rw-r--r--

usr.bin/awk/FIXES

-rw-r--r--

usr.bin/awk/b.c

150

-rw-r--r--

usr.bin/awk/main.c

-rw-r--r--

usr.bin/awk/maketab.c

-rw-r--r--

usr.bin/awk/proto.h

-rw-r--r--

usr.bin/awk/run.c

272

6 files changed, 199 insertions, 250 deletions

diff --git a/usr.bin/awk/FIXES b/usr.bin/awk/FIXES
index a13ca50ccde..5d2b4595980 100644
--- a/usr.bin/awk/FIXES
+++ b/usr.bin/awk/FIXES

@@ -25,6 +25,18 @@ THIS SOFTWARE.

This file lists all bug fixes, changes, etc., made since the

second edition of the AWK book was published in September 2023.

+Nov 20, 2023

+ rewrite of fnematch to fix a number of issues, including

+ extraneous output, out-of-bounds access, number of bytes

+ to push back after a failed match etc.

+ thanks to Miguel Pineiro Jr.

+Nov 15, 2023

+ Man page edit, regression test fixes. thanks to Arnold Robbins

+ consolidation of sub and gsub into dosub, removing duplicate

+ code. thanks to Miguel Pineiro Jr.

+ gcc replaced with cc everywhere.

Oct 30, 2023:

multiple fixes and a minor code cleanup.

disabled utf-8 for non-multibyte locales, such as C or POSIX.

diff --git a/usr.bin/awk/b.c b/usr.bin/awk/b.c
index 543fbf798c7..09548d7e5f4 100644
--- a/usr.bin/awk/b.c
+++ b/usr.bin/awk/b.c

@@ -1,4 +1,4 @@

-/* $OpenBSD: b.c,v 1.47 2023/11/15 18:56:53 millert Exp $ */

+/* $OpenBSD: b.c,v 1.48 2023/11/22 01:01:21 millert Exp $ */

/****************************************************************

@@ -770,59 +770,6 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */

#define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long

-// Read one rune at a time from the given FILE*. Return both

-// the bytes and the actual rune.

-struct runedata {

- int rune;

- size_t len;

- char bytes[6];

-};

-struct runedata getrune(FILE *fp)

- struct runedata result;

- int c, i, next;

- memset(&result, 0, sizeof(result));

- c = getc(fp);

- if (c == EOF)

- return result; // result.rune == 0 --> EOF

- else if (c < 128 || awk_mb_cur_max == 1) {

- result.bytes[0] = c;

- result.len = 1;

- result.rune = c;

- return result;

- }

- // need to get bytes and fill things in

- result.bytes[0] = c;

- result.len = 1;

- next = 1;

- for (i = 1; i < MAX_UTF_BYTES; i++) {

- c = getc(fp);

- if (c == EOF)

- break;

- result.bytes[next++] = c;

- result.len++;

- }

- // put back any extra input bytes

- int actual_len = u8_nextlen(result.bytes);

- while (result.len > actual_len) {

- ungetc(result.bytes[--result.len], fp);

- }

- result.bytes[result.len] = '\0';

- (void) u8_rune(& result.rune, (uschar *) result.bytes);

- return result;

* NAME

* fnematch

@@ -840,60 +787,76 @@ struct runedata getrune(FILE *fp)

bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)

{

- char *buf = *pbuf;

+ char *i, *j, *k, *buf = *pbuf;

int bufsize = *pbufsize;

- int i, j, k, ns, s;

- struct runedata r;

+ int c, n, ns, s;

s = pfa->initstat;

patlen = 0;

- * All indices relative to buf.

- * i <= j <= k <= bufsize

+ * buf <= i <= j <= k <= buf+bufsize

- * i: origin of active substring (first byte of first character)

- * j: current character (last byte of current character)

- * k: destination of next getc()

+ * i: origin of active substring

+ * j: current character

+ * k: destination of the next getc

- i = -1, k = 0;

- do {

- j = i++;

- do {

- r = getrune(f);

- if (r.len == 0) {

- r.len = 1; // store NUL byte for EOF

+ i = j = k = buf;

+ do {

+ /*

+ * Call u8_rune with at least MAX_UTF_BYTES ahead in

+ * the buffer until EOF interferes.

+ */

+ if (k - j < MAX_UTF_BYTES) {

+ if (k + MAX_UTF_BYTES > buf + bufsize) {

+ adjbuf(&buf, &bufsize,

+ bufsize + MAX_UTF_BYTES,

+ quantum, 0, "fnematch");

}

- j += r.len;

- if (j >= bufsize) {

- if (!adjbuf(&buf, &bufsize, j+1, quantum, 0, "fnematch"))

- FATAL("stream '%.30s...' too long", buf);

+ for (n = MAX_UTF_BYTES ; n > 0; n--) {

+ *k++ = (c = getc(f)) != EOF ? c : 0;

+ if (c == EOF) {

+ if (ferror(f))

+ FATAL("fnematch: getc error");

+ break;

+ }

}

- memcpy(buf + k, r.bytes, r.len);

- k += r.len;

+ }

- if ((ns = get_gototab(pfa, s, r.rune)) != 0)

- s = ns;

- else

- s = cgoto(pfa, s, r.rune);

+ j += u8_rune(&c, (uschar *)j);

- if (pfa->out[s]) { /* final state */

- patlen = j - i + 1;

- if (r.rune == 0) /* don't count $ */

- patlen--;

- }

- } while (buf[j] && s != 1);

+ if ((ns = get_gototab(pfa, s, c)) != 0)

+ s = ns;

+ else

+ s = cgoto(pfa, s, c);

+ if (pfa->out[s]) { /* final state */

+ patbeg = i;

+ patlen = j - i;

+ if (c == 0) /* don't count $ */

+ patlen--;

+ }

+ if (c && s != 1)

+ continue; /* origin i still viable, next j */

+ if (patlen)

+ break; /* best match found */

+ /* no match at origin i, next i and start over */

+ i += u8_rune(&c, (uschar *)i);

+ if (c == 0)

+ break; /* no match */

+ j = i;

s = 2;

- if (r.len > 1)

- i += r.len - 1; // i incremented around the loop

- } while (buf[i] && !patlen);

+ } while (1);

/* adjbuf() may have relocated a resized buffer. Inform the world. */

*pbuf = buf;

*pbufsize = bufsize;

if (patlen) {

- patbeg = buf + i;

* Under no circumstances is the last character fed to

* the automaton part of the match. It is EOF's nullbyte,

@@ -905,10 +868,11 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)

* (except for EOF's nullbyte, if present) and null

* terminate the buffer.

- for (; r.len > 0; r.len--)

- if (buf[--k] && ungetc(buf[k], f) == EOF)

- FATAL("unable to ungetc '%c'", buf[k]);

- buf[k-patlen] = '\0';

+ do

+ if (*--k && ungetc(*k, f) == EOF)

+ FATAL("unable to ungetc '%c'", *k);

+ while (k > patbeg + patlen);

+ *k = '\0';

return true;

}

else

diff --git a/usr.bin/awk/main.c b/usr.bin/awk/main.c
index d5acca8368f..ddec8e820a1 100644
--- a/usr.bin/awk/main.c
+++ b/usr.bin/awk/main.c

@@ -1,4 +1,4 @@

-/* $OpenBSD: main.c,v 1.64 2023/10/31 01:08:51 millert Exp $ */

+/* $OpenBSD: main.c,v 1.65 2023/11/22 01:01:21 millert Exp $ */

/****************************************************************

@@ -23,7 +23,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF

THIS SOFTWARE.

****************************************************************/

-const char *version = "version 20231030";

+const char *version = "version 20231120";

#define DEBUG

#include <stdio.h>

diff --git a/usr.bin/awk/maketab.c b/usr.bin/awk/maketab.c
index 2c4adf72c78..4f2756b4bb0 100644
--- a/usr.bin/awk/maketab.c
+++ b/usr.bin/awk/maketab.c

@@ -1,4 +1,4 @@

-/* $OpenBSD: maketab.c,v 1.21 2023/10/30 17:52:54 millert Exp $ */

+/* $OpenBSD: maketab.c,v 1.22 2023/11/22 01:01:21 millert Exp $ */

/****************************************************************

@@ -53,8 +53,8 @@ struct xx

{ ARRAY, "array", NULL },

{ INDIRECT, "indirect", "$(" },

{ SUBSTR, "substr", "substr" },

- { SUB, "sub", "sub" },

- { GSUB, "gsub", "gsub" },

+ { SUB, "dosub", "sub" },

+ { GSUB, "dosub", "gsub" },

{ INDEX, "sindex", "sindex" },

{ SPRINTF, "awksprintf", "sprintf " },

{ ADD, "arith", " + " },

diff --git a/usr.bin/awk/proto.h b/usr.bin/awk/proto.h
index 7d9aa3c24cf..4c2fafd0825 100644
--- a/usr.bin/awk/proto.h
+++ b/usr.bin/awk/proto.h

@@ -1,4 +1,4 @@

-/* $OpenBSD: proto.h,v 1.22 2023/09/17 14:49:44 millert Exp $ */

+/* $OpenBSD: proto.h,v 1.23 2023/11/22 01:01:21 millert Exp $ */

/****************************************************************

@@ -199,8 +199,7 @@ extern FILE *openfile(int, const char *, bool *);

extern const char *filename(FILE *);

extern Cell *closefile(Node **, int);

extern void closeall(void);

-extern Cell *sub(Node **, int);

-extern Cell *gsub(Node **, int);

+extern Cell *dosub(Node **, int);

extern Cell *gensub(Node **, int);

extern FILE *popen(const char *, const char *);

diff --git a/usr.bin/awk/run.c b/usr.bin/awk/run.c
index 6e72ec1ceb7..ba9469d69f5 100644
--- a/usr.bin/awk/run.c
+++ b/usr.bin/awk/run.c

@@ -1,4 +1,4 @@

-/* $OpenBSD: run.c,v 1.80 2023/10/28 22:38:22 millert Exp $ */

+/* $OpenBSD: run.c,v 1.81 2023/11/22 01:01:21 millert Exp $ */

/****************************************************************

@@ -2518,169 +2518,143 @@ static void flush_all(void)

void backsub(char **pb_ptr, const char **sptr_ptr);

-Cell *sub(Node **a, int nnn) /* substitute command */

+Cell *dosub(Node **a, int subop) /* sub and gsub */

{

- const char *sptr, *q;

- Cell *x, *y, *result;

- char *t, *buf, *pb;

fa *pfa;

+ int tempstat;

+ char *repl;

+ Cell *x;

+ char *buf = NULL;

+ char *pb = NULL;

int bufsz = recsize;

- if ((buf = (char *) malloc(bufsz)) == NULL)

- FATAL("out of memory in sub");

- x = execute(a[3]); /* target string */

- t = getsval(x);

- if (a[0] == NULL) /* 0 => a[1] is already-compiled regexpr */

- pfa = (fa *) a[1]; /* regular expression */

- else {

- y = execute(a[1]);

- pfa = makedfa(getsval(y), 1);

- tempfree(y);

+ const char *r, *s;

+ const char *start;

+ const char *noempty = NULL; /* empty match disallowed here */

+ size_t m = 0; /* match count */

+ size_t whichm; /* which match to select, 0 = global */

+ int mtype; /* match type */

+ if (a[0] == NULL) { /* 0 => a[1] is already-compiled regexpr */

+ pfa = (fa *) a[1];

+ } else {

+ x = execute(a[1]);

+ pfa = makedfa(getsval(x), 1);

+ tempfree(x);

}

- y = execute(a[2]); /* replacement string */

- result = False;

- if (pmatch(pfa, t)) {

- sptr = t;

- adjbuf(&buf, &bufsz, 1+patbeg-sptr, recsize, 0, "sub");

- pb = buf;

- while (sptr < patbeg)

- *pb++ = *sptr++;

- sptr = getsval(y);

- while (*sptr != '\0') {

- adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "sub");

- if (*sptr == '\\') {

- backsub(&pb, &sptr);

- } else if (*sptr == '&') {

- sptr++;

- adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "sub");

- for (q = patbeg; q < patbeg+patlen; )

- *pb++ = *q++;

- } else

- *pb++ = *sptr++;

+ x = execute(a[2]); /* replacement string */

+ repl = tostring(getsval(x));

+ tempfree(x);

+ switch (subop) {

+ case SUB:

+ whichm = 1;

+ x = execute(a[3]); /* source string */

+ break;

+ case GSUB:

+ whichm = 0;

+ x = execute(a[3]); /* source string */

+ break;

+ default:

+ FATAL("dosub: unrecognized subop: %d", subop);

+ }

+ start = getsval(x);

+ while (pmatch(pfa, start)) {

+ if (buf == NULL) {

+ if ((pb = buf = malloc(bufsz)) == NULL)

+ FATAL("out of memory in dosub");

+ tempstat = pfa->initstat;

+ pfa->initstat = 2;

}

- *pb = '\0';

- if (pb > buf + bufsz)

- FATAL("sub result1 %.30s too big; can't happen", buf);

- sptr = patbeg + patlen;

- if ((patlen == 0 && *patbeg) || (patlen && *(sptr-1))) {

- adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "sub");

- while ((*pb++ = *sptr++) != '\0')

- continue;

+ /* match types */

+ #define MT_IGNORE 0 /* unselected or invalid */

+ #define MT_INSERT 1 /* selected, empty */

+ #define MT_REPLACE 2 /* selected, not empty */

+ /* an empty match just after replacement is invalid */

+ if (patbeg == noempty && patlen == 0) {

+ mtype = MT_IGNORE; /* invalid, not counted */

+ } else if (whichm == ++m || whichm == 0) {

+ mtype = patlen ? MT_REPLACE : MT_INSERT;

+ } else {

+ mtype = MT_IGNORE; /* unselected, but counted */

}

- if (pb > buf + bufsz)

- FATAL("sub result2 %.30s too big; can't happen", buf);

- setsval(x, buf); /* BUG: should be able to avoid copy */

- result = True;

- }

- tempfree(x);

- tempfree(y);

- free(buf);

- return result;

-Cell *gsub(Node **a, int nnn) /* global substitute */

- Cell *x, *y;

- char *rptr, *pb;

- const char *q, *t, *sptr;

- char *buf;

- fa *pfa;

- int mflag, tempstat, num;

- int bufsz = recsize;

- int charlen = 0;

+ /* leading text: */

+ if (patbeg > start) {

+ adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - start),

+ recsize, &pb, "dosub");

+ s = start;

+ while (s < patbeg)

+ *pb++ = *s++;

+ }

- if ((buf = (char *) malloc(bufsz)) == NULL)

- FATAL("out of memory in gsub");

- mflag = 0; /* if mflag == 0, can replace empty string */

- num = 0;

- x = execute(a[3]); /* target string */

- t = getsval(x);

- if (a[0] == NULL) /* 0 => a[1] is already-compiled regexpr */

- pfa = (fa *) a[1]; /* regular expression */

- else {

- y = execute(a[1]);

- pfa = makedfa(getsval(y), 1);

- tempfree(y);

- }

- y = execute(a[2]); /* replacement string */

- if (pmatch(pfa, t)) {

- tempstat = pfa->initstat;

- pfa->initstat = 2;

- pb = buf;

- rptr = getsval(y);

- do {

- if (patlen == 0 && *patbeg != '\0') { /* matched empty string */

- if (mflag == 0) { /* can replace empty */

- num++;

- sptr = rptr;

- while (*sptr != '\0') {

- adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");

- if (*sptr == '\\') {

- backsub(&pb, &sptr);

- } else if (*sptr == '&') {

- sptr++;

- adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");

- for (q = patbeg; q < patbeg+patlen; )

- *pb++ = *q++;

- } else

- *pb++ = *sptr++;

- }

- if (*t == '\0') /* at end */

- goto done;

- adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gsub");

- charlen = u8_nextlen(t);

- while (charlen-- > 0)

- *pb++ = *t++;

- if (pb > buf + bufsz) /* BUG: not sure of this test */

- FATAL("gsub result0 %.30s too big; can't happen", buf);

- mflag = 0;

- }

- else { /* matched nonempty string */

- num++;

- sptr = t;

- adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gsub");

- while (sptr < patbeg)

- *pb++ = *sptr++;

- sptr = rptr;

- while (*sptr != '\0') {

- adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gsub");

- if (*sptr == '\\') {

- backsub(&pb, &sptr);

- } else if (*sptr == '&') {

- sptr++;

- adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gsub");

- for (q = patbeg; q < patbeg+patlen; )

- *pb++ = *q++;

- } else

- *pb++ = *sptr++;

- }

- t = patbeg + patlen;

- if (patlen == 0 || *t == '\0' || *(t-1) == '\0')

- goto done;

- if (pb > buf + bufsz)

- FATAL("gsub result1 %.30s too big; can't happen", buf);

- mflag = 1;

+ if (mtype == MT_IGNORE)

+ goto matching_text; /* skip replacement text */

+ r = repl;

+ while (*r != 0) {

+ adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "dosub");

+ if (*r == '\\') {

+ backsub(&pb, &r);

+ } else if (*r == '&') {

+ r++;

+ adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize,

+ &pb, "dosub");

+ for (s = patbeg; s < patbeg+patlen; )

+ *pb++ = *s++;

+ } else {

+ *pb++ = *r++;

}

- } while (pmatch(pfa,t));

- sptr = t;

- adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gsub");

- while ((*pb++ = *sptr++) != '\0')

- continue;

- done: if (pb < buf + bufsz)

- *pb = '\0';

- else if (*(pb-1) != '\0')

- FATAL("gsub result2 %.30s truncated; can't happen", buf);

- setsval(x, buf); /* BUG: should be able to avoid copy + free */

+ }

+matching_text:

+ if (mtype == MT_REPLACE || *patbeg == '\0')

+ goto next_search; /* skip matching text */

+ if (patlen == 0)

+ patlen = u8_nextlen(patbeg);

+ adjbuf(&buf, &bufsz, (pb-buf) + patlen, recsize, &pb, "dosub");

+ s = patbeg;

+ while (s < patbeg + patlen)

+ *pb++ = *s++;

+next_search:

+ start = patbeg + patlen;

+ if (m == whichm || *patbeg == '\0')

+ break;

+ if (mtype == MT_REPLACE)

+ noempty = start;

+ #undef MT_IGNORE

+ #undef MT_INSERT

+ #undef MT_REPLACE

+ }

+ xfree(repl);

+ if (buf != NULL) {

pfa->initstat = tempstat;

+ /* trailing text */

+ adjbuf(&buf, &bufsz, 1+strlen(start)+pb-buf, 0, &pb, "dosub");

+ while ((*pb++ = *start++) != '\0')

+ ;

+ setsval(x, buf);

+ free(buf);

}

tempfree(x);

- tempfree(y);

x = gettemp();

x->tval = NUM;

- x->fval = num;

- free(buf);

- return(x);

+ x->fval = m;

+ return x;

}

Cell *gensub(Node **a, int nnn) /* global selective substitute */