summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTodd C. Miller <millert@cvs.openbsd.org>2023-11-25 16:31:34 +0000
committerTodd C. Miller <millert@cvs.openbsd.org>2023-11-25 16:31:34 +0000
commitf32f94607735461e1e24ccbce40bac51d106532d (patch)
treec416ae920a4e4551aac8097a08c09bcf28c59ed0
parentde53af60e639ddb1e917427d64abb108d64ff5c3 (diff)
Update awk to the Nov 24, 2023 version.
-rw-r--r--usr.bin/awk/FIXES16
-rw-r--r--usr.bin/awk/awk.h13
-rw-r--r--usr.bin/awk/b.c130
-rw-r--r--usr.bin/awk/lex.c16
-rw-r--r--usr.bin/awk/main.c4
-rw-r--r--usr.bin/awk/run.c5
6 files changed, 137 insertions, 47 deletions
diff --git a/usr.bin/awk/FIXES b/usr.bin/awk/FIXES
index 5d2b4595980..52f49e3eed0 100644
--- a/usr.bin/awk/FIXES
+++ b/usr.bin/awk/FIXES
@@ -25,13 +25,24 @@ THIS SOFTWARE.
This file lists all bug fixes, changes, etc., made since the
second edition of the AWK book was published in September 2023.
-Nov 20, 2023
+Nov 24, 2023:
+ Fix issue #199: gototab improvements to dynamically resize the
+ table, qsort and bsearch to improve the lookup speed as the
+ table gets larger for multibyte input. thanks to Arnold Robbins.
+
+Nov 23, 2023:
+ Fix Issue #169, related to escape sequences in strings.
+ Thanks to Github user rajeevvp.
+ Fix Issue #147, reported by Github user drawkula, and fixed
+ by Miguel Pineiro Jr.
+
+Nov 20, 2023:
rewrite of fnematch to fix a number of issues, including
extraneous output, out-of-bounds access, number of bytes
to push back after a failed match etc.
thanks to Miguel Pineiro Jr.
-Nov 15, 2023
+Nov 15, 2023:
Man page edit, regression test fixes. thanks to Arnold Robbins
consolidation of sub and gsub into dosub, removing duplicate
code. thanks to Miguel Pineiro Jr.
@@ -44,7 +55,6 @@ Oct 30, 2023:
systems. also fixed an out-of-bounds read for empty CCL.
fixed a buffer overflow in substr with utf-8 strings.
many thanks to Todd C Miller.
-
Sep 24, 2023:
fnematch and getrune have been overhauled to solve issues around
diff --git a/usr.bin/awk/awk.h b/usr.bin/awk/awk.h
index a57e27eaffd..3b5c67b60c3 100644
--- a/usr.bin/awk/awk.h
+++ b/usr.bin/awk/awk.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: awk.h,v 1.30 2023/09/18 19:32:19 millert Exp $ */
+/* $OpenBSD: awk.h,v 1.31 2023/11/25 16:31:33 millert Exp $ */
/****************************************************************
Copyright (C) Lucent Technologies 1997
All Rights Reserved
@@ -257,14 +257,19 @@ typedef struct rrow {
int *lfollow;
} rrow;
-typedef struct gtt { /* gototab entry */
+typedef struct gtte { /* gototab entry */
unsigned int ch;
unsigned int state;
+} gtte;
+
+typedef struct gtt { /* gototab */
+ size_t allocated;
+ size_t inuse;
+ gtte *entries;
} gtt;
typedef struct fa {
- gtt **gototab;
- int gototab_len;
+ gtt *gototab;
uschar *out;
uschar *restr;
int **posns;
diff --git a/usr.bin/awk/b.c b/usr.bin/awk/b.c
index 09548d7e5f4..523e3ee195c 100644
--- a/usr.bin/awk/b.c
+++ b/usr.bin/awk/b.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: b.c,v 1.48 2023/11/22 01:01:21 millert Exp $ */
+/* $OpenBSD: b.c,v 1.49 2023/11/25 16:31:33 millert Exp $ */
/****************************************************************
Copyright (C) Lucent Technologies 1997
All Rights Reserved
@@ -97,9 +97,8 @@ extern int u8_nextlen(const char *s);
mechanism of the goto table used 8-bit byte indices into the
gototab entries to compute the next state. Unicode is a lot
bigger, so the gototab entries are now structs with a character
- and a next state, and there is a linear search of the characters
- to find the state. (Yes, this is slower, by a significant
- amount. Tough.)
+ and a next state. These are sorted by code point and binary
+ searched.
Throughout the RE mechanism in b.c, utf-8 characters are
converted to their utf-32 value. This mostly shows up in
@@ -114,9 +113,10 @@ extern int u8_nextlen(const char *s);
*/
+static int entry_cmp(const void *l, const void *r);
static int get_gototab(fa*, int, int);
static int set_gototab(fa*, int, int, int);
-static void reset_gototab(fa*, int);
+static void clear_gototab(fa*, int);
extern int u8_rune(int *, const uschar *);
static int *
@@ -151,7 +151,7 @@ resizesetvec(const char *f)
static void
resize_state(fa *f, int state)
{
- gtt **p;
+ gtt *p;
uschar *p2;
int **p3;
int i, new_count;
@@ -161,7 +161,7 @@ resize_state(fa *f, int state)
new_count = state + 10; /* needs to be tuned */
- p = (gtt **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0]));
+ p = (gtt *) reallocarray(f->gototab, new_count, sizeof(gtt));
if (p == NULL)
goto out;
f->gototab = p;
@@ -177,13 +177,14 @@ resize_state(fa *f, int state)
f->posns = p3;
for (i = f->state_count; i < new_count; ++i) {
- f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab));
- if (f->gototab[i] == NULL)
+ f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
+ if (f->gototab[i].entries == NULL)
goto out;
- f->out[i] = 0;
+ f->gototab[i].allocated = NCHARS;
+ f->gototab[i].inuse = 0;
+ f->out[i] = 0;
f->posns[i] = NULL;
}
- f->gototab_len = NCHARS; /* should be variable, growable */
f->state_count = new_count;
return;
out:
@@ -277,7 +278,7 @@ int makeinit(fa *f, bool anchor)
}
if ((f->posns[2])[1] == f->accept)
f->out[2] = 1;
- reset_gototab(f, 2);
+ clear_gototab(f, 2);
f->curstat = cgoto(f, 2, HAT);
if (anchor) {
*f->posns[2] = k-1; /* leave out position 0 */
@@ -601,37 +602,104 @@ int member(int c, int *sarg) /* is c in s? */
return(0);
}
+static void resize_gototab(fa *f, int state)
+{
+ size_t new_size = f->gototab[state].allocated * 2;
+ gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
+ if (p == NULL)
+ overflo(__func__);
+
+ // need to initialized the new memory to zero
+ size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size
+ memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out
+
+ f->gototab[state].allocated = new_size; // update gotottab info
+ f->gototab[state].entries = p;
+}
+
static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */
{
- int i;
- for (i = 0; i < f->gototab_len; i++) {
- if (f->gototab[state][i].ch == 0)
- break;
- if (f->gototab[state][i].ch == ch)
- return f->gototab[state][i].state;
- }
- return 0;
+ gtte key;
+ gtte *item;
+
+ key.ch = ch;
+ key.state = 0; /* irrelevant */
+ item = bsearch(& key, f->gototab[state].entries,
+ f->gototab[state].inuse, sizeof(gtte),
+ entry_cmp);
+
+ if (item == NULL)
+ return 0;
+ else
+ return item->state;
}
-static void reset_gototab(fa *f, int state) /* hide gototab inplementation */
+static int entry_cmp(const void *l, const void *r)
{
- memset(f->gototab[state], 0, f->gototab_len * sizeof(**f->gototab));
+ const gtte *left, *right;
+
+ left = (const gtte *) l;
+ right = (const gtte *) r;
+
+ return left->ch - right->ch;
}
static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */
{
- int i;
- for (i = 0; i < f->gototab_len; i++) {
- if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) {
- f->gototab[state][i].ch = ch;
- f->gototab[state][i].state = val;
- return val;
+ if (f->gototab[state].inuse == 0) {
+ f->gototab[state].entries[0].ch = ch;
+ f->gototab[state].entries[0].state = val;
+ f->gototab[state].inuse++;
+ return val;
+ } else if (ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
+ // not seen yet, insert and return
+ gtt *tab = & f->gototab[state];
+ if (tab->inuse + 1 >= tab->allocated)
+ resize_gototab(f, state);
+
+ f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch;
+ f->gototab[state].entries[f->gototab[state].inuse-1].state = val;
+ f->gototab[state].inuse++;
+ return val;
+ } else {
+ // maybe we have it, maybe we don't
+ gtte key;
+ gtte *item;
+
+ key.ch = ch;
+ key.state = 0; /* irrelevant */
+ item = bsearch(& key, f->gototab[state].entries,
+ f->gototab[state].inuse, sizeof(gtte),
+ entry_cmp);
+
+ if (item != NULL) {
+ // we have it, update state and return
+ item->state = val;
+ return item->state;
}
+ // otherwise, fall through to insert and reallocate.
}
- overflo(__func__);
+
+ gtt *tab = & f->gototab[state];
+ if (tab->inuse + 1 >= tab->allocated)
+ resize_gototab(f, state);
+ ++tab->inuse;
+ f->gototab[state].entries[tab->inuse].ch = ch;
+ f->gototab[state].entries[tab->inuse].state = val;
+
+ qsort(f->gototab[state].entries,
+ f->gototab[state].inuse, sizeof(gtte), entry_cmp);
+
return val; /* not used anywhere at the moment */
}
+static void clear_gototab(fa *f, int state)
+{
+ memset(f->gototab[state].entries, 0,
+ f->gototab[state].allocated * sizeof(gtte));
+ f->gototab[state].inuse = 0;
+}
+
int match(fa *f, const char *p0) /* shortest match ? */
{
int s, ns;
@@ -1460,6 +1528,7 @@ int cgoto(fa *f, int s, int c)
/* add tmpset to current set of states */
++(f->curstat);
resize_state(f, f->curstat);
+ clear_gototab(f, f->curstat);
xfree(f->posns[f->curstat]);
p = intalloc(setcnt + 1, __func__);
@@ -1483,7 +1552,8 @@ void freefa(fa *f) /* free a finite automaton */
if (f == NULL)
return;
for (i = 0; i < f->state_count; i++)
- xfree(f->gototab[i])
+ xfree(f->gototab[i].entries);
+ xfree(f->gototab);
for (i = 0; i <= f->curstat; i++)
xfree(f->posns[i]);
for (i = 0; i <= f->accept; i++) {
diff --git a/usr.bin/awk/lex.c b/usr.bin/awk/lex.c
index b1f422d4093..4c7371de3bb 100644
--- a/usr.bin/awk/lex.c
+++ b/usr.bin/awk/lex.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: lex.c,v 1.31 2023/09/17 14:49:44 millert Exp $ */
+/* $OpenBSD: lex.c,v 1.32 2023/11/25 16:31:33 millert Exp $ */
/****************************************************************
Copyright (C) Lucent Technologies 1997
All Rights Reserved
@@ -432,8 +432,12 @@ int string(void)
{
int i;
+ if (!isxdigit(peek())) {
+ unput(c);
+ break;
+ }
n = 0;
- for (i = 1; i <= 2; i++) {
+ for (i = 0; i < 2; i++) {
c = input();
if (c == 0)
break;
@@ -444,13 +448,13 @@ int string(void)
n += (c - '0');
else
n += 10 + (c - 'a');
- } else
+ } else {
+ unput(c);
break;
+ }
}
- if (n)
+ if (i)
*bp++ = n;
- else
- unput(c);
break;
}
diff --git a/usr.bin/awk/main.c b/usr.bin/awk/main.c
index ddec8e820a1..a2f53635351 100644
--- a/usr.bin/awk/main.c
+++ b/usr.bin/awk/main.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: main.c,v 1.65 2023/11/22 01:01:21 millert Exp $ */
+/* $OpenBSD: main.c,v 1.66 2023/11/25 16:31:33 millert Exp $ */
/****************************************************************
Copyright (C) Lucent Technologies 1997
All Rights Reserved
@@ -23,7 +23,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE.
****************************************************************/
-const char *version = "version 20231120";
+const char *version = "version 20231124";
#define DEBUG
#include <stdio.h>
diff --git a/usr.bin/awk/run.c b/usr.bin/awk/run.c
index ba9469d69f5..6114768edf1 100644
--- a/usr.bin/awk/run.c
+++ b/usr.bin/awk/run.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: run.c,v 1.81 2023/11/22 01:01:21 millert Exp $ */
+/* $OpenBSD: run.c,v 1.82 2023/11/25 16:31:33 millert Exp $ */
/****************************************************************
Copyright (C) Lucent Technologies 1997
All Rights Reserved
@@ -1541,8 +1541,9 @@ Cell *assign(Node **a, int n) /* a[0] = a[1], a[0] += a[1], etc. */
if (x == y && !(x->tval & (FLD|REC)) && x != nfloc)
; /* self-assignment: leave alone unless it's a field or NF */
else if ((y->tval & (STR|NUM)) == (STR|NUM)) {
+ yf = getfval(y);
setsval(x, getsval(y));
- x->fval = getfval(y);
+ x->fval = yf;
x->tval |= NUM;
}
else if (isstr(y))