src - OpenBSD base system

diff options


context:
space:
mode:

author	Todd C. Miller <millert@cvs.openbsd.org>	2023-09-17 14:49:45 +0000
committer	Todd C. Miller <millert@cvs.openbsd.org>	2023-09-17 14:49:45 +0000
commit	c5246d2718cf85e86ceff0c93e17b160815813c4 (patch)
tree	1010eb43cacfc5d7357890fc5a58a08c324ce8ae /usr.bin/awk/run.c
parent	c749387d621c11d2c961dce4fefb26b736c15bca (diff)

Update to the One True Awk, 2nd edition (Sep 12, 2023).

This corresponds to the 2nd edition of "The AWK Programming Language" and adds support for UTF-8 and comma-separated value inputs.

Diffstat (limited to 'usr.bin/awk/run.c')

-rw-r--r--

usr.bin/awk/run.c

560

1 files changed, 523 insertions, 37 deletions

diff --git a/usr.bin/awk/run.c b/usr.bin/awk/run.c
index 5d87b43e8e6..6d2658fcb75 100644
--- a/usr.bin/awk/run.c
+++ b/usr.bin/awk/run.c

@@ -1,4 +1,4 @@

-/* $OpenBSD: run.c,v 1.74 2022/09/21 01:42:59 millert Exp $ */

+/* $OpenBSD: run.c,v 1.75 2023/09/17 14:49:44 millert Exp $ */

/****************************************************************

@@ -27,7 +27,6 @@ THIS SOFTWARE.

#include <stdio.h>

#include <ctype.h>

#include <errno.h>

-#include <wchar.h>

#include <wctype.h>

#include <fcntl.h>

#include <setjmp.h>

@@ -41,8 +40,10 @@ THIS SOFTWARE.

#include "awk.h"

#include "awkgram.tab.h"

static void stdinit(void);

static void flush_all(void);

+static char *wide_char_to_byte_str(int rune, size_t *outlen);

#if 1

#define tempfree(x) do { if (istemp(x)) tfree(x); } while (/*CONSTCOND*/0)

@@ -580,11 +581,225 @@ Cell *intest(Node **a, int n) /* a[0] is index (list), a[1] is symtab */

}

+/* ======== utf-8 code ========== */

+/*

+ * Awk strings can contain ascii, random 8-bit items (eg Latin-1),

+ * or utf-8. u8_isutf tests whether a string starts with a valid

+ * utf-8 sequence, and returns 0 if not (e.g., high bit set).

+ * u8_nextlen returns length of next valid sequence, which is

+ * 1 for ascii, 2..4 for utf-8, or 1 for high bit non-utf.

+ * u8_strlen returns length of string in valid utf-8 sequences

+ * and/or high-bit bytes. Conversion functions go between byte

+ * number and character number.

+ *

+ * In theory, this behaves the same as before for non-utf8 bytes.

+ *

+ * Limited checking! This is a potential security hole.

+ */

+/* is s the beginning of a valid utf-8 string? */

+/* return length 1..4 if yes, 0 if no */

+int u8_isutf(const char *s)

+ int n, ret;

+ unsigned char c;

+ c = s[0];

+ if (c < 128)

+ return 1; /* what if it's 0? */

+ n = strlen(s);

+ if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {

+ ret = 2; /* 110xxxxx 10xxxxxx */

+ } else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80

+ && (s[2] & 0xC0) == 0x80) {

+ ret = 3; /* 1110xxxx 10xxxxxx 10xxxxxx */

+ } else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80

+ && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {

+ ret = 4; /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */

+ } else {

+ ret = 0;

+ }

+ return ret;

+/* Convert (prefix of) utf8 string to utf-32 rune. */

+/* Sets *rune to the value, returns the length. */

+/* No error checking: watch out. */

+int u8_rune(int *rune, const char *s)

+ int n, ret;

+ unsigned char c;

+ c = s[0];

+ if (c < 128) {

+ *rune = c;

+ return 1;

+ }

+ n = strlen(s);

+ if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) {

+ *rune = ((c & 0x1F) << 6) | (s[1] & 0x3F); /* 110xxxxx 10xxxxxx */

+ ret = 2;

+ } else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80

+ && (s[2] & 0xC0) == 0x80) {

+ *rune = ((c & 0xF) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F);

+ /* 1110xxxx 10xxxxxx 10xxxxxx */

+ ret = 3;

+ } else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80

+ && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) {

+ *rune = ((c & 0x7) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F);

+ /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */

+ ret = 4;

+ } else {

+ *rune = c;

+ ret = 1;

+ }

+ return ret; /* returns one byte if sequence doesn't look like utf */

+/* return length of next sequence: 1 for ascii or random, 2..4 for valid utf8 */

+int u8_nextlen(const char *s)

+ int len;

+ len = u8_isutf(s);

+ if (len == 0)

+ len = 1;

+ return len;

+/* return number of utf characters or single non-utf bytes */

+int u8_strlen(const char *s)

+ int i, len, n, totlen;

+ unsigned char c;

+ n = strlen(s);

+ totlen = 0;

+ for (i = 0; i < n; i += len) {

+ c = s[i];

+ if (c < 128) {

+ len = 1;

+ } else {

+ len = u8_nextlen(&s[i]);

+ }

+ totlen++;

+ if (i > n)

+ FATAL("bad utf count [%s] n=%d i=%d\n", s, n, i);

+ }

+ return totlen;

+/* convert utf-8 char number in a string to its byte offset */

+int u8_char2byte(const char *s, int charnum)

+ int n;

+ int bytenum = 0;

+ while (charnum > 0) {

+ n = u8_nextlen(s);

+ s += n;

+ bytenum += n;

+ charnum--;

+ }

+ return bytenum;

+/* convert byte offset in s to utf-8 char number that starts there */

+int u8_byte2char(const char *s, int bytenum)

+ int i, len, b;

+ int charnum = 0; /* BUG: what origin? */

+ /* should be 0 to match start==0 which means no match */

+ b = strlen(s);

+ if (bytenum > b) {

+ return -1; /* ??? */

+ }

+ for (i = 0; i <= bytenum; i += len) {

+ len = u8_nextlen(s+i);

+ charnum++;

+ }

+ return charnum;

+/* runetochar() adapted from rune.c in the Plan 9 distributione */

+enum

+ Runeerror = 128, /* from somewhere else */

+ Runemax = 0x10FFFF,

+ Bit1 = 7,

+ Bitx = 6,

+ Bit2 = 5,

+ Bit3 = 4,

+ Bit4 = 3,

+ Bit5 = 2,

+ T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */

+ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */

+ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */

+ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */

+ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */

+ T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */

+ Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */

+ Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */

+ Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */

+ Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */

+ Maskx = (1<<Bitx)-1, /* 0011 1111 */

+ Testx = Maskx ^ 0xFF, /* 1100 0000 */

+};

+int runetochar(char *str, int c)

+ /* one character sequence 00000-0007F => 00-7F */

+ if (c <= Rune1) {

+ str[0] = c;

+ return 1;

+ }

+ /* two character sequence 00080-007FF => T2 Tx */

+ if (c <= Rune2) {

+ str[0] = T2 | (c >> 1*Bitx);

+ str[1] = Tx | (c & Maskx);

+ return 2;

+ }

+ /* three character sequence 00800-0FFFF => T3 Tx Tx */

+ if (c > Runemax)

+ c = Runeerror;

+ if (c <= Rune3) {

+ str[0] = T3 | (c >> 2*Bitx);

+ str[1] = Tx | ((c >> 1*Bitx) & Maskx);

+ str[2] = Tx | (c & Maskx);

+ return 3;

+ }

+ /* four character sequence 010000-1FFFFF => T4 Tx Tx Tx */

+ str[0] = T4 | (c >> 3*Bitx);

+ str[1] = Tx | ((c >> 2*Bitx) & Maskx);

+ str[2] = Tx | ((c >> 1*Bitx) & Maskx);

+ str[3] = Tx | (c & Maskx);

+ return 4;

+/* ========== end of utf8 code =========== */

Cell *matchop(Node **a, int n) /* ~ and match() */

{

Cell *x, *y;

char *s, *t;

int i;

+ int cstart, cpatlen, len;

fa *pfa;

int (*mf)(fa *, const char *) = match, mode = 0;

@@ -605,9 +820,21 @@ Cell *matchop(Node **a, int n) /* ~ and match() */

}

tempfree(x);

if (n == MATCHFCN) {

- int start = patbeg - s + 1;

- if (patlen < 0)

- start = 0;

+ int start = patbeg - s + 1; /* origin 1 */

+ if (patlen < 0) {

+ start = 0; /* not found */

+ } else {

+ cstart = u8_byte2char(s, start-1);

+ cpatlen = 0;

+ for (i = 0; i < patlen; i += len) {

+ len = u8_nextlen(patbeg+i);

+ cpatlen++;

+ }

+ start = cstart;

+ patlen = cpatlen;

+ }

setfval(rstartloc, (Awkfloat) start);

setfval(rlengthloc, (Awkfloat) patlen);

x = gettemp();

@@ -658,10 +885,15 @@ Cell *relop(Node **a, int n) /* a[0 < a[1], etc. */

int i;

Cell *x, *y;

Awkfloat j;

+ bool x_is_nan, y_is_nan;

x = execute(a[0]);

y = execute(a[1]);

+ x_is_nan = isnan(x->fval);

+ y_is_nan = isnan(y->fval);

if (x->tval&NUM && y->tval&NUM) {

+ if ((x_is_nan || y_is_nan) && n != NE)

+ return(False);

j = x->fval - y->fval;

i = j<0? -1: (j>0? 1: 0);

} else {

@@ -674,7 +906,8 @@ Cell *relop(Node **a, int n) /* a[0 < a[1], etc. */

else return(False);

case LE: if (i<=0) return(True);

else return(False);

- case NE: if (i!=0) return(True);

+ case NE: if (x_is_nan && y_is_nan) return(True);

+ else if (i!=0) return(True);

else return(False);

case EQ: if (i == 0) return(True);

else return(False);

@@ -743,6 +976,7 @@ Cell *indirect(Node **a, int n) /* $( a[0] ) */

Cell *substr(Node **a, int nnn) /* substr(a[0], a[1], a[2]) */

{

int k, m, n;

+ int mb, nb;

char *s;

int temp;

Cell *x, *y, *z = NULL;

@@ -778,12 +1012,16 @@ Cell *substr(Node **a, int nnn) /* substr(a[0], a[1], a[2]) */

n = 0;

else if (n > k - m)

n = k - m;

+ /* m is start, n is length from there */

DPRINTF("substr: m=%d, n=%d, s=%s\n", m, n, s);

y = gettemp();

- temp = s[n+m-1]; /* with thanks to John Linderman */

- s[n+m-1] = '\0';

- setsval(y, s + m - 1);

- s[n+m-1] = temp;

+ mb = u8_char2byte(s, m-1); /* byte offset of start char in s */

+ nb = u8_char2byte(s, m-1+n); /* byte offset of end+1 char in s */

+ temp = s[nb]; /* with thanks to John Linderman */

+ s[nb] = '\0';

+ setsval(y, s + mb);

+ s[nb] = temp;

tempfree(x);

return(y);

}

@@ -804,7 +1042,15 @@ Cell *sindex(Node **a, int nnn) /* index(a[0], a[1]) */

for (q = p1, p2 = s2; *p2 != '\0' && *q == *p2; q++, p2++)

continue;

if (*p2 == '\0') {

- v = (Awkfloat) (p1 - s1 + 1); /* origin 1 */

+ /* v = (Awkfloat) (p1 - s1 + 1); origin 1 */

+ /* should be a function: used in match() as well */

+ int i, len;

+ v = 0;

+ for (i = 0; i < p1-s1+1; i += len) {

+ len = u8_nextlen(s1+i);

+ v++;

+ }

break;

}

@@ -814,6 +1060,18 @@ Cell *sindex(Node **a, int nnn) /* index(a[0], a[1]) */

return(z);

}

+int has_utf8(char *s) /* return 1 if s contains any utf-8 (2 bytes or more) character */

+ int n;

+ for (n = 0; *s != 0; s += n) {

+ n = u8_nextlen(s);

+ if (n > 1)

+ return 1;

+ }

+ return 0;

#define MAXNUMSIZE 50

int format(char **pbuf, int *pbufsize, const char *s, Node *a) /* printf-like conversions */

@@ -856,7 +1114,6 @@ int format(char **pbuf, int *pbufsize, const char *s, Node *a) /* printf-like co

s += 2;

continue;

}

- /* have to be real careful in case this is a huge number, eg, %100000d */

fmtwd = atoi(s+1);

if (fmtwd < 0)

fmtwd = -fmtwd;

@@ -929,7 +1186,8 @@ int format(char **pbuf, int *pbufsize, const char *s, Node *a) /* printf-like co

n = fmtwd;

adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format5");

switch (flag) {

- case '?': snprintf(p, BUFSZ(p), "%s", fmt); /* unknown, so dump it too */

+ case '?':

+ snprintf(p, BUFSZ(p), "%s", fmt); /* unknown, so dump it too */

t = getsval(x);

n = strlen(t);

if (fmtwd > n)

@@ -943,29 +1201,176 @@ int format(char **pbuf, int *pbufsize, const char *s, Node *a) /* printf-like co

case 'f': snprintf(p, BUFSZ(p), fmt, getfval(x)); break;

case 'd': snprintf(p, BUFSZ(p), fmt, (intmax_t) getfval(x)); break;

case 'u': snprintf(p, BUFSZ(p), fmt, (uintmax_t) getfval(x)); break;

- case 's':

+ case 's': {

t = getsval(x);

n = strlen(t);

- if (fmtwd > n)

- n = fmtwd;

- if (!adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format7"))

- FATAL("huge string/format (%d chars) in printf %.30s... ran format() out of memory", n, t);

- snprintf(p, BUFSZ(p), fmt, t);

+ /* if simple format or no utf-8 in the string, sprintf works */

+ if (!has_utf8(t) || strcmp(fmt,"%s") == 0) {

+ if (fmtwd > n)

+ n = fmtwd;

+ if (!adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format7"))

+ FATAL("huge string/format (%d chars) in printf %.30s..." \

+ " ran format() out of memory", n, t);

+ snprintf(p, BUFSZ(p), fmt, t);

+ break;

+ }

+ /* get here if string has utf-8 chars and fmt is not plain %s */

+ /* "%-w.ps", where -, w and .p are all optional */

+ /* '0' before the w is a flag character */

+ /* fmt points at % */

+ int ljust = 0, wid = 0, prec = n, pad = 0;

+ char *f = fmt+1;

+ if (f[0] == '-') {

+ ljust = 1;

+ f++;

+ }

+ // flags '0' and '+' are recognized but skipped

+ if (f[0] == '0') {

+ f++;

+ if (f[0] == '+')

+ f++;

+ }

+ if (f[0] == '+') {

+ f++;

+ if (f[0] == '0')

+ f++;

+ }

+ if (isdigit((uschar)f[0])) { /* there is a wid */

+ wid = strtol(f, &f, 10);

+ }

+ if (f[0] == '.') { /* there is a .prec */

+ prec = strtol(++f, &f, 10);

+ }

+ if (prec > u8_strlen(t))

+ prec = u8_strlen(t);

+ pad = wid>prec ? wid - prec : 0; // has to be >= 0

+ int i, k, n;

+ if (ljust) { // print prec chars from t, then pad blanks

+ n = u8_char2byte(t, prec);

+ for (k = 0; k < n; k++) {

+ //putchar(t[k]);

+ *p++ = t[k];

+ }

+ for (i = 0; i < pad; i++) {

+ //printf(" ");

+ *p++ = ' ';

+ }

+ } else { // print pad blanks, then prec chars from t

+ for (i = 0; i < pad; i++) {

+ //printf(" ");

+ *p++ = ' ';

+ }

+ n = u8_char2byte(t, prec);

+ for (k = 0; k < n; k++) {

+ //putchar(t[k]);

+ *p++ = t[k];

+ }

+ *p = 0;

break;

- case 'c':

+ }

+ case 'c': {

+ /*

+ * If a numeric value is given, awk should just turn

+ * it into a character and print it:

+ * BEGIN { printf("%c\n", 65) }

+ * prints "A".

+ *

+ * But what if the numeric value is > 128 and

+ * represents a valid Unicode code point?!? We do

+ * our best to convert it back into UTF-8. If we

+ * can't, we output the encoding of the Unicode

+ * "invalid character", 0xFFFD.

+ */

if (isnum(x)) {

- if ((int)getfval(x))

- snprintf(p, BUFSZ(p), fmt, (int) getfval(x));

- else {

+ int charval = (int) getfval(x);

+ if (charval != 0) {

+ if (charval < 128)

+ snprintf(p, BUFSZ(p), fmt, charval);

+ else {

+ // possible unicode character

+ size_t count;

+ char *bs = wide_char_to_byte_str(charval, &count);

+ if (bs == NULL) { // invalid character

+ // use unicode invalid character, 0xFFFD

+ bs = "\357\277\275";

+ count = 3;

+ }

+ t = bs;

+ n = count;

+ goto format_percent_c;

+ }

+ } else {

*p++ = '\0'; /* explicit null byte */

*p = '\0'; /* next output will start here */

}

- } else

+ break;

+ }

+ t = getsval(x);

+ n = u8_nextlen(t);

+ format_percent_c:

+ if (n < 2) { /* not utf8 */

snprintf(p, BUFSZ(p), fmt, getsval(x)[0]);

+ break;

+ }

+ // utf8 character, almost same song and dance as for %s

+ int ljust = 0, wid = 0, prec = n, pad = 0;

+ char *f = fmt+1;

+ if (f[0] == '-') {

+ ljust = 1;

+ f++;

+ }

+ // flags '0' and '+' are recognized but skipped

+ if (f[0] == '0') {

+ f++;

+ if (f[0] == '+')

+ f++;

+ }

+ if (f[0] == '+') {

+ f++;

+ if (f[0] == '0')

+ f++;

+ }

+ if (isdigit((uschar)f[0])) { /* there is a wid */

+ wid = strtol(f, &f, 10);

+ }

+ if (f[0] == '.') { /* there is a .prec */

+ prec = strtol(++f, &f, 10);

+ }

+ if (prec > 1) // %c --> only one character

+ prec = 1;

+ pad = wid>prec ? wid - prec : 0; // has to be >= 0

+ int i;

+ if (ljust) { // print one char from t, then pad blanks

+ for (int i = 0; i < n; i++)

+ *p++ = t[i];

+ for (i = 0; i < pad; i++) {

+ //printf(" ");

+ *p++ = ' ';

+ }

+ } else { // print pad blanks, then prec chars from t

+ for (i = 0; i < pad; i++) {

+ //printf(" ");

+ *p++ = ' ';

+ }

+ for (int i = 0; i < n; i++)

+ *p++ = t[i];

+ }

+ *p = 0;

break;

+ }

default:

FATAL("can't happen: bad conversion %c in format()", flag);

}

tempfree(x);

p += strlen(p);

s++;

@@ -1265,7 +1670,7 @@ Cell *split(Node **a, int nnn) /* split(a[0], a[1], a[2]); a[3] is type */

char *origfs = NULL;

int sep;

char temp, num[50];

- int n, tempstat, arg3type;

+ int j, n, tempstat, arg3type;

double result;

y = execute(a[0]); /* source string */

@@ -1274,20 +1679,22 @@ Cell *split(Node **a, int nnn) /* split(a[0], a[1], a[2]); a[3] is type */

FATAL("out of space in split");

tempfree(y);

arg3type = ptoi(a[3]);

- if (a[2] == NULL) /* fs string */

+ if (a[2] == NULL) { /* BUG: CSV should override implicit fs but not explicit */

fs = getsval(fsloc);

- else if (arg3type == STRING) { /* split(str,arr,"string") */

+ } else if (arg3type == STRING) { /* split(str,arr,"string") */

x = execute(a[2]);

fs = origfs = strdup(getsval(x));

if (fs == NULL)

FATAL("out of space in split");

tempfree(x);

- } else if (arg3type == REGEXPR)

+ } else if (arg3type == REGEXPR) {

fs = "(regexpr)"; /* split(str,arr,/regexpr/) */

- else

+ } else {

FATAL("illegal type of split");

+ }

sep = *fs;

ap = execute(a[1]); /* array name */

+ /* BUG 7/26/22: this appears not to reset array: see C1/asplit */

freesymtab(ap);

DPRINTF("split: s=|%s|, a=%s, sep=|%s|\n", s, NN(ap->nval), fs);

ap->tval &= ~STR;

@@ -1341,7 +1748,41 @@ Cell *split(Node **a, int nnn) /* split(a[0], a[1], a[2]); a[3] is type */

setsymtab(num, s, 0.0, STR, (Array *) ap->sval);

spdone:

pfa = NULL;

- } else if (sep == ' ') {

+ } else if (a[2] == NULL && CSV) { /* CSV only if no explicit separator */

+ char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */

+ for (;;) {

+ char *fr = newt;

+ n++;

+ if (*s == '"' ) { /* start of "..." */

+ for (s++ ; *s != '\0'; ) {

+ if (*s == '"' && s[1] != '\0' && s[1] == '"') {

+ s += 2; /* doubled quote */

+ *fr++ = '"';

+ } else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) {

+ s++; /* skip over closing quote */

+ break;

+ } else {

+ *fr++ = *s++;

+ }

+ *fr++ = 0;

+ } else { /* unquoted field */

+ while (*s != ',' && *s != '\0')

+ *fr++ = *s++;

+ *fr++ = 0;

+ }

+ snprintf(num, sizeof(num), "%d", n);

+ if (is_number(newt, &result))

+ setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval);

+ else

+ setsymtab(num, newt, 0.0, STR, (Array *) ap->sval);

+ if (*s++ == '\0')

+ break;

+ }

+ free(newt);

+ } else if (!CSV && sep == ' ') { /* usual case: split on white space */

for (n = 0; ; ) {

#define ISWS(c) ((c) == ' ' || (c) == '\t' || (c) == '\n')

while (ISWS(*s))

@@ -1364,19 +1805,25 @@ Cell *split(Node **a, int nnn) /* split(a[0], a[1], a[2]); a[3] is type */

if (*s != '\0')

s++;

}

} else if (sep == 0) { /* new: split(s, a, "") => 1 char/elem */

- for (n = 0; *s != '\0'; s++) {

- char buf[2];

+ for (n = 0; *s != '\0'; s += u8_nextlen(s)) {

+ char buf[10];

n++;

snprintf(num, sizeof(num), "%d", n);

- buf[0] = *s;

- buf[1] = '\0';

+ for (j = 0; j < u8_nextlen(s); j++) {

+ buf[j] = s[j];

+ }

+ buf[j] = '\0';

if (isdigit((uschar)buf[0]))

setsymtab(num, buf, atof(buf), STR|NUM, (Array *) ap->sval);

else

setsymtab(num, buf, 0.0, STR, (Array *) ap->sval);

}

- } else if (*s != '\0') {

+ } else if (*s != '\0') { /* some random single character */

for (;;) {

n++;

t = s;

@@ -1535,6 +1982,7 @@ static char *nawk_convert(const char *s, int (*fun_c)(int),

size_t n = 0;

wchar_t wc;

size_t sz = MB_CUR_MAX;

+ int unused;

if (sz == 1) {

buf = tostring(s);

@@ -1554,7 +2002,7 @@ static char *nawk_convert(const char *s, int (*fun_c)(int),

* doesn't work.)

* Increment said variable to avoid a different warning.

- int unused = wctomb(NULL, L'\0');

+ unused = wctomb(NULL, L'\0');

unused++;

ps = s;

@@ -1629,7 +2077,7 @@ Cell *bltin(Node **a, int n) /* builtin functions. a[0] is type, a[1] is arg lis

if (isarr(x))

u = ((Array *) x->sval)->nelem; /* GROT. should be function*/

else

- u = strlen(getsval(x));

+ u = u8_strlen(getsval(x));

break;

case FLOG:

errno = 0;

@@ -2402,3 +2850,41 @@ void backsub(char **pb_ptr, const char **sptr_ptr) /* handle \\& variations */

*pb_ptr = pb;

*sptr_ptr = sptr;

}

+static char *wide_char_to_byte_str(int rune, size_t *outlen)

+ static char buf[5];

+ int len;

+ if (rune < 0 || rune > 0x10FFFF)

+ return NULL;

+ memset(buf, 0, sizeof(buf));

+ len = 0;

+ if (rune <= 0x0000007F) {

+ buf[len++] = rune;

+ } else if (rune <= 0x000007FF) {

+ // 110xxxxx 10xxxxxx

+ buf[len++] = 0xC0 | (rune >> 6);

+ buf[len++] = 0x80 | (rune & 0x3F);

+ } else if (rune <= 0x0000FFFF) {

+ // 1110xxxx 10xxxxxx 10xxxxxx

+ buf[len++] = 0xE0 | (rune >> 12);

+ buf[len++] = 0x80 | ((rune >> 6) & 0x3F);

+ buf[len++] = 0x80 | (rune & 0x3F);

+ } else {

+ // 0x00010000 - 0x10FFFF

+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

+ buf[len++] = 0xF0 | (rune >> 18);

+ buf[len++] = 0x80 | ((rune >> 12) & 0x3F);

+ buf[len++] = 0x80 | ((rune >> 6) & 0x3F);

+ buf[len++] = 0x80 | (rune & 0x3F);

+ }

+ *outlen = len;

+ buf[len++] = '\0';

+ return buf;