diff options
author | Ingo Schwarze <schwarze@cvs.openbsd.org> | 2015-12-02 00:56:47 +0000 |
---|---|---|
committer | Ingo Schwarze <schwarze@cvs.openbsd.org> | 2015-12-02 00:56:47 +0000 |
commit | b307a061b89a6d3a3fae5cb7e31dede52346c5b0 (patch) | |
tree | b73922c3a49053307c59c69f5a141cbfa1e0a486 /usr.bin | |
parent | 7dabb67cf11129fc5d1420c9e2228f8e873e14b4 (diff) |
UTF-8 support: Implement -c and -n
and let -d accept a multibyte delimiter character.
While here, simplify the code by switching from fgetln(3) to getline(3)
and from hand-crafted string parsing to strstr(3) and strchr(3).
OK tedu@ czarkoff@ zhuk@.
Diffstat (limited to 'usr.bin')
-rw-r--r-- | usr.bin/cut/cut.1 | 22 | ||||
-rw-r--r-- | usr.bin/cut/cut.c | 168 |
2 files changed, 119 insertions, 71 deletions
diff --git a/usr.bin/cut/cut.1 b/usr.bin/cut/cut.1 index b02ee555df1..1edeb04e0b7 100644 --- a/usr.bin/cut/cut.1 +++ b/usr.bin/cut/cut.1 @@ -1,4 +1,4 @@ -.\" $OpenBSD: cut.1,v 1.24 2014/07/10 14:11:56 jmc Exp $ +.\" $OpenBSD: cut.1,v 1.25 2015/12/02 00:56:46 schwarze Exp $ .\" $NetBSD: cut.1,v 1.6 1995/10/02 20:19:26 jtc Exp $ .\" .\" Copyright (c) 1989, 1990, 1993 @@ -33,7 +33,7 @@ .\" .\" @(#)cut.1 8.1 (Berkeley) 6/6/93 .\" -.Dd $Mdocdate: July 10 2014 $ +.Dd $Mdocdate: December 2 2015 $ .Dt CUT 1 .Os .Sh NAME @@ -114,6 +114,8 @@ The selected fields are output, separated by the field delimiter character. .It Fl n Do not split multi-byte characters. +A character is written to standard output if and only if the byte +position holding its last byte is selected. .It Fl s Suppresses lines with no field delimiter characters. Unless specified, lines with no delimiters are passed through unmodified. @@ -145,11 +147,19 @@ utility is compliant with the .St -p1003.1-2008 specification. .Sh CAVEATS -The current implementation does not support multi-byte characters. -Consequently +The definition of a character depends on the current character set +.Xr locale 1 . +If +.Ev LC_CTYPE +is set to +.Qq C +or +.Qq POSIX , .Fl c does the same as .Fl b , -and .Fl n -has no effect. +has no effect, and +.Fl d +uses the first byte of +.Ar delim . diff --git a/usr.bin/cut/cut.c b/usr.bin/cut/cut.c index 16d4e336719..8637bb4d95f 100644 --- a/usr.bin/cut/cut.c +++ b/usr.bin/cut/cut.c @@ -1,4 +1,4 @@ -/* $OpenBSD: cut.c,v 1.22 2015/11/03 04:57:20 mmcc Exp $ */ +/* $OpenBSD: cut.c,v 1.23 2015/12/02 00:56:46 schwarze Exp $ */ /* $NetBSD: cut.c,v 1.9 1995/09/02 05:59:23 jtc Exp $ */ /* @@ -33,6 +33,7 @@ * SUCH DAMAGE. */ +#include <assert.h> #include <ctype.h> #include <err.h> #include <errno.h> @@ -43,12 +44,17 @@ #include <string.h> #include <unistd.h> +char dchar[5]; +int dlen; + +int bflag; int cflag; -char dchar; int dflag; int fflag; +int nflag; int sflag; +void b_cut(FILE *, char *); void c_cut(FILE *, char *); void f_cut(FILE *, char *); void get_list(char *); @@ -61,37 +67,43 @@ main(int argc, char *argv[]) void (*fcn)(FILE *, char *); int ch, rval; - setlocale (LC_ALL, ""); + setlocale(LC_CTYPE, ""); if (pledge("stdio rpath", NULL) == -1) err(1, "pledge"); - dchar = '\t'; /* default delimiter is \t */ + dchar[0] = '\t'; /* default delimiter */ + dchar[1] = '\0'; + dlen = 1; - /* Since we don't support multi-byte characters, the -c and -b - options are equivalent, and the -n option is meaningless. */ while ((ch = getopt(argc, argv, "b:c:d:f:sn")) != -1) switch(ch) { case 'b': + get_list(optarg); + bflag = 1; + break; case 'c': - fcn = c_cut; get_list(optarg); cflag = 1; break; case 'd': - dchar = *optarg; + if ((dlen = mblen(optarg, MB_CUR_MAX)) == -1) + usage(); + assert(dlen < sizeof(dchar)); + (void)memcpy(dchar, optarg, dlen); + dchar[dlen] = '\0'; dflag = 1; break; case 'f': get_list(optarg); - fcn = f_cut; fflag = 1; break; + case 'n': + nflag = 1; + break; case 's': sflag = 1; break; - case 'n': - break; case '?': default: usage(); @@ -99,12 +111,21 @@ main(int argc, char *argv[]) argc -= optind; argv += optind; - if (fflag) { - if (cflag) - usage(); - } else if (!cflag || dflag || sflag) + if (bflag + cflag + fflag != 1 || + (nflag && !bflag) || + ((dflag || sflag) && !fflag)) usage(); + if (MB_CUR_MAX == 1) { + nflag = 0; + if (cflag) { + bflag = 1; + cflag = 0; + } + } + + fcn = fflag ? f_cut : (cflag || nflag) ? c_cut : b_cut; + rval = 0; if (*argv) for (; *argv; ++argv) { @@ -192,7 +213,7 @@ get_list(char *list) /* ARGSUSED */ void -c_cut(FILE *fp, char *fname) +b_cut(FILE *fp, char *fname) { int ch, col; char *pos; @@ -220,65 +241,82 @@ c_cut(FILE *fp, char *fname) } void -f_cut(FILE *fp, char *fname) +c_cut(FILE *fp, char *fname) { - int ch, field, isdelim; - char *pos, *p, sep; - int output; - size_t len; - char *lbuf, *tbuf; + static char *line = NULL; + static size_t linesz = 0; + ssize_t linelen; + char *cp, *pos, *maxpos; + int len; - for (sep = dchar, tbuf = NULL; (lbuf = fgetln(fp, &len));) { - output = 0; - if (lbuf[len - 1] != '\n') { - /* no newline at the end of the last line so add one */ - if ((tbuf = malloc(len + 1)) == NULL) - err(1, NULL); - memcpy(tbuf, lbuf, len); - tbuf[len] = '\n'; - lbuf = tbuf; - } - for (isdelim = 0, p = lbuf;; ++p) { - ch = *p; - /* this should work if newline is delimiter */ - if (ch == sep) - isdelim = 1; - if (ch == '\n') { - if (!isdelim && !sflag) - (void)fwrite(lbuf, len, 1, stdout); - break; - } + while ((linelen = getline(&line, &linesz, fp)) != -1) { + if (line[linelen - 1] == '\n') + line[linelen - 1] = '\0'; + + cp = line; + pos = positions + 1; + maxpos = pos + maxval; + while(pos < maxpos && *cp != '\0') { + len = mblen(cp, MB_CUR_MAX); + if (len == -1) + len = 1; + pos += nflag ? len : 1; + if (pos[-1] == '\0') + cp += len; + else + while (len--) + putchar(*cp++); } - if (!isdelim) + if (autostop) + puts(cp); + else + putchar('\n'); + } +} + +void +f_cut(FILE *fp, char *fname) +{ + static char *line = NULL; + static size_t linesz = 0; + ssize_t linelen; + char *sp, *ep, *pos, *maxpos; + int output; + + while ((linelen = getline(&line, &linesz, fp)) != -1) { + if (line[linelen - 1] == '\n') + line[linelen - 1] = '\0'; + + if ((ep = strstr(line, dchar)) == NULL) { + if (!sflag) + puts(line); continue; + } pos = positions + 1; - for (field = maxval, p = lbuf; field; --field, ++pos) { - if (*pos) { - if (output++) - (void)putchar(sep); - while ((ch = *p++) != '\n' && ch != sep) - (void)putchar(ch); - } else - while ((ch = *p++) != '\n' && ch != sep) - ; - if (ch == '\n') - break; - } - if (ch != '\n') { - if (autostop) { + maxpos = pos + maxval; + output = 0; + sp = line; + for (;;) { + if (*pos++) { if (output) - (void)putchar(sep); - for (; (ch = *p) != '\n'; ++p) - (void)putchar(ch); + fputs(dchar, stdout); + while (sp < ep) + putchar(*sp++); + output = 1; } else - for (; (ch = *p) != '\n'; ++p) - ; + sp = ep; + if (*sp == '\0' || pos == maxpos) + break; + sp += dlen; + if ((ep = strstr(sp, dchar)) == NULL) + ep = strchr(sp, '\0'); } - (void)putchar('\n'); + if (autostop) + puts(sp); + else + putchar('\n'); } - if (tbuf) - free(tbuf); } void |