summaryrefslogtreecommitdiff
path: root/usr.bin
diff options
context:
space:
mode:
authorIngo Schwarze <schwarze@cvs.openbsd.org>2015-12-02 00:56:47 +0000
committerIngo Schwarze <schwarze@cvs.openbsd.org>2015-12-02 00:56:47 +0000
commitb307a061b89a6d3a3fae5cb7e31dede52346c5b0 (patch)
treeb73922c3a49053307c59c69f5a141cbfa1e0a486 /usr.bin
parent7dabb67cf11129fc5d1420c9e2228f8e873e14b4 (diff)
UTF-8 support: Implement -c and -n
and let -d accept a multibyte delimiter character. While here, simplify the code by switching from fgetln(3) to getline(3) and from hand-crafted string parsing to strstr(3) and strchr(3). OK tedu@ czarkoff@ zhuk@.
Diffstat (limited to 'usr.bin')
-rw-r--r--usr.bin/cut/cut.122
-rw-r--r--usr.bin/cut/cut.c168
2 files changed, 119 insertions, 71 deletions
diff --git a/usr.bin/cut/cut.1 b/usr.bin/cut/cut.1
index b02ee555df1..1edeb04e0b7 100644
--- a/usr.bin/cut/cut.1
+++ b/usr.bin/cut/cut.1
@@ -1,4 +1,4 @@
-.\" $OpenBSD: cut.1,v 1.24 2014/07/10 14:11:56 jmc Exp $
+.\" $OpenBSD: cut.1,v 1.25 2015/12/02 00:56:46 schwarze Exp $
.\" $NetBSD: cut.1,v 1.6 1995/10/02 20:19:26 jtc Exp $
.\"
.\" Copyright (c) 1989, 1990, 1993
@@ -33,7 +33,7 @@
.\"
.\" @(#)cut.1 8.1 (Berkeley) 6/6/93
.\"
-.Dd $Mdocdate: July 10 2014 $
+.Dd $Mdocdate: December 2 2015 $
.Dt CUT 1
.Os
.Sh NAME
@@ -114,6 +114,8 @@ The selected fields are output,
separated by the field delimiter character.
.It Fl n
Do not split multi-byte characters.
+A character is written to standard output if and only if the byte
+position holding its last byte is selected.
.It Fl s
Suppresses lines with no field delimiter characters.
Unless specified, lines with no delimiters are passed through unmodified.
@@ -145,11 +147,19 @@ utility is compliant with the
.St -p1003.1-2008
specification.
.Sh CAVEATS
-The current implementation does not support multi-byte characters.
-Consequently
+The definition of a character depends on the current character set
+.Xr locale 1 .
+If
+.Ev LC_CTYPE
+is set to
+.Qq C
+or
+.Qq POSIX ,
.Fl c
does the same as
.Fl b ,
-and
.Fl n
-has no effect.
+has no effect, and
+.Fl d
+uses the first byte of
+.Ar delim .
diff --git a/usr.bin/cut/cut.c b/usr.bin/cut/cut.c
index 16d4e336719..8637bb4d95f 100644
--- a/usr.bin/cut/cut.c
+++ b/usr.bin/cut/cut.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: cut.c,v 1.22 2015/11/03 04:57:20 mmcc Exp $ */
+/* $OpenBSD: cut.c,v 1.23 2015/12/02 00:56:46 schwarze Exp $ */
/* $NetBSD: cut.c,v 1.9 1995/09/02 05:59:23 jtc Exp $ */
/*
@@ -33,6 +33,7 @@
* SUCH DAMAGE.
*/
+#include <assert.h>
#include <ctype.h>
#include <err.h>
#include <errno.h>
@@ -43,12 +44,17 @@
#include <string.h>
#include <unistd.h>
+char dchar[5];
+int dlen;
+
+int bflag;
int cflag;
-char dchar;
int dflag;
int fflag;
+int nflag;
int sflag;
+void b_cut(FILE *, char *);
void c_cut(FILE *, char *);
void f_cut(FILE *, char *);
void get_list(char *);
@@ -61,37 +67,43 @@ main(int argc, char *argv[])
void (*fcn)(FILE *, char *);
int ch, rval;
- setlocale (LC_ALL, "");
+ setlocale(LC_CTYPE, "");
if (pledge("stdio rpath", NULL) == -1)
err(1, "pledge");
- dchar = '\t'; /* default delimiter is \t */
+ dchar[0] = '\t'; /* default delimiter */
+ dchar[1] = '\0';
+ dlen = 1;
- /* Since we don't support multi-byte characters, the -c and -b
- options are equivalent, and the -n option is meaningless. */
while ((ch = getopt(argc, argv, "b:c:d:f:sn")) != -1)
switch(ch) {
case 'b':
+ get_list(optarg);
+ bflag = 1;
+ break;
case 'c':
- fcn = c_cut;
get_list(optarg);
cflag = 1;
break;
case 'd':
- dchar = *optarg;
+ if ((dlen = mblen(optarg, MB_CUR_MAX)) == -1)
+ usage();
+ assert(dlen < sizeof(dchar));
+ (void)memcpy(dchar, optarg, dlen);
+ dchar[dlen] = '\0';
dflag = 1;
break;
case 'f':
get_list(optarg);
- fcn = f_cut;
fflag = 1;
break;
+ case 'n':
+ nflag = 1;
+ break;
case 's':
sflag = 1;
break;
- case 'n':
- break;
case '?':
default:
usage();
@@ -99,12 +111,21 @@ main(int argc, char *argv[])
argc -= optind;
argv += optind;
- if (fflag) {
- if (cflag)
- usage();
- } else if (!cflag || dflag || sflag)
+ if (bflag + cflag + fflag != 1 ||
+ (nflag && !bflag) ||
+ ((dflag || sflag) && !fflag))
usage();
+ if (MB_CUR_MAX == 1) {
+ nflag = 0;
+ if (cflag) {
+ bflag = 1;
+ cflag = 0;
+ }
+ }
+
+ fcn = fflag ? f_cut : (cflag || nflag) ? c_cut : b_cut;
+
rval = 0;
if (*argv)
for (; *argv; ++argv) {
@@ -192,7 +213,7 @@ get_list(char *list)
/* ARGSUSED */
void
-c_cut(FILE *fp, char *fname)
+b_cut(FILE *fp, char *fname)
{
int ch, col;
char *pos;
@@ -220,65 +241,82 @@ c_cut(FILE *fp, char *fname)
}
void
-f_cut(FILE *fp, char *fname)
+c_cut(FILE *fp, char *fname)
{
- int ch, field, isdelim;
- char *pos, *p, sep;
- int output;
- size_t len;
- char *lbuf, *tbuf;
+ static char *line = NULL;
+ static size_t linesz = 0;
+ ssize_t linelen;
+ char *cp, *pos, *maxpos;
+ int len;
- for (sep = dchar, tbuf = NULL; (lbuf = fgetln(fp, &len));) {
- output = 0;
- if (lbuf[len - 1] != '\n') {
- /* no newline at the end of the last line so add one */
- if ((tbuf = malloc(len + 1)) == NULL)
- err(1, NULL);
- memcpy(tbuf, lbuf, len);
- tbuf[len] = '\n';
- lbuf = tbuf;
- }
- for (isdelim = 0, p = lbuf;; ++p) {
- ch = *p;
- /* this should work if newline is delimiter */
- if (ch == sep)
- isdelim = 1;
- if (ch == '\n') {
- if (!isdelim && !sflag)
- (void)fwrite(lbuf, len, 1, stdout);
- break;
- }
+ while ((linelen = getline(&line, &linesz, fp)) != -1) {
+ if (line[linelen - 1] == '\n')
+ line[linelen - 1] = '\0';
+
+ cp = line;
+ pos = positions + 1;
+ maxpos = pos + maxval;
+ while(pos < maxpos && *cp != '\0') {
+ len = mblen(cp, MB_CUR_MAX);
+ if (len == -1)
+ len = 1;
+ pos += nflag ? len : 1;
+ if (pos[-1] == '\0')
+ cp += len;
+ else
+ while (len--)
+ putchar(*cp++);
}
- if (!isdelim)
+ if (autostop)
+ puts(cp);
+ else
+ putchar('\n');
+ }
+}
+
+void
+f_cut(FILE *fp, char *fname)
+{
+ static char *line = NULL;
+ static size_t linesz = 0;
+ ssize_t linelen;
+ char *sp, *ep, *pos, *maxpos;
+ int output;
+
+ while ((linelen = getline(&line, &linesz, fp)) != -1) {
+ if (line[linelen - 1] == '\n')
+ line[linelen - 1] = '\0';
+
+ if ((ep = strstr(line, dchar)) == NULL) {
+ if (!sflag)
+ puts(line);
continue;
+ }
pos = positions + 1;
- for (field = maxval, p = lbuf; field; --field, ++pos) {
- if (*pos) {
- if (output++)
- (void)putchar(sep);
- while ((ch = *p++) != '\n' && ch != sep)
- (void)putchar(ch);
- } else
- while ((ch = *p++) != '\n' && ch != sep)
- ;
- if (ch == '\n')
- break;
- }
- if (ch != '\n') {
- if (autostop) {
+ maxpos = pos + maxval;
+ output = 0;
+ sp = line;
+ for (;;) {
+ if (*pos++) {
if (output)
- (void)putchar(sep);
- for (; (ch = *p) != '\n'; ++p)
- (void)putchar(ch);
+ fputs(dchar, stdout);
+ while (sp < ep)
+ putchar(*sp++);
+ output = 1;
} else
- for (; (ch = *p) != '\n'; ++p)
- ;
+ sp = ep;
+ if (*sp == '\0' || pos == maxpos)
+ break;
+ sp += dlen;
+ if ((ep = strstr(sp, dchar)) == NULL)
+ ep = strchr(sp, '\0');
}
- (void)putchar('\n');
+ if (autostop)
+ puts(sp);
+ else
+ putchar('\n');
}
- if (tbuf)
- free(tbuf);
}
void