diff options
Diffstat (limited to 'usr.bin/wc')
-rw-r--r-- | usr.bin/wc/wc.1 | 31 | ||||
-rw-r--r-- | usr.bin/wc/wc.c | 99 |
2 files changed, 87 insertions, 43 deletions
diff --git a/usr.bin/wc/wc.1 b/usr.bin/wc/wc.1 index 2e9525e3caa..afd78b00567 100644 --- a/usr.bin/wc/wc.1 +++ b/usr.bin/wc/wc.1 @@ -1,4 +1,4 @@ -.\" $OpenBSD: wc.1,v 1.25 2015/04/21 10:46:48 schwarze Exp $ +.\" $OpenBSD: wc.1,v 1.26 2015/12/08 01:00:44 schwarze Exp $ .\" .\" Copyright (c) 1991, 1993 .\" The Regents of the University of California. All rights reserved. @@ -32,7 +32,7 @@ .\" .\" from: @(#)wc.1 8.2 (Berkeley) 4/19/94 .\" -.Dd $Mdocdate: April 21 2015 $ +.Dd $Mdocdate: December 8 2015 $ .Dt WC 1 .Os .Sh NAME @@ -72,9 +72,10 @@ using powers of 2 for sizes (K=1024, M=1048576, etc.). The number of lines in each input file is written to the standard output. .It Fl m -Intended to count characters instead of bytes; -currently an alias for -.Fl c . +Count characters instead of bytes, and use +.Xr iswspace 3 +instead of +.Xr isspace 3 . .It Fl w The number of words in each input file is written to the standard output. @@ -102,6 +103,20 @@ lines words bytes file_name The counts for lines, words, and bytes .Pq or characters are integers separated by spaces. +.Sh ENVIRONMENT +.Bl -tag -width LC_CTYPE +.It Ev LC_CTYPE +The character set +.Xr locale 1 , +defining which byte sequences form characters. +If unset or set to +.Qq C , +.Qq POSIX , +or an unsupported value, +.Fl m +has the same effect as +.Fl c . +.El .Sh EXIT STATUS .Ex -std wc .Sh SEE ALSO @@ -111,7 +126,7 @@ The .Nm utility is compliant with the .St -p1003.1-2008 -specification, except that it ignores the locale. +specification. .Pp The flag .Op Fl h @@ -121,7 +136,3 @@ A .Nm utility appeared in .At v1 . -.Sh BUGS -The -.Fl m -option counts bytes instead of characters. diff --git a/usr.bin/wc/wc.c b/usr.bin/wc/wc.c index cfeb9e4196d..5813f44a73d 100644 --- a/usr.bin/wc/wc.c +++ b/usr.bin/wc/wc.c @@ -1,4 +1,4 @@ -/* $OpenBSD: wc.c,v 1.19 2015/10/09 01:37:09 deraadt Exp $ */ +/* $OpenBSD: wc.c,v 1.20 2015/12/08 01:00:45 schwarze Exp $ */ /* * Copyright (c) 1980, 1987, 1991, 1993 @@ -40,9 +40,11 @@ #include <err.h> #include <unistd.h> #include <util.h> +#include <wchar.h> +#include <wctype.h> int64_t tlinect, twordct, tcharct; -int doline, doword, dochar, humanchar; +int doline, doword, dochar, humanchar, multibyte; int rval; extern char *__progname; @@ -55,7 +57,7 @@ main(int argc, char *argv[]) { int ch; - setlocale(LC_ALL, ""); + setlocale(LC_CTYPE, ""); if (pledge("stdio rpath", NULL) == -1) err(1, "pledge"); @@ -68,8 +70,11 @@ main(int argc, char *argv[]) case 'w': doword = 1; break; - case 'c': case 'm': + if (MB_CUR_MAX > 1) + multibyte = 1; + /* FALLTHROUGH */ + case 'c': dochar = 1; break; case 'h': @@ -112,15 +117,20 @@ main(int argc, char *argv[]) void cnt(char *file) { - u_char *C; + static char *buf; + static ssize_t bufsz; + + FILE *stream; + char *C; + wchar_t wc; short gotsp; - int len; + ssize_t len; int64_t linect, wordct, charct; struct stat sbuf; int fd; - u_char buf[MAXBSIZE]; linect = wordct = charct = 0; + stream = NULL; if (file) { if ((fd = open(file, O_RDONLY, 0)) < 0) { warn("%s", file); @@ -131,7 +141,10 @@ cnt(char *file) fd = STDIN_FILENO; } - if (!doword) { + if (!doword && !multibyte) { + if (bufsz < MAXBSIZE && + (buf = realloc(buf, MAXBSIZE)) == NULL) + err(1, NULL); /* * Line counting is split out because it's a lot * faster to get lines than to get words, since @@ -178,37 +191,57 @@ cnt(char *file) } } } else { - /* Do it the hard way... */ + if (file == NULL) + stream = stdin; + else if ((stream = fdopen(fd, "r")) == NULL) { + warn("%s", file); + close(fd); + rval = 1; + return; + } + + /* + * Do it the hard way. + * According to POSIX, a word is a "maximal string of + * characters delimited by whitespace." Nothing is said + * about a character being printing or non-printing. + */ gotsp = 1; - while ((len = read(fd, buf, MAXBSIZE)) > 0) { - /* - * This loses in the presence of multi-byte characters. - * To do it right would require a function to return a - * character while knowing how many bytes it consumed. - */ - charct += len; - for (C = buf; len--; ++C) { - if (isspace(*C)) { - gotsp = 1; - if (*C == '\n') - ++linect; - } else { - /* - * This line implements the POSIX - * spec, i.e. a word is a "maximal - * string of characters delimited by - * whitespace." Notice nothing was - * said about a character being - * printing or non-printing. - */ - if (gotsp) { + while ((len = getline(&buf, &bufsz, stream)) > 0) { + if (multibyte) { + for (C = buf; *C != '\0'; C += len) { + ++charct; + len = mbtowc(&wc, C, MB_CUR_MAX); + if (len == -1) { + (void)mbtowc(NULL, NULL, + MB_CUR_MAX); + len = 1; + wc = L' '; + } + if (iswspace(wc)) { + gotsp = 1; + if (wc == L'\n') + ++linect; + } else if (gotsp) { + gotsp = 0; + ++wordct; + } + } + } else { + charct += len; + for (C = buf; *C != '\0'; ++C) { + if (isspace((unsigned char)*C)) { + gotsp = 1; + if (*C == '\n') + ++linect; + } else if (gotsp) { gotsp = 0; ++wordct; } } } } - if (len == -1) { + if (ferror(stream)) { warn("%s", file); rval = 1; } @@ -224,7 +257,7 @@ cnt(char *file) twordct += wordct; tcharct += charct; - if (close(fd) != 0) { + if ((stream == NULL ? close(fd) : fclose(stream)) != 0) { warn("%s", file); rval = 1; } |