summaryrefslogtreecommitdiff
path: root/usr.bin/wc
diff options
context:
space:
mode:
Diffstat (limited to 'usr.bin/wc')
-rw-r--r--usr.bin/wc/wc.131
-rw-r--r--usr.bin/wc/wc.c99
2 files changed, 87 insertions, 43 deletions
diff --git a/usr.bin/wc/wc.1 b/usr.bin/wc/wc.1
index 2e9525e3caa..afd78b00567 100644
--- a/usr.bin/wc/wc.1
+++ b/usr.bin/wc/wc.1
@@ -1,4 +1,4 @@
-.\" $OpenBSD: wc.1,v 1.25 2015/04/21 10:46:48 schwarze Exp $
+.\" $OpenBSD: wc.1,v 1.26 2015/12/08 01:00:44 schwarze Exp $
.\"
.\" Copyright (c) 1991, 1993
.\" The Regents of the University of California. All rights reserved.
@@ -32,7 +32,7 @@
.\"
.\" from: @(#)wc.1 8.2 (Berkeley) 4/19/94
.\"
-.Dd $Mdocdate: April 21 2015 $
+.Dd $Mdocdate: December 8 2015 $
.Dt WC 1
.Os
.Sh NAME
@@ -72,9 +72,10 @@ using powers of 2 for sizes (K=1024, M=1048576, etc.).
The number of lines in each input file
is written to the standard output.
.It Fl m
-Intended to count characters instead of bytes;
-currently an alias for
-.Fl c .
+Count characters instead of bytes, and use
+.Xr iswspace 3
+instead of
+.Xr isspace 3 .
.It Fl w
The number of words in each input file
is written to the standard output.
@@ -102,6 +103,20 @@ lines words bytes file_name
The counts for lines, words, and bytes
.Pq or characters
are integers separated by spaces.
+.Sh ENVIRONMENT
+.Bl -tag -width LC_CTYPE
+.It Ev LC_CTYPE
+The character set
+.Xr locale 1 ,
+defining which byte sequences form characters.
+If unset or set to
+.Qq C ,
+.Qq POSIX ,
+or an unsupported value,
+.Fl m
+has the same effect as
+.Fl c .
+.El
.Sh EXIT STATUS
.Ex -std wc
.Sh SEE ALSO
@@ -111,7 +126,7 @@ The
.Nm
utility is compliant with the
.St -p1003.1-2008
-specification, except that it ignores the locale.
+specification.
.Pp
The flag
.Op Fl h
@@ -121,7 +136,3 @@ A
.Nm
utility appeared in
.At v1 .
-.Sh BUGS
-The
-.Fl m
-option counts bytes instead of characters.
diff --git a/usr.bin/wc/wc.c b/usr.bin/wc/wc.c
index cfeb9e4196d..5813f44a73d 100644
--- a/usr.bin/wc/wc.c
+++ b/usr.bin/wc/wc.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: wc.c,v 1.19 2015/10/09 01:37:09 deraadt Exp $ */
+/* $OpenBSD: wc.c,v 1.20 2015/12/08 01:00:45 schwarze Exp $ */
/*
* Copyright (c) 1980, 1987, 1991, 1993
@@ -40,9 +40,11 @@
#include <err.h>
#include <unistd.h>
#include <util.h>
+#include <wchar.h>
+#include <wctype.h>
int64_t tlinect, twordct, tcharct;
-int doline, doword, dochar, humanchar;
+int doline, doword, dochar, humanchar, multibyte;
int rval;
extern char *__progname;
@@ -55,7 +57,7 @@ main(int argc, char *argv[])
{
int ch;
- setlocale(LC_ALL, "");
+ setlocale(LC_CTYPE, "");
if (pledge("stdio rpath", NULL) == -1)
err(1, "pledge");
@@ -68,8 +70,11 @@ main(int argc, char *argv[])
case 'w':
doword = 1;
break;
- case 'c':
case 'm':
+ if (MB_CUR_MAX > 1)
+ multibyte = 1;
+ /* FALLTHROUGH */
+ case 'c':
dochar = 1;
break;
case 'h':
@@ -112,15 +117,20 @@ main(int argc, char *argv[])
void
cnt(char *file)
{
- u_char *C;
+ static char *buf;
+ static ssize_t bufsz;
+
+ FILE *stream;
+ char *C;
+ wchar_t wc;
short gotsp;
- int len;
+ ssize_t len;
int64_t linect, wordct, charct;
struct stat sbuf;
int fd;
- u_char buf[MAXBSIZE];
linect = wordct = charct = 0;
+ stream = NULL;
if (file) {
if ((fd = open(file, O_RDONLY, 0)) < 0) {
warn("%s", file);
@@ -131,7 +141,10 @@ cnt(char *file)
fd = STDIN_FILENO;
}
- if (!doword) {
+ if (!doword && !multibyte) {
+ if (bufsz < MAXBSIZE &&
+ (buf = realloc(buf, MAXBSIZE)) == NULL)
+ err(1, NULL);
/*
* Line counting is split out because it's a lot
* faster to get lines than to get words, since
@@ -178,37 +191,57 @@ cnt(char *file)
}
}
} else {
- /* Do it the hard way... */
+ if (file == NULL)
+ stream = stdin;
+ else if ((stream = fdopen(fd, "r")) == NULL) {
+ warn("%s", file);
+ close(fd);
+ rval = 1;
+ return;
+ }
+
+ /*
+ * Do it the hard way.
+ * According to POSIX, a word is a "maximal string of
+ * characters delimited by whitespace." Nothing is said
+ * about a character being printing or non-printing.
+ */
gotsp = 1;
- while ((len = read(fd, buf, MAXBSIZE)) > 0) {
- /*
- * This loses in the presence of multi-byte characters.
- * To do it right would require a function to return a
- * character while knowing how many bytes it consumed.
- */
- charct += len;
- for (C = buf; len--; ++C) {
- if (isspace(*C)) {
- gotsp = 1;
- if (*C == '\n')
- ++linect;
- } else {
- /*
- * This line implements the POSIX
- * spec, i.e. a word is a "maximal
- * string of characters delimited by
- * whitespace." Notice nothing was
- * said about a character being
- * printing or non-printing.
- */
- if (gotsp) {
+ while ((len = getline(&buf, &bufsz, stream)) > 0) {
+ if (multibyte) {
+ for (C = buf; *C != '\0'; C += len) {
+ ++charct;
+ len = mbtowc(&wc, C, MB_CUR_MAX);
+ if (len == -1) {
+ (void)mbtowc(NULL, NULL,
+ MB_CUR_MAX);
+ len = 1;
+ wc = L' ';
+ }
+ if (iswspace(wc)) {
+ gotsp = 1;
+ if (wc == L'\n')
+ ++linect;
+ } else if (gotsp) {
+ gotsp = 0;
+ ++wordct;
+ }
+ }
+ } else {
+ charct += len;
+ for (C = buf; *C != '\0'; ++C) {
+ if (isspace((unsigned char)*C)) {
+ gotsp = 1;
+ if (*C == '\n')
+ ++linect;
+ } else if (gotsp) {
gotsp = 0;
++wordct;
}
}
}
}
- if (len == -1) {
+ if (ferror(stream)) {
warn("%s", file);
rval = 1;
}
@@ -224,7 +257,7 @@ cnt(char *file)
twordct += wordct;
tcharct += charct;
- if (close(fd) != 0) {
+ if ((stream == NULL ? close(fd) : fclose(stream)) != 0) {
warn("%s", file);
rval = 1;
}