From 11df82023be3eadafdd1c5a52cf0a5d890c968f4 Mon Sep 17 00:00:00 2001 From: Ingo Schwarze Date: Sat, 19 Dec 2015 10:21:02 +0000 Subject: UTF-8 support: Let -f recognize non-ASCII blank characters and let -s count characters rather than bytes. OK zhuk@ bentley@ --- usr.bin/uniq/uniq.1 | 12 ++++++++++-- usr.bin/uniq/uniq.c | 35 ++++++++++++++++++++++++++++------- 2 files changed, 38 insertions(+), 9 deletions(-) (limited to 'usr.bin/uniq') diff --git a/usr.bin/uniq/uniq.1 b/usr.bin/uniq/uniq.1 index 7858404e913..d7e275372e2 100644 --- a/usr.bin/uniq/uniq.1 +++ b/usr.bin/uniq/uniq.1 @@ -1,4 +1,4 @@ -.\" $OpenBSD: uniq.1,v 1.17 2010/09/03 11:09:29 jmc Exp $ +.\" $OpenBSD: uniq.1,v 1.18 2015/12/19 10:21:01 schwarze Exp $ .\" $NetBSD: uniq.1,v 1.5 1994/12/06 07:51:15 jtc Exp $ .\" .\" Copyright (c) 1991, 1993 @@ -33,7 +33,7 @@ .\" .\" @(#)uniq.1 8.1 (Berkeley) 6/6/93 .\" -.Dd $Mdocdate: September 3 2010 $ +.Dd $Mdocdate: December 19 2015 $ .Dt UNIQ 1 .Os .Sh NAME @@ -114,6 +114,14 @@ A file name of .Ql - denotes the standard input or the standard output .Pq depending on its position on the command line . +.Sh ENVIRONMENT +.Bl -tag -width LC_CTYPE +.It Ev LC_CTYPE +The character set +.Xr locale 1 . +Determines which groups of bytes are treated as characters +and which characters are considered blank. +.El .Sh EXIT STATUS .Ex -std uniq .Sh SEE ALSO diff --git a/usr.bin/uniq/uniq.c b/usr.bin/uniq/uniq.c index 43e462e1463..7e4d7b86a11 100644 --- a/usr.bin/uniq/uniq.c +++ b/usr.bin/uniq/uniq.c @@ -1,4 +1,4 @@ -/* $OpenBSD: uniq.c,v 1.23 2015/11/02 20:25:42 mmcc Exp $ */ +/* $OpenBSD: uniq.c,v 1.24 2015/12/19 10:21:01 schwarze Exp $ */ /* $NetBSD: uniq.c,v 1.7 1995/08/31 22:03:48 jtc Exp $ */ /* @@ -37,10 +37,13 @@ #include #include #include +#include #include #include #include #include +#include +#include #define MAXLINELEN (8 * 1024) @@ -61,6 +64,8 @@ main(int argc, char *argv[]) int ch; char *prevline, *thisline; + setlocale(LC_CTYPE, ""); + if (pledge("stdio rpath wpath cpath", NULL) == -1) err(1, "pledge"); @@ -176,16 +181,32 @@ show(FILE *ofp, char *str) char * skip(char *str) { + wchar_t wc; int nchars, nfields; + int len; + int field_started; for (nfields = numfields; nfields && *str; nfields--) { - while (isblank((unsigned char)*str)) - str++; - while (*str && !isblank((unsigned char)*str)) - str++; + /* Skip one field, including preceding blanks. */ + for (field_started = 0; *str != '\0'; str += len) { + if ((len = mbtowc(&wc, str, MB_CUR_MAX)) == -1) { + (void)mbtowc(NULL, NULL, MB_CUR_MAX); + wc = L'?'; + len = 1; + } + if (iswblank(wc)) { + if (field_started) + break; + } else + field_started = 1; + } } - for (nchars = numchars; nchars-- && *str && *str != '\n'; ++str) - ; + + /* Skip some additional characters. */ + for (nchars = numchars; nchars-- && *str != '\0'; str += len) + if ((len = mblen(str, MB_CUR_MAX)) == -1) + len = 1; + return (str); } -- cgit v1.2.3