diff options
author | Ingo Schwarze <schwarze@cvs.openbsd.org> | 2016-05-23 10:31:43 +0000 |
---|---|---|
committer | Ingo Schwarze <schwarze@cvs.openbsd.org> | 2016-05-23 10:31:43 +0000 |
commit | 7927ad60d752f4ce8ac9654ffe9adcaaf2ea83ad (patch) | |
tree | 8157fe84b44268a72c56d370bf6ff780973d90e6 | |
parent | fac140e6b81364cea22d53e9913d77ba0b0cb9fd (diff) |
UTF-8 support.
Using feedback about bugs in earlier versions from Matthew Martin
<phy1729 at gmail dot com> and from tsg@ who tested it with afl(1).
OK czarkoff@ tsg@
-rw-r--r-- | regress/usr.bin/fold/fold.sh | 45 | ||||
-rw-r--r-- | usr.bin/fold/fold.1 | 55 | ||||
-rw-r--r-- | usr.bin/fold/fold.c | 203 |
3 files changed, 195 insertions, 108 deletions
diff --git a/regress/usr.bin/fold/fold.sh b/regress/usr.bin/fold/fold.sh index 393e8c7e6cc..091dd62d8de 100644 --- a/regress/usr.bin/fold/fold.sh +++ b/regress/usr.bin/fold/fold.sh @@ -14,11 +14,18 @@ # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +FOLD=/usr/bin/fold + +# Arguments of the test function: +# 1. command line arguments for fold(1) +# 2. standard input for fold, backslash-encoded +# 3. expected standard output, backslash-encoded +# 4. expected standard output of "fold -b", backslash-encoded +# (optional, by default the same as argument 3.) test_fold() { expect=`echo -n "$3" ; echo .` - if [ $SKIPUTF8 -eq 0 ]; then - result=`echo -n "$2" | fold $1 2>&1 ; echo .` + result=`echo -n "$2" | $FOLD $1 2>&1 ; echo .` if [ "$result" != "$expect" ]; then echo "fold $1 \"$2\":" echo -n "$2" | hexdump -C @@ -28,9 +35,8 @@ test_fold() echo -n "$result" | hexdump -C exit 1 fi - fi [ -n "$4" ] && expect=`echo -n "$4" ; echo .` - result=`echo -n "$2" | fold -b $1 2>&1 ; echo .` + result=`echo -n "$2" | $FOLD -b $1 2>&1 ; echo .` if [ "$result" != "$expect" ]; then echo "fold -b $1 \"$2\":" echo -n "$2" | hexdump -C @@ -44,17 +50,21 @@ test_fold() export LC_ALL=C -SKIPUTF8=0 - test_fold "" "" "" + +# newline test_fold "" "\n" "\n" test_fold "" "\n\n" "\n\n" test_fold "-w 1" "\n\n" "\n\n" +test_fold "-w 2" "1\n12\n123" "1\n12\n12\n3" +test_fold "-w 2" "12345" "12\n34\n5" +test_fold "-w 2" "12345\n" "12\n34\n5\n" # backspace test_fold "-w 2" "123" "12\n3" test_fold "-w 2" "1\b234" "1\b23\n4" "1\b\n23\n4" test_fold "-w 2" "\b1234" "\b12\n34" "\b1\n23\n4" +test_fold "-w 2" "12\b\b345" "12\b\b34\n5" "12\n\b\b\n34\n5" test_fold "-w 2" "12\r3" "12\r3" "12\n\r3" # tabulator @@ -66,20 +76,35 @@ test_fold "-w 9" "1\t9\b\b89012" "1\t9\b\b89\n012" "1\t9\b\b8901\n2" test_fold "-sw 4" "1 23 45" "1 \n23 \n45" test_fold "-sw 3" "1234 56" "123\n4 \n56" -export LC_ALL=en_US.UTF-8 - # invalid characters test_fold "-w 3" "1\037734" "1\03773\n4" test_fold "-w 3" "1\000734" "1\00073\n4" +test_fold "-w 3" "1\000034" "1\00003\n4" -SKIPUTF8=1 +export LC_ALL=en_US.UTF-8 # double width characters test_fold "-w 4" "1\0343\0201\020145" "1\0343\0201\02014\n5" \ "1\0343\0201\0201\n45" +test_fold "-w 3" "\0343\0201\0201\0343\0201\020134" \ + "\0343\0201\0201\n\0343\0201\02013\n4" \ + "\0343\0201\0201\n\0343\0201\0201\n34" +test_fold "-w 2" "\0343\0201\0201\b23" "\0343\0201\0201\b2\n3" \ + "\0343\0201\0201\n\b2\n3" +test_fold "-w 1" "1\0343\0201\02014" "1\n\0343\0201\0201\n4" # zero width characters -test_fold "-w 3" "1a\0314\020034" "1a\0314\02003\n4" "1a\0314\n\020034" +test_fold "-w 3" "1a\0314\020034" "1a\0314\02003\n4" "1a\n\0314\02003\n4" test_fold "-w 2" "1a\0314\02003" "1a\0314\0200\n3" "1a\n\0314\0200\n3" +# four byte UTF-8 encoding +test_fold "-w 3" "1\0360\0220\0200\020034" "1\0360\0220\0200\02003\n4" \ + "1\n\0360\0220\0200\0200\n34" + +# invalid UTF-8 +test_fold "-w 3" "\0343\0201\0201\0201\0201\0201\0201\0201\n" \ + "\0343\0201\0201\0201\n\0201\0201\0201\n\0201\n" \ + "\0343\0201\0201\n\0201\0201\0201\n\0201\0201\n" +test_fold "-w 2" "\0343\0343\0201\0201\n" "\0343\n\0343\0201\0201\n" + exit 0 diff --git a/usr.bin/fold/fold.1 b/usr.bin/fold/fold.1 index 98d149a595f..ab6e09027fc 100644 --- a/usr.bin/fold/fold.1 +++ b/usr.bin/fold/fold.1 @@ -1,4 +1,4 @@ -.\" $OpenBSD: fold.1,v 1.17 2016/01/05 12:44:55 schwarze Exp $ +.\" $OpenBSD: fold.1,v 1.18 2016/05/23 10:31:42 schwarze Exp $ .\" $NetBSD: fold.1,v 1.5 1995/09/01 01:42:42 jtc Exp $ .\" .\" Copyright (c) 1980, 1993 @@ -30,7 +30,7 @@ .\" .\" @(#)fold.1 8.1 (Berkeley) 6/6/93 .\" -.Dd $Mdocdate: January 5 2016 $ +.Dd $Mdocdate: May 23 2016 $ .Dt FOLD 1 .Os .Sh NAME @@ -48,7 +48,7 @@ or the standard input if no files are specified, breaking the lines to have a maximum of 80 display columns. .Pp The options are as follows: -.Bl -tag -width Ds +.Bl -tag -width 8n .It Fl b Count .Ar width @@ -62,10 +62,31 @@ possible. .It Fl w Ar width Specifies a line width to use instead of the default of 80. .El +.Pp +Unless +.Fl b +is specified, a backspace character decrements the column position +by one, a carriage return resets the column position to zero, and +a tab advances the column position to the next multiple of eight. +.Sh ENVIRONMENT +.Bl -tag -width 8n +.It Ev LC_CTYPE +The character set +.Xr locale 1 . +It is used to decide which byte sequences form characters and what +their display width is. +If it is unset or set to +.Qq C , +.Qq POSIX , +or an unsupported value, each byte except backspace, tab, newline, +and carriage return is assumed to represent a character of display +width 1. +.El .Sh EXIT STATUS .Ex -std fold .Sh SEE ALSO -.Xr expand 1 +.Xr expand 1 , +.Xr fmt 1 .Sh STANDARDS The .Nm @@ -100,15 +121,17 @@ rewrote the command in 1990, and .An J. T. Conklin added the missing options in 1993. .Sh BUGS -If underlining (see -.Xr ul 1 ) -is present it may be messed up by folding. -.Pp -.Ar width -should be a multiple of 8 if tabs are present, or the tabs should -be expanded using -.Xr expand 1 -before using -.Nm fold . -.Pp -Multibyte character support is missing. +Traditional +.Xr roff 7 +output semantics, implemented both by GNU nroff and by +.Xr mandoc 1 , +only uses a single backspace for backing up the previous character, +even for double-width characters. +The +.Nm +backspace semantics required by POSIX mishandles such backspace-encoded +sequences, breaking lines early. +The +.Xr fmt 1 +utility provides similar functionality and does not suffer from that +problem, but isn't standardized by POSIX. diff --git a/usr.bin/fold/fold.c b/usr.bin/fold/fold.c index 90d4ed592a9..cdb6e99cb4a 100644 --- a/usr.bin/fold/fold.c +++ b/usr.bin/fold/fold.c @@ -1,4 +1,4 @@ -/* $OpenBSD: fold.c,v 1.17 2015/10/09 01:37:07 deraadt Exp $ */ +/* $OpenBSD: fold.c,v 1.18 2016/05/23 10:31:42 schwarze Exp $ */ /* $NetBSD: fold.c,v 1.6 1995/09/01 01:42:44 jtc Exp $ */ /*- @@ -33,19 +33,22 @@ * SUCH DAMAGE. */ +#include <ctype.h> +#include <err.h> +#include <limits.h> +#include <locale.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> -#include <ctype.h> -#include <err.h> -#include <limits.h> +#include <wchar.h> #define DEFLINEWIDTH 80 static void fold(unsigned int); -static unsigned int new_column_position(unsigned int, int); +static int isu8cont(unsigned char); static __dead void usage(void); + int count_bytes = 0; int split_words = 0; @@ -56,6 +59,8 @@ main(int argc, char *argv[]) unsigned int width; const char *errstr; + setlocale(LC_CTYPE, ""); + if (pledge("stdio rpath", NULL) == -1) err(1, "pledge"); @@ -110,12 +115,11 @@ main(int argc, char *argv[]) for (; *argv; ++argv) { if (!freopen(*argv, "r", stdin)) err(1, "%s", *argv); - /* NOTREACHED */ else fold(width); } } - exit(0); + return 0; } /* @@ -130,100 +134,135 @@ main(int argc, char *argv[]) * returns embedded in the input stream. */ static void -fold(unsigned int width) +fold(unsigned int max_width) { - static char *buf = NULL; - static int buf_max = 0; - int ch; - unsigned int col, indx; - - col = indx = 0; - while ((ch = getchar()) != EOF) { - if (ch == '\n') { - if (indx != 0) - fwrite(buf, 1, indx, stdout); - putchar('\n'); - col = indx = 0; - continue; - } + static char *buf = NULL; + static size_t bufsz = 2048; + char *cp; /* Current mb character. */ + char *np; /* Next mb character. */ + char *sp; /* To search for the last space. */ + char *nbuf; /* For buffer reallocation. */ + wchar_t wc; /* Current wide character. */ + int ch; /* Last byte read. */ + int len; /* Bytes in the current mb character. */ + unsigned int col; /* Current display position. */ + int width; /* Display width of wc. */ + + if (buf == NULL && (buf = malloc(bufsz)) == NULL) + err(1, NULL); - col = new_column_position(col, ch); - if (col > width) { - unsigned int i, last_space; + np = cp = buf; + ch = 0; + col = 0; - if (split_words) { - for (i = 0, last_space = -1; i < indx; i++) - if(buf[i] == ' ') - last_space = i; + while (ch != EOF) { /* Loop on input characters. */ + while ((ch = getchar()) != EOF) { /* Loop on input bytes. */ + if (np + 1 == buf + bufsz) { + nbuf = reallocarray(buf, 2, bufsz); + if (nbuf == NULL) + err(1, NULL); + bufsz *= 2; + cp = nbuf + (cp - buf); + np = nbuf + (np - buf); + buf = nbuf; } + *np++ = ch; - if (split_words && last_space != -1) { - last_space++; + /* + * Read up to and including the first byte of + * the next character, such that we are sure + * to have a complete character in the buffer. + * There is no need to read more than five bytes + * ahead, since UTF-8 characters are four bytes + * long at most. + */ - fwrite(buf, 1, last_space, stdout); - memmove(buf, buf+last_space, indx-last_space); + if (np - cp > 4 || (np - cp > 1 && !isu8cont(ch))) + break; + } + + while (cp < np) { /* Loop on output characters. */ + + /* Handle end of line and backspace. */ - indx -= last_space; + if (*cp == '\n' || (*cp == '\r' && !count_bytes)) { + fwrite(buf, 1, ++cp - buf, stdout); + memmove(buf, cp, np - cp); + np = buf + (np - cp); + cp = buf; col = 0; - for (i = 0; i < indx; i++) { - col = new_column_position(col, buf[i]); - } - } else { - fwrite(buf, 1, indx, stdout); - col = indx = 0; + continue; + } + if (*cp == '\b' && !count_bytes) { + if (col) + col--; + cp++; + continue; } - putchar('\n'); - /* calculate the column position for the next line. */ - col = new_column_position(col, ch); - } + /* + * Measure display width. + * Process the last byte only if + * end of file was reached. + */ + + if (np - cp > (ch != EOF)) { + len = 1; + width = 1; - if (indx + 1 > buf_max) { - int newmax = buf_max + 2048; - char *newbuf; + if (*cp == '\t') { + if (count_bytes == 0) + width = 8 - (col & 7); + } else if ((len = mbtowc(&wc, cp, + np - cp)) < 1) + len = 1; + else if (count_bytes) + width = len; + else if ((width = wcwidth(wc)) < 0) + width = 1; - /* Allocate buffer in LINE_MAX increments */ - if ((newbuf = realloc(buf, newmax)) == NULL) { - err(1, NULL); - /* NOTREACHED */ + col += width; + if (col <= max_width || cp == buf) { + cp += len; + continue; + } } - buf = newbuf; - buf_max = newmax; - } - buf[indx++] = ch; - } - if (indx != 0) - fwrite(buf, 1, indx, stdout); -} + /* Line break required. */ + + if (col > max_width) { + if (split_words) { + for (sp = cp; sp > buf; sp--) { + if (sp[-1] == ' ') { + cp = sp; + break; + } + } + } + fwrite(buf, 1, cp - buf, stdout); + putchar('\n'); + memmove(buf, cp, np - cp); + np = buf + (np - cp); + cp = buf; + col = 0; + continue; + } + + /* Need more input. */ -/* - * calculate the column position - */ -static unsigned int -new_column_position(unsigned int col, int ch) -{ - if (!count_bytes) { - switch (ch) { - case '\b': - if (col > 0) - --col; - break; - case '\r': - col = 0; - break; - case '\t': - col = (col + 8) & ~7; - break; - default: - ++col; break; } - } else { - ++col; } + fwrite(buf, 1, np - buf, stdout); + + if (ferror(stdin)) + err(1, NULL); +} - return col; +static int +isu8cont(unsigned char c) +{ + return MB_CUR_MAX > 1 && (c & (0x80 | 0x40)) == 0x80; } static __dead void |