summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIngo Schwarze <schwarze@cvs.openbsd.org>2016-05-23 10:31:43 +0000
committerIngo Schwarze <schwarze@cvs.openbsd.org>2016-05-23 10:31:43 +0000
commit7927ad60d752f4ce8ac9654ffe9adcaaf2ea83ad (patch)
tree8157fe84b44268a72c56d370bf6ff780973d90e6
parentfac140e6b81364cea22d53e9913d77ba0b0cb9fd (diff)
UTF-8 support.
Using feedback about bugs in earlier versions from Matthew Martin <phy1729 at gmail dot com> and from tsg@ who tested it with afl(1). OK czarkoff@ tsg@
-rw-r--r--regress/usr.bin/fold/fold.sh45
-rw-r--r--usr.bin/fold/fold.155
-rw-r--r--usr.bin/fold/fold.c203
3 files changed, 195 insertions, 108 deletions
diff --git a/regress/usr.bin/fold/fold.sh b/regress/usr.bin/fold/fold.sh
index 393e8c7e6cc..091dd62d8de 100644
--- a/regress/usr.bin/fold/fold.sh
+++ b/regress/usr.bin/fold/fold.sh
@@ -14,11 +14,18 @@
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+FOLD=/usr/bin/fold
+
+# Arguments of the test function:
+# 1. command line arguments for fold(1)
+# 2. standard input for fold, backslash-encoded
+# 3. expected standard output, backslash-encoded
+# 4. expected standard output of "fold -b", backslash-encoded
+# (optional, by default the same as argument 3.)
test_fold()
{
expect=`echo -n "$3" ; echo .`
- if [ $SKIPUTF8 -eq 0 ]; then
- result=`echo -n "$2" | fold $1 2>&1 ; echo .`
+ result=`echo -n "$2" | $FOLD $1 2>&1 ; echo .`
if [ "$result" != "$expect" ]; then
echo "fold $1 \"$2\":"
echo -n "$2" | hexdump -C
@@ -28,9 +35,8 @@ test_fold()
echo -n "$result" | hexdump -C
exit 1
fi
- fi
[ -n "$4" ] && expect=`echo -n "$4" ; echo .`
- result=`echo -n "$2" | fold -b $1 2>&1 ; echo .`
+ result=`echo -n "$2" | $FOLD -b $1 2>&1 ; echo .`
if [ "$result" != "$expect" ]; then
echo "fold -b $1 \"$2\":"
echo -n "$2" | hexdump -C
@@ -44,17 +50,21 @@ test_fold()
export LC_ALL=C
-SKIPUTF8=0
-
test_fold "" "" ""
+
+# newline
test_fold "" "\n" "\n"
test_fold "" "\n\n" "\n\n"
test_fold "-w 1" "\n\n" "\n\n"
+test_fold "-w 2" "1\n12\n123" "1\n12\n12\n3"
+test_fold "-w 2" "12345" "12\n34\n5"
+test_fold "-w 2" "12345\n" "12\n34\n5\n"
# backspace
test_fold "-w 2" "123" "12\n3"
test_fold "-w 2" "1\b234" "1\b23\n4" "1\b\n23\n4"
test_fold "-w 2" "\b1234" "\b12\n34" "\b1\n23\n4"
+test_fold "-w 2" "12\b\b345" "12\b\b34\n5" "12\n\b\b\n34\n5"
test_fold "-w 2" "12\r3" "12\r3" "12\n\r3"
# tabulator
@@ -66,20 +76,35 @@ test_fold "-w 9" "1\t9\b\b89012" "1\t9\b\b89\n012" "1\t9\b\b8901\n2"
test_fold "-sw 4" "1 23 45" "1 \n23 \n45"
test_fold "-sw 3" "1234 56" "123\n4 \n56"
-export LC_ALL=en_US.UTF-8
-
# invalid characters
test_fold "-w 3" "1\037734" "1\03773\n4"
test_fold "-w 3" "1\000734" "1\00073\n4"
+test_fold "-w 3" "1\000034" "1\00003\n4"
-SKIPUTF8=1
+export LC_ALL=en_US.UTF-8
# double width characters
test_fold "-w 4" "1\0343\0201\020145" "1\0343\0201\02014\n5" \
"1\0343\0201\0201\n45"
+test_fold "-w 3" "\0343\0201\0201\0343\0201\020134" \
+ "\0343\0201\0201\n\0343\0201\02013\n4" \
+ "\0343\0201\0201\n\0343\0201\0201\n34"
+test_fold "-w 2" "\0343\0201\0201\b23" "\0343\0201\0201\b2\n3" \
+ "\0343\0201\0201\n\b2\n3"
+test_fold "-w 1" "1\0343\0201\02014" "1\n\0343\0201\0201\n4"
# zero width characters
-test_fold "-w 3" "1a\0314\020034" "1a\0314\02003\n4" "1a\0314\n\020034"
+test_fold "-w 3" "1a\0314\020034" "1a\0314\02003\n4" "1a\n\0314\02003\n4"
test_fold "-w 2" "1a\0314\02003" "1a\0314\0200\n3" "1a\n\0314\0200\n3"
+# four byte UTF-8 encoding
+test_fold "-w 3" "1\0360\0220\0200\020034" "1\0360\0220\0200\02003\n4" \
+ "1\n\0360\0220\0200\0200\n34"
+
+# invalid UTF-8
+test_fold "-w 3" "\0343\0201\0201\0201\0201\0201\0201\0201\n" \
+ "\0343\0201\0201\0201\n\0201\0201\0201\n\0201\n" \
+ "\0343\0201\0201\n\0201\0201\0201\n\0201\0201\n"
+test_fold "-w 2" "\0343\0343\0201\0201\n" "\0343\n\0343\0201\0201\n"
+
exit 0
diff --git a/usr.bin/fold/fold.1 b/usr.bin/fold/fold.1
index 98d149a595f..ab6e09027fc 100644
--- a/usr.bin/fold/fold.1
+++ b/usr.bin/fold/fold.1
@@ -1,4 +1,4 @@
-.\" $OpenBSD: fold.1,v 1.17 2016/01/05 12:44:55 schwarze Exp $
+.\" $OpenBSD: fold.1,v 1.18 2016/05/23 10:31:42 schwarze Exp $
.\" $NetBSD: fold.1,v 1.5 1995/09/01 01:42:42 jtc Exp $
.\"
.\" Copyright (c) 1980, 1993
@@ -30,7 +30,7 @@
.\"
.\" @(#)fold.1 8.1 (Berkeley) 6/6/93
.\"
-.Dd $Mdocdate: January 5 2016 $
+.Dd $Mdocdate: May 23 2016 $
.Dt FOLD 1
.Os
.Sh NAME
@@ -48,7 +48,7 @@ or the standard input if no files are specified,
breaking the lines to have a maximum of 80 display columns.
.Pp
The options are as follows:
-.Bl -tag -width Ds
+.Bl -tag -width 8n
.It Fl b
Count
.Ar width
@@ -62,10 +62,31 @@ possible.
.It Fl w Ar width
Specifies a line width to use instead of the default of 80.
.El
+.Pp
+Unless
+.Fl b
+is specified, a backspace character decrements the column position
+by one, a carriage return resets the column position to zero, and
+a tab advances the column position to the next multiple of eight.
+.Sh ENVIRONMENT
+.Bl -tag -width 8n
+.It Ev LC_CTYPE
+The character set
+.Xr locale 1 .
+It is used to decide which byte sequences form characters and what
+their display width is.
+If it is unset or set to
+.Qq C ,
+.Qq POSIX ,
+or an unsupported value, each byte except backspace, tab, newline,
+and carriage return is assumed to represent a character of display
+width 1.
+.El
.Sh EXIT STATUS
.Ex -std fold
.Sh SEE ALSO
-.Xr expand 1
+.Xr expand 1 ,
+.Xr fmt 1
.Sh STANDARDS
The
.Nm
@@ -100,15 +121,17 @@ rewrote the command in 1990, and
.An J. T. Conklin
added the missing options in 1993.
.Sh BUGS
-If underlining (see
-.Xr ul 1 )
-is present it may be messed up by folding.
-.Pp
-.Ar width
-should be a multiple of 8 if tabs are present, or the tabs should
-be expanded using
-.Xr expand 1
-before using
-.Nm fold .
-.Pp
-Multibyte character support is missing.
+Traditional
+.Xr roff 7
+output semantics, implemented both by GNU nroff and by
+.Xr mandoc 1 ,
+only uses a single backspace for backing up the previous character,
+even for double-width characters.
+The
+.Nm
+backspace semantics required by POSIX mishandles such backspace-encoded
+sequences, breaking lines early.
+The
+.Xr fmt 1
+utility provides similar functionality and does not suffer from that
+problem, but isn't standardized by POSIX.
diff --git a/usr.bin/fold/fold.c b/usr.bin/fold/fold.c
index 90d4ed592a9..cdb6e99cb4a 100644
--- a/usr.bin/fold/fold.c
+++ b/usr.bin/fold/fold.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: fold.c,v 1.17 2015/10/09 01:37:07 deraadt Exp $ */
+/* $OpenBSD: fold.c,v 1.18 2016/05/23 10:31:42 schwarze Exp $ */
/* $NetBSD: fold.c,v 1.6 1995/09/01 01:42:44 jtc Exp $ */
/*-
@@ -33,19 +33,22 @@
* SUCH DAMAGE.
*/
+#include <ctype.h>
+#include <err.h>
+#include <limits.h>
+#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
-#include <ctype.h>
-#include <err.h>
-#include <limits.h>
+#include <wchar.h>
#define DEFLINEWIDTH 80
static void fold(unsigned int);
-static unsigned int new_column_position(unsigned int, int);
+static int isu8cont(unsigned char);
static __dead void usage(void);
+
int count_bytes = 0;
int split_words = 0;
@@ -56,6 +59,8 @@ main(int argc, char *argv[])
unsigned int width;
const char *errstr;
+ setlocale(LC_CTYPE, "");
+
if (pledge("stdio rpath", NULL) == -1)
err(1, "pledge");
@@ -110,12 +115,11 @@ main(int argc, char *argv[])
for (; *argv; ++argv) {
if (!freopen(*argv, "r", stdin))
err(1, "%s", *argv);
- /* NOTREACHED */
else
fold(width);
}
}
- exit(0);
+ return 0;
}
/*
@@ -130,100 +134,135 @@ main(int argc, char *argv[])
* returns embedded in the input stream.
*/
static void
-fold(unsigned int width)
+fold(unsigned int max_width)
{
- static char *buf = NULL;
- static int buf_max = 0;
- int ch;
- unsigned int col, indx;
-
- col = indx = 0;
- while ((ch = getchar()) != EOF) {
- if (ch == '\n') {
- if (indx != 0)
- fwrite(buf, 1, indx, stdout);
- putchar('\n');
- col = indx = 0;
- continue;
- }
+ static char *buf = NULL;
+ static size_t bufsz = 2048;
+ char *cp; /* Current mb character. */
+ char *np; /* Next mb character. */
+ char *sp; /* To search for the last space. */
+ char *nbuf; /* For buffer reallocation. */
+ wchar_t wc; /* Current wide character. */
+ int ch; /* Last byte read. */
+ int len; /* Bytes in the current mb character. */
+ unsigned int col; /* Current display position. */
+ int width; /* Display width of wc. */
+
+ if (buf == NULL && (buf = malloc(bufsz)) == NULL)
+ err(1, NULL);
- col = new_column_position(col, ch);
- if (col > width) {
- unsigned int i, last_space;
+ np = cp = buf;
+ ch = 0;
+ col = 0;
- if (split_words) {
- for (i = 0, last_space = -1; i < indx; i++)
- if(buf[i] == ' ')
- last_space = i;
+ while (ch != EOF) { /* Loop on input characters. */
+ while ((ch = getchar()) != EOF) { /* Loop on input bytes. */
+ if (np + 1 == buf + bufsz) {
+ nbuf = reallocarray(buf, 2, bufsz);
+ if (nbuf == NULL)
+ err(1, NULL);
+ bufsz *= 2;
+ cp = nbuf + (cp - buf);
+ np = nbuf + (np - buf);
+ buf = nbuf;
}
+ *np++ = ch;
- if (split_words && last_space != -1) {
- last_space++;
+ /*
+ * Read up to and including the first byte of
+ * the next character, such that we are sure
+ * to have a complete character in the buffer.
+ * There is no need to read more than five bytes
+ * ahead, since UTF-8 characters are four bytes
+ * long at most.
+ */
- fwrite(buf, 1, last_space, stdout);
- memmove(buf, buf+last_space, indx-last_space);
+ if (np - cp > 4 || (np - cp > 1 && !isu8cont(ch)))
+ break;
+ }
+
+ while (cp < np) { /* Loop on output characters. */
+
+ /* Handle end of line and backspace. */
- indx -= last_space;
+ if (*cp == '\n' || (*cp == '\r' && !count_bytes)) {
+ fwrite(buf, 1, ++cp - buf, stdout);
+ memmove(buf, cp, np - cp);
+ np = buf + (np - cp);
+ cp = buf;
col = 0;
- for (i = 0; i < indx; i++) {
- col = new_column_position(col, buf[i]);
- }
- } else {
- fwrite(buf, 1, indx, stdout);
- col = indx = 0;
+ continue;
+ }
+ if (*cp == '\b' && !count_bytes) {
+ if (col)
+ col--;
+ cp++;
+ continue;
}
- putchar('\n');
- /* calculate the column position for the next line. */
- col = new_column_position(col, ch);
- }
+ /*
+ * Measure display width.
+ * Process the last byte only if
+ * end of file was reached.
+ */
+
+ if (np - cp > (ch != EOF)) {
+ len = 1;
+ width = 1;
- if (indx + 1 > buf_max) {
- int newmax = buf_max + 2048;
- char *newbuf;
+ if (*cp == '\t') {
+ if (count_bytes == 0)
+ width = 8 - (col & 7);
+ } else if ((len = mbtowc(&wc, cp,
+ np - cp)) < 1)
+ len = 1;
+ else if (count_bytes)
+ width = len;
+ else if ((width = wcwidth(wc)) < 0)
+ width = 1;
- /* Allocate buffer in LINE_MAX increments */
- if ((newbuf = realloc(buf, newmax)) == NULL) {
- err(1, NULL);
- /* NOTREACHED */
+ col += width;
+ if (col <= max_width || cp == buf) {
+ cp += len;
+ continue;
+ }
}
- buf = newbuf;
- buf_max = newmax;
- }
- buf[indx++] = ch;
- }
- if (indx != 0)
- fwrite(buf, 1, indx, stdout);
-}
+ /* Line break required. */
+
+ if (col > max_width) {
+ if (split_words) {
+ for (sp = cp; sp > buf; sp--) {
+ if (sp[-1] == ' ') {
+ cp = sp;
+ break;
+ }
+ }
+ }
+ fwrite(buf, 1, cp - buf, stdout);
+ putchar('\n');
+ memmove(buf, cp, np - cp);
+ np = buf + (np - cp);
+ cp = buf;
+ col = 0;
+ continue;
+ }
+
+ /* Need more input. */
-/*
- * calculate the column position
- */
-static unsigned int
-new_column_position(unsigned int col, int ch)
-{
- if (!count_bytes) {
- switch (ch) {
- case '\b':
- if (col > 0)
- --col;
- break;
- case '\r':
- col = 0;
- break;
- case '\t':
- col = (col + 8) & ~7;
- break;
- default:
- ++col;
break;
}
- } else {
- ++col;
}
+ fwrite(buf, 1, np - buf, stdout);
+
+ if (ferror(stdin))
+ err(1, NULL);
+}
- return col;
+static int
+isu8cont(unsigned char c)
+{
+ return MB_CUR_MAX > 1 && (c & (0x80 | 0x40)) == 0x80;
}
static __dead void