summaryrefslogtreecommitdiff
path: root/usr.bin/fmt
diff options
context:
space:
mode:
authorIngo Schwarze <schwarze@cvs.openbsd.org>2015-12-15 16:26:18 +0000
committerIngo Schwarze <schwarze@cvs.openbsd.org>2015-12-15 16:26:18 +0000
commitfab4481f0f9cdbd985fac855476970295c7a231e (patch)
tree0ed33ff9bbc506cf2d13ff5dc8a1af46bd04d62d /usr.bin/fmt
parentd3aed62f977a1d496624d727366dee2f9ca0ca04 (diff)
UTF-8 support; does not yet handle the -c option.
No longer expand tabs up front in get_line(), their width depends on the width of characters earlier on the line. Always NUL-terminate the input buffer for easier and safer handling. Get rid of the hand-rolled output buffer, just let stdio do its work. OK tedu@
Diffstat (limited to 'usr.bin/fmt')
-rw-r--r--usr.bin/fmt/fmt.117
-rw-r--r--usr.bin/fmt/fmt.c239
2 files changed, 130 insertions, 126 deletions
diff --git a/usr.bin/fmt/fmt.1 b/usr.bin/fmt/fmt.1
index 6df6ae8ed31..408d5c8e519 100644
--- a/usr.bin/fmt/fmt.1
+++ b/usr.bin/fmt/fmt.1
@@ -1,4 +1,4 @@
-.\" $OpenBSD: fmt.1,v 1.27 2015/09/11 19:20:18 schwarze Exp $
+.\" $OpenBSD: fmt.1,v 1.28 2015/12/15 16:26:17 schwarze Exp $
.\"
.\" Copyright (c) 1980, 1990, 1993
.\" The Regents of the University of California. All rights reserved.
@@ -29,7 +29,7 @@
.\"
.\" @(#)fmt.1 8.1 (Berkeley) 6/6/93
.\"
-.Dd $Mdocdate: September 11 2015 $
+.Dd $Mdocdate: December 15 2015 $
.Dt FMT 1
.Os
.Sh NAME
@@ -139,6 +139,19 @@ will reformat a paragraph,
evening the lines:
.Pp
.Dl !}fmt
+.Sh ENVIRONMENT
+.Bl -tag -width LC_CTYPE
+.It Ev LC_CTYPE
+The character set
+.Xr locale 1 .
+It is used to decide which byte sequences form characters and what
+their display width is.
+If it is unset or set to
+.Qq C ,
+.Qq POSIX,
+or an unsupported value, each byte except the tab is assumed
+to represent a character of display width 1.
+.El
.Sh EXIT STATUS
.Ex -std
The latter happens with invalid options, insufficient memory,
diff --git a/usr.bin/fmt/fmt.c b/usr.bin/fmt/fmt.c
index 7bf480a1ade..7d2925e8d43 100644
--- a/usr.bin/fmt/fmt.c
+++ b/usr.bin/fmt/fmt.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: fmt.c,v 1.33 2015/10/09 01:37:07 deraadt Exp $ */
+/* $OpenBSD: fmt.c,v 1.34 2015/12/15 16:26:17 schwarze Exp $ */
/* Sensible version of fmt
*
@@ -176,6 +176,8 @@
#include <string.h>
#include <sysexits.h>
#include <unistd.h>
+#include <wchar.h>
+#include <wctype.h>
/* Something that, we hope, will never be a genuine line length,
* indentation etc.
@@ -222,7 +224,6 @@ static int grok_mail_headers = 0; /* treat embedded mail headers magically? */
static int format_troff = 0; /* Format troff? */
static int n_errors = 0; /* Number of failed files. */
-static char *output_buffer = NULL; /* Output line will be built here */
static size_t x; /* Horizontal position in output line */
static size_t x0; /* Ditto, ignoring leading whitespace */
static size_t pending_spaces; /* Spaces to add before next word */
@@ -232,17 +233,16 @@ static int output_in_paragraph = 0; /* Any of current para written out yet? */
static void process_named_file(const char *);
static void process_stream(FILE *, const char *);
-static size_t indent_length(const char *, size_t);
+static size_t indent_length(const char *);
static int might_be_header(const char *);
-static void new_paragraph(size_t, size_t);
-static void output_word(size_t, size_t, const char *, size_t, size_t);
+static void new_paragraph(size_t);
+static void output_word(size_t, size_t, const char *, int, int, int);
static void output_indent(size_t);
static void center_stream(FILE *, const char *);
-static char *get_line(FILE *, size_t *);
+static char *get_line(FILE *);
static void *xrealloc(void *, size_t);
void usage(void);
-#define XMALLOC(x) xrealloc(0, x)
#define ERRS(x) (x >= 127 ? 127 : ++x)
/* Here is perhaps the right place to mention that this code is
@@ -332,7 +332,6 @@ main(int argc, char *argv[])
goal_length = 65;
if (max_length == 0)
max_length = goal_length+10;
- output_buffer = XMALLOC(max_length+1); /* really needn't be longer */
/* 2. Process files. */
@@ -381,25 +380,31 @@ typedef enum {
static void
process_stream(FILE *stream, const char *name)
{
- size_t n;
+ const char *wordp, *cp;
+ wchar_t wc;
size_t np;
size_t last_indent = SILLY; /* how many spaces in last indent? */
size_t para_line_number = 0; /* how many lines already read in this para? */
size_t first_indent = SILLY; /* indentation of line 0 of paragraph */
+ int wcl; /* number of bytes in wide character */
+ int wcw; /* display width of wide character */
+ int word_length; /* number of bytes in word */
+ int word_width; /* display width of word */
+ int space_width; /* display width of space after word */
+ int line_width; /* display width of line */
HdrType prev_header_type = hdr_ParagraphStart;
HdrType header_type;
/* ^-- header_type of previous line; -1 at para start */
const char *line;
- size_t length;
if (centerP) {
center_stream(stream, name);
return;
}
- while ((line = get_line(stream, &length)) != NULL) {
- np = indent_length(line, length);
+ while ((line = get_line(stream)) != NULL) {
+ np = indent_length(line);
header_type = hdr_NonHeader;
if (grok_mail_headers && prev_header_type != hdr_NonHeader) {
if (np == 0 && might_be_header(line))
@@ -417,24 +422,24 @@ process_stream(FILE *stream, const char *name)
* AND the line isn't a mail header continuation line
* AND this isn't the second line of an indented paragraph.
*/
- if (length == 0 || (line[0] == '.' && !format_troff) ||
+ if (*line == '\0' || (*line == '.' && !format_troff) ||
header_type == hdr_Header ||
(header_type == hdr_NonHeader && prev_header_type > hdr_NonHeader) ||
(np != last_indent && header_type != hdr_Continuation &&
(!allow_indented_paragraphs || para_line_number != 1)) ) {
- new_paragraph(output_in_paragraph ? last_indent : first_indent, np);
+ new_paragraph(np);
para_line_number = 0;
first_indent = np;
last_indent = np;
/* nroff compatibility */
- if (length > 0 && line[0] == '.' && !format_troff) {
- printf("%.*s\n", (int)length, line);
+ if (*line == '.' && !format_troff) {
+ puts(line);
continue;
}
if (header_type == hdr_Header)
last_indent = 2; /* for cont. lines */
- if (length == 0) {
+ if (*line == '\0') {
putchar('\n');
prev_header_type = hdr_ParagraphStart;
continue;
@@ -448,24 +453,49 @@ process_stream(FILE *stream, const char *name)
prev_header_type = header_type;
}
- n = np;
- while (n < length) {
- /* Find word end and count spaces after it */
- size_t word_length = 0, space_length = 0;
- while (n+word_length < length && line[n+word_length] != ' ')
- ++word_length;
- space_length = word_length;
- while (n+space_length < length && line[n+space_length] == ' ')
- ++space_length;
+ line_width = np;
+ for (wordp = line; *wordp != '\0'; wordp = cp) {
+ word_length = 0;
+ word_width = space_width = 0;
+ for (cp = wordp; *cp != '\0'; cp += wcl) {
+ wcl = mbtowc(&wc, cp, MB_CUR_MAX);
+ if (wcl == -1) {
+ (void)mbtowc(NULL, NULL, MB_CUR_MAX);
+ wc = L'?';
+ wcl = 1;
+ wcw = 1;
+ } else if (wc == L'\t')
+ wcw = (line_width / tab_width + 1) *
+ tab_width - line_width;
+ else if ((wcw = wcwidth(wc)) == -1)
+ wcw = 1;
+ if (iswblank(wc)) {
+ /* Skip whitespace at start of line. */
+ if (word_length == 0) {
+ wordp += wcl;
+ continue;
+ }
+ /* Count whitespace after word. */
+ space_width += wcw;
+ } else {
+ /* Detect end of word. */
+ if (space_width > 0)
+ break;
+ /* Measure word. */
+ word_length += wcl;
+ word_width += wcw;
+ }
+ line_width += wcw;
+ }
+
/* Send the word to the output machinery. */
- output_word(first_indent, last_indent,
- line+n, word_length, space_length-word_length);
- n += space_length;
+ output_word(first_indent, last_indent, wordp,
+ word_length, word_width, space_width);
}
++para_line_number;
}
- new_paragraph(output_in_paragraph ? last_indent : first_indent, 0);
+ new_paragraph(0);
if (ferror(stream)) {
warn("%s", name);
ERRS(n_errors);
@@ -475,12 +505,23 @@ process_stream(FILE *stream, const char *name)
/* How long is the indent on this line?
*/
static size_t
-indent_length(const char *line, size_t length)
+indent_length(const char *line)
{
size_t n = 0;
- while (n < length && *line++ == ' ')
- ++n;
+ for (;;) {
+ switch(*line++) {
+ case ' ':
+ ++n;
+ continue;
+ case '\t':
+ n = (n / tab_width + 1) * tab_width;
+ continue;
+ default:
+ break;
+ }
+ break;
+ }
return n;
}
@@ -504,15 +545,11 @@ might_be_header(const char *line)
/* Begin a new paragraph with an indent of |indent| spaces.
*/
static void
-new_paragraph(size_t old_indent, size_t indent)
+new_paragraph(size_t indent)
{
- if (x0) {
- if (old_indent > 0)
- output_indent(old_indent);
- fwrite(output_buffer, 1, x0, stdout);
+ if (x0 > 0)
putchar('\n');
- }
x = indent;
x0 = 0;
pending_spaces = 0;
@@ -525,6 +562,8 @@ static void
output_indent(size_t n_spaces)
{
+ if (n_spaces == 0)
+ return;
if (output_tab_width) {
while (n_spaces >= output_tab_width) {
putchar('\t');
@@ -535,15 +574,15 @@ output_indent(size_t n_spaces)
putchar(' ');
}
-/* Output a single word, or add it to the buffer.
+/* Output a single word.
* indent0 and indent1 are the indents to use on the first and subsequent
* lines of a paragraph. They'll often be the same, of course.
*/
static void
-output_word(size_t indent0, size_t indent1, const char *word, size_t length, size_t spaces)
+output_word(size_t indent0, size_t indent1, const char *word,
+ int length, int width, int spaces)
{
- size_t new_x = x + pending_spaces + length;
- size_t indent = output_in_paragraph ? indent1 : indent0;
+ size_t new_x = x + pending_spaces + width;
/* If either |spaces==0| (at end of line) or |coalesce_spaces_P|
* (squashing internal whitespace), then add just one space;
@@ -553,54 +592,26 @@ output_word(size_t indent0, size_t indent1, const char *word, size_t length, siz
if (coalesce_spaces_P || spaces == 0)
spaces = strchr(sentence_enders, word[length-1]) ? 2 : 1;
- if (new_x <= goal_length) {
- /* After adding the word we still aren't at the goal length,
- * so clearly we add it to the buffer rather than outputing it.
- */
- memset(output_buffer+x0, ' ', pending_spaces);
+ if (x0 == 0)
+ output_indent(output_in_paragraph ? indent1 : indent0);
+ else if (new_x > max_length || x >= goal_length ||
+ (new_x > goal_length && new_x-goal_length > goal_length-x)) {
+ putchar('\n');
+ output_indent(indent1);
+ x0 = 0;
+ x = indent1;
+ } else {
x0 += pending_spaces;
x += pending_spaces;
- memcpy(output_buffer+x0, word, length);
- x0 += length;
- x += length;
- pending_spaces = spaces;
- } else {
- /* Adding the word takes us past the goal. Print the line-so-far,
- * and the word too iff either (1) the lsf is empty or (2) that
- * makes us nearer the goal but doesn't take us over the limit,
- * or (3) the word on its own takes us over the limit.
- * In case (3) we put a newline in between.
- */
- if (indent > 0)
- output_indent(indent);
- fwrite(output_buffer, 1, x0, stdout);
- if (x0 == 0 || (new_x <= max_length && new_x-goal_length <= goal_length-x)) {
- printf("%*s", (int)pending_spaces, "");
- goto write_out_word;
- } else {
- /* If the word takes us over the limit on its own, just
- * spit it out and don't bother buffering it.
- */
- if (indent+length > max_length) {
- putchar('\n');
- if (indent > 0)
- output_indent(indent);
-write_out_word:
- fwrite(word, 1, length, stdout);
- x0 = 0;
- x = indent1;
- pending_spaces = 0;
- } else {
- memcpy(output_buffer, word, length);
- x0 = length;
- x = length+indent1;
- pending_spaces = spaces;
- }
- }
-
- putchar('\n');
- output_in_paragraph = 1;
+ while (pending_spaces--)
+ putchar(' ');
}
+ x0 += width;
+ x += width;
+ while(length--)
+ putchar(*word++);
+ pending_spaces = spaces;
+ output_in_paragraph = 1;
}
/* Process a stream, but just center its lines rather than trying to
@@ -610,25 +621,17 @@ static void
center_stream(FILE *stream, const char *name)
{
char *line;
- size_t length;
size_t l;
- while ((line = get_line(stream, &length)) != 0) {
- l = length;
- while (l > 0 && isspace(*line)) {
+ while ((line = get_line(stream)) != NULL) {
+ while (isspace((unsigned char)*line))
++line;
- --l;
- }
-
- length = l;
-
+ l = strlen(line);
while (l < goal_length) {
putchar(' ');
l += 2;
}
-
- fwrite(line, 1, length, stdout);
- putchar('\n');
+ puts(line);
}
if (ferror(stream)) {
@@ -637,58 +640,46 @@ center_stream(FILE *stream, const char *name)
}
}
-/* Get a single line from a stream. Expand tabs, strip control
+/* Get a single line from a stream. Strip control
* characters and trailing whitespace, and handle backspaces.
- * Return the address of the buffer containing the line, and
- * put the length of the line in |lengthp|.
+ * Return the address of the buffer containing the line.
* This can cope with arbitrarily long lines, and with lines
* without terminating \n.
* If there are no characters left or an error happens, we
- * return 0.
- * Don't confuse |spaces_pending| here with the global
- * |pending_spaces|.
+ * return NULL.
*/
static char *
-get_line(FILE *stream, size_t *lengthp)
+get_line(FILE *stream)
{
int ch;
int troff = 0;
static char *buf = NULL;
static size_t length = 0;
size_t len = 0;
- size_t spaces_pending = 0;
if (buf == NULL) {
length = 100;
- buf = XMALLOC(length);
+ buf = xrealloc(NULL, length);
}
while ((ch = getc(stream)) != '\n' && ch != EOF) {
- if ((len + spaces_pending == 0) && (ch == '.' && !format_troff))
+ if ((len == 0) && (ch == '.' && !format_troff))
troff = 1;
- if (ch == ' ') {
- ++spaces_pending;
- } else if (troff || !iscntrl(ch)) {
- while (len + spaces_pending >= length) {
+ if (troff || ch == '\t' || !iscntrl(ch)) {
+ if (len >= length) {
length *= 2;
buf = xrealloc(buf, length);
}
-
- while (spaces_pending > 0) {
- --spaces_pending;
- buf[len++] = ' ';
- }
buf[len++] = ch;
- } else if (ch == '\t') {
- spaces_pending += tab_width - (len+spaces_pending)%tab_width;
} else if (ch == '\b') {
if (len)
--len;
}
}
-
- *lengthp = len;
- return (len > 0 || ch != EOF) ? buf : 0;
+ while (len > 0 && isspace((unsigned char)buf[len-1]))
+ --len;
+ buf[len] = '\0';
+ return (len > 0 || ch != EOF) ? buf : NULL;
}
/* (Re)allocate some memory, exiting with an error if we can't.