usr.bin/fmt/fmt.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628

/*	$OpenBSD: fmt.c,v 1.11 1998/04/25 23:07:16 millert Exp $	*/

/* Sensible version of fmt
 *
 * Syntax: fmt [ options ] [ goal [ max ] ] [ filename ... ]
 *
 * Since the documentation for the original fmt is so poor, here
 * is an accurate description of what this one does. It's usually
 * the same. The *mechanism* used may differ from that suggested
 * here. Note that we are *not* entirely compatible with fmt,
 * because fmt gets so many things wrong.
 *
 * 1. Tabs are expanded, assuming 8-space tab stops.
 *    If the `-t <n>' option is given, we assume <n>-space
 *    tab stops instead.
 *    Trailing blanks are removed from all lines.
 *    x\b == nothing, for any x other than \b.
 *    Other control characters are simply stripped. This
 *    includes \r.
 * 2. Each line is split into leading whitespace and
 *    everything else. Maximal consecutive sequences of
 *    lines with the same leading whitespace are considered
 *    to form paragraphs, except that a blank line is always
 *    a paragraph to itself.
 *    If the `-p' option is given then the first line of a
 *    paragraph is permitted to have indentation different
 *    from that of the other lines.
 *    If the `-m' option is given then a line that looks
 *    like a mail message header, if it is not immediately
 *    preceded by a non-blank non-message-header line, is
 *    taken to start a new paragraph, which also contains
 *    any subsequent lines with non-empty leading whitespace.
 * 3. The "everything else" is split into words; a word
 *    includes its trailing whitespace, and a word at the
 *    end of a line is deemed to be followed by a single
 *    space, or two spaces if it ends with a sentence-end
 *    character. (See the `-d' option for how to change that.)
 *    If the `-s' option has been given, then a word's trailing
 *    whitespace is replaced by what it would have had if it
 *    had occurred at end of line.
 * 4. Each paragraph is sent to standard output as follows.
 *    We output the leading whitespace, and then enough words
 *    to make the line length as near as possible to the goal
 *    without exceeding the maximum. (If a single word would
 *    exceed the maximum, we output that anyway.) Of course
 *    the trailing whitespace of the last word is ignored.
 *    We then emit a newline and start again if there are any
 *    words left.
 *    Note that for a blank line this translates as "We emit
 *    a newline".
 *    If the `-l <n>' option is given, then leading whitespace
 *    is modified slightly: <n> spaces are replaced by a tab.
 *    Indented paragraphs (see above under `-p') make matters
 *    more complicated than this suggests. Actually every paragraph
 *    has two `leading whitespace' values; the value for the first
 *    line, and the value for the most recent line. (While processing
 *    the first line, the two are equal. When `-p' has not been
 *    given, they are always equal.) The leading whitespace
 *    actually output is that of the first line (for the first
 *    line of *output*) or that of the most recent line (for
 *    all other lines of output).
 *    When `-m' has been given, message header paragraphs are
 *    taken as having first-leading-whitespace empty and
 *    subsequent-leading-whitespace two spaces.
 *
 * Multiple input files are formatted one at a time, so that a file
 * never ends in the middle of a line.
 *
 * There's an alternative mode of operation, invoked by giving
 * the `-c' option. In that case we just center every line,
 * and most of the other options are ignored. This should
 * really be in a separate program, but we must stay compatible
 * with old `fmt'.
 *
 * QUERY: Should `-m' also try to do the right thing with quoted text?
 * QUERY: `-b' to treat backslashed whitespace as old `fmt' does?
 * QUERY: Option meaning `never join lines'?
 * QUERY: Option meaning `split in mid-word to avoid overlong lines'?
 * (Those last two might not be useful, since we have `fold'.)
 *
 * Differences from old `fmt':
 *
 *   - We have many more options. Options that aren't understood
 *     generate a lengthy usage message, rather than being
 *     treated as filenames.
 *   - Even with `-m', our handling of message headers is
 *     significantly different. (And much better.)
 *   - We don't treat `\ ' as non-word-breaking.
 *   - Downward changes of indentation start new paragraphs
 *     for us, as well as upward. (I think old `fmt' behaves
 *     in the way it does in order to allow indented paragraphs,
 *     but this is a broken way of making indented paragraphs
 *     behave right.)
 *   - Given the choice of going over or under |goal_length|
 *     by the same amount, we go over; old `fmt' goes under.
 *   - We treat `?' as ending a sentence, and not `:'. Old `fmt'
 *     does the reverse.
 *   - We return approved return codes. Old `fmt' returns
 *     1 for some errors, and *the number of unopenable files*
 *     when that was all that went wrong.
 *   - We have fewer crashes and more helpful error messages.
 *   - We don't turn spaces into tabs at starts of lines unless
 *     specifically requested.
 *   - New `fmt' is somewhat smaller and slightly faster than
 *     old `fmt'.
 *
 * Bugs:
 *
 *   None known. There probably are some, though.
 *
 * Portability:
 *
 *   I believe this code to be pretty portable. It does require
 *   that you have `getopt'. If you need to include "getopt.h"
 *   for this (e.g., if your system didn't come with `getopt'
 *   and you installed it yourself) then you should arrange for
 *   NEED_getopt_h to be #defined.
 *
 *   Everything here should work OK even on nasty 16-bit
 *   machines and nice 64-bit ones. However, it's only really
 *   been tested on my FreeBSD machine. Your mileage may vary.
 */

/* Copyright (c) 1997 Gareth McCaughan. All rights reserved.
 *
 * Redistribution and use of this code, in source or binary forms,
 * with or without modification, are permitted subject to the following
 * conditions:
 *
 *  - Redistribution of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 *  - If you distribute modified source code it must also include
 *    a notice saying that it has been modified, and giving a brief
 *    description of what changes have been made.
 *
 * Disclaimer: I am not responsible for the results of using this code.
 *             If it formats your hard disc, sends obscene messages to
 *             your boss and kills your children then that's your problem
 *             not mine. I give absolutely no warranty of any sort as to
 *             what the program will do, and absolutely refuse to be held
 *             liable for any consequences of your using it.
 *             Thank you. Have a nice day.
 */

/* RCS change log:
 * Revision 1.5  1998/03/02 18:02:21  gjm11
 * Minor changes for portability.
 *
 * Revision 1.4  1997/10/01 11:51:28  gjm11
 * Repair broken indented-paragraph handling.
 * Add mail message header stuff.
 * Improve comments and layout.
 * Make usable with non-BSD systems.
 * Add revision display to usage message.
 *
 * Revision 1.3  1997/09/30 16:24:47  gjm11
 * Add copyright notice, rcsid string and log message.
 *
 * Revision 1.2  1997/09/30 16:13:39  gjm11
 * Add options: -d <chars>, -l <width>, -p, -s, -t <width>, -h .
 * Parse options with `getopt'. Clean up code generally.
 * Make comments more accurate.
 *
 * Revision 1.1  1997/09/30 11:29:57  gjm11
 * Initial revision
 */

#ifndef lint
static const char rcsid[] =
  "$OpenBSD: fmt.c,v 1.11 1998/04/25 23:07:16 millert Exp $";
static const char copyright[] =
  "Copyright (c) 1997 Gareth McCaughan. All rights reserved.\n";
#endif /* not lint */

/* Cater for BSD and non-BSD systems.
 * I hate the C preprocessor.
 */

#undef HAVE_errx
#undef HAVE_sysexits

#ifdef unix
# include <sys/param.h>
# ifdef BSD
#  define HAVE_errx
#  if BSD >= 199306
#   define HAVE_sysexits
#  endif
# endif
#endif

#ifdef HAVE_errx
# include <err.h>
#else
# define errx(rc,str) { fprintf(stderr,"fmt: %s\n",str); exit(rc); }
#endif

#ifdef HAVE_sysexits
# include <sysexits.h>
#else
# define EX_USAGE 1
# define EX_NOINPUT 1
# define EX_SOFTWARE 1
# define EX_OSERR 1
#endif

#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#ifdef NEED_getopt_h
# include "getopt.h"
#endif

/* Something that, we hope, will never be a genuine line length,
 * indentation etc.
 */
#define SILLY ((size_t)-1)

/* I used to use |strtoul| for this, but (1) not all systems have it
 * and (2) it's probably better to use |strtol| to detect negative
 * numbers better.
 * If |fussyp==0| then we don't complain about non-numbers
 * (returning 0 instead), but we do complain about bad numbers.
 */
size_t get_positive(const char *s, const char *err_mess, int fussyP) {
  char *t;
  long result = strtol(s,&t,0);
  if (*t) { if (fussyP) goto Lose; else return 0; }
  if (result<=0) { Lose: errx(EX_USAGE, err_mess); }
  return (size_t) result;
}

/* Just for the sake of linguistic purity: */

#ifdef BRITISH
# define CENTER "centre"
#else
# define CENTER "center"
#endif

/* Global variables */

static int centerP=0;		/* Try to center lines? */
static size_t goal_length=0;	/* Target length for output lines */
static size_t max_length=0;	/* Maximum length for output lines */
static int coalesce_spaces_P=0;	/* Coalesce multiple whitespace -> ' ' ? */
static int allow_indented_paragraphs=0;	/* Can first line have diff. ind.? */
static int tab_width=8;		/* Number of spaces per tab stop */
static int output_tab_width=0;	/* Ditto, when squashing leading spaces */
static char *sentence_enders=".?!";	/* Double-space after these */
static int grok_mail_headers=0;	/* treat embedded mail headers magically? */

static int n_errors=0;		/* Number of failed files. Return on exit. */
static char *output_buffer=0;	/* Output line will be built here */
static size_t x;		/* Horizontal position in output line */
static size_t x0;		/* Ditto, ignoring leading whitespace */
static size_t pending_spaces;	/* Spaces to add before next word */
static int output_in_paragraph=0;	/* Any of current para written out yet? */

/* Prototypes */

static void process_named_file (const char *);
static void     process_stream (FILE *, const char *);
static size_t    indent_length (const char *, size_t);
static int     might_be_header (const char *);
static void      new_paragraph (size_t, size_t);
static void        output_word (size_t, size_t, const char *, size_t, size_t);
static void      output_indent (size_t);
static void      center_stream (FILE *, const char *);
static char *         get_line (FILE *, size_t *);
static void *         xrealloc (void *, size_t);

#define XMALLOC(x) xrealloc(0,x)

/* Here is perhaps the right place to mention that this code is
 * all in top-down order. Hence, |main| comes first.
 */
int
main(int argc, char *argv[]) {
  int ch;			/* used for |getopt| processing */

  /* 1. Grok parameters. */

  while ((ch = getopt(argc, argv, "cd:hl:mpst:")) != -1) switch(ch) {
    case 'c':
      centerP = 1;
      continue;
    case 'd':
      sentence_enders = XMALLOC(strlen(optarg)+1);
      strcpy(sentence_enders, optarg);
      continue;
    case 'l':
      output_tab_width
        = get_positive(optarg, "output tab width must be positive", 1);
      continue;
    case 'm':
      grok_mail_headers = 1;
      continue;
    case 'p':
      allow_indented_paragraphs = 1;
      continue;
    case 's':
      coalesce_spaces_P = 1;
      continue;
    case 't':
      tab_width = get_positive(optarg, "tab width must be positive", 1);
      continue;
    case 'h': default:
      fprintf(stderr,
"Usage:   fmt [-cmps] [-d chars] [-l num] [-t num] [goal [maximum]] [file...]\n"
"Options: -c     " CENTER " each line instead of formatting\n"
"         -d <chars> double-space after <chars> at line end\n"
"         -l <n> turn each <n> spaces at start of line into a tab\n"
"         -m     try to make sure mail header lines stay separate\n"
"         -p     allow indented paragraphs\n"
"         -s     coalesce whitespace inside lines\n"
"         -t <n> have tabs every <n> columns\n");
      exit(ch=='h' ? 0 : EX_USAGE);
  }
  argc -= optind; argv += optind;

  /* [ goal [ maximum ] ] */

  if (argc>0
      && (goal_length=get_positive(*argv,"goal length must be positive", 0))
         != 0) {
    --argc; ++argv;
    if (argc>0
        && (goal_length=get_positive(*argv,"max length must be positive", 0))
           != 0) {
      if (max_length<goal_length)
        errx(EX_USAGE, "max length must be >= goal length");
    }
  }
  if (goal_length==0) goal_length = 65;
  if (max_length==0) max_length = goal_length+10;
  output_buffer = XMALLOC(max_length+1);	/* really needn't be longer */

  /* 2. Process files. */

  if (argc>0) {
    while (argc-->0) process_named_file(*argv++);
  }
  else {
    process_stream(stdin, "standard input");
  }

  /* We're done. */

  return n_errors ? EX_NOINPUT : 0;

}

/* Process a single file, given its name.
 */
static void
process_named_file(const char *name) {
  FILE *f=fopen(name, "r");
  if (!f) { perror(name); ++n_errors; }
  else {
    process_stream(f, name);
    fclose(f);
  }
}

/* Types of mail header continuation lines:
 */
typedef enum {
  hdr_ParagraphStart = -1,
  hdr_NonHeader      = 0,
  hdr_Header         = 1,
  hdr_Continuation   = 2
} HdrType;

/* Process a stream. This is where the real work happens,
 * except that centering is handled separately.
 */
static void
process_stream(FILE *stream, const char *name) {
  size_t last_indent=SILLY;	/* how many spaces in last indent? */
  size_t para_line_number=0;	/* how many lines already read in this para? */
  size_t first_indent=SILLY;	/* indentation of line 0 of paragraph */
  HdrType prev_header_type=hdr_ParagraphStart;
	/* ^-- header_type of previous line; -1 at para start */
  char *line;
  size_t length;

  if (centerP) { center_stream(stream, name); return; }
  while ((line=get_line(stream,&length)) != NULL) {
    size_t np=indent_length(line, length);
    { HdrType header_type=hdr_NonHeader;
      if (grok_mail_headers && prev_header_type!=hdr_NonHeader) {
        if (np==0 && might_be_header(line))
          header_type = hdr_Header;
        else if (np>0 && prev_header_type>hdr_NonHeader)
          header_type = hdr_Continuation;
      }
      /* We need a new paragraph if and only if:
       *   this line is blank,
       *   OR it's a mail header,
       *   OR it's not a mail header AND the last line was one,
       *   OR the indentation has changed
       *      AND the line isn't a mail header continuation line
       *      AND this isn't the second line of an indented paragraph.
       */
      if ( length==0
           || header_type==hdr_Header
           || (header_type==hdr_NonHeader && prev_header_type>hdr_NonHeader)
           || (np!=last_indent
               && header_type != hdr_Continuation
               && (!allow_indented_paragraphs || para_line_number != 1)) ) {
        new_paragraph(output_in_paragraph ? last_indent : first_indent, np);
        para_line_number = 0;
        first_indent = np;
        last_indent = np;
        if (header_type==hdr_Header) last_indent=2;	/* for cont. lines */
        if (length==0) {
          putchar('\n');
          prev_header_type=hdr_ParagraphStart;
          continue;
        }
      }
      else {
        /* If this is an indented paragraph other than a mail header
         * continuation, set |last_indent|.
         */
        if (np != last_indent && header_type != hdr_Continuation)
          last_indent=np;
      }
      prev_header_type = header_type;
    }

    { size_t n=np;
      while (n<length) {
        /* Find word end and count spaces after it */
        size_t word_length=0, space_length=0;
        while (n+word_length < length && line[n+word_length] != ' ')
          ++word_length;
        space_length = word_length;
        while (n+space_length < length && line[n+space_length] == ' ')
          ++space_length;
        /* Send the word to the output machinery. */
        output_word(first_indent, last_indent,
                    line+n, word_length, space_length-word_length);
        n += space_length;
      }
    }
    ++para_line_number;
  }
  new_paragraph(output_in_paragraph ? last_indent : first_indent, 0);
  if (ferror(stream)) { perror(name); ++n_errors; }
}

/* How long is the indent on this line?
 */
static size_t
indent_length(const char *line, size_t length) {
  size_t n=0;
  while (n<length && *line++ == ' ') ++n;
  return n;
}

/* Might this line be a mail header?
 * We deem a line to be a possible header if it matches the
 * Perl regexp /^[A-Z][-A-Za-z0-9]*:\s/. This is *not* the same
 * as in RFC whatever-number-it-is; we want to be gratuitously
 * conservative to avoid mangling ordinary civilised text.
 */
static int
might_be_header(const char *line) {
  if (!isupper(*line++)) return 0;
  while (*line && (isalnum(*line) || *line=='-')) ++line;
  return (*line==':' && isspace(line[1]));
}

/* Begin a new paragraph with an indent of |indent| spaces.
 */
static void
new_paragraph(size_t old_indent, size_t indent) {
  if (x0) {
    if (old_indent>0) output_indent(old_indent);
    fwrite(output_buffer, 1, x0, stdout);
    putchar('\n');
  }
  x=indent; x0=0; pending_spaces=0;
  output_in_paragraph = 0;
}

/* Output spaces or tabs for leading indentation.
 */
static void
output_indent(size_t n_spaces) {
  if (output_tab_width) {
    while (n_spaces >= output_tab_width) {
      putchar('\t');
      n_spaces -= output_tab_width;
    }
  }
  while (n_spaces-- > 0) putchar(' ');
}

/* Output a single word, or add it to the buffer.
 * indent0 and indent1 are the indents to use on the first and subsequent
 * lines of a paragraph. They'll often be the same, of course.
 */
static void
output_word(size_t indent0, size_t indent1, const char *word, size_t length, size_t spaces) {
  size_t new_x = x+pending_spaces+length;
  size_t indent = output_in_paragraph ? indent1 : indent0;

  /* If either |spaces==0| (at end of line) or |coalesce_spaces_P|
   * (squashing internal whitespace), then add just one space;
   * except that if the last character was a sentence-ender we
   * actually add two spaces.
   */
  if (coalesce_spaces_P || spaces==0)
    spaces = strchr(sentence_enders, word[length-1]) ? 2 : 1;

  if (new_x<=goal_length) {
    /* After adding the word we still aren't at the goal length,
     * so clearly we add it to the buffer rather than outputing it.
     */
    memset(output_buffer+x0, ' ', pending_spaces);
    x0 += pending_spaces; x += pending_spaces;
    memcpy(output_buffer+x0, word, length);
    x0 += length; x += length;
    pending_spaces = spaces;
  }
  else {
    /* Adding the word takes us past the goal. Print the line-so-far,
     * and the word too iff either (1) the lsf is empty or (2) that
     * makes us nearer the goal but doesn't take us over the limit,
     * or (3) the word on its own takes us over the limit.
     * In case (3) we put a newline in between.
     */
    if (indent>0) output_indent(indent);
    fwrite(output_buffer, 1, x0, stdout);
    if (x0==0 || (new_x <= max_length && new_x-goal_length <= goal_length-x)) {
      printf("%*s", pending_spaces, "");
      goto write_out_word;
    }
    else {
      /* If the word takes us over the limit on its own, just
       * spit it out and don't bother buffering it.
       */
      if (indent+length > max_length) {
        putchar('\n');
        if (indent>0) output_indent(indent);
write_out_word:
        fwrite(word, 1, length, stdout);
        x0 = 0; x = indent1; pending_spaces = 0;
      }
      else {
        memcpy(output_buffer, word, length);
        x0 = length; x = length+indent1; pending_spaces = spaces;
      }
    }
    putchar('\n');
    output_in_paragraph = 1;
  }
}

/* Process a stream, but just center its lines rather than trying to
 * format them neatly.
 */
static void
center_stream(FILE *stream, const char *name) {
  char *line;
  size_t length;
  while ((line=get_line(stream, &length)) != 0) {
    size_t l=length;
    while (l>0 && isspace(*line)) { ++line; --l; }
    length=l;
    while (l<goal_length) { putchar(' '); l+=2; }
    fwrite(line, 1, length, stdout);
    putchar('\n');
  }
  if (ferror(stream)) { perror(name); ++n_errors; }
}

/* Get a single line from a stream. Expand tabs, strip control
 * characters and trailing whitespace, and handle backspaces.
 * Return the address of the buffer containing the line, and
 * put the length of the line in |lengthp|.
 * This can cope with arbitrarily long lines, and with lines
 * without terminating \n.
 * If there are no characters left or an error happens, we
 * return 0.
 * Don't confuse |spaces_pending| here with the global
 * |pending_spaces|.
 */
static char *
get_line(FILE *stream, size_t *lengthp) {
  static char *buf=NULL;
  static size_t length=0;
  size_t len=0;
  int ch;
  size_t spaces_pending=0;

  if (buf==NULL) { length=100; buf=XMALLOC(length); }
  while ((ch=getc(stream)) != '\n' && ch != EOF) {
    if (ch==' ') ++spaces_pending;
    else if (isprint(ch)) {
      while (len+spaces_pending >= length) {
        length*=2; buf=xrealloc(buf, length);
      }
      while (spaces_pending > 0) { --spaces_pending; buf[len++]=' '; }
      buf[len++] = ch;
    }
    else if (ch=='\t')
      spaces_pending += tab_width - (len+spaces_pending)%tab_width;
    else if (ch=='\b') { if (len) --len; }
  }
  *lengthp=len;
  return (len>0 || ch!=EOF) ? buf : 0;
}

/* (Re)allocate some memory, exiting with an error if we can't.
 */
static void *
xrealloc(void *ptr, size_t nbytes) {
  void *p = realloc(ptr, nbytes);
  if (p == NULL) errx(EX_OSERR, "out of memory");
  return p;
}