src - OpenBSD base system

diff options


context:
space:
mode:

author	Ingo Schwarze <schwarze@cvs.openbsd.org>	2014-10-30 00:05:03 +0000
committer	Ingo Schwarze <schwarze@cvs.openbsd.org>	2014-10-30 00:05:03 +0000
commit	e4b08b14b91d398693871c7a44fa6160dd7726b7 (patch)
tree	4894c108ff19655ce5786be057cec1216407541f /usr.bin
parent	4f6719c761be9d89aeb2445b4b338c015abc01c5 (diff)

support UTF-8 and ISO-8859-1 input by integrating modified parts

of kristaps@' version of the preconv(1) utility into mandoc(1); positive feedback from bentley@ and no concern raised when shown on tech@

Diffstat (limited to 'usr.bin')

-rw-r--r--

usr.bin/mandoc/Makefile

-rw-r--r--

usr.bin/mandoc/apropos.1

-rw-r--r--

usr.bin/mandoc/libmandoc.h

-rw-r--r--

usr.bin/mandoc/main.c

-rw-r--r--

usr.bin/mandoc/man.1

-rw-r--r--

usr.bin/mandoc/mandoc.1

-rw-r--r--

usr.bin/mandoc/mandoc.h

-rw-r--r--

usr.bin/mandoc/preconv.c

211

-rw-r--r--

usr.bin/mandoc/read.c

9 files changed, 341 insertions, 35 deletions

diff --git a/usr.bin/mandoc/Makefile b/usr.bin/mandoc/Makefile
index fad30f9d2a0..84f11e5729f 100644
--- a/usr.bin/mandoc/Makefile
+++ b/usr.bin/mandoc/Makefile

@@ -1,4 +1,4 @@

-# $OpenBSD: Makefile,v 1.82 2014/08/27 00:06:08 schwarze Exp $

+# $OpenBSD: Makefile,v 1.83 2014/10/30 00:05:02 schwarze Exp $

.include <bsd.own.mk>

@@ -7,7 +7,7 @@ CFLAGS += -W -Wall -Wstrict-prototypes -Wno-unused-parameter

DPADD += ${LIBUTIL}

LDADD += -lsqlite3 -lutil

-SRCS= mandoc.c mandoc_aux.c read.c \

+SRCS= mandoc.c mandoc_aux.c preconv.c read.c \

roff.c tbl.c tbl_opts.c tbl_layout.c tbl_data.c eqn.c

SRCS+= mdoc_macro.c mdoc.c mdoc_hash.c \

mdoc_argv.c mdoc_validate.c lib.c att.c \

diff --git a/usr.bin/mandoc/apropos.1 b/usr.bin/mandoc/apropos.1
index a38700bed89..294db4c739e 100644
--- a/usr.bin/mandoc/apropos.1
+++ b/usr.bin/mandoc/apropos.1

@@ -1,4 +1,4 @@

-.\" $OpenBSD: apropos.1,v 1.27 2014/09/03 05:17:08 schwarze Exp $

+.\" $OpenBSD: apropos.1,v 1.28 2014/10/30 00:05:02 schwarze Exp $

.\"

@@ -15,7 +15,7 @@

.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF

.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

.\"

-.Dd $Mdocdate: September 3 2014 $

+.Dd $Mdocdate: October 30 2014 $

.Dt APROPOS 1

.Os

.Sh NAME

@@ -79,7 +79,7 @@ to paginate them.

.Fl a

mode, the options

-.Fl IOTW

+.Fl IKOTW

described in the

.Xr mandoc 1

manual are also available.

diff --git a/usr.bin/mandoc/libmandoc.h b/usr.bin/mandoc/libmandoc.h
index e7484c7b708..ccd4597b1d7 100644
--- a/usr.bin/mandoc/libmandoc.h
+++ b/usr.bin/mandoc/libmandoc.h

@@ -1,4 +1,4 @@

-/* $OpenBSD: libmandoc.h,v 1.31 2014/10/28 17:35:42 schwarze Exp $ */

+/* $OpenBSD: libmandoc.h,v 1.32 2014/10/30 00:05:02 schwarze Exp $ */

@@ -30,6 +30,12 @@ enum rofferr {

ROFF_ERR /* badness: puke and stop */

};

+struct buf {

+ char *buf;

+ size_t sz;

+ size_t offs;

+};

__BEGIN_DECLS

struct roff;

@@ -63,6 +69,9 @@ int man_endparse(struct man *);

int man_addspan(struct man *, const struct tbl_span *);

int man_addeqn(struct man *, const struct eqn *);

+int preconv_cue(const struct buf *);

+int preconv_encode(struct buf *, struct buf *, int *);

void roff_free(struct roff *);

struct roff *roff_alloc(struct mparse *, const struct mchars *, int);

void roff_reset(struct roff *);

diff --git a/usr.bin/mandoc/main.c b/usr.bin/mandoc/main.c
index c2b8bb032ed..8a14d547f81 100644
--- a/usr.bin/mandoc/main.c
+++ b/usr.bin/mandoc/main.c

@@ -1,4 +1,4 @@

-/* $OpenBSD: main.c,v 1.102 2014/10/28 17:35:42 schwarze Exp $ */

+/* $OpenBSD: main.c,v 1.103 2014/10/30 00:05:02 schwarze Exp $ */

@@ -75,6 +75,7 @@ struct curparse {

char outopts[BUFSIZ]; /* buf of output opts */

};

+static int koptions(int *, char *);

int mandocdb(int, char**);

static int moptions(int *, char *);

static void mmsg(enum mandocerr, enum mandoclevel,

@@ -145,14 +146,15 @@ main(int argc, char *argv[])

memset(&curp, 0, sizeof(struct curparse));

curp.outtype = OUTT_ASCII;

curp.wlevel = MANDOCLEVEL_FATAL;

- options = MPARSE_SO;

+ options = MPARSE_SO | MPARSE_UTF8 | MPARSE_LATIN1;

defos = NULL;

use_pager = 1;

show_usage = 0;

outmode = OUTMODE_DEF;

- while (-1 != (c = getopt(argc, argv, "aC:cfhI:iklM:m:O:S:s:T:VW:w"))) {

+ while (-1 != (c = getopt(argc, argv,

+ "aC:cfhI:iK:klM:m:O:S:s:T:VW:w"))) {

switch (c) {

case 'a':

outmode = OUTMODE_ALL;

@@ -188,6 +190,10 @@ main(int argc, char *argv[])

case 'i':

outmode = OUTMODE_INT;

break;

+ case 'K':

+ if ( ! koptions(&options, optarg))

+ return((int)MANDOCLEVEL_BADARG);

+ break;

case 'k':

search.argmode = ARG_EXPR;

break;

@@ -584,6 +590,26 @@ fail:

}

static int

+koptions(int *options, char *arg)

+ if ( ! strcmp(arg, "utf-8")) {

+ *options |= MPARSE_UTF8;

+ *options &= ~MPARSE_LATIN1;

+ } else if ( ! strcmp(arg, "iso-8859-1")) {

+ *options |= MPARSE_LATIN1;

+ *options &= ~MPARSE_UTF8;

+ } else if ( ! strcmp(arg, "us-ascii")) {

+ *options &= ~(MPARSE_UTF8 | MPARSE_LATIN1);

+ } else {

+ fprintf(stderr, "%s: -K%s: Bad argument\n",

+ progname, arg);

+ return(0);

+ }

+ return(1);

+static int

moptions(int *options, char *arg)

{

diff --git a/usr.bin/mandoc/man.1 b/usr.bin/mandoc/man.1
index 47bf6589fe4..e78a385a8a8 100644
--- a/usr.bin/mandoc/man.1
+++ b/usr.bin/mandoc/man.1

@@ -1,4 +1,4 @@

-.\" $OpenBSD: man.1,v 1.3 2014/09/03 05:17:08 schwarze Exp $

+.\" $OpenBSD: man.1,v 1.4 2014/10/30 00:05:02 schwarze Exp $

.\"

@@ -31,7 +31,7 @@

.\"

.\" @(#)man.1 8.2 (Berkeley) 1/2/94

.\"

-.Dd $Mdocdate: September 3 2014 $

+.Dd $Mdocdate: October 30 2014 $

.Dt MAN 1

.Os

.Sh NAME

@@ -255,7 +255,7 @@ combination.

The

.Nm

utility also supports the options

-.Fl IOTW

+.Fl IKOTW

described in the

.Xr mandoc 1

manual.

diff --git a/usr.bin/mandoc/mandoc.1 b/usr.bin/mandoc/mandoc.1
index 5c7f99c27db..625ea0bdbba 100644
--- a/usr.bin/mandoc/mandoc.1
+++ b/usr.bin/mandoc/mandoc.1

@@ -1,4 +1,4 @@

-.\" $OpenBSD: mandoc.1,v 1.63 2014/10/07 18:17:05 schwarze Exp $

+.\" $OpenBSD: mandoc.1,v 1.64 2014/10/30 00:05:02 schwarze Exp $

.\"

@@ -15,7 +15,7 @@

.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF

.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

.\"

-.Dd $Mdocdate: October 7 2014 $

+.Dd $Mdocdate: October 30 2014 $

.Dt MANDOC 1

.Os

.Sh NAME

@@ -27,6 +27,7 @@

.Sm off

.Op Fl I Cm os Li = Ar name

.Sm on

+.Op Fl K Na Ar encoding

.Op Fl m Ns Ar format

.Op Fl O Ns Ar option

.Op Fl T Ns Ar output

@@ -89,6 +90,31 @@ macro.

Display only the SYNOPSIS lines.

Implies

.Fl a .

+.It Fl K Ns Ar encoding

+Specify the input encoding.

+The supported

+.Ar encoding

+arguments are

+.Cm us-ascii ,

+.Cm iso-8859-1 ,

+and

+.Cm utf-8 .

+If not specified, autodetection uses the first match:

+.Bl -tag -width iso-8859-1

+.It Cm utf-8

+if the first three bytes of the input file

+are the UTF-8 byte order mark (BOM, 0xefbbbf)

+.It Ar encoding

+if the first or second line of the input file matches the

+.Sy emacs

+mode line format

+.Pp

+.D1 .\e" -*- Oo ...; Oc coding: Ar encoding ; No -*-

+.It Cm utf-8

+if the first non-ASCII byte in the file introduces a valid UTF-8 sequence

+.It Cm iso-8859-1

+otherwise

+.El

.It Fl k

A synonym for

.Xr apropos 1 .

diff --git a/usr.bin/mandoc/mandoc.h b/usr.bin/mandoc/mandoc.h
index dad792ddc09..8832b8db56e 100644
--- a/usr.bin/mandoc/mandoc.h
+++ b/usr.bin/mandoc/mandoc.h

@@ -1,4 +1,4 @@

-/* $OpenBSD: mandoc.h,v 1.110 2014/10/29 00:17:01 schwarze Exp $ */

+/* $OpenBSD: mandoc.h,v 1.111 2014/10/30 00:05:02 schwarze Exp $ */

@@ -393,6 +393,8 @@ struct eqn {

#define MPARSE_MAN 2 /* assume -man */

#define MPARSE_SO 4 /* honour .so requests */

#define MPARSE_QUICK 8 /* abort the parse early */

+#define MPARSE_UTF8 16 /* accept UTF-8 input */

+#define MPARSE_LATIN1 32 /* accept ISO-LATIN-1 input */

enum mandoc_esc {

ESCAPE_ERROR = 0, /* bail! unparsable escape */

diff --git a/usr.bin/mandoc/preconv.c b/usr.bin/mandoc/preconv.c
new file mode 100644
index 00000000000..6ed72f1bccf
--- /dev/null
+++ b/usr.bin/mandoc/preconv.c

@@ -0,0 +1,211 @@

+/* $OpenBSD: preconv.c,v 1.1 2014/10/30 00:05:02 schwarze Exp $ */

+/*

+ *

+ * Permission to use, copy, modify, and distribute this software for any

+ * purpose with or without fee is hereby granted, provided that the above

+ * copyright notice and this permission notice appear in all copies.

+ *

+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES

+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF

+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR

+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES

+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN

+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF

+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

+ */

+#include <sys/types.h>

+#include <stdio.h>

+#include <string.h>

+#include "mandoc.h"

+#include "libmandoc.h"

+int

+preconv_encode(struct buf *ib, struct buf *ob, int *filenc)

+ size_t i;

+ const long one = 1L;

+ int state, be;

+ unsigned int accum;

+ unsigned char cu;

+ if ( ! (*filenc & MPARSE_UTF8))

+ goto latin;

+ state = 0;

+ accum = 0U;

+ be = 0;

+ /* Quick test for big-endian value. */

+ if ( ! (*((const char *)(&one))))

+ be = 1;

+ for (i = ib->offs; i < ib->sz; i++) {

+ cu = ib->buf[i];

+ if (state) {

+ if ( ! (cu & 128) || (cu & 64)) {

+ /* Bad sequence header. */

+ break;

+ }

+ /* Accept only legitimate bit patterns. */

+ if (cu > 191 || cu < 128) {

+ /* Bad in-sequence bits. */

+ break;

+ }

+ accum |= (cu & 63) << --state * 6;

+ if (state)

+ continue;

+ /*

+ * Accum is held in little-endian order as

+ * stipulated by the UTF-8 sequence coding. We

+ * need to convert to a native big-endian if our

+ * architecture requires it.

+ */

+ if (be)

+ accum = (accum >> 24) |

+ ((accum << 8) & 0x00FF0000) |

+ ((accum >> 8) & 0x0000FF00) |

+ (accum << 24);

+ if (accum < 0x80)

+ ob->buf[ob->offs++] = accum;

+ else

+ ob->offs += snprintf(ob->buf + ob->offs,

+ 11, "\\[u%.4X]", accum);

+ ib->offs = i + 1;

+ *filenc &= ~MPARSE_LATIN1;

+ return(1);

+ } else {

+ /*

+ * Entering a UTF-8 state: if we encounter a

+ * UTF-8 bitmask, calculate the expected UTF-8

+ * state from it.

+ */

+ for (state = 0; state < 7; state++)

+ if ( ! (cu & (1 << (7 - state))))

+ break;

+ /* Accept only legitimate bit patterns. */

+ switch (state--) {

+ case (4):

+ if (cu <= 244 && cu >= 240) {

+ accum = (cu & 7) << 18;

+ continue;

+ }

+ /* Bad 4-sequence start bits. */

+ break;

+ case (3):

+ if (cu <= 239 && cu >= 224) {

+ accum = (cu & 15) << 12;

+ continue;

+ }

+ /* Bad 3-sequence start bits. */

+ break;

+ case (2):

+ if (cu <= 223 && cu >= 194) {

+ accum = (cu & 31) << 6;

+ continue;

+ }

+ /* Bad 2-sequence start bits. */

+ break;

+ default:

+ /* Bad sequence bit mask. */

+ break;

+ }

+ break;

+ }

+ /* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */

+latin:

+ if ( ! (*filenc & MPARSE_LATIN1))

+ return(0);

+ ob->offs += snprintf(ob->buf + ob->offs, 11,

+ "\\[u%.4X]", (unsigned char)ib->buf[ib->offs++]);

+ *filenc &= ~MPARSE_UTF8;

+ return(1);

+int

+preconv_cue(const struct buf *b)

+ const char *ln, *eoln, *eoph;

+ size_t sz, phsz;

+ ln = b->buf + b->offs;

+ sz = b->sz - b->offs;

+ /* Look for the end-of-line. */

+ if (NULL == (eoln = memchr(ln, '\n', sz)))

+ eoln = ln + sz;

+ /* Check if we have the correct header/trailer. */

+ if ((sz = (size_t)(eoln - ln)) < 10 ||

+ memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3))

+ return(MPARSE_UTF8 | MPARSE_LATIN1);

+ /* Move after the header and adjust for the trailer. */

+ ln += 7;

+ sz -= 10;

+ while (sz > 0) {

+ while (sz > 0 && ' ' == *ln) {

+ ln++;

+ sz--;

+ }

+ if (0 == sz)

+ break;

+ /* Find the end-of-phrase marker (or eoln). */

+ if (NULL == (eoph = memchr(ln, ';', sz)))

+ eoph = eoln - 3;

+ else

+ eoph++;

+ /* Only account for the "coding" phrase. */

+ if ((phsz = eoph - ln) < 7 ||

+ strncasecmp(ln, "coding:", 7)) {

+ sz -= phsz;

+ ln += phsz;

+ continue;

+ }

+ sz -= 7;

+ ln += 7;

+ while (sz > 0 && ' ' == *ln) {

+ ln++;

+ sz--;

+ }

+ if (0 == sz)

+ return(0);

+ /* Check us against known encodings. */

+ if (phsz > 4 && !strncasecmp(ln, "utf-8", 5))

+ return(MPARSE_UTF8);

+ if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11))

+ return(MPARSE_LATIN1);

+ return(0);

+ }

+ return(MPARSE_UTF8 | MPARSE_LATIN1);

diff --git a/usr.bin/mandoc/read.c b/usr.bin/mandoc/read.c
index 4dc8374273d..8cec99488ac 100644
--- a/usr.bin/mandoc/read.c
+++ b/usr.bin/mandoc/read.c

@@ -1,4 +1,4 @@

-/* $OpenBSD: read.c,v 1.69 2014/10/28 17:35:42 schwarze Exp $ */

+/* $OpenBSD: read.c,v 1.70 2014/10/30 00:05:02 schwarze Exp $ */

@@ -39,11 +39,6 @@

#define REPARSE_LIMIT 1000

-struct buf {

- char *buf; /* binary input buffer */

- size_t sz; /* size of binary buffer */

-};

struct mparse {

struct man *pman; /* persistent man parser */

struct mdoc *pmdoc; /* persistent mdoc parser */

@@ -60,6 +55,7 @@ struct mparse {

enum mandoclevel file_status; /* status of current parse */

enum mandoclevel wlevel; /* ignore messages below this */

int options; /* parser options */

+ int filenc; /* encoding of the current file */

int reparse_count; /* finite interp. stack */

int line; /* line number in the file */

};

@@ -321,13 +317,20 @@ mparse_buf_r(struct mparse *curp, struct buf blk, int start)

lnn = curp->line;

pos = 0;

- for (i = 0; i < (int)blk.sz; ) {

+ for (i = blk.offs; i < (int)blk.sz; ) {

if (0 == pos && '\0' == blk.buf[i])

break;

if (start) {

curp->line = lnn;

curp->reparse_count = 0;

+ if (lnn < 3 &&

+ curp->filenc & MPARSE_UTF8 &&

+ curp->filenc & MPARSE_LATIN1) {

+ blk.offs = i;

+ curp->filenc = preconv_cue(&blk);

+ }

}

while (i < (int)blk.sz && (start || '\0' != blk.buf[i])) {

@@ -348,27 +351,40 @@ mparse_buf_r(struct mparse *curp, struct buf blk, int start)

}

- * Make sure we have space for at least

- * one backslash and one other character

- * and the trailing NUL byte.

+ * Make sure we have space for the worst

+ * case of 11 bytes: "\\[u10ffff]\0"

- if (pos + 2 >= (int)ln.sz)

+ if (pos + 11 > (int)ln.sz)

resize_buf(&ln, 256);

- * Warn about bogus characters. If you're using

- * non-ASCII encoding, you're screwing your

- * readers. Since I'd rather this not happen,

- * I'll be helpful and replace these characters

- * with "?", so we don't display gibberish.

- * Note to manual writers: use special characters.

+ * Encode 8-bit input.

- c = (unsigned char) blk.buf[i];

+ c = blk.buf[i];

+ if (c & 0x80) {

+ blk.offs = i;

+ ln.offs = pos;

+ if (curp->filenc && preconv_encode(

+ &blk, &ln, &curp->filenc)) {

+ pos = ln.offs;

+ i = blk.offs;

+ } else {

+ mandoc_vmsg(MANDOCERR_BADCHAR,

+ curp, curp->line, pos,

+ "0x%x", c);

+ ln.buf[pos++] = '?';

+ i++;

+ }

+ continue;

+ }

+ /*

+ * Exclude control characters.

+ */

- if ( ! (isascii(c) &&

- (isgraph(c) || isblank(c)))) {

+ if (c == 0x7f || (c < 0x20 && c != 0x09)) {

mandoc_vmsg(MANDOCERR_BADCHAR, curp,

curp->line, pos, "0x%x", c);

i++;

@@ -627,6 +643,7 @@ read_whole_file(struct mparse *curp, const char *file, int fd,

return(0);

}

*with_mmap = 1;

+ fb->offs = 0;

fb->sz = (size_t)st.st_size;

fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);

if (fb->buf != MAP_FAILED)

@@ -657,6 +674,7 @@ read_whole_file(struct mparse *curp, const char *file, int fd,

ssz = read(fd, fb->buf + (int)off, fb->sz - off);

if (ssz == 0) {

fb->sz = off;

+ fb->offs = 0;

return(1);

}

if (ssz == -1) {

@@ -728,6 +746,15 @@ mparse_parse_buffer(struct mparse *curp, struct buf blk, const char *file)

curp->line = 1;

recursion_depth++;

+ /* Skip an UTF-8 byte order mark. */

+ if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&

+ (unsigned char)blk.buf[0] == 0xef &&

+ (unsigned char)blk.buf[1] == 0xbb &&

+ (unsigned char)blk.buf[2] == 0xbf) {

+ blk.offs = 3;

+ curp->filenc &= ~MPARSE_LATIN1;

+ }

mparse_buf_r(curp, blk, 1);

if (0 == --recursion_depth && MANDOCLEVEL_FATAL > curp->file_status)

@@ -742,6 +769,7 @@ mparse_readfd(struct mparse *curp, int fd, const char *file)

{

struct buf blk;

int with_mmap;

+ int save_filenc;

if (-1 == fd && -1 == (fd = open(file, O_RDONLY, 0))) {

curp->file_status = MANDOCLEVEL_SYSERR;

@@ -760,7 +788,11 @@ mparse_readfd(struct mparse *curp, int fd, const char *file)

if (read_whole_file(curp, file, fd, &blk, &with_mmap)) {

+ save_filenc = curp->filenc;

+ curp->filenc = curp->options &

+ (MPARSE_UTF8 | MPARSE_LATIN1);

mparse_parse_buffer(curp, blk, file);

+ curp->filenc = save_filenc;

if (with_mmap)

munmap(blk.buf, blk.sz);

else