diff options
author | Ingo Schwarze <schwarze@cvs.openbsd.org> | 2014-10-30 00:05:03 +0000 |
---|---|---|
committer | Ingo Schwarze <schwarze@cvs.openbsd.org> | 2014-10-30 00:05:03 +0000 |
commit | e4b08b14b91d398693871c7a44fa6160dd7726b7 (patch) | |
tree | 4894c108ff19655ce5786be057cec1216407541f /usr.bin | |
parent | 4f6719c761be9d89aeb2445b4b338c015abc01c5 (diff) |
support UTF-8 and ISO-8859-1 input by integrating modified parts
of kristaps@' version of the preconv(1) utility into mandoc(1);
positive feedback from bentley@ and no concern raised when shown on tech@
Diffstat (limited to 'usr.bin')
-rw-r--r-- | usr.bin/mandoc/Makefile | 4 | ||||
-rw-r--r-- | usr.bin/mandoc/apropos.1 | 6 | ||||
-rw-r--r-- | usr.bin/mandoc/libmandoc.h | 11 | ||||
-rw-r--r-- | usr.bin/mandoc/main.c | 32 | ||||
-rw-r--r-- | usr.bin/mandoc/man.1 | 6 | ||||
-rw-r--r-- | usr.bin/mandoc/mandoc.1 | 30 | ||||
-rw-r--r-- | usr.bin/mandoc/mandoc.h | 4 | ||||
-rw-r--r-- | usr.bin/mandoc/preconv.c | 211 | ||||
-rw-r--r-- | usr.bin/mandoc/read.c | 72 |
9 files changed, 341 insertions, 35 deletions
diff --git a/usr.bin/mandoc/Makefile b/usr.bin/mandoc/Makefile index fad30f9d2a0..84f11e5729f 100644 --- a/usr.bin/mandoc/Makefile +++ b/usr.bin/mandoc/Makefile @@ -1,4 +1,4 @@ -# $OpenBSD: Makefile,v 1.82 2014/08/27 00:06:08 schwarze Exp $ +# $OpenBSD: Makefile,v 1.83 2014/10/30 00:05:02 schwarze Exp $ .include <bsd.own.mk> @@ -7,7 +7,7 @@ CFLAGS += -W -Wall -Wstrict-prototypes -Wno-unused-parameter DPADD += ${LIBUTIL} LDADD += -lsqlite3 -lutil -SRCS= mandoc.c mandoc_aux.c read.c \ +SRCS= mandoc.c mandoc_aux.c preconv.c read.c \ roff.c tbl.c tbl_opts.c tbl_layout.c tbl_data.c eqn.c SRCS+= mdoc_macro.c mdoc.c mdoc_hash.c \ mdoc_argv.c mdoc_validate.c lib.c att.c \ diff --git a/usr.bin/mandoc/apropos.1 b/usr.bin/mandoc/apropos.1 index a38700bed89..294db4c739e 100644 --- a/usr.bin/mandoc/apropos.1 +++ b/usr.bin/mandoc/apropos.1 @@ -1,4 +1,4 @@ -.\" $OpenBSD: apropos.1,v 1.27 2014/09/03 05:17:08 schwarze Exp $ +.\" $OpenBSD: apropos.1,v 1.28 2014/10/30 00:05:02 schwarze Exp $ .\" .\" Copyright (c) 2011, 2012 Kristaps Dzonsons <kristaps@bsd.lv> .\" Copyright (c) 2011, 2012, 2014 Ingo Schwarze <schwarze@openbsd.org> @@ -15,7 +15,7 @@ .\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF .\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. .\" -.Dd $Mdocdate: September 3 2014 $ +.Dd $Mdocdate: October 30 2014 $ .Dt APROPOS 1 .Os .Sh NAME @@ -79,7 +79,7 @@ to paginate them. In .Fl a mode, the options -.Fl IOTW +.Fl IKOTW described in the .Xr mandoc 1 manual are also available. diff --git a/usr.bin/mandoc/libmandoc.h b/usr.bin/mandoc/libmandoc.h index e7484c7b708..ccd4597b1d7 100644 --- a/usr.bin/mandoc/libmandoc.h +++ b/usr.bin/mandoc/libmandoc.h @@ -1,4 +1,4 @@ -/* $OpenBSD: libmandoc.h,v 1.31 2014/10/28 17:35:42 schwarze Exp $ */ +/* $OpenBSD: libmandoc.h,v 1.32 2014/10/30 00:05:02 schwarze Exp $ */ /* * Copyright (c) 2009, 2010, 2011, 2012 Kristaps Dzonsons <kristaps@bsd.lv> * Copyright (c) 2013, 2014 Ingo Schwarze <schwarze@openbsd.org> @@ -30,6 +30,12 @@ enum rofferr { ROFF_ERR /* badness: puke and stop */ }; +struct buf { + char *buf; + size_t sz; + size_t offs; +}; + __BEGIN_DECLS struct roff; @@ -63,6 +69,9 @@ int man_endparse(struct man *); int man_addspan(struct man *, const struct tbl_span *); int man_addeqn(struct man *, const struct eqn *); +int preconv_cue(const struct buf *); +int preconv_encode(struct buf *, struct buf *, int *); + void roff_free(struct roff *); struct roff *roff_alloc(struct mparse *, const struct mchars *, int); void roff_reset(struct roff *); diff --git a/usr.bin/mandoc/main.c b/usr.bin/mandoc/main.c index c2b8bb032ed..8a14d547f81 100644 --- a/usr.bin/mandoc/main.c +++ b/usr.bin/mandoc/main.c @@ -1,4 +1,4 @@ -/* $OpenBSD: main.c,v 1.102 2014/10/28 17:35:42 schwarze Exp $ */ +/* $OpenBSD: main.c,v 1.103 2014/10/30 00:05:02 schwarze Exp $ */ /* * Copyright (c) 2008-2012 Kristaps Dzonsons <kristaps@bsd.lv> * Copyright (c) 2010, 2011, 2012, 2014 Ingo Schwarze <schwarze@openbsd.org> @@ -75,6 +75,7 @@ struct curparse { char outopts[BUFSIZ]; /* buf of output opts */ }; +static int koptions(int *, char *); int mandocdb(int, char**); static int moptions(int *, char *); static void mmsg(enum mandocerr, enum mandoclevel, @@ -145,14 +146,15 @@ main(int argc, char *argv[]) memset(&curp, 0, sizeof(struct curparse)); curp.outtype = OUTT_ASCII; curp.wlevel = MANDOCLEVEL_FATAL; - options = MPARSE_SO; + options = MPARSE_SO | MPARSE_UTF8 | MPARSE_LATIN1; defos = NULL; use_pager = 1; show_usage = 0; outmode = OUTMODE_DEF; - while (-1 != (c = getopt(argc, argv, "aC:cfhI:iklM:m:O:S:s:T:VW:w"))) { + while (-1 != (c = getopt(argc, argv, + "aC:cfhI:iK:klM:m:O:S:s:T:VW:w"))) { switch (c) { case 'a': outmode = OUTMODE_ALL; @@ -188,6 +190,10 @@ main(int argc, char *argv[]) case 'i': outmode = OUTMODE_INT; break; + case 'K': + if ( ! koptions(&options, optarg)) + return((int)MANDOCLEVEL_BADARG); + break; case 'k': search.argmode = ARG_EXPR; break; @@ -584,6 +590,26 @@ fail: } static int +koptions(int *options, char *arg) +{ + + if ( ! strcmp(arg, "utf-8")) { + *options |= MPARSE_UTF8; + *options &= ~MPARSE_LATIN1; + } else if ( ! strcmp(arg, "iso-8859-1")) { + *options |= MPARSE_LATIN1; + *options &= ~MPARSE_UTF8; + } else if ( ! strcmp(arg, "us-ascii")) { + *options &= ~(MPARSE_UTF8 | MPARSE_LATIN1); + } else { + fprintf(stderr, "%s: -K%s: Bad argument\n", + progname, arg); + return(0); + } + return(1); +} + +static int moptions(int *options, char *arg) { diff --git a/usr.bin/mandoc/man.1 b/usr.bin/mandoc/man.1 index 47bf6589fe4..e78a385a8a8 100644 --- a/usr.bin/mandoc/man.1 +++ b/usr.bin/mandoc/man.1 @@ -1,4 +1,4 @@ -.\" $OpenBSD: man.1,v 1.3 2014/09/03 05:17:08 schwarze Exp $ +.\" $OpenBSD: man.1,v 1.4 2014/10/30 00:05:02 schwarze Exp $ .\" .\" Copyright (c) 1989, 1990, 1993 .\" The Regents of the University of California. All rights reserved. @@ -31,7 +31,7 @@ .\" .\" @(#)man.1 8.2 (Berkeley) 1/2/94 .\" -.Dd $Mdocdate: September 3 2014 $ +.Dd $Mdocdate: October 30 2014 $ .Dt MAN 1 .Os .Sh NAME @@ -255,7 +255,7 @@ combination. The .Nm utility also supports the options -.Fl IOTW +.Fl IKOTW described in the .Xr mandoc 1 manual. diff --git a/usr.bin/mandoc/mandoc.1 b/usr.bin/mandoc/mandoc.1 index 5c7f99c27db..625ea0bdbba 100644 --- a/usr.bin/mandoc/mandoc.1 +++ b/usr.bin/mandoc/mandoc.1 @@ -1,4 +1,4 @@ -.\" $OpenBSD: mandoc.1,v 1.63 2014/10/07 18:17:05 schwarze Exp $ +.\" $OpenBSD: mandoc.1,v 1.64 2014/10/30 00:05:02 schwarze Exp $ .\" .\" Copyright (c) 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> .\" Copyright (c) 2012, 2014 Ingo Schwarze <schwarze@openbsd.org> @@ -15,7 +15,7 @@ .\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF .\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. .\" -.Dd $Mdocdate: October 7 2014 $ +.Dd $Mdocdate: October 30 2014 $ .Dt MANDOC 1 .Os .Sh NAME @@ -27,6 +27,7 @@ .Sm off .Op Fl I Cm os Li = Ar name .Sm on +.Op Fl K Na Ar encoding .Op Fl m Ns Ar format .Op Fl O Ns Ar option .Op Fl T Ns Ar output @@ -89,6 +90,31 @@ macro. Display only the SYNOPSIS lines. Implies .Fl a . +.It Fl K Ns Ar encoding +Specify the input encoding. +The supported +.Ar encoding +arguments are +.Cm us-ascii , +.Cm iso-8859-1 , +and +.Cm utf-8 . +If not specified, autodetection uses the first match: +.Bl -tag -width iso-8859-1 +.It Cm utf-8 +if the first three bytes of the input file +are the UTF-8 byte order mark (BOM, 0xefbbbf) +.It Ar encoding +if the first or second line of the input file matches the +.Sy emacs +mode line format +.Pp +.D1 .\e" -*- Oo ...; Oc coding: Ar encoding ; No -*- +.It Cm utf-8 +if the first non-ASCII byte in the file introduces a valid UTF-8 sequence +.It Cm iso-8859-1 +otherwise +.El .It Fl k A synonym for .Xr apropos 1 . diff --git a/usr.bin/mandoc/mandoc.h b/usr.bin/mandoc/mandoc.h index dad792ddc09..8832b8db56e 100644 --- a/usr.bin/mandoc/mandoc.h +++ b/usr.bin/mandoc/mandoc.h @@ -1,4 +1,4 @@ -/* $OpenBSD: mandoc.h,v 1.110 2014/10/29 00:17:01 schwarze Exp $ */ +/* $OpenBSD: mandoc.h,v 1.111 2014/10/30 00:05:02 schwarze Exp $ */ /* * Copyright (c) 2010, 2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv> * Copyright (c) 2010-2014 Ingo Schwarze <schwarze@openbsd.org> @@ -393,6 +393,8 @@ struct eqn { #define MPARSE_MAN 2 /* assume -man */ #define MPARSE_SO 4 /* honour .so requests */ #define MPARSE_QUICK 8 /* abort the parse early */ +#define MPARSE_UTF8 16 /* accept UTF-8 input */ +#define MPARSE_LATIN1 32 /* accept ISO-LATIN-1 input */ enum mandoc_esc { ESCAPE_ERROR = 0, /* bail! unparsable escape */ diff --git a/usr.bin/mandoc/preconv.c b/usr.bin/mandoc/preconv.c new file mode 100644 index 00000000000..6ed72f1bccf --- /dev/null +++ b/usr.bin/mandoc/preconv.c @@ -0,0 +1,211 @@ +/* $OpenBSD: preconv.c,v 1.1 2014/10/30 00:05:02 schwarze Exp $ */ +/* + * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv> + * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/types.h> + +#include <stdio.h> +#include <string.h> +#include "mandoc.h" +#include "libmandoc.h" + +int +preconv_encode(struct buf *ib, struct buf *ob, int *filenc) +{ + size_t i; + const long one = 1L; + int state, be; + unsigned int accum; + unsigned char cu; + + if ( ! (*filenc & MPARSE_UTF8)) + goto latin; + + state = 0; + accum = 0U; + be = 0; + + /* Quick test for big-endian value. */ + + if ( ! (*((const char *)(&one)))) + be = 1; + + for (i = ib->offs; i < ib->sz; i++) { + cu = ib->buf[i]; + if (state) { + if ( ! (cu & 128) || (cu & 64)) { + /* Bad sequence header. */ + break; + } + + /* Accept only legitimate bit patterns. */ + + if (cu > 191 || cu < 128) { + /* Bad in-sequence bits. */ + break; + } + + accum |= (cu & 63) << --state * 6; + + if (state) + continue; + + /* + * Accum is held in little-endian order as + * stipulated by the UTF-8 sequence coding. We + * need to convert to a native big-endian if our + * architecture requires it. + */ + + if (be) + accum = (accum >> 24) | + ((accum << 8) & 0x00FF0000) | + ((accum >> 8) & 0x0000FF00) | + (accum << 24); + + if (accum < 0x80) + ob->buf[ob->offs++] = accum; + else + ob->offs += snprintf(ob->buf + ob->offs, + 11, "\\[u%.4X]", accum); + ib->offs = i + 1; + *filenc &= ~MPARSE_LATIN1; + return(1); + } else { + /* + * Entering a UTF-8 state: if we encounter a + * UTF-8 bitmask, calculate the expected UTF-8 + * state from it. + */ + for (state = 0; state < 7; state++) + if ( ! (cu & (1 << (7 - state)))) + break; + + /* Accept only legitimate bit patterns. */ + + switch (state--) { + case (4): + if (cu <= 244 && cu >= 240) { + accum = (cu & 7) << 18; + continue; + } + /* Bad 4-sequence start bits. */ + break; + case (3): + if (cu <= 239 && cu >= 224) { + accum = (cu & 15) << 12; + continue; + } + /* Bad 3-sequence start bits. */ + break; + case (2): + if (cu <= 223 && cu >= 194) { + accum = (cu & 31) << 6; + continue; + } + /* Bad 2-sequence start bits. */ + break; + default: + /* Bad sequence bit mask. */ + break; + } + break; + } + } + + /* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */ + +latin: + if ( ! (*filenc & MPARSE_LATIN1)) + return(0); + + ob->offs += snprintf(ob->buf + ob->offs, 11, + "\\[u%.4X]", (unsigned char)ib->buf[ib->offs++]); + + *filenc &= ~MPARSE_UTF8; + return(1); +} + +int +preconv_cue(const struct buf *b) +{ + const char *ln, *eoln, *eoph; + size_t sz, phsz; + + ln = b->buf + b->offs; + sz = b->sz - b->offs; + + /* Look for the end-of-line. */ + + if (NULL == (eoln = memchr(ln, '\n', sz))) + eoln = ln + sz; + + /* Check if we have the correct header/trailer. */ + + if ((sz = (size_t)(eoln - ln)) < 10 || + memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3)) + return(MPARSE_UTF8 | MPARSE_LATIN1); + + /* Move after the header and adjust for the trailer. */ + + ln += 7; + sz -= 10; + + while (sz > 0) { + while (sz > 0 && ' ' == *ln) { + ln++; + sz--; + } + if (0 == sz) + break; + + /* Find the end-of-phrase marker (or eoln). */ + + if (NULL == (eoph = memchr(ln, ';', sz))) + eoph = eoln - 3; + else + eoph++; + + /* Only account for the "coding" phrase. */ + + if ((phsz = eoph - ln) < 7 || + strncasecmp(ln, "coding:", 7)) { + sz -= phsz; + ln += phsz; + continue; + } + + sz -= 7; + ln += 7; + + while (sz > 0 && ' ' == *ln) { + ln++; + sz--; + } + if (0 == sz) + return(0); + + /* Check us against known encodings. */ + + if (phsz > 4 && !strncasecmp(ln, "utf-8", 5)) + return(MPARSE_UTF8); + if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11)) + return(MPARSE_LATIN1); + return(0); + } + return(MPARSE_UTF8 | MPARSE_LATIN1); +} diff --git a/usr.bin/mandoc/read.c b/usr.bin/mandoc/read.c index 4dc8374273d..8cec99488ac 100644 --- a/usr.bin/mandoc/read.c +++ b/usr.bin/mandoc/read.c @@ -1,4 +1,4 @@ -/* $OpenBSD: read.c,v 1.69 2014/10/28 17:35:42 schwarze Exp $ */ +/* $OpenBSD: read.c,v 1.70 2014/10/30 00:05:02 schwarze Exp $ */ /* * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> * Copyright (c) 2010-2014 Ingo Schwarze <schwarze@openbsd.org> @@ -39,11 +39,6 @@ #define REPARSE_LIMIT 1000 -struct buf { - char *buf; /* binary input buffer */ - size_t sz; /* size of binary buffer */ -}; - struct mparse { struct man *pman; /* persistent man parser */ struct mdoc *pmdoc; /* persistent mdoc parser */ @@ -60,6 +55,7 @@ struct mparse { enum mandoclevel file_status; /* status of current parse */ enum mandoclevel wlevel; /* ignore messages below this */ int options; /* parser options */ + int filenc; /* encoding of the current file */ int reparse_count; /* finite interp. stack */ int line; /* line number in the file */ }; @@ -321,13 +317,20 @@ mparse_buf_r(struct mparse *curp, struct buf blk, int start) lnn = curp->line; pos = 0; - for (i = 0; i < (int)blk.sz; ) { + for (i = blk.offs; i < (int)blk.sz; ) { if (0 == pos && '\0' == blk.buf[i]) break; if (start) { curp->line = lnn; curp->reparse_count = 0; + + if (lnn < 3 && + curp->filenc & MPARSE_UTF8 && + curp->filenc & MPARSE_LATIN1) { + blk.offs = i; + curp->filenc = preconv_cue(&blk); + } } while (i < (int)blk.sz && (start || '\0' != blk.buf[i])) { @@ -348,27 +351,40 @@ mparse_buf_r(struct mparse *curp, struct buf blk, int start) } /* - * Make sure we have space for at least - * one backslash and one other character - * and the trailing NUL byte. + * Make sure we have space for the worst + * case of 11 bytes: "\\[u10ffff]\0" */ - if (pos + 2 >= (int)ln.sz) + if (pos + 11 > (int)ln.sz) resize_buf(&ln, 256); /* - * Warn about bogus characters. If you're using - * non-ASCII encoding, you're screwing your - * readers. Since I'd rather this not happen, - * I'll be helpful and replace these characters - * with "?", so we don't display gibberish. - * Note to manual writers: use special characters. + * Encode 8-bit input. */ - c = (unsigned char) blk.buf[i]; + c = blk.buf[i]; + if (c & 0x80) { + blk.offs = i; + ln.offs = pos; + if (curp->filenc && preconv_encode( + &blk, &ln, &curp->filenc)) { + pos = ln.offs; + i = blk.offs; + } else { + mandoc_vmsg(MANDOCERR_BADCHAR, + curp, curp->line, pos, + "0x%x", c); + ln.buf[pos++] = '?'; + i++; + } + continue; + } + + /* + * Exclude control characters. + */ - if ( ! (isascii(c) && - (isgraph(c) || isblank(c)))) { + if (c == 0x7f || (c < 0x20 && c != 0x09)) { mandoc_vmsg(MANDOCERR_BADCHAR, curp, curp->line, pos, "0x%x", c); i++; @@ -627,6 +643,7 @@ read_whole_file(struct mparse *curp, const char *file, int fd, return(0); } *with_mmap = 1; + fb->offs = 0; fb->sz = (size_t)st.st_size; fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0); if (fb->buf != MAP_FAILED) @@ -657,6 +674,7 @@ read_whole_file(struct mparse *curp, const char *file, int fd, ssz = read(fd, fb->buf + (int)off, fb->sz - off); if (ssz == 0) { fb->sz = off; + fb->offs = 0; return(1); } if (ssz == -1) { @@ -728,6 +746,15 @@ mparse_parse_buffer(struct mparse *curp, struct buf blk, const char *file) curp->line = 1; recursion_depth++; + /* Skip an UTF-8 byte order mark. */ + if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 && + (unsigned char)blk.buf[0] == 0xef && + (unsigned char)blk.buf[1] == 0xbb && + (unsigned char)blk.buf[2] == 0xbf) { + blk.offs = 3; + curp->filenc &= ~MPARSE_LATIN1; + } + mparse_buf_r(curp, blk, 1); if (0 == --recursion_depth && MANDOCLEVEL_FATAL > curp->file_status) @@ -742,6 +769,7 @@ mparse_readfd(struct mparse *curp, int fd, const char *file) { struct buf blk; int with_mmap; + int save_filenc; if (-1 == fd && -1 == (fd = open(file, O_RDONLY, 0))) { curp->file_status = MANDOCLEVEL_SYSERR; @@ -760,7 +788,11 @@ mparse_readfd(struct mparse *curp, int fd, const char *file) */ if (read_whole_file(curp, file, fd, &blk, &with_mmap)) { + save_filenc = curp->filenc; + curp->filenc = curp->options & + (MPARSE_UTF8 | MPARSE_LATIN1); mparse_parse_buffer(curp, blk, file); + curp->filenc = save_filenc; if (with_mmap) munmap(blk.buf, blk.sz); else |