summaryrefslogtreecommitdiff
path: root/usr.bin
diff options
context:
space:
mode:
authorIngo Schwarze <schwarze@cvs.openbsd.org>2014-10-30 00:05:03 +0000
committerIngo Schwarze <schwarze@cvs.openbsd.org>2014-10-30 00:05:03 +0000
commite4b08b14b91d398693871c7a44fa6160dd7726b7 (patch)
tree4894c108ff19655ce5786be057cec1216407541f /usr.bin
parent4f6719c761be9d89aeb2445b4b338c015abc01c5 (diff)
support UTF-8 and ISO-8859-1 input by integrating modified parts
of kristaps@' version of the preconv(1) utility into mandoc(1); positive feedback from bentley@ and no concern raised when shown on tech@
Diffstat (limited to 'usr.bin')
-rw-r--r--usr.bin/mandoc/Makefile4
-rw-r--r--usr.bin/mandoc/apropos.16
-rw-r--r--usr.bin/mandoc/libmandoc.h11
-rw-r--r--usr.bin/mandoc/main.c32
-rw-r--r--usr.bin/mandoc/man.16
-rw-r--r--usr.bin/mandoc/mandoc.130
-rw-r--r--usr.bin/mandoc/mandoc.h4
-rw-r--r--usr.bin/mandoc/preconv.c211
-rw-r--r--usr.bin/mandoc/read.c72
9 files changed, 341 insertions, 35 deletions
diff --git a/usr.bin/mandoc/Makefile b/usr.bin/mandoc/Makefile
index fad30f9d2a0..84f11e5729f 100644
--- a/usr.bin/mandoc/Makefile
+++ b/usr.bin/mandoc/Makefile
@@ -1,4 +1,4 @@
-# $OpenBSD: Makefile,v 1.82 2014/08/27 00:06:08 schwarze Exp $
+# $OpenBSD: Makefile,v 1.83 2014/10/30 00:05:02 schwarze Exp $
.include <bsd.own.mk>
@@ -7,7 +7,7 @@ CFLAGS += -W -Wall -Wstrict-prototypes -Wno-unused-parameter
DPADD += ${LIBUTIL}
LDADD += -lsqlite3 -lutil
-SRCS= mandoc.c mandoc_aux.c read.c \
+SRCS= mandoc.c mandoc_aux.c preconv.c read.c \
roff.c tbl.c tbl_opts.c tbl_layout.c tbl_data.c eqn.c
SRCS+= mdoc_macro.c mdoc.c mdoc_hash.c \
mdoc_argv.c mdoc_validate.c lib.c att.c \
diff --git a/usr.bin/mandoc/apropos.1 b/usr.bin/mandoc/apropos.1
index a38700bed89..294db4c739e 100644
--- a/usr.bin/mandoc/apropos.1
+++ b/usr.bin/mandoc/apropos.1
@@ -1,4 +1,4 @@
-.\" $OpenBSD: apropos.1,v 1.27 2014/09/03 05:17:08 schwarze Exp $
+.\" $OpenBSD: apropos.1,v 1.28 2014/10/30 00:05:02 schwarze Exp $
.\"
.\" Copyright (c) 2011, 2012 Kristaps Dzonsons <kristaps@bsd.lv>
.\" Copyright (c) 2011, 2012, 2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -15,7 +15,7 @@
.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
.\"
-.Dd $Mdocdate: September 3 2014 $
+.Dd $Mdocdate: October 30 2014 $
.Dt APROPOS 1
.Os
.Sh NAME
@@ -79,7 +79,7 @@ to paginate them.
In
.Fl a
mode, the options
-.Fl IOTW
+.Fl IKOTW
described in the
.Xr mandoc 1
manual are also available.
diff --git a/usr.bin/mandoc/libmandoc.h b/usr.bin/mandoc/libmandoc.h
index e7484c7b708..ccd4597b1d7 100644
--- a/usr.bin/mandoc/libmandoc.h
+++ b/usr.bin/mandoc/libmandoc.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: libmandoc.h,v 1.31 2014/10/28 17:35:42 schwarze Exp $ */
+/* $OpenBSD: libmandoc.h,v 1.32 2014/10/30 00:05:02 schwarze Exp $ */
/*
* Copyright (c) 2009, 2010, 2011, 2012 Kristaps Dzonsons <kristaps@bsd.lv>
* Copyright (c) 2013, 2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -30,6 +30,12 @@ enum rofferr {
ROFF_ERR /* badness: puke and stop */
};
+struct buf {
+ char *buf;
+ size_t sz;
+ size_t offs;
+};
+
__BEGIN_DECLS
struct roff;
@@ -63,6 +69,9 @@ int man_endparse(struct man *);
int man_addspan(struct man *, const struct tbl_span *);
int man_addeqn(struct man *, const struct eqn *);
+int preconv_cue(const struct buf *);
+int preconv_encode(struct buf *, struct buf *, int *);
+
void roff_free(struct roff *);
struct roff *roff_alloc(struct mparse *, const struct mchars *, int);
void roff_reset(struct roff *);
diff --git a/usr.bin/mandoc/main.c b/usr.bin/mandoc/main.c
index c2b8bb032ed..8a14d547f81 100644
--- a/usr.bin/mandoc/main.c
+++ b/usr.bin/mandoc/main.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: main.c,v 1.102 2014/10/28 17:35:42 schwarze Exp $ */
+/* $OpenBSD: main.c,v 1.103 2014/10/30 00:05:02 schwarze Exp $ */
/*
* Copyright (c) 2008-2012 Kristaps Dzonsons <kristaps@bsd.lv>
* Copyright (c) 2010, 2011, 2012, 2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -75,6 +75,7 @@ struct curparse {
char outopts[BUFSIZ]; /* buf of output opts */
};
+static int koptions(int *, char *);
int mandocdb(int, char**);
static int moptions(int *, char *);
static void mmsg(enum mandocerr, enum mandoclevel,
@@ -145,14 +146,15 @@ main(int argc, char *argv[])
memset(&curp, 0, sizeof(struct curparse));
curp.outtype = OUTT_ASCII;
curp.wlevel = MANDOCLEVEL_FATAL;
- options = MPARSE_SO;
+ options = MPARSE_SO | MPARSE_UTF8 | MPARSE_LATIN1;
defos = NULL;
use_pager = 1;
show_usage = 0;
outmode = OUTMODE_DEF;
- while (-1 != (c = getopt(argc, argv, "aC:cfhI:iklM:m:O:S:s:T:VW:w"))) {
+ while (-1 != (c = getopt(argc, argv,
+ "aC:cfhI:iK:klM:m:O:S:s:T:VW:w"))) {
switch (c) {
case 'a':
outmode = OUTMODE_ALL;
@@ -188,6 +190,10 @@ main(int argc, char *argv[])
case 'i':
outmode = OUTMODE_INT;
break;
+ case 'K':
+ if ( ! koptions(&options, optarg))
+ return((int)MANDOCLEVEL_BADARG);
+ break;
case 'k':
search.argmode = ARG_EXPR;
break;
@@ -584,6 +590,26 @@ fail:
}
static int
+koptions(int *options, char *arg)
+{
+
+ if ( ! strcmp(arg, "utf-8")) {
+ *options |= MPARSE_UTF8;
+ *options &= ~MPARSE_LATIN1;
+ } else if ( ! strcmp(arg, "iso-8859-1")) {
+ *options |= MPARSE_LATIN1;
+ *options &= ~MPARSE_UTF8;
+ } else if ( ! strcmp(arg, "us-ascii")) {
+ *options &= ~(MPARSE_UTF8 | MPARSE_LATIN1);
+ } else {
+ fprintf(stderr, "%s: -K%s: Bad argument\n",
+ progname, arg);
+ return(0);
+ }
+ return(1);
+}
+
+static int
moptions(int *options, char *arg)
{
diff --git a/usr.bin/mandoc/man.1 b/usr.bin/mandoc/man.1
index 47bf6589fe4..e78a385a8a8 100644
--- a/usr.bin/mandoc/man.1
+++ b/usr.bin/mandoc/man.1
@@ -1,4 +1,4 @@
-.\" $OpenBSD: man.1,v 1.3 2014/09/03 05:17:08 schwarze Exp $
+.\" $OpenBSD: man.1,v 1.4 2014/10/30 00:05:02 schwarze Exp $
.\"
.\" Copyright (c) 1989, 1990, 1993
.\" The Regents of the University of California. All rights reserved.
@@ -31,7 +31,7 @@
.\"
.\" @(#)man.1 8.2 (Berkeley) 1/2/94
.\"
-.Dd $Mdocdate: September 3 2014 $
+.Dd $Mdocdate: October 30 2014 $
.Dt MAN 1
.Os
.Sh NAME
@@ -255,7 +255,7 @@ combination.
The
.Nm
utility also supports the options
-.Fl IOTW
+.Fl IKOTW
described in the
.Xr mandoc 1
manual.
diff --git a/usr.bin/mandoc/mandoc.1 b/usr.bin/mandoc/mandoc.1
index 5c7f99c27db..625ea0bdbba 100644
--- a/usr.bin/mandoc/mandoc.1
+++ b/usr.bin/mandoc/mandoc.1
@@ -1,4 +1,4 @@
-.\" $OpenBSD: mandoc.1,v 1.63 2014/10/07 18:17:05 schwarze Exp $
+.\" $OpenBSD: mandoc.1,v 1.64 2014/10/30 00:05:02 schwarze Exp $
.\"
.\" Copyright (c) 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
.\" Copyright (c) 2012, 2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -15,7 +15,7 @@
.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
.\"
-.Dd $Mdocdate: October 7 2014 $
+.Dd $Mdocdate: October 30 2014 $
.Dt MANDOC 1
.Os
.Sh NAME
@@ -27,6 +27,7 @@
.Sm off
.Op Fl I Cm os Li = Ar name
.Sm on
+.Op Fl K Na Ar encoding
.Op Fl m Ns Ar format
.Op Fl O Ns Ar option
.Op Fl T Ns Ar output
@@ -89,6 +90,31 @@ macro.
Display only the SYNOPSIS lines.
Implies
.Fl a .
+.It Fl K Ns Ar encoding
+Specify the input encoding.
+The supported
+.Ar encoding
+arguments are
+.Cm us-ascii ,
+.Cm iso-8859-1 ,
+and
+.Cm utf-8 .
+If not specified, autodetection uses the first match:
+.Bl -tag -width iso-8859-1
+.It Cm utf-8
+if the first three bytes of the input file
+are the UTF-8 byte order mark (BOM, 0xefbbbf)
+.It Ar encoding
+if the first or second line of the input file matches the
+.Sy emacs
+mode line format
+.Pp
+.D1 .\e" -*- Oo ...; Oc coding: Ar encoding ; No -*-
+.It Cm utf-8
+if the first non-ASCII byte in the file introduces a valid UTF-8 sequence
+.It Cm iso-8859-1
+otherwise
+.El
.It Fl k
A synonym for
.Xr apropos 1 .
diff --git a/usr.bin/mandoc/mandoc.h b/usr.bin/mandoc/mandoc.h
index dad792ddc09..8832b8db56e 100644
--- a/usr.bin/mandoc/mandoc.h
+++ b/usr.bin/mandoc/mandoc.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: mandoc.h,v 1.110 2014/10/29 00:17:01 schwarze Exp $ */
+/* $OpenBSD: mandoc.h,v 1.111 2014/10/30 00:05:02 schwarze Exp $ */
/*
* Copyright (c) 2010, 2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
* Copyright (c) 2010-2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -393,6 +393,8 @@ struct eqn {
#define MPARSE_MAN 2 /* assume -man */
#define MPARSE_SO 4 /* honour .so requests */
#define MPARSE_QUICK 8 /* abort the parse early */
+#define MPARSE_UTF8 16 /* accept UTF-8 input */
+#define MPARSE_LATIN1 32 /* accept ISO-LATIN-1 input */
enum mandoc_esc {
ESCAPE_ERROR = 0, /* bail! unparsable escape */
diff --git a/usr.bin/mandoc/preconv.c b/usr.bin/mandoc/preconv.c
new file mode 100644
index 00000000000..6ed72f1bccf
--- /dev/null
+++ b/usr.bin/mandoc/preconv.c
@@ -0,0 +1,211 @@
+/* $OpenBSD: preconv.c,v 1.1 2014/10/30 00:05:02 schwarze Exp $ */
+/*
+ * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
+ * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/types.h>
+
+#include <stdio.h>
+#include <string.h>
+#include "mandoc.h"
+#include "libmandoc.h"
+
+int
+preconv_encode(struct buf *ib, struct buf *ob, int *filenc)
+{
+ size_t i;
+ const long one = 1L;
+ int state, be;
+ unsigned int accum;
+ unsigned char cu;
+
+ if ( ! (*filenc & MPARSE_UTF8))
+ goto latin;
+
+ state = 0;
+ accum = 0U;
+ be = 0;
+
+ /* Quick test for big-endian value. */
+
+ if ( ! (*((const char *)(&one))))
+ be = 1;
+
+ for (i = ib->offs; i < ib->sz; i++) {
+ cu = ib->buf[i];
+ if (state) {
+ if ( ! (cu & 128) || (cu & 64)) {
+ /* Bad sequence header. */
+ break;
+ }
+
+ /* Accept only legitimate bit patterns. */
+
+ if (cu > 191 || cu < 128) {
+ /* Bad in-sequence bits. */
+ break;
+ }
+
+ accum |= (cu & 63) << --state * 6;
+
+ if (state)
+ continue;
+
+ /*
+ * Accum is held in little-endian order as
+ * stipulated by the UTF-8 sequence coding. We
+ * need to convert to a native big-endian if our
+ * architecture requires it.
+ */
+
+ if (be)
+ accum = (accum >> 24) |
+ ((accum << 8) & 0x00FF0000) |
+ ((accum >> 8) & 0x0000FF00) |
+ (accum << 24);
+
+ if (accum < 0x80)
+ ob->buf[ob->offs++] = accum;
+ else
+ ob->offs += snprintf(ob->buf + ob->offs,
+ 11, "\\[u%.4X]", accum);
+ ib->offs = i + 1;
+ *filenc &= ~MPARSE_LATIN1;
+ return(1);
+ } else {
+ /*
+ * Entering a UTF-8 state: if we encounter a
+ * UTF-8 bitmask, calculate the expected UTF-8
+ * state from it.
+ */
+ for (state = 0; state < 7; state++)
+ if ( ! (cu & (1 << (7 - state))))
+ break;
+
+ /* Accept only legitimate bit patterns. */
+
+ switch (state--) {
+ case (4):
+ if (cu <= 244 && cu >= 240) {
+ accum = (cu & 7) << 18;
+ continue;
+ }
+ /* Bad 4-sequence start bits. */
+ break;
+ case (3):
+ if (cu <= 239 && cu >= 224) {
+ accum = (cu & 15) << 12;
+ continue;
+ }
+ /* Bad 3-sequence start bits. */
+ break;
+ case (2):
+ if (cu <= 223 && cu >= 194) {
+ accum = (cu & 31) << 6;
+ continue;
+ }
+ /* Bad 2-sequence start bits. */
+ break;
+ default:
+ /* Bad sequence bit mask. */
+ break;
+ }
+ break;
+ }
+ }
+
+ /* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */
+
+latin:
+ if ( ! (*filenc & MPARSE_LATIN1))
+ return(0);
+
+ ob->offs += snprintf(ob->buf + ob->offs, 11,
+ "\\[u%.4X]", (unsigned char)ib->buf[ib->offs++]);
+
+ *filenc &= ~MPARSE_UTF8;
+ return(1);
+}
+
+int
+preconv_cue(const struct buf *b)
+{
+ const char *ln, *eoln, *eoph;
+ size_t sz, phsz;
+
+ ln = b->buf + b->offs;
+ sz = b->sz - b->offs;
+
+ /* Look for the end-of-line. */
+
+ if (NULL == (eoln = memchr(ln, '\n', sz)))
+ eoln = ln + sz;
+
+ /* Check if we have the correct header/trailer. */
+
+ if ((sz = (size_t)(eoln - ln)) < 10 ||
+ memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3))
+ return(MPARSE_UTF8 | MPARSE_LATIN1);
+
+ /* Move after the header and adjust for the trailer. */
+
+ ln += 7;
+ sz -= 10;
+
+ while (sz > 0) {
+ while (sz > 0 && ' ' == *ln) {
+ ln++;
+ sz--;
+ }
+ if (0 == sz)
+ break;
+
+ /* Find the end-of-phrase marker (or eoln). */
+
+ if (NULL == (eoph = memchr(ln, ';', sz)))
+ eoph = eoln - 3;
+ else
+ eoph++;
+
+ /* Only account for the "coding" phrase. */
+
+ if ((phsz = eoph - ln) < 7 ||
+ strncasecmp(ln, "coding:", 7)) {
+ sz -= phsz;
+ ln += phsz;
+ continue;
+ }
+
+ sz -= 7;
+ ln += 7;
+
+ while (sz > 0 && ' ' == *ln) {
+ ln++;
+ sz--;
+ }
+ if (0 == sz)
+ return(0);
+
+ /* Check us against known encodings. */
+
+ if (phsz > 4 && !strncasecmp(ln, "utf-8", 5))
+ return(MPARSE_UTF8);
+ if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11))
+ return(MPARSE_LATIN1);
+ return(0);
+ }
+ return(MPARSE_UTF8 | MPARSE_LATIN1);
+}
diff --git a/usr.bin/mandoc/read.c b/usr.bin/mandoc/read.c
index 4dc8374273d..8cec99488ac 100644
--- a/usr.bin/mandoc/read.c
+++ b/usr.bin/mandoc/read.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: read.c,v 1.69 2014/10/28 17:35:42 schwarze Exp $ */
+/* $OpenBSD: read.c,v 1.70 2014/10/30 00:05:02 schwarze Exp $ */
/*
* Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
* Copyright (c) 2010-2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -39,11 +39,6 @@
#define REPARSE_LIMIT 1000
-struct buf {
- char *buf; /* binary input buffer */
- size_t sz; /* size of binary buffer */
-};
-
struct mparse {
struct man *pman; /* persistent man parser */
struct mdoc *pmdoc; /* persistent mdoc parser */
@@ -60,6 +55,7 @@ struct mparse {
enum mandoclevel file_status; /* status of current parse */
enum mandoclevel wlevel; /* ignore messages below this */
int options; /* parser options */
+ int filenc; /* encoding of the current file */
int reparse_count; /* finite interp. stack */
int line; /* line number in the file */
};
@@ -321,13 +317,20 @@ mparse_buf_r(struct mparse *curp, struct buf blk, int start)
lnn = curp->line;
pos = 0;
- for (i = 0; i < (int)blk.sz; ) {
+ for (i = blk.offs; i < (int)blk.sz; ) {
if (0 == pos && '\0' == blk.buf[i])
break;
if (start) {
curp->line = lnn;
curp->reparse_count = 0;
+
+ if (lnn < 3 &&
+ curp->filenc & MPARSE_UTF8 &&
+ curp->filenc & MPARSE_LATIN1) {
+ blk.offs = i;
+ curp->filenc = preconv_cue(&blk);
+ }
}
while (i < (int)blk.sz && (start || '\0' != blk.buf[i])) {
@@ -348,27 +351,40 @@ mparse_buf_r(struct mparse *curp, struct buf blk, int start)
}
/*
- * Make sure we have space for at least
- * one backslash and one other character
- * and the trailing NUL byte.
+ * Make sure we have space for the worst
+ * case of 11 bytes: "\\[u10ffff]\0"
*/
- if (pos + 2 >= (int)ln.sz)
+ if (pos + 11 > (int)ln.sz)
resize_buf(&ln, 256);
/*
- * Warn about bogus characters. If you're using
- * non-ASCII encoding, you're screwing your
- * readers. Since I'd rather this not happen,
- * I'll be helpful and replace these characters
- * with "?", so we don't display gibberish.
- * Note to manual writers: use special characters.
+ * Encode 8-bit input.
*/
- c = (unsigned char) blk.buf[i];
+ c = blk.buf[i];
+ if (c & 0x80) {
+ blk.offs = i;
+ ln.offs = pos;
+ if (curp->filenc && preconv_encode(
+ &blk, &ln, &curp->filenc)) {
+ pos = ln.offs;
+ i = blk.offs;
+ } else {
+ mandoc_vmsg(MANDOCERR_BADCHAR,
+ curp, curp->line, pos,
+ "0x%x", c);
+ ln.buf[pos++] = '?';
+ i++;
+ }
+ continue;
+ }
+
+ /*
+ * Exclude control characters.
+ */
- if ( ! (isascii(c) &&
- (isgraph(c) || isblank(c)))) {
+ if (c == 0x7f || (c < 0x20 && c != 0x09)) {
mandoc_vmsg(MANDOCERR_BADCHAR, curp,
curp->line, pos, "0x%x", c);
i++;
@@ -627,6 +643,7 @@ read_whole_file(struct mparse *curp, const char *file, int fd,
return(0);
}
*with_mmap = 1;
+ fb->offs = 0;
fb->sz = (size_t)st.st_size;
fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
if (fb->buf != MAP_FAILED)
@@ -657,6 +674,7 @@ read_whole_file(struct mparse *curp, const char *file, int fd,
ssz = read(fd, fb->buf + (int)off, fb->sz - off);
if (ssz == 0) {
fb->sz = off;
+ fb->offs = 0;
return(1);
}
if (ssz == -1) {
@@ -728,6 +746,15 @@ mparse_parse_buffer(struct mparse *curp, struct buf blk, const char *file)
curp->line = 1;
recursion_depth++;
+ /* Skip an UTF-8 byte order mark. */
+ if (curp->filenc & MPARSE_UTF8 && blk.sz > 2 &&
+ (unsigned char)blk.buf[0] == 0xef &&
+ (unsigned char)blk.buf[1] == 0xbb &&
+ (unsigned char)blk.buf[2] == 0xbf) {
+ blk.offs = 3;
+ curp->filenc &= ~MPARSE_LATIN1;
+ }
+
mparse_buf_r(curp, blk, 1);
if (0 == --recursion_depth && MANDOCLEVEL_FATAL > curp->file_status)
@@ -742,6 +769,7 @@ mparse_readfd(struct mparse *curp, int fd, const char *file)
{
struct buf blk;
int with_mmap;
+ int save_filenc;
if (-1 == fd && -1 == (fd = open(file, O_RDONLY, 0))) {
curp->file_status = MANDOCLEVEL_SYSERR;
@@ -760,7 +788,11 @@ mparse_readfd(struct mparse *curp, int fd, const char *file)
*/
if (read_whole_file(curp, file, fd, &blk, &with_mmap)) {
+ save_filenc = curp->filenc;
+ curp->filenc = curp->options &
+ (MPARSE_UTF8 | MPARSE_LATIN1);
mparse_parse_buffer(curp, blk, file);
+ curp->filenc = save_filenc;
if (with_mmap)
munmap(blk.buf, blk.sz);
else