summaryrefslogtreecommitdiff
path: root/usr.bin/mandoc
diff options
context:
space:
mode:
authorIngo Schwarze <schwarze@cvs.openbsd.org>2014-12-19 04:57:12 +0000
committerIngo Schwarze <schwarze@cvs.openbsd.org>2014-12-19 04:57:12 +0000
commit471959573eec2067dcde4dd2b29ca931a17c6983 (patch)
treef3c1c78ee8e3c7ebca409b6762ba714c943f1f02 /usr.bin/mandoc
parent40ac43966a02a9df27bd6166381399205a4db7a0 (diff)
Rewrite the low-level UTF-8 parser from scratch.
It accepted invalid byte sequences like 0xc080-c1bf, 0xe08080-e09fbf, 0xeda080-edbfbf, and 0xf0808080-f08fbfbf, produced valid roff Unicode escape sequences from them, and the algorithm contained strong defenses against any attempt to fix it. This cures an assertion failure in the terminal formatter caused by sneaking in ASCII 0x08 (backspace) by "encoding" it as an (invalid) multibyte UTF-8 sequence, found by jsg@ with afl. As a bonus, the new algorithm also reduces the code in the function by about 20%.
Diffstat (limited to 'usr.bin/mandoc')
-rw-r--r--usr.bin/mandoc/preconv.c135
1 files changed, 59 insertions, 76 deletions
diff --git a/usr.bin/mandoc/preconv.c b/usr.bin/mandoc/preconv.c
index 8e4a1739f76..3d5a30655f6 100644
--- a/usr.bin/mandoc/preconv.c
+++ b/usr.bin/mandoc/preconv.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: preconv.c,v 1.4 2014/11/28 19:25:03 schwarze Exp $ */
+/* $OpenBSD: preconv.c,v 1.5 2014/12/19 04:57:11 schwarze Exp $ */
/*
* Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
* Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -17,6 +17,7 @@
*/
#include <sys/types.h>
+#include <assert.h>
#include <stdio.h>
#include <string.h>
#include "mandoc.h"
@@ -26,88 +27,70 @@ int
preconv_encode(struct buf *ib, size_t *ii, struct buf *ob, size_t *oi,
int *filenc)
{
- size_t i;
- int state;
+ unsigned char *cu;
+ int nby;
unsigned int accum;
- unsigned char cu;
+
+ cu = ib->buf + *ii;
+ assert(*cu & 0x80);
if ( ! (*filenc & MPARSE_UTF8))
goto latin;
- state = 0;
- accum = 0U;
-
- for (i = *ii; i < ib->sz; i++) {
- cu = ib->buf[i];
- if (state) {
- if ( ! (cu & 128) || (cu & 64)) {
- /* Bad sequence header. */
- break;
- }
-
- /* Accept only legitimate bit patterns. */
-
- if (cu > 191 || cu < 128) {
- /* Bad in-sequence bits. */
- break;
- }
-
- accum |= (cu & 63) << --state * 6;
-
- if (state)
- continue;
-
- if (accum < 0x80)
- ob->buf[(*oi)++] = accum;
- else
- *oi += snprintf(ob->buf + *oi,
- 11, "\\[u%.4X]", accum);
- *ii = i + 1;
- *filenc &= ~MPARSE_LATIN1;
- return(1);
- } else {
- /*
- * Entering a UTF-8 state: if we encounter a
- * UTF-8 bitmask, calculate the expected UTF-8
- * state from it.
- */
- for (state = 0; state < 7; state++)
- if ( ! (cu & (1 << (7 - state))))
- break;
-
- /* Accept only legitimate bit patterns. */
-
- switch (state--) {
- case (4):
- if (cu <= 244 && cu >= 240) {
- accum = (cu & 7) << 18;
- continue;
- }
- /* Bad 4-sequence start bits. */
- break;
- case (3):
- if (cu <= 239 && cu >= 224) {
- accum = (cu & 15) << 12;
- continue;
- }
- /* Bad 3-sequence start bits. */
- break;
- case (2):
- if (cu <= 223 && cu >= 194) {
- accum = (cu & 31) << 6;
- continue;
- }
- /* Bad 2-sequence start bits. */
- break;
- default:
- /* Bad sequence bit mask. */
- break;
- }
- break;
- }
+ nby = 1;
+ while (nby < 5 && *cu & (1 << (7 - nby)))
+ nby++;
+
+ switch (nby) {
+ case 2:
+ accum = *cu & 0x1f;
+ if (accum < 0x02) /* Obfuscated ASCII. */
+ goto latin;
+ break;
+ case 3:
+ accum = *cu & 0x0f;
+ break;
+ case 4:
+ accum = *cu & 0x07;
+ if (accum > 0x04) /* Beyond Unicode. */
+ goto latin;
+ break;
+ default: /* Bad sequence header. */
+ goto latin;
+ }
+
+ cu++;
+ switch (nby) {
+ case 3:
+ if ((accum == 0x00 && ! (*cu & 0x20)) || /* Use 2-byte. */
+ (accum == 0x0d && *cu & 0x20)) /* Surrogates. */
+ goto latin;
+ break;
+ case 4:
+ if ((accum == 0x00 && ! (*cu & 0x30)) || /* Use 3-byte. */
+ (accum == 0x04 && *cu & 0x30)) /* Beyond Unicode. */
+ goto latin;
+ break;
+ default:
+ break;
+ }
+
+ while (--nby) {
+ if ((*cu & 0xc0) != 0x80) /* Invalid continuation. */
+ goto latin;
+ accum <<= 6;
+ accum += *cu & 0x3f;
+ cu++;
}
- /* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */
+ assert(accum > 0x7f);
+ assert(accum < 0x110000);
+ assert(accum < 0xd800 || accum > 0xdfff);
+
+ *oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum);
+ *ii = (char *)cu - ib->buf;
+ *filenc &= ~MPARSE_LATIN1;
+ return(1);
latin:
if ( ! (*filenc & MPARSE_LATIN1))