From 6e429a1541b24996bb20feffa4a20d1ef8881f2d Mon Sep 17 00:00:00 2001
From: Ingo Schwarze <schwarze@cvs.openbsd.org>
Date: Sun, 26 Oct 2014 17:11:19 +0000
Subject: Improve -Tascii output for Unicode escape sequences: For the first
 512 code points, provide ASCII approximations.  This is already much better
 than what groff does, which prints nothing for most code points.

A few minor fixes while here:
* Handle Unicode escape sequences in the ASCII range.
* In case of errors, use the REPLACEMENT CHARACTER U+FFFD for -Tutf8
and the string "<?>" for -Tascii output.
* Handle all one-character escape sequences in mchars_spec2{cp,str}()
and remove the workarounds on the higher level.
---
 usr.bin/mandoc/chars.c      | 23 ++++-------
 usr.bin/mandoc/html.c       | 16 ++++++--
 usr.bin/mandoc/term.c       | 94 +++++++++++++++++++--------------------------
 usr.bin/mandoc/term.h       |  4 +-
 usr.bin/mandoc/term_ascii.c | 77 ++++++++++++++++++++++++++++++++++++-
 5 files changed, 139 insertions(+), 75 deletions(-)

diff --git a/usr.bin/mandoc/chars.c b/usr.bin/mandoc/chars.c
index d213e3aabe0..8135c8d0ba1 100644
--- a/usr.bin/mandoc/chars.c
+++ b/usr.bin/mandoc/chars.c
@@ -1,7 +1,7 @@
-/*	$Id: chars.c,v 1.29 2014/07/23 15:00:00 schwarze Exp $ */
+/*	$OpenBSD: chars.c,v 1.30 2014/10/26 17:11:18 schwarze Exp $ */
 /*
  * Copyright (c) 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
- * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2011, 2014 Ingo Schwarze <schwarze@openbsd.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -100,9 +100,7 @@ mchars_spec2cp(const struct mchars *arg, const char *p, size_t sz)
 	const struct ln	*ln;
 
 	ln = find(arg, p, sz);
-	if (NULL == ln)
-		return(-1);
-	return(ln->unicode);
+	return(ln != NULL ? ln->unicode : sz == 1 ? *p : -1);
 }
 
 char
@@ -122,20 +120,13 @@ mchars_num2uc(const char *p, size_t sz)
 	int	 i;
 
 	if ((i = mandoc_strntoi(p, sz, 16)) < 0)
-		return('\0');
+		return(0xFFFD);
 
 	/*
-	 * Security warning:
-	 * Never extend the range of accepted characters
-	 * to overlap with the ASCII range, 0x00-0x7F
-	 * without re-auditing the callers of this function.
-	 * Some callers might relay on the fact that we never
-	 * return ASCII characters for their escaping decisions.
-	 *
 	 * XXX Code is missing here to exclude bogus ranges.
 	 */
 
-	return(i > 0x80 && i <= 0x10FFFF ? i : '\0');
+	return(i <= 0x10FFFF ? i : 0xFFFD);
 }
 
 const char *
@@ -145,9 +136,9 @@ mchars_spec2str(const struct mchars *arg,
 	const struct ln	*ln;
 
 	ln = find(arg, p, sz);
-	if (NULL == ln) {
+	if (ln == NULL) {
 		*rsz = 1;
-		return(NULL);
+		return(sz == 1 ? p : NULL);
 	}
 
 	*rsz = strlen(ln->ascii);
diff --git a/usr.bin/mandoc/html.c b/usr.bin/mandoc/html.c
index 4ec3fbdc8c6..219671ec597 100644
--- a/usr.bin/mandoc/html.c
+++ b/usr.bin/mandoc/html.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: html.c,v 1.46 2014/10/13 21:05:59 chl Exp $ */
+/*	$OpenBSD: html.c,v 1.47 2014/10/26 17:11:18 schwarze Exp $ */
 /*
  * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
  * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -435,8 +435,18 @@ print_encode(struct html *h, const char *p, int norecurse)
 		case ESCAPE_UNICODE:
 			/* Skip past "u" header. */
 			c = mchars_num2uc(seq + 1, len - 1);
-			if ('\0' != c)
-				printf("&#x%x;", c);
+
+			/*
+			 * XXX Security warning:
+			 * For now, forbid Unicode obfuscation of ASCII
+			 * characters.  An audit of the callers is
+			 * required before this can be removed.
+			 */
+
+			if (c < 0x80)
+				c = 0xFFFD;
+
+			printf("&#x%x;", c);
 			break;
 		case ESCAPE_NUMBERED:
 			c = mchars_num2char(seq, len);
diff --git a/usr.bin/mandoc/term.c b/usr.bin/mandoc/term.c
index 5ca55b69f3f..b64b49095f3 100644
--- a/usr.bin/mandoc/term.c
+++ b/usr.bin/mandoc/term.c
@@ -1,4 +1,4 @@
-/*	$Id: term.c,v 1.88 2014/08/18 22:21:52 schwarze Exp $ */
+/*	$OpenBSD: term.c,v 1.89 2014/10/26 17:11:18 schwarze Exp $ */
 /*
  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
  * Copyright (c) 2010-2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -442,27 +442,14 @@ term_word(struct termp *p, const char *word)
 		if (ESCAPE_ERROR == esc)
 			continue;
 
-		if (TERMENC_ASCII != p->enc)
-			switch (esc) {
-			case ESCAPE_UNICODE:
-				uc = mchars_num2uc(seq + 1, sz - 1);
-				if ('\0' == uc)
-					break;
-				encode1(p, uc);
-				continue;
-			case ESCAPE_SPECIAL:
-				uc = mchars_spec2cp(p->symtab, seq, sz);
-				if (uc <= 0)
-					break;
-				encode1(p, uc);
-				continue;
-			default:
-				break;
-			}
-
 		switch (esc) {
 		case ESCAPE_UNICODE:
-			encode1(p, '?');
+			uc = mchars_num2uc(seq + 1, sz - 1);
+			if (p->enc == TERMENC_ASCII) {
+				cp = ascii_uc2str(uc);
+				encode(p, cp, strlen(cp));
+			} else
+				encode1(p, uc);
 			break;
 		case ESCAPE_NUMBERED:
 			c = mchars_num2char(seq, sz);
@@ -470,11 +457,19 @@ term_word(struct termp *p, const char *word)
 				encode(p, &c, 1);
 			break;
 		case ESCAPE_SPECIAL:
-			cp = mchars_spec2str(p->symtab, seq, sz, &ssz);
-			if (NULL != cp)
-				encode(p, cp, ssz);
-			else if (1 == ssz)
-				encode(p, seq, sz);
+			if (p->enc == TERMENC_ASCII) {
+				cp = mchars_spec2str(p->symtab,
+				    seq, sz, &ssz);
+				if (cp == NULL)
+					encode(p, "<?>", 3);
+				else
+					encode(p, cp, ssz);
+			} else {
+				uc = mchars_spec2cp(p->symtab, seq, sz);
+				if (uc <= 0)
+					uc = 0xFFFD;
+				encode1(p, uc);
+			}
 			break;
 		case ESCAPE_FONTBOLD:
 			term_fontrepl(p, TERMFONT_BOLD);
@@ -681,31 +676,16 @@ term_strlen(const struct termp *p, const char *cp)
 			if (ESCAPE_ERROR == esc)
 				continue;
 
-			if (TERMENC_ASCII != p->enc)
-				switch (esc) {
-				case ESCAPE_UNICODE:
-					c = mchars_num2uc(seq + 1,
-					    ssz - 1);
-					if ('\0' == c)
-						break;
-					sz += cond_width(p, c, &skip);
-					continue;
-				case ESCAPE_SPECIAL:
-					c = mchars_spec2cp(p->symtab,
-					    seq, ssz);
-					if (c <= 0)
-						break;
-					sz += cond_width(p, c, &skip);
-					continue;
-				default:
-					break;
-				}
-
 			rhs = NULL;
 
 			switch (esc) {
 			case ESCAPE_UNICODE:
-				sz += cond_width(p, '?', &skip);
+				c = mchars_num2uc(seq + 1, sz - 1);
+				if (p->enc == TERMENC_ASCII) {
+					rhs = ascii_uc2str(c);
+					rsz = strlen(rhs);
+				} else
+					sz += cond_width(p, c, &skip);
 				break;
 			case ESCAPE_NUMBERED:
 				c = mchars_num2char(seq, ssz);
@@ -713,14 +693,20 @@ term_strlen(const struct termp *p, const char *cp)
 					sz += cond_width(p, c, &skip);
 				break;
 			case ESCAPE_SPECIAL:
-				rhs = mchars_spec2str(p->symtab,
-				    seq, ssz, &rsz);
-
-				if (ssz != 1 || rhs)
-					break;
-
-				rhs = seq;
-				rsz = ssz;
+				if (p->enc == TERMENC_ASCII) {
+					rhs = mchars_spec2str(p->symtab,
+					    seq, ssz, &rsz);
+					if (rhs == NULL) {
+						rhs = "<?>";
+						rsz = 3;
+					}
+				} else {
+					c = mchars_spec2cp(p->symtab,
+					    seq, ssz);
+					if (c <= 0)
+						c = 0xFFFD;
+					sz += cond_width(p, c, &skip);
+				}
 				break;
 			case ESCAPE_SKIPCHAR:
 				skip = 1;
diff --git a/usr.bin/mandoc/term.h b/usr.bin/mandoc/term.h
index e79926f0f73..78d26fc6075 100644
--- a/usr.bin/mandoc/term.h
+++ b/usr.bin/mandoc/term.h
@@ -1,4 +1,4 @@
-/*	$OpenBSD: term.h,v 1.47 2014/09/17 20:17:55 schwarze Exp $ */
+/*	$OpenBSD: term.h,v 1.48 2014/10/26 17:11:18 schwarze Exp $ */
 /*
  * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
  * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -104,6 +104,8 @@ struct	termp {
 	struct termp_ps	 *ps;
 };
 
+const char	 *ascii_uc2str(int);
+
 void		  term_eqn(struct termp *, const struct eqn *);
 void		  term_tbl(struct termp *, const struct tbl_span *);
 void		  term_free(struct termp *);
diff --git a/usr.bin/mandoc/term_ascii.c b/usr.bin/mandoc/term_ascii.c
index d813a640766..8fbb398d1ee 100644
--- a/usr.bin/mandoc/term_ascii.c
+++ b/usr.bin/mandoc/term_ascii.c
@@ -1,4 +1,4 @@
-/*	$OpenBSD: term_ascii.c,v 1.21 2014/09/03 05:17:08 schwarze Exp $ */
+/*	$OpenBSD: term_ascii.c,v 1.22 2014/10/26 17:11:18 schwarze Exp $ */
 /*
  * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
  * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -255,6 +255,81 @@ ascii_hspan(const struct termp *p, const struct roffsu *su)
 	return(r);
 }
 
+const char *
+ascii_uc2str(int uc)
+{
+	static const char nbrsp[2] = { ASCII_NBRSP, '\0' };
+	static const char *tab[] = {
+	"<NUL>","<SOH>","<STX>","<ETX>","<EOT>","<ENQ>","<ACK>","<BEL>",
+	"<BS>",	"\t",	"<LF>",	"<VT>",	"<FF>",	"<CR>",	"<SO>",	"<SI>",
+	"<DLE>","<DC1>","<DC2>","<DC3>","<DC4>","<NAK>","<SYN>","<ETB>",
+	"<CAN>","<EM>",	"<SUB>","<ESC>","<FS>",	"<GS>",	"<RS>",	"<US>",
+	" ",	"!",	"\"",	"#",	"$",	"%",	"&",	"'",
+	"(",	")",	"*",	"+",	",",	"-",	".",	"/",
+	"0",	"1",	"2",	"3",	"4",	"5",	"6",	"7",
+	"8",	"9",	":",	";",	"<",	"=",	">",	"?",
+	"@",	"A",	"B",	"C",	"D",	"E",	"F",	"G",
+	"H",	"I",	"J",	"K",	"L",	"M",	"N",	"O",
+	"P",	"Q",	"R",	"S",	"T",	"U",	"V",	"W",
+	"X",	"Y",	"Z",	"[",	"\\",	"]",	"^",	"_",
+	"`",	"a",	"b",	"c",	"d",	"e",	"f",	"g",
+	"h",	"i",	"j",	"k",	"l",	"m",	"n",	"o",
+	"p",	"q",	"r",	"s",	"t",	"u",	"v",	"w",
+	"x",	"y",	"z",	"{",	"|",	"}",	"~",	"<DEL>",
+	"<80>",	"<81>",	"<82>",	"<83>",	"<84>",	"<85>",	"<86>",	"<87>",
+	"<88>",	"<89>",	"<8A>",	"<8B>",	"<8C>",	"<8D>",	"<8E>",	"<8F>",
+	"<90>",	"<91>",	"<92>",	"<93>",	"<94>",	"<95>",	"<96>",	"<97>",
+	"<99>",	"<99>",	"<9A>",	"<9B>",	"<9C>",	"<9D>",	"<9E>",	"<9F>",
+	nbrsp,	"!",	"c",	"GBP",	"$?",	"Y=",	"|",	"<sec>",
+	"\"",	"(C)",	"a.",	"<<",	"<not>","",	"(R)",	"-",
+	"<deg>","+-",	"^2",	"^3",	"'",	"<my>",	"<par>","*",
+	",",	"^1",	"o.",	">>",	"1/4",	"1/2",	"3/4",	"?",
+	"A",	"A",	"A",	"A",	"Ae",	"Aa",	"AE",	"C",
+	"E",	"E",	"E",	"E",	"I",	"I",	"I",	"I",
+	"D",	"N",	"O",	"O",	"O",	"O",	"Oe",	"*",
+	"Oe",	"U",	"U",	"U",	"Ue",	"Y",	"Th",	"ss",
+	"a",	"a",	"a",	"a",	"ae",	"aa",	"ae",	"c",
+	"e",	"e",	"e",	"e",	"i",	"i",	"i",	"i",
+	"d",	"n",	"o",	"o",	"o",	"o",	"oe",	"/",
+	"oe",	"u",	"u",	"u",	"ue",	"y",	"th",	"y",
+	"A",	"a",	"A",	"a",	"A",	"a",	"C",	"c",
+	"C",	"c",	"C",	"c",	"C",	"c",	"D",	"d",
+	"D",	"d",	"E",	"e",	"E",	"e",	"E",	"e",
+	"E",	"e",	"E",	"e",	"G",	"g",	"G",	"g",
+	"G",	"g",	"G",	"g",	"H",	"h",	"H",	"h",
+	"I",	"i",	"I",	"i",	"I",	"i",	"I",	"i",
+	"I",	"i",	"IJ",	"ij",	"J",	"j",	"K",	"k",
+	"q",	"L",	"l",	"L",	"l",	"L",	"l",	"L",
+	"l",	"L",	"l",	"N",	"n",	"N",	"n",	"N",
+	"n",	"'n",	"Ng",	"ng",	"O",	"o",	"O",	"o",
+	"O",	"o",	"OE",	"oe",	"R",	"r",	"R",	"r",
+	"R",	"r",	"S",	"s",	"S",	"s",	"S",	"s",
+	"S",	"s",	"T",	"t",	"T",	"t",	"T",	"t",
+	"U",	"u",	"U",	"u",	"U",	"u",	"U",	"u",
+	"U",	"u",	"U",	"u",	"W",	"w",	"Y",	"y",
+	"Y",	"Z",	"z",	"Z",	"z",	"Z",	"z",	"s",
+	"b",	"B",	"B",	"b",	"6",	"6",	"O",	"C",
+	"c",	"D",	"D",	"D",	"d",	"d",	"3",	"@",
+	"E",	"F",	"f",	"G",	"G",	"hv",	"I",	"I",
+	"K",	"k",	"l",	"l",	"W",	"N",	"n",	"O",
+	"O",	"o",	"OI",	"oi",	"P",	"p",	"YR",	"2",
+	"2",	"SH",	"sh",	"t",	"T",	"t",	"T",	"U",
+	"u",	"Y",	"V",	"Y",	"y",	"Z",	"z",	"ZH",
+	"ZH",	"zh",	"zh",	"2",	"5",	"5",	"ts",	"w",
+	"|",	"||",	"|=",	"!",	"DZ",	"Dz",	"dz",	"LJ",
+	"Lj",	"lj",	"NJ",	"Nj",	"nj",	"A",	"a",	"I",
+	"i",	"O",	"o",	"U",	"u",	"U",	"u",	"U",
+	"u",	"U",	"u",	"U",	"u",	"@",	"A",	"a",
+	"A",	"a",	"AE",	"ae",	"G",	"g",	"G",	"g",
+	"K",	"k",	"O",	"o",	"O",	"o",	"ZH",	"zh",
+	"j",	"DZ",	"D",	"dz",	"G",	"g",	"HV",	"W",
+	"N",	"n",	"A",	"a",	"AE",	"ae",	"O",	"o"};
+
+	if (uc < 0 || (size_t)uc >= sizeof(tab)/sizeof(tab[0]))
+		return("<?>");
+	return(tab[uc]);
+}
+
 static size_t
 locale_width(const struct termp *p, int c)
 {
-- 
cgit v1.2.3