src - OpenBSD base system

diff options


context:
space:
mode:

author	Ingo Schwarze <schwarze@cvs.openbsd.org>	2014-10-26 17:11:19 +0000
committer	Ingo Schwarze <schwarze@cvs.openbsd.org>	2014-10-26 17:11:19 +0000
commit	6e429a1541b24996bb20feffa4a20d1ef8881f2d (patch)
tree	6fd1c404b6d613e88aa331ac4a54c70546ac505b
parent	395b22669041af25c911b3efeda5f1f3cae19027 (diff)

Improve -Tascii output for Unicode escape sequences: For the first 512

code points, provide ASCII approximations. This is already much better than what groff does, which prints nothing for most code points. A few minor fixes while here: * Handle Unicode escape sequences in the ASCII range. * In case of errors, use the REPLACEMENT CHARACTER U+FFFD for -Tutf8 and the string "<?>" for -Tascii output. * Handle all one-character escape sequences in mchars_spec2{cp,str}() and remove the workarounds on the higher level.

Diffstat

-rw-r--r--

usr.bin/mandoc/chars.c

-rw-r--r--

usr.bin/mandoc/html.c

-rw-r--r--

usr.bin/mandoc/term.c

-rw-r--r--

usr.bin/mandoc/term.h

-rw-r--r--

usr.bin/mandoc/term_ascii.c

5 files changed, 139 insertions, 75 deletions

diff --git a/usr.bin/mandoc/chars.c b/usr.bin/mandoc/chars.c
index d213e3aabe0..8135c8d0ba1 100644
--- a/usr.bin/mandoc/chars.c
+++ b/usr.bin/mandoc/chars.c

@@ -1,7 +1,7 @@

-/* $Id: chars.c,v 1.29 2014/07/23 15:00:00 schwarze Exp $ */

+/* $OpenBSD: chars.c,v 1.30 2014/10/26 17:11:18 schwarze Exp $ */

* Permission to use, copy, modify, and distribute this software for any

* purpose with or without fee is hereby granted, provided that the above

@@ -100,9 +100,7 @@ mchars_spec2cp(const struct mchars *arg, const char *p, size_t sz)

const struct ln *ln;

ln = find(arg, p, sz);

- if (NULL == ln)

- return(-1);

- return(ln->unicode);

+ return(ln != NULL ? ln->unicode : sz == 1 ? *p : -1);

}

char

@@ -122,20 +120,13 @@ mchars_num2uc(const char *p, size_t sz)

int i;

if ((i = mandoc_strntoi(p, sz, 16)) < 0)

- return('\0');

+ return(0xFFFD);

- * Security warning:

- * Never extend the range of accepted characters

- * to overlap with the ASCII range, 0x00-0x7F

- * without re-auditing the callers of this function.

- * Some callers might relay on the fact that we never

- * return ASCII characters for their escaping decisions.

- *

* XXX Code is missing here to exclude bogus ranges.

- return(i > 0x80 && i <= 0x10FFFF ? i : '\0');

+ return(i <= 0x10FFFF ? i : 0xFFFD);

}

const char *

@@ -145,9 +136,9 @@ mchars_spec2str(const struct mchars *arg,

const struct ln *ln;

ln = find(arg, p, sz);

- if (NULL == ln) {

+ if (ln == NULL) {

*rsz = 1;

- return(NULL);

+ return(sz == 1 ? p : NULL);

}

*rsz = strlen(ln->ascii);

diff --git a/usr.bin/mandoc/html.c b/usr.bin/mandoc/html.c
index 4ec3fbdc8c6..219671ec597 100644
--- a/usr.bin/mandoc/html.c
+++ b/usr.bin/mandoc/html.c

@@ -1,4 +1,4 @@

-/* $OpenBSD: html.c,v 1.46 2014/10/13 21:05:59 chl Exp $ */

+/* $OpenBSD: html.c,v 1.47 2014/10/26 17:11:18 schwarze Exp $ */

@@ -435,8 +435,18 @@ print_encode(struct html *h, const char *p, int norecurse)

case ESCAPE_UNICODE:

/* Skip past "u" header. */

c = mchars_num2uc(seq + 1, len - 1);

- if ('\0' != c)

- printf("&#x%x;", c);

+ /*

+ * XXX Security warning:

+ * For now, forbid Unicode obfuscation of ASCII

+ * characters. An audit of the callers is

+ * required before this can be removed.

+ */

+ if (c < 0x80)

+ c = 0xFFFD;

+ printf("&#x%x;", c);

break;

case ESCAPE_NUMBERED:

c = mchars_num2char(seq, len);

diff --git a/usr.bin/mandoc/term.c b/usr.bin/mandoc/term.c
index 5ca55b69f3f..b64b49095f3 100644
--- a/usr.bin/mandoc/term.c
+++ b/usr.bin/mandoc/term.c

@@ -1,4 +1,4 @@

-/* $Id: term.c,v 1.88 2014/08/18 22:21:52 schwarze Exp $ */

+/* $OpenBSD: term.c,v 1.89 2014/10/26 17:11:18 schwarze Exp $ */

@@ -442,27 +442,14 @@ term_word(struct termp *p, const char *word)

if (ESCAPE_ERROR == esc)

continue;

- if (TERMENC_ASCII != p->enc)

- switch (esc) {

- case ESCAPE_UNICODE:

- uc = mchars_num2uc(seq + 1, sz - 1);

- if ('\0' == uc)

- break;

- encode1(p, uc);

- continue;

- case ESCAPE_SPECIAL:

- uc = mchars_spec2cp(p->symtab, seq, sz);

- if (uc <= 0)

- break;

- encode1(p, uc);

- continue;

- default:

- break;

- }

switch (esc) {

case ESCAPE_UNICODE:

- encode1(p, '?');

+ uc = mchars_num2uc(seq + 1, sz - 1);

+ if (p->enc == TERMENC_ASCII) {

+ cp = ascii_uc2str(uc);

+ encode(p, cp, strlen(cp));

+ } else

+ encode1(p, uc);

break;

case ESCAPE_NUMBERED:

c = mchars_num2char(seq, sz);

@@ -470,11 +457,19 @@ term_word(struct termp *p, const char *word)

encode(p, &c, 1);

break;

case ESCAPE_SPECIAL:

- cp = mchars_spec2str(p->symtab, seq, sz, &ssz);

- if (NULL != cp)

- encode(p, cp, ssz);

- else if (1 == ssz)

- encode(p, seq, sz);

+ if (p->enc == TERMENC_ASCII) {

+ cp = mchars_spec2str(p->symtab,

+ seq, sz, &ssz);

+ if (cp == NULL)

+ encode(p, "<?>", 3);

+ else

+ encode(p, cp, ssz);

+ } else {

+ uc = mchars_spec2cp(p->symtab, seq, sz);

+ if (uc <= 0)

+ uc = 0xFFFD;

+ encode1(p, uc);

+ }

break;

case ESCAPE_FONTBOLD:

term_fontrepl(p, TERMFONT_BOLD);

@@ -681,31 +676,16 @@ term_strlen(const struct termp *p, const char *cp)

if (ESCAPE_ERROR == esc)

continue;

- if (TERMENC_ASCII != p->enc)

- switch (esc) {

- case ESCAPE_UNICODE:

- c = mchars_num2uc(seq + 1,

- ssz - 1);

- if ('\0' == c)

- break;

- sz += cond_width(p, c, &skip);

- continue;

- case ESCAPE_SPECIAL:

- c = mchars_spec2cp(p->symtab,

- seq, ssz);

- if (c <= 0)

- break;

- sz += cond_width(p, c, &skip);

- continue;

- default:

- break;

- }

rhs = NULL;

switch (esc) {

case ESCAPE_UNICODE:

- sz += cond_width(p, '?', &skip);

+ c = mchars_num2uc(seq + 1, sz - 1);

+ if (p->enc == TERMENC_ASCII) {

+ rhs = ascii_uc2str(c);

+ rsz = strlen(rhs);

+ } else

+ sz += cond_width(p, c, &skip);

break;

case ESCAPE_NUMBERED:

c = mchars_num2char(seq, ssz);

@@ -713,14 +693,20 @@ term_strlen(const struct termp *p, const char *cp)

sz += cond_width(p, c, &skip);

break;

case ESCAPE_SPECIAL:

- rhs = mchars_spec2str(p->symtab,

- seq, ssz, &rsz);

- if (ssz != 1 || rhs)

- break;

- rhs = seq;

- rsz = ssz;

+ if (p->enc == TERMENC_ASCII) {

+ rhs = mchars_spec2str(p->symtab,

+ seq, ssz, &rsz);

+ if (rhs == NULL) {

+ rhs = "<?>";

+ rsz = 3;

+ }

+ } else {

+ c = mchars_spec2cp(p->symtab,

+ seq, ssz);

+ if (c <= 0)

+ c = 0xFFFD;

+ sz += cond_width(p, c, &skip);

+ }

break;

case ESCAPE_SKIPCHAR:

skip = 1;

diff --git a/usr.bin/mandoc/term.h b/usr.bin/mandoc/term.h
index e79926f0f73..78d26fc6075 100644
--- a/usr.bin/mandoc/term.h
+++ b/usr.bin/mandoc/term.h

@@ -1,4 +1,4 @@

-/* $OpenBSD: term.h,v 1.47 2014/09/17 20:17:55 schwarze Exp $ */

+/* $OpenBSD: term.h,v 1.48 2014/10/26 17:11:18 schwarze Exp $ */

@@ -104,6 +104,8 @@ struct termp {

struct termp_ps *ps;

};

+const char *ascii_uc2str(int);

void term_eqn(struct termp *, const struct eqn *);

void term_tbl(struct termp *, const struct tbl_span *);

void term_free(struct termp *);

diff --git a/usr.bin/mandoc/term_ascii.c b/usr.bin/mandoc/term_ascii.c
index d813a640766..8fbb398d1ee 100644
--- a/usr.bin/mandoc/term_ascii.c
+++ b/usr.bin/mandoc/term_ascii.c

@@ -1,4 +1,4 @@

-/* $OpenBSD: term_ascii.c,v 1.21 2014/09/03 05:17:08 schwarze Exp $ */

+/* $OpenBSD: term_ascii.c,v 1.22 2014/10/26 17:11:18 schwarze Exp $ */

@@ -255,6 +255,81 @@ ascii_hspan(const struct termp *p, const struct roffsu *su)

return(r);

}

+const char *

+ascii_uc2str(int uc)

+ static const char nbrsp[2] = { ASCII_NBRSP, '\0' };

+ static const char *tab[] = {

+ "<NUL>","<SOH>","<STX>","<ETX>","<EOT>","<ENQ>","<ACK>","<BEL>",

+ "<BS>", "\t", "<LF>", "<VT>", "<FF>", "<CR>", "<SO>", "<SI>",

+ "<DLE>","<DC1>","<DC2>","<DC3>","<DC4>","<NAK>","<SYN>","<ETB>",

+ "<CAN>","<EM>", "<SUB>","<ESC>","<FS>", "<GS>", "<RS>", "<US>",

+ " ", "!", "\"", "#", "$", "%", "&", "'",

+ "(", ")", "*", "+", ",", "-", ".", "/",

+ "0", "1", "2", "3", "4", "5", "6", "7",

+ "8", "9", ":", ";", "<", "=", ">", "?",

+ "@", "A", "B", "C", "D", "E", "F", "G",

+ "H", "I", "J", "K", "L", "M", "N", "O",

+ "P", "Q", "R", "S", "T", "U", "V", "W",

+ "X", "Y", "Z", "[", "\\", "]", "^", "_",

+ "`", "a", "b", "c", "d", "e", "f", "g",

+ "h", "i", "j", "k", "l", "m", "n", "o",

+ "p", "q", "r", "s", "t", "u", "v", "w",

+ "x", "y", "z", "{", "|", "}", "~", "<DEL>",

+ "<80>", "<81>", "<82>", "<83>", "<84>", "<85>", "<86>", "<87>",

+ "<88>", "<89>", "<8A>", "<8B>", "<8C>", "<8D>", "<8E>", "<8F>",

+ "<90>", "<91>", "<92>", "<93>", "<94>", "<95>", "<96>", "<97>",

+ "<99>", "<99>", "<9A>", "<9B>", "<9C>", "<9D>", "<9E>", "<9F>",

+ nbrsp, "!", "c", "GBP", "$?", "Y=", "|", "<sec>",

+ "\"", "(C)", "a.", "<<", "<not>","", "(R)", "-",

+ "<deg>","+-", "^2", "^3", "'", "<my>", "<par>","*",

+ ",", "^1", "o.", ">>", "1/4", "1/2", "3/4", "?",

+ "A", "A", "A", "A", "Ae", "Aa", "AE", "C",

+ "E", "E", "E", "E", "I", "I", "I", "I",

+ "D", "N", "O", "O", "O", "O", "Oe", "*",

+ "Oe", "U", "U", "U", "Ue", "Y", "Th", "ss",

+ "a", "a", "a", "a", "ae", "aa", "ae", "c",

+ "e", "e", "e", "e", "i", "i", "i", "i",

+ "d", "n", "o", "o", "o", "o", "oe", "/",

+ "oe", "u", "u", "u", "ue", "y", "th", "y",

+ "A", "a", "A", "a", "A", "a", "C", "c",

+ "C", "c", "C", "c", "C", "c", "D", "d",

+ "D", "d", "E", "e", "E", "e", "E", "e",

+ "E", "e", "E", "e", "G", "g", "G", "g",

+ "G", "g", "G", "g", "H", "h", "H", "h",

+ "I", "i", "I", "i", "I", "i", "I", "i",

+ "I", "i", "IJ", "ij", "J", "j", "K", "k",

+ "q", "L", "l", "L", "l", "L", "l", "L",

+ "l", "L", "l", "N", "n", "N", "n", "N",

+ "n", "'n", "Ng", "ng", "O", "o", "O", "o",

+ "O", "o", "OE", "oe", "R", "r", "R", "r",

+ "R", "r", "S", "s", "S", "s", "S", "s",

+ "S", "s", "T", "t", "T", "t", "T", "t",

+ "U", "u", "U", "u", "U", "u", "U", "u",

+ "U", "u", "U", "u", "W", "w", "Y", "y",

+ "Y", "Z", "z", "Z", "z", "Z", "z", "s",

+ "b", "B", "B", "b", "6", "6", "O", "C",

+ "c", "D", "D", "D", "d", "d", "3", "@",

+ "E", "F", "f", "G", "G", "hv", "I", "I",

+ "K", "k", "l", "l", "W", "N", "n", "O",

+ "O", "o", "OI", "oi", "P", "p", "YR", "2",

+ "2", "SH", "sh", "t", "T", "t", "T", "U",

+ "u", "Y", "V", "Y", "y", "Z", "z", "ZH",

+ "ZH", "zh", "zh", "2", "5", "5", "ts", "w",

+ "|", "||", "|=", "!", "DZ", "Dz", "dz", "LJ",

+ "Lj", "lj", "NJ", "Nj", "nj", "A", "a", "I",

+ "i", "O", "o", "U", "u", "U", "u", "U",

+ "u", "U", "u", "U", "u", "@", "A", "a",

+ "A", "a", "AE", "ae", "G", "g", "G", "g",

+ "K", "k", "O", "o", "O", "o", "ZH", "zh",

+ "j", "DZ", "D", "dz", "G", "g", "HV", "W",

+ "N", "n", "A", "a", "AE", "ae", "O", "o"};

+ if (uc < 0 || (size_t)uc >= sizeof(tab)/sizeof(tab[0]))

+ return("<?>");

+ return(tab[uc]);

static size_t

locale_width(const struct termp *p, int c)

{