From 6e429a1541b24996bb20feffa4a20d1ef8881f2d Mon Sep 17 00:00:00 2001 From: Ingo Schwarze Date: Sun, 26 Oct 2014 17:11:19 +0000 Subject: Improve -Tascii output for Unicode escape sequences: For the first 512 code points, provide ASCII approximations. This is already much better than what groff does, which prints nothing for most code points. A few minor fixes while here: * Handle Unicode escape sequences in the ASCII range. * In case of errors, use the REPLACEMENT CHARACTER U+FFFD for -Tutf8 and the string "" for -Tascii output. * Handle all one-character escape sequences in mchars_spec2{cp,str}() and remove the workarounds on the higher level. --- usr.bin/mandoc/chars.c | 23 ++++------- usr.bin/mandoc/html.c | 16 ++++++-- usr.bin/mandoc/term.c | 94 +++++++++++++++++++-------------------------- usr.bin/mandoc/term.h | 4 +- usr.bin/mandoc/term_ascii.c | 77 ++++++++++++++++++++++++++++++++++++- 5 files changed, 139 insertions(+), 75 deletions(-) diff --git a/usr.bin/mandoc/chars.c b/usr.bin/mandoc/chars.c index d213e3aabe0..8135c8d0ba1 100644 --- a/usr.bin/mandoc/chars.c +++ b/usr.bin/mandoc/chars.c @@ -1,7 +1,7 @@ -/* $Id: chars.c,v 1.29 2014/07/23 15:00:00 schwarze Exp $ */ +/* $OpenBSD: chars.c,v 1.30 2014/10/26 17:11:18 schwarze Exp $ */ /* * Copyright (c) 2009, 2010, 2011 Kristaps Dzonsons - * Copyright (c) 2011 Ingo Schwarze + * Copyright (c) 2011, 2014 Ingo Schwarze * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -100,9 +100,7 @@ mchars_spec2cp(const struct mchars *arg, const char *p, size_t sz) const struct ln *ln; ln = find(arg, p, sz); - if (NULL == ln) - return(-1); - return(ln->unicode); + return(ln != NULL ? ln->unicode : sz == 1 ? *p : -1); } char @@ -122,20 +120,13 @@ mchars_num2uc(const char *p, size_t sz) int i; if ((i = mandoc_strntoi(p, sz, 16)) < 0) - return('\0'); + return(0xFFFD); /* - * Security warning: - * Never extend the range of accepted characters - * to overlap with the ASCII range, 0x00-0x7F - * without re-auditing the callers of this function. - * Some callers might relay on the fact that we never - * return ASCII characters for their escaping decisions. - * * XXX Code is missing here to exclude bogus ranges. */ - return(i > 0x80 && i <= 0x10FFFF ? i : '\0'); + return(i <= 0x10FFFF ? i : 0xFFFD); } const char * @@ -145,9 +136,9 @@ mchars_spec2str(const struct mchars *arg, const struct ln *ln; ln = find(arg, p, sz); - if (NULL == ln) { + if (ln == NULL) { *rsz = 1; - return(NULL); + return(sz == 1 ? p : NULL); } *rsz = strlen(ln->ascii); diff --git a/usr.bin/mandoc/html.c b/usr.bin/mandoc/html.c index 4ec3fbdc8c6..219671ec597 100644 --- a/usr.bin/mandoc/html.c +++ b/usr.bin/mandoc/html.c @@ -1,4 +1,4 @@ -/* $OpenBSD: html.c,v 1.46 2014/10/13 21:05:59 chl Exp $ */ +/* $OpenBSD: html.c,v 1.47 2014/10/26 17:11:18 schwarze Exp $ */ /* * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze @@ -435,8 +435,18 @@ print_encode(struct html *h, const char *p, int norecurse) case ESCAPE_UNICODE: /* Skip past "u" header. */ c = mchars_num2uc(seq + 1, len - 1); - if ('\0' != c) - printf("&#x%x;", c); + + /* + * XXX Security warning: + * For now, forbid Unicode obfuscation of ASCII + * characters. An audit of the callers is + * required before this can be removed. + */ + + if (c < 0x80) + c = 0xFFFD; + + printf("&#x%x;", c); break; case ESCAPE_NUMBERED: c = mchars_num2char(seq, len); diff --git a/usr.bin/mandoc/term.c b/usr.bin/mandoc/term.c index 5ca55b69f3f..b64b49095f3 100644 --- a/usr.bin/mandoc/term.c +++ b/usr.bin/mandoc/term.c @@ -1,4 +1,4 @@ -/* $Id: term.c,v 1.88 2014/08/18 22:21:52 schwarze Exp $ */ +/* $OpenBSD: term.c,v 1.89 2014/10/26 17:11:18 schwarze Exp $ */ /* * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons * Copyright (c) 2010-2014 Ingo Schwarze @@ -442,27 +442,14 @@ term_word(struct termp *p, const char *word) if (ESCAPE_ERROR == esc) continue; - if (TERMENC_ASCII != p->enc) - switch (esc) { - case ESCAPE_UNICODE: - uc = mchars_num2uc(seq + 1, sz - 1); - if ('\0' == uc) - break; - encode1(p, uc); - continue; - case ESCAPE_SPECIAL: - uc = mchars_spec2cp(p->symtab, seq, sz); - if (uc <= 0) - break; - encode1(p, uc); - continue; - default: - break; - } - switch (esc) { case ESCAPE_UNICODE: - encode1(p, '?'); + uc = mchars_num2uc(seq + 1, sz - 1); + if (p->enc == TERMENC_ASCII) { + cp = ascii_uc2str(uc); + encode(p, cp, strlen(cp)); + } else + encode1(p, uc); break; case ESCAPE_NUMBERED: c = mchars_num2char(seq, sz); @@ -470,11 +457,19 @@ term_word(struct termp *p, const char *word) encode(p, &c, 1); break; case ESCAPE_SPECIAL: - cp = mchars_spec2str(p->symtab, seq, sz, &ssz); - if (NULL != cp) - encode(p, cp, ssz); - else if (1 == ssz) - encode(p, seq, sz); + if (p->enc == TERMENC_ASCII) { + cp = mchars_spec2str(p->symtab, + seq, sz, &ssz); + if (cp == NULL) + encode(p, "", 3); + else + encode(p, cp, ssz); + } else { + uc = mchars_spec2cp(p->symtab, seq, sz); + if (uc <= 0) + uc = 0xFFFD; + encode1(p, uc); + } break; case ESCAPE_FONTBOLD: term_fontrepl(p, TERMFONT_BOLD); @@ -681,31 +676,16 @@ term_strlen(const struct termp *p, const char *cp) if (ESCAPE_ERROR == esc) continue; - if (TERMENC_ASCII != p->enc) - switch (esc) { - case ESCAPE_UNICODE: - c = mchars_num2uc(seq + 1, - ssz - 1); - if ('\0' == c) - break; - sz += cond_width(p, c, &skip); - continue; - case ESCAPE_SPECIAL: - c = mchars_spec2cp(p->symtab, - seq, ssz); - if (c <= 0) - break; - sz += cond_width(p, c, &skip); - continue; - default: - break; - } - rhs = NULL; switch (esc) { case ESCAPE_UNICODE: - sz += cond_width(p, '?', &skip); + c = mchars_num2uc(seq + 1, sz - 1); + if (p->enc == TERMENC_ASCII) { + rhs = ascii_uc2str(c); + rsz = strlen(rhs); + } else + sz += cond_width(p, c, &skip); break; case ESCAPE_NUMBERED: c = mchars_num2char(seq, ssz); @@ -713,14 +693,20 @@ term_strlen(const struct termp *p, const char *cp) sz += cond_width(p, c, &skip); break; case ESCAPE_SPECIAL: - rhs = mchars_spec2str(p->symtab, - seq, ssz, &rsz); - - if (ssz != 1 || rhs) - break; - - rhs = seq; - rsz = ssz; + if (p->enc == TERMENC_ASCII) { + rhs = mchars_spec2str(p->symtab, + seq, ssz, &rsz); + if (rhs == NULL) { + rhs = ""; + rsz = 3; + } + } else { + c = mchars_spec2cp(p->symtab, + seq, ssz); + if (c <= 0) + c = 0xFFFD; + sz += cond_width(p, c, &skip); + } break; case ESCAPE_SKIPCHAR: skip = 1; diff --git a/usr.bin/mandoc/term.h b/usr.bin/mandoc/term.h index e79926f0f73..78d26fc6075 100644 --- a/usr.bin/mandoc/term.h +++ b/usr.bin/mandoc/term.h @@ -1,4 +1,4 @@ -/* $OpenBSD: term.h,v 1.47 2014/09/17 20:17:55 schwarze Exp $ */ +/* $OpenBSD: term.h,v 1.48 2014/10/26 17:11:18 schwarze Exp $ */ /* * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze @@ -104,6 +104,8 @@ struct termp { struct termp_ps *ps; }; +const char *ascii_uc2str(int); + void term_eqn(struct termp *, const struct eqn *); void term_tbl(struct termp *, const struct tbl_span *); void term_free(struct termp *); diff --git a/usr.bin/mandoc/term_ascii.c b/usr.bin/mandoc/term_ascii.c index d813a640766..8fbb398d1ee 100644 --- a/usr.bin/mandoc/term_ascii.c +++ b/usr.bin/mandoc/term_ascii.c @@ -1,4 +1,4 @@ -/* $OpenBSD: term_ascii.c,v 1.21 2014/09/03 05:17:08 schwarze Exp $ */ +/* $OpenBSD: term_ascii.c,v 1.22 2014/10/26 17:11:18 schwarze Exp $ */ /* * Copyright (c) 2010, 2011 Kristaps Dzonsons * Copyright (c) 2014 Ingo Schwarze @@ -255,6 +255,81 @@ ascii_hspan(const struct termp *p, const struct roffsu *su) return(r); } +const char * +ascii_uc2str(int uc) +{ + static const char nbrsp[2] = { ASCII_NBRSP, '\0' }; + static const char *tab[] = { + "","","","","","","","", + "", "\t", "", "", "", "", "", "", + "","","","","","","","", + "","", "","","", "", "", "", + " ", "!", "\"", "#", "$", "%", "&", "'", + "(", ")", "*", "+", ",", "-", ".", "/", + "0", "1", "2", "3", "4", "5", "6", "7", + "8", "9", ":", ";", "<", "=", ">", "?", + "@", "A", "B", "C", "D", "E", "F", "G", + "H", "I", "J", "K", "L", "M", "N", "O", + "P", "Q", "R", "S", "T", "U", "V", "W", + "X", "Y", "Z", "[", "\\", "]", "^", "_", + "`", "a", "b", "c", "d", "e", "f", "g", + "h", "i", "j", "k", "l", "m", "n", "o", + "p", "q", "r", "s", "t", "u", "v", "w", + "x", "y", "z", "{", "|", "}", "~", "", + "<80>", "<81>", "<82>", "<83>", "<84>", "<85>", "<86>", "<87>", + "<88>", "<89>", "<8A>", "<8B>", "<8C>", "<8D>", "<8E>", "<8F>", + "<90>", "<91>", "<92>", "<93>", "<94>", "<95>", "<96>", "<97>", + "<99>", "<99>", "<9A>", "<9B>", "<9C>", "<9D>", "<9E>", "<9F>", + nbrsp, "!", "c", "GBP", "$?", "Y=", "|", "", + "\"", "(C)", "a.", "<<", "","", "(R)", "-", + "","+-", "^2", "^3", "'", "", "","*", + ",", "^1", "o.", ">>", "1/4", "1/2", "3/4", "?", + "A", "A", "A", "A", "Ae", "Aa", "AE", "C", + "E", "E", "E", "E", "I", "I", "I", "I", + "D", "N", "O", "O", "O", "O", "Oe", "*", + "Oe", "U", "U", "U", "Ue", "Y", "Th", "ss", + "a", "a", "a", "a", "ae", "aa", "ae", "c", + "e", "e", "e", "e", "i", "i", "i", "i", + "d", "n", "o", "o", "o", "o", "oe", "/", + "oe", "u", "u", "u", "ue", "y", "th", "y", + "A", "a", "A", "a", "A", "a", "C", "c", + "C", "c", "C", "c", "C", "c", "D", "d", + "D", "d", "E", "e", "E", "e", "E", "e", + "E", "e", "E", "e", "G", "g", "G", "g", + "G", "g", "G", "g", "H", "h", "H", "h", + "I", "i", "I", "i", "I", "i", "I", "i", + "I", "i", "IJ", "ij", "J", "j", "K", "k", + "q", "L", "l", "L", "l", "L", "l", "L", + "l", "L", "l", "N", "n", "N", "n", "N", + "n", "'n", "Ng", "ng", "O", "o", "O", "o", + "O", "o", "OE", "oe", "R", "r", "R", "r", + "R", "r", "S", "s", "S", "s", "S", "s", + "S", "s", "T", "t", "T", "t", "T", "t", + "U", "u", "U", "u", "U", "u", "U", "u", + "U", "u", "U", "u", "W", "w", "Y", "y", + "Y", "Z", "z", "Z", "z", "Z", "z", "s", + "b", "B", "B", "b", "6", "6", "O", "C", + "c", "D", "D", "D", "d", "d", "3", "@", + "E", "F", "f", "G", "G", "hv", "I", "I", + "K", "k", "l", "l", "W", "N", "n", "O", + "O", "o", "OI", "oi", "P", "p", "YR", "2", + "2", "SH", "sh", "t", "T", "t", "T", "U", + "u", "Y", "V", "Y", "y", "Z", "z", "ZH", + "ZH", "zh", "zh", "2", "5", "5", "ts", "w", + "|", "||", "|=", "!", "DZ", "Dz", "dz", "LJ", + "Lj", "lj", "NJ", "Nj", "nj", "A", "a", "I", + "i", "O", "o", "U", "u", "U", "u", "U", + "u", "U", "u", "U", "u", "@", "A", "a", + "A", "a", "AE", "ae", "G", "g", "G", "g", + "K", "k", "O", "o", "O", "o", "ZH", "zh", + "j", "DZ", "D", "dz", "G", "g", "HV", "W", + "N", "n", "A", "a", "AE", "ae", "O", "o"}; + + if (uc < 0 || (size_t)uc >= sizeof(tab)/sizeof(tab[0])) + return(""); + return(tab[uc]); +} + static size_t locale_width(const struct termp *p, int c) { -- cgit v1.2.3