diff options
author | Yang Zhao <yang@yangman.ca> | 2009-10-20 19:50:28 -0400 |
---|---|---|
committer | James Cloos <cloos@jhcloos.com> | 2009-10-21 14:55:14 -0400 |
commit | eb64005382abf6262c913a46e6186e019a179d59 (patch) | |
tree | 1ad74b0f50efb2c80bc6a92ba30a42fc79d2642a | |
parent | 681d16ec9843b5c001cff1c9bc0637b8ba0c9abd (diff) |
Print UTF8_STRING type as UTF-8 when locale supports it
Introduces 'u' format character, which behaves like 's', but leaves
UTF-8 encoding intact.
Property value is checked for UTF-8 validity according to RFC 3629.
If invalid, an error string is printed, followed by the string formatted
using 's'. ie:
PROP(UTF8_STRING) = <Invalid UTF-8 string: Forbidden value> "\374\233"
Signed-off-by: Yang Zhao <yang@yangman.ca>
Signed-off-by: James Cloos <cloos@jhcloos.com>
-rw-r--r-- | xprop.c | 148 | ||||
-rw-r--r-- | xprop.man | 8 |
2 files changed, 147 insertions, 9 deletions
@@ -409,6 +409,7 @@ static propertyRec windowPropTable[] = { {"RECTANGLE", XA_RECTANGLE, "16iicc", RECTANGLE_DFORMAT }, {"RGB_COLOR_MAP", XA_RGB_COLOR_MAP,"32xcccccccxx",RGB_COLOR_MAP_DFORMAT}, {"STRING", XA_STRING, "8s", 0 }, + {"UTF8_STRING", 0, "8u", 0 }, {"WINDOW", XA_WINDOW, "32x", ": window id # $0+\n" }, {"VISUALID", XA_VISUALID, "32x", ": visual id # $0\n" }, {"WM_COLORMAP_WINDOWS", 0, "32x", ": window id # $0+\n"}, @@ -683,7 +684,7 @@ _put_char (char c) } static void -_format_char (char c) +_format_char (char c, int unicode) { switch (c) { case '\\': @@ -701,17 +702,21 @@ _format_char (char c) break; default: if (!c_isprint(c)) { - _put_char('\\'); - snprintf(_buf_ptr, _buf_len, "%03o", (unsigned char) c); - _buf_ptr += 3; - _buf_len -= 3; + if (unicode && (c & 0x80)) { + _put_char(c); + } else { + _put_char('\\'); + snprintf(_buf_ptr, _buf_len, "%03o", (unsigned char) c); + _buf_ptr += 3; + _buf_len -= 3; + } } else _put_char(c); } } static const char * -Format_String (const char *string) +Format_String (const char *string, int unicode) { char c; @@ -720,7 +725,7 @@ Format_String (const char *string) _put_char('\"'); while ((c = string++[0])) - _format_char(c); + _format_char(c, unicode); *_buf_ptr++ = '"'; *_buf_ptr++ = '\0'; @@ -738,7 +743,7 @@ Format_Len_String (const char *string, int len) memcpy(data, string, len); data[len] = '\0'; - result = Format_String(data); + result = Format_String(data, 0); free(data); return result; @@ -905,6 +910,129 @@ Format_Len_Text (const char *string, int len, Atom encoding) } /* + * Validate a string as UTF-8 encoded according to RFC 3629 + * + * Simply, a unicode code point (up to 21-bits long) is encoded as follows: + * + * Char. number range | UTF-8 octet sequence + * (hexadecimal) | (binary) + * --------------------+--------------------------------------------- + * 0000 0000-0000 007F | 0xxxxxxx + * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx + * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx + * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + * + * Validation is done left-to-right, and an error condition, if any, refers to + * only the left-most problem in the string. + * + * Return values: + * UTF8_VALID: Valid UTF-8 encoded string + * UTF8_OVERLONG: Using more bytes than needed for a code point + * UTF8_SHORT_TAIL: Not enough bytes in a multi-byte sequence + * UTF8_LONG_TAIL: Too many bytes in a multi-byte sequence + * UTF8_FORBIDDEN_VALUE: Forbidden prefix or code point outside 0x10FFFF + */ +#define UTF8_VALID 0 +#define UTF8_FORBIDDEN_VALUE 1 +#define UTF8_OVERLONG 2 +#define UTF8_SHORT_TAIL 3 +#define UTF8_LONG_TAIL 4 +static int +is_valid_utf8 (const char *string, int len) +{ + unsigned long codepoint; + int rem, i; + unsigned char c; + + rem = 0; + for (i = 0; i < len; i++) { + c = (unsigned char) string[i]; + + /* Order of type check: + * - Single byte code point + * - Non-starting byte of multi-byte sequence + * - Start of 2-byte sequence + * - Start of 3-byte sequence + * - Start of 4-byte sequence + */ + if (!(c & 0x80)) { + if (rem > 0) return UTF8_SHORT_TAIL; + rem = 0; + codepoint = c; + } else if ((c & 0xC0) == 0x80) { + if (rem == 0) return UTF8_LONG_TAIL; + rem--; + codepoint |= (c & 0x3F) << (rem * 6); + if (codepoint == 0) return UTF8_OVERLONG; + } else if ((c & 0xE0) == 0xC0) { + if (rem > 0) return UTF8_SHORT_TAIL; + rem = 1; + codepoint = (c & 0x1F) << 6; + if (codepoint == 0) return UTF8_OVERLONG; + } else if ((c & 0xF0) == 0xE0) { + if (rem > 0) return UTF8_SHORT_TAIL; + rem = 2; + codepoint = (c & 0x0F) << 12; + } else if ((c & 0xF8) == 0xF0) { + if (rem > 0) return UTF8_SHORT_TAIL; + rem = 3; + codepoint = (c & 0x07) << 18; + if (codepoint > 0x10FFFF) return UTF8_FORBIDDEN_VALUE; + } else + return UTF8_FORBIDDEN_VALUE; + } + + return UTF8_VALID; +} + +static const char * +Format_Len_Unicode (const char *string, int len) +{ + char *data; + const char *result, *error; + int len2; + + int validity = is_valid_utf8(string, len); + + if (validity != UTF8_VALID) { + switch (validity) { + case UTF8_FORBIDDEN_VALUE: + error = "<Invalid UTF-8 string: Forbidden value> "; break; + case UTF8_OVERLONG: + error = "<Invalid UTF-8 string: Overlong encoding> "; break; + case UTF8_SHORT_TAIL: + error = "<Invalid UTF-8 string: Tail too short> "; break; + case UTF8_LONG_TAIL: + error = "<Invalid UTF-8 string: Tail too long> "; break; + } + + result = Format_Len_String(string, len); + len2 = strlen(result); + data = (char *) Malloc(len2+1); + memcpy(data, result, len2+1); + + memcpy(_formatting_buffer, error, strlen(error)+1); + strcat(_formatting_buffer, data); + free(data); + + return _formatting_buffer; + } + + if (!is_utf8_locale()) + return Format_Len_String(string, len); + + data = (char *) Malloc(len+1); + + memcpy(data, string, len); + data[len] = '\0'; + + result = Format_String(data, 1); + free(data); + + return result; +} + +/* * * The Format Manager: a group of routines to manage "formats" * @@ -956,6 +1084,8 @@ Format_Thunk (thunk t, char format_char) switch (format_char) { case 's': return Format_Len_String(t.extra_value, (int)t.value); + case 'u': + return Format_Len_Unicode(t.extra_value, (int)t.value); case 't': return Format_Len_Text(t.extra_value, (int)t.value, t.extra_encoding); case 'x': @@ -1252,7 +1382,7 @@ Break_Down_Property (const char *pointer, int length, Atom type, const char *for while (length >= size/8) { format_char = Get_Format_Char(format, i); - if (format_char == 's') + if (format_char == 's' || format_char == 'u') t.value = Extract_Len_String(&pointer,&length,size,&t.extra_value); else if (format_char == 't') { t.extra_encoding = type; @@ -234,6 +234,14 @@ usable with a field size of 8. The string is assumed to be in an ICCCM compliant encoding and is converted to the current locale encoding before being output. .TP +u +This field and the next ones until either a 0 or the end of the property +represent an UTF-8 encoded unicode string. This format character is only +usable with a field size of 8. If the string is found to be an invalid +character, the type of encoding violation is printed instead, followed by +the string formatted using 's'. When in an environment not capable of +displaying UTF-8 encoded string, behaviour is identical to 's'. +.TP x The field is a hex number (like 'c' but displayed in hex - most useful for displaying window ids and the like) |