summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYang Zhao <yang@yangman.ca>2009-10-20 19:50:28 -0400
committerJames Cloos <cloos@jhcloos.com>2009-10-21 14:55:14 -0400
commiteb64005382abf6262c913a46e6186e019a179d59 (patch)
tree1ad74b0f50efb2c80bc6a92ba30a42fc79d2642a
parent681d16ec9843b5c001cff1c9bc0637b8ba0c9abd (diff)
Print UTF8_STRING type as UTF-8 when locale supports it
Introduces 'u' format character, which behaves like 's', but leaves UTF-8 encoding intact. Property value is checked for UTF-8 validity according to RFC 3629. If invalid, an error string is printed, followed by the string formatted using 's'. ie: PROP(UTF8_STRING) = <Invalid UTF-8 string: Forbidden value> "\374\233" Signed-off-by: Yang Zhao <yang@yangman.ca> Signed-off-by: James Cloos <cloos@jhcloos.com>
-rw-r--r--xprop.c148
-rw-r--r--xprop.man8
2 files changed, 147 insertions, 9 deletions
diff --git a/xprop.c b/xprop.c
index 8261b15..ea65013 100644
--- a/xprop.c
+++ b/xprop.c
@@ -409,6 +409,7 @@ static propertyRec windowPropTable[] = {
{"RECTANGLE", XA_RECTANGLE, "16iicc", RECTANGLE_DFORMAT },
{"RGB_COLOR_MAP", XA_RGB_COLOR_MAP,"32xcccccccxx",RGB_COLOR_MAP_DFORMAT},
{"STRING", XA_STRING, "8s", 0 },
+ {"UTF8_STRING", 0, "8u", 0 },
{"WINDOW", XA_WINDOW, "32x", ": window id # $0+\n" },
{"VISUALID", XA_VISUALID, "32x", ": visual id # $0\n" },
{"WM_COLORMAP_WINDOWS", 0, "32x", ": window id # $0+\n"},
@@ -683,7 +684,7 @@ _put_char (char c)
}
static void
-_format_char (char c)
+_format_char (char c, int unicode)
{
switch (c) {
case '\\':
@@ -701,17 +702,21 @@ _format_char (char c)
break;
default:
if (!c_isprint(c)) {
- _put_char('\\');
- snprintf(_buf_ptr, _buf_len, "%03o", (unsigned char) c);
- _buf_ptr += 3;
- _buf_len -= 3;
+ if (unicode && (c & 0x80)) {
+ _put_char(c);
+ } else {
+ _put_char('\\');
+ snprintf(_buf_ptr, _buf_len, "%03o", (unsigned char) c);
+ _buf_ptr += 3;
+ _buf_len -= 3;
+ }
} else
_put_char(c);
}
}
static const char *
-Format_String (const char *string)
+Format_String (const char *string, int unicode)
{
char c;
@@ -720,7 +725,7 @@ Format_String (const char *string)
_put_char('\"');
while ((c = string++[0]))
- _format_char(c);
+ _format_char(c, unicode);
*_buf_ptr++ = '"';
*_buf_ptr++ = '\0';
@@ -738,7 +743,7 @@ Format_Len_String (const char *string, int len)
memcpy(data, string, len);
data[len] = '\0';
- result = Format_String(data);
+ result = Format_String(data, 0);
free(data);
return result;
@@ -905,6 +910,129 @@ Format_Len_Text (const char *string, int len, Atom encoding)
}
/*
+ * Validate a string as UTF-8 encoded according to RFC 3629
+ *
+ * Simply, a unicode code point (up to 21-bits long) is encoded as follows:
+ *
+ * Char. number range | UTF-8 octet sequence
+ * (hexadecimal) | (binary)
+ * --------------------+---------------------------------------------
+ * 0000 0000-0000 007F | 0xxxxxxx
+ * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
+ * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
+ * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * Validation is done left-to-right, and an error condition, if any, refers to
+ * only the left-most problem in the string.
+ *
+ * Return values:
+ * UTF8_VALID: Valid UTF-8 encoded string
+ * UTF8_OVERLONG: Using more bytes than needed for a code point
+ * UTF8_SHORT_TAIL: Not enough bytes in a multi-byte sequence
+ * UTF8_LONG_TAIL: Too many bytes in a multi-byte sequence
+ * UTF8_FORBIDDEN_VALUE: Forbidden prefix or code point outside 0x10FFFF
+ */
+#define UTF8_VALID 0
+#define UTF8_FORBIDDEN_VALUE 1
+#define UTF8_OVERLONG 2
+#define UTF8_SHORT_TAIL 3
+#define UTF8_LONG_TAIL 4
+static int
+is_valid_utf8 (const char *string, int len)
+{
+ unsigned long codepoint;
+ int rem, i;
+ unsigned char c;
+
+ rem = 0;
+ for (i = 0; i < len; i++) {
+ c = (unsigned char) string[i];
+
+ /* Order of type check:
+ * - Single byte code point
+ * - Non-starting byte of multi-byte sequence
+ * - Start of 2-byte sequence
+ * - Start of 3-byte sequence
+ * - Start of 4-byte sequence
+ */
+ if (!(c & 0x80)) {
+ if (rem > 0) return UTF8_SHORT_TAIL;
+ rem = 0;
+ codepoint = c;
+ } else if ((c & 0xC0) == 0x80) {
+ if (rem == 0) return UTF8_LONG_TAIL;
+ rem--;
+ codepoint |= (c & 0x3F) << (rem * 6);
+ if (codepoint == 0) return UTF8_OVERLONG;
+ } else if ((c & 0xE0) == 0xC0) {
+ if (rem > 0) return UTF8_SHORT_TAIL;
+ rem = 1;
+ codepoint = (c & 0x1F) << 6;
+ if (codepoint == 0) return UTF8_OVERLONG;
+ } else if ((c & 0xF0) == 0xE0) {
+ if (rem > 0) return UTF8_SHORT_TAIL;
+ rem = 2;
+ codepoint = (c & 0x0F) << 12;
+ } else if ((c & 0xF8) == 0xF0) {
+ if (rem > 0) return UTF8_SHORT_TAIL;
+ rem = 3;
+ codepoint = (c & 0x07) << 18;
+ if (codepoint > 0x10FFFF) return UTF8_FORBIDDEN_VALUE;
+ } else
+ return UTF8_FORBIDDEN_VALUE;
+ }
+
+ return UTF8_VALID;
+}
+
+static const char *
+Format_Len_Unicode (const char *string, int len)
+{
+ char *data;
+ const char *result, *error;
+ int len2;
+
+ int validity = is_valid_utf8(string, len);
+
+ if (validity != UTF8_VALID) {
+ switch (validity) {
+ case UTF8_FORBIDDEN_VALUE:
+ error = "<Invalid UTF-8 string: Forbidden value> "; break;
+ case UTF8_OVERLONG:
+ error = "<Invalid UTF-8 string: Overlong encoding> "; break;
+ case UTF8_SHORT_TAIL:
+ error = "<Invalid UTF-8 string: Tail too short> "; break;
+ case UTF8_LONG_TAIL:
+ error = "<Invalid UTF-8 string: Tail too long> "; break;
+ }
+
+ result = Format_Len_String(string, len);
+ len2 = strlen(result);
+ data = (char *) Malloc(len2+1);
+ memcpy(data, result, len2+1);
+
+ memcpy(_formatting_buffer, error, strlen(error)+1);
+ strcat(_formatting_buffer, data);
+ free(data);
+
+ return _formatting_buffer;
+ }
+
+ if (!is_utf8_locale())
+ return Format_Len_String(string, len);
+
+ data = (char *) Malloc(len+1);
+
+ memcpy(data, string, len);
+ data[len] = '\0';
+
+ result = Format_String(data, 1);
+ free(data);
+
+ return result;
+}
+
+/*
*
* The Format Manager: a group of routines to manage "formats"
*
@@ -956,6 +1084,8 @@ Format_Thunk (thunk t, char format_char)
switch (format_char) {
case 's':
return Format_Len_String(t.extra_value, (int)t.value);
+ case 'u':
+ return Format_Len_Unicode(t.extra_value, (int)t.value);
case 't':
return Format_Len_Text(t.extra_value, (int)t.value, t.extra_encoding);
case 'x':
@@ -1252,7 +1382,7 @@ Break_Down_Property (const char *pointer, int length, Atom type, const char *for
while (length >= size/8) {
format_char = Get_Format_Char(format, i);
- if (format_char == 's')
+ if (format_char == 's' || format_char == 'u')
t.value = Extract_Len_String(&pointer,&length,size,&t.extra_value);
else if (format_char == 't') {
t.extra_encoding = type;
diff --git a/xprop.man b/xprop.man
index 498faa9..310812f 100644
--- a/xprop.man
+++ b/xprop.man
@@ -234,6 +234,14 @@ usable with a field size of 8. The string is assumed to be in an ICCCM
compliant encoding and is converted to the current locale encoding before
being output.
.TP
+u
+This field and the next ones until either a 0 or the end of the property
+represent an UTF-8 encoded unicode string. This format character is only
+usable with a field size of 8. If the string is found to be an invalid
+character, the type of encoding violation is printed instead, followed by
+the string formatted using 's'. When in an environment not capable of
+displaying UTF-8 encoded string, behaviour is identical to 's'.
+.TP
x
The field is a hex number (like 'c' but displayed in hex - most useful
for displaying window ids and the like)