diff options
Diffstat (limited to 'usr.bin/file/ascmagic.c')
-rw-r--r-- | usr.bin/file/ascmagic.c | 225 |
1 files changed, 151 insertions, 74 deletions
diff --git a/usr.bin/file/ascmagic.c b/usr.bin/file/ascmagic.c index b5cc8b991d6..7dcf0d3d99a 100644 --- a/usr.bin/file/ascmagic.c +++ b/usr.bin/file/ascmagic.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ascmagic.c,v 1.9 2008/05/08 01:40:56 chl Exp $ */ +/* $OpenBSD: ascmagic.c,v 1.10 2009/04/24 18:54:34 chl Exp $ */ /* * Copyright (c) Ian F. Darwin 1986-1995. * Software written by Ian F. Darwin and others; @@ -50,33 +50,35 @@ #include "names.h" #ifndef lint -FILE_RCSID("@(#)$Id: ascmagic.c,v 1.9 2008/05/08 01:40:56 chl Exp $") +FILE_RCSID("@(#)$Id: ascmagic.c,v 1.10 2009/04/24 18:54:34 chl Exp $") #endif /* lint */ -typedef unsigned long unichar; - #define MAXLINELEN 300 /* longest sane line length */ #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \ || (x) == 0x85 || (x) == '\f') private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *); -private int looks_utf8(const unsigned char *, size_t, unichar *, size_t *); -private int looks_unicode(const unsigned char *, size_t, unichar *, size_t *); +private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *, + size_t *); +protected int file_looks_utf8(const unsigned char *, size_t, unichar *, size_t *); +private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *); private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *); private int looks_extended(const unsigned char *, size_t, unichar *, size_t *); private void from_ebcdic(const unsigned char *, size_t, unsigned char *); private int ascmatch(const unsigned char *, const unichar *, size_t); +private unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t); protected int file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) { size_t i; - unsigned char *nbuf = NULL; + unsigned char *nbuf = NULL, *utf8_buf = NULL, *utf8_end; unichar *ubuf = NULL; - size_t ulen; - struct names *p; + size_t ulen, mlen; + const struct names *p; int rv = -1; + int mime = ms->flags & MAGIC_MIME; const char *code = NULL; const char *code_mime = NULL; @@ -118,11 +120,15 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) code = "ASCII"; code_mime = "us-ascii"; type = "text"; - } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) { + } else if (looks_utf8_with_BOM(buf, nbytes, ubuf, &ulen) > 0) { + code = "UTF-8 Unicode (with BOM)"; + code_mime = "utf-8"; + type = "text"; + } else if (file_looks_utf8(buf, nbytes, ubuf, &ulen) > 1) { code = "UTF-8 Unicode"; code_mime = "utf-8"; type = "text"; - } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) { + } else if ((i = looks_ucs16(buf, nbytes, ubuf, &ulen)) != 0) { if (i == 1) code = "Little-endian UTF-16 Unicode"; else @@ -160,40 +166,25 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) goto done; } - /* - * for troff, look for . + letter + letter or .\"; - * this must be done to disambiguate tar archives' ./file - * and other trash from real troff input. - * - * I believe Plan 9 troff allows non-ASCII characters in the names - * of macros, so this test might possibly fail on such a file. - */ - if ((ms->flags & MAGIC_NO_CHECK_TROFF) == 0 && *ubuf == '.') { - unichar *tp = ubuf + 1; - - while (ISSPC(*tp)) - ++tp; /* skip leading whitespace */ - if ((tp[0] == '\\' && tp[1] == '\"') || - (isascii((unsigned char)tp[0]) && - isalnum((unsigned char)tp[0]) && - isascii((unsigned char)tp[1]) && - isalnum((unsigned char)tp[1]) && - ISSPC(tp[2]))) { - subtype_mime = "text/troff"; - subtype = "troff or preprocessor input"; - goto subtype_identified; - } + /* Convert ubuf to UTF-8 and try text soft magic */ + /* If original was ASCII or UTF-8, could use nbuf instead of + re-converting. */ + /* malloc size is a conservative overestimate; could be + re-converting improved, or at least realloced after + re-converting conversion. */ + mlen = ulen * 6; + if ((utf8_buf = malloc(mlen)) == NULL) { + file_oomem(ms, mlen); + goto done; } - - if ((ms->flags & MAGIC_NO_CHECK_FORTRAN) == 0 && - (*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) { - subtype_mime = "text/fortran"; - subtype = "fortran program"; - goto subtype_identified; + if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen)) == NULL) + goto done; + if (file_softmagic(ms, utf8_buf, utf8_end - utf8_buf, TEXTTEST) != 0) { + rv = 1; + goto done; } /* look for tokens from names.h - this is expensive! */ - if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0) goto subtype_identified; @@ -201,24 +192,18 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) while (i < ulen) { size_t end; - /* - * skip past any leading space - */ + /* skip past any leading space */ while (i < ulen && ISSPC(ubuf[i])) i++; if (i >= ulen) break; - /* - * find the next whitespace - */ + /* find the next whitespace */ for (end = i + 1; end < nbytes; end++) if (ISSPC(ubuf[end])) break; - /* - * compare the word thus isolated against the token list - */ + /* compare the word thus isolated against the token list */ for (p = names; p < names + NNAMES; p++) { if (ascmatch((const unsigned char *)p->name, ubuf + i, end - i)) { @@ -233,9 +218,7 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes) subtype_identified: - /* - * Now try to discover other details about the file. - */ + /* Now try to discover other details about the file. */ for (i = 0; i < ulen; i++) { if (ubuf[i] == '\n') { if (seen_cr) @@ -272,21 +255,27 @@ subtype_identified: if (seen_cr && nbytes < HOWMANY) n_cr++; - if ((ms->flags & MAGIC_MIME)) { - if (subtype_mime) { - if (file_printf(ms, subtype_mime) == -1) - goto done; - } else { - if (file_printf(ms, "text/plain") == -1) - goto done; + if (mime) { + if (mime & MAGIC_MIME_TYPE) { + if (subtype_mime) { + if (file_printf(ms, subtype_mime) == -1) + goto done; + } else { + if (file_printf(ms, "text/plain") == -1) + goto done; + } } - if (code_mime) { - if (file_printf(ms, "; charset=") == -1) + if ((mime == 0 || mime == MAGIC_MIME) && code_mime) { + if ((mime & MAGIC_MIME_TYPE) && + file_printf(ms, " charset=") == -1) goto done; if (file_printf(ms, code_mime) == -1) goto done; } + + if (mime == MAGIC_MIME_ENCODING) + file_printf(ms, "binary"); } else { if (file_printf(ms, code) == -1) goto done; @@ -363,6 +352,8 @@ done: free(nbuf); if (ubuf) free(ubuf); + if (utf8_buf) + free(utf8_buf); return rv; } @@ -521,15 +512,84 @@ looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf, return 1; } -private int -looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) +/* + * Encode Unicode string as UTF-8, returning pointer to character + * after end of string, or NULL if an invalid character is found. + */ +private unsigned char * +encode_utf8(unsigned char *buf, size_t len, unichar *ubuf, size_t ulen) +{ + size_t i; + unsigned char *end = buf + len; + + for (i = 0; i < ulen; i++) { + if (ubuf[i] <= 0x7f) { + if (end - buf < 1) + return NULL; + *buf++ = (unsigned char)ubuf[i]; + } else if (ubuf[i] <= 0x7ff) { + if (end - buf < 2) + return NULL; + *buf++ = (unsigned char)((ubuf[i] >> 6) + 0xc0); + *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); + } else if (ubuf[i] <= 0xffff) { + if (end - buf < 3) + return NULL; + *buf++ = (unsigned char)((ubuf[i] >> 12) + 0xe0); + *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80); + *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); + } else if (ubuf[i] <= 0x1fffff) { + if (end - buf < 4) + return NULL; + *buf++ = (unsigned char)((ubuf[i] >> 18) + 0xf0); + *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80); + *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80); + *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); + } else if (ubuf[i] <= 0x3ffffff) { + if (end - buf < 5) + return NULL; + *buf++ = (unsigned char)((ubuf[i] >> 24) + 0xf8); + *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80); + *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80); + *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80); + *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); + } else if (ubuf[i] <= 0x7fffffff) { + if (end - buf < 6) + return NULL; + *buf++ = (unsigned char)((ubuf[i] >> 30) + 0xfc); + *buf++ = (unsigned char)(((ubuf[i] >> 24) & 0x3f) + 0x80); + *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80); + *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80); + *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80); + *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80); + } else /* Invalid character */ + return NULL; + } + + return buf; +} + +/* + * Decide whether some text looks like UTF-8. Returns: + * + * -1: invalid UTF-8 + * 0: uses odd control characters, so doesn't look like text + * 1: 7-bit text + * 2: definitely UTF-8 text (valid high-bit set bytes) + * + * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen; + * ubuf must be big enough! + */ +protected int +file_looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) { size_t i; int n; unichar c; - int gotone = 0; + int gotone = 0, ctrl = 0; - *ulen = 0; + if (ubuf) + *ulen = 0; for (i = 0; i < nbytes; i++) { if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */ @@ -539,11 +599,12 @@ looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) */ if (text_chars[buf[i]] != T) - return 0; + ctrl = 1; - ubuf[(*ulen)++] = buf[i]; + if (ubuf) + ubuf[(*ulen)++] = buf[i]; } else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */ - return 0; + return -1; } else { /* 11xxxxxx begins UTF-8 */ int following; @@ -563,7 +624,7 @@ looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) c = buf[i] & 0x01; following = 5; } else - return 0; + return -1; for (n = 0; n < following; n++) { i++; @@ -571,21 +632,37 @@ looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) goto done; if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40)) - return 0; + return -1; c = (c << 6) + (buf[i] & 0x3f); } - ubuf[(*ulen)++] = c; + if (ubuf) + ubuf[(*ulen)++] = c; gotone = 1; } } done: - return gotone; /* don't claim it's UTF-8 if it's all 7-bit */ + return ctrl ? 0 : (gotone ? 2 : 1); +} + +/* + * Decide whether some text looks like UTF-8 with BOM. If there is no + * BOM, return -1; otherwise return the result of looks_utf8 on the + * rest of the text. + */ +private int +looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf, + size_t *ulen) +{ + if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf) + return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen); + else + return -1; } private int -looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf, +looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen) { int bigend; |