summaryrefslogtreecommitdiff
path: root/usr.bin/file/ascmagic.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr.bin/file/ascmagic.c')
-rw-r--r--usr.bin/file/ascmagic.c225
1 files changed, 151 insertions, 74 deletions
diff --git a/usr.bin/file/ascmagic.c b/usr.bin/file/ascmagic.c
index b5cc8b991d6..7dcf0d3d99a 100644
--- a/usr.bin/file/ascmagic.c
+++ b/usr.bin/file/ascmagic.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: ascmagic.c,v 1.9 2008/05/08 01:40:56 chl Exp $ */
+/* $OpenBSD: ascmagic.c,v 1.10 2009/04/24 18:54:34 chl Exp $ */
/*
* Copyright (c) Ian F. Darwin 1986-1995.
* Software written by Ian F. Darwin and others;
@@ -50,33 +50,35 @@
#include "names.h"
#ifndef lint
-FILE_RCSID("@(#)$Id: ascmagic.c,v 1.9 2008/05/08 01:40:56 chl Exp $")
+FILE_RCSID("@(#)$Id: ascmagic.c,v 1.10 2009/04/24 18:54:34 chl Exp $")
#endif /* lint */
-typedef unsigned long unichar;
-
#define MAXLINELEN 300 /* longest sane line length */
#define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
|| (x) == 0x85 || (x) == '\f')
private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
-private int looks_utf8(const unsigned char *, size_t, unichar *, size_t *);
-private int looks_unicode(const unsigned char *, size_t, unichar *, size_t *);
+private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
+ size_t *);
+protected int file_looks_utf8(const unsigned char *, size_t, unichar *, size_t *);
+private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
private int ascmatch(const unsigned char *, const unichar *, size_t);
+private unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t);
protected int
file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
{
size_t i;
- unsigned char *nbuf = NULL;
+ unsigned char *nbuf = NULL, *utf8_buf = NULL, *utf8_end;
unichar *ubuf = NULL;
- size_t ulen;
- struct names *p;
+ size_t ulen, mlen;
+ const struct names *p;
int rv = -1;
+ int mime = ms->flags & MAGIC_MIME;
const char *code = NULL;
const char *code_mime = NULL;
@@ -118,11 +120,15 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
code = "ASCII";
code_mime = "us-ascii";
type = "text";
- } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
+ } else if (looks_utf8_with_BOM(buf, nbytes, ubuf, &ulen) > 0) {
+ code = "UTF-8 Unicode (with BOM)";
+ code_mime = "utf-8";
+ type = "text";
+ } else if (file_looks_utf8(buf, nbytes, ubuf, &ulen) > 1) {
code = "UTF-8 Unicode";
code_mime = "utf-8";
type = "text";
- } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
+ } else if ((i = looks_ucs16(buf, nbytes, ubuf, &ulen)) != 0) {
if (i == 1)
code = "Little-endian UTF-16 Unicode";
else
@@ -160,40 +166,25 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
goto done;
}
- /*
- * for troff, look for . + letter + letter or .\";
- * this must be done to disambiguate tar archives' ./file
- * and other trash from real troff input.
- *
- * I believe Plan 9 troff allows non-ASCII characters in the names
- * of macros, so this test might possibly fail on such a file.
- */
- if ((ms->flags & MAGIC_NO_CHECK_TROFF) == 0 && *ubuf == '.') {
- unichar *tp = ubuf + 1;
-
- while (ISSPC(*tp))
- ++tp; /* skip leading whitespace */
- if ((tp[0] == '\\' && tp[1] == '\"') ||
- (isascii((unsigned char)tp[0]) &&
- isalnum((unsigned char)tp[0]) &&
- isascii((unsigned char)tp[1]) &&
- isalnum((unsigned char)tp[1]) &&
- ISSPC(tp[2]))) {
- subtype_mime = "text/troff";
- subtype = "troff or preprocessor input";
- goto subtype_identified;
- }
+ /* Convert ubuf to UTF-8 and try text soft magic */
+ /* If original was ASCII or UTF-8, could use nbuf instead of
+ re-converting. */
+ /* malloc size is a conservative overestimate; could be
+ re-converting improved, or at least realloced after
+ re-converting conversion. */
+ mlen = ulen * 6;
+ if ((utf8_buf = malloc(mlen)) == NULL) {
+ file_oomem(ms, mlen);
+ goto done;
}
-
- if ((ms->flags & MAGIC_NO_CHECK_FORTRAN) == 0 &&
- (*buf == 'c' || *buf == 'C') && ISSPC(buf[1])) {
- subtype_mime = "text/fortran";
- subtype = "fortran program";
- goto subtype_identified;
+ if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen)) == NULL)
+ goto done;
+ if (file_softmagic(ms, utf8_buf, utf8_end - utf8_buf, TEXTTEST) != 0) {
+ rv = 1;
+ goto done;
}
/* look for tokens from names.h - this is expensive! */
-
if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0)
goto subtype_identified;
@@ -201,24 +192,18 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
while (i < ulen) {
size_t end;
- /*
- * skip past any leading space
- */
+ /* skip past any leading space */
while (i < ulen && ISSPC(ubuf[i]))
i++;
if (i >= ulen)
break;
- /*
- * find the next whitespace
- */
+ /* find the next whitespace */
for (end = i + 1; end < nbytes; end++)
if (ISSPC(ubuf[end]))
break;
- /*
- * compare the word thus isolated against the token list
- */
+ /* compare the word thus isolated against the token list */
for (p = names; p < names + NNAMES; p++) {
if (ascmatch((const unsigned char *)p->name, ubuf + i,
end - i)) {
@@ -233,9 +218,7 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
subtype_identified:
- /*
- * Now try to discover other details about the file.
- */
+ /* Now try to discover other details about the file. */
for (i = 0; i < ulen; i++) {
if (ubuf[i] == '\n') {
if (seen_cr)
@@ -272,21 +255,27 @@ subtype_identified:
if (seen_cr && nbytes < HOWMANY)
n_cr++;
- if ((ms->flags & MAGIC_MIME)) {
- if (subtype_mime) {
- if (file_printf(ms, subtype_mime) == -1)
- goto done;
- } else {
- if (file_printf(ms, "text/plain") == -1)
- goto done;
+ if (mime) {
+ if (mime & MAGIC_MIME_TYPE) {
+ if (subtype_mime) {
+ if (file_printf(ms, subtype_mime) == -1)
+ goto done;
+ } else {
+ if (file_printf(ms, "text/plain") == -1)
+ goto done;
+ }
}
- if (code_mime) {
- if (file_printf(ms, "; charset=") == -1)
+ if ((mime == 0 || mime == MAGIC_MIME) && code_mime) {
+ if ((mime & MAGIC_MIME_TYPE) &&
+ file_printf(ms, " charset=") == -1)
goto done;
if (file_printf(ms, code_mime) == -1)
goto done;
}
+
+ if (mime == MAGIC_MIME_ENCODING)
+ file_printf(ms, "binary");
} else {
if (file_printf(ms, code) == -1)
goto done;
@@ -363,6 +352,8 @@ done:
free(nbuf);
if (ubuf)
free(ubuf);
+ if (utf8_buf)
+ free(utf8_buf);
return rv;
}
@@ -521,15 +512,84 @@ looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
return 1;
}
-private int
-looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
+/*
+ * Encode Unicode string as UTF-8, returning pointer to character
+ * after end of string, or NULL if an invalid character is found.
+ */
+private unsigned char *
+encode_utf8(unsigned char *buf, size_t len, unichar *ubuf, size_t ulen)
+{
+ size_t i;
+ unsigned char *end = buf + len;
+
+ for (i = 0; i < ulen; i++) {
+ if (ubuf[i] <= 0x7f) {
+ if (end - buf < 1)
+ return NULL;
+ *buf++ = (unsigned char)ubuf[i];
+ } else if (ubuf[i] <= 0x7ff) {
+ if (end - buf < 2)
+ return NULL;
+ *buf++ = (unsigned char)((ubuf[i] >> 6) + 0xc0);
+ *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ } else if (ubuf[i] <= 0xffff) {
+ if (end - buf < 3)
+ return NULL;
+ *buf++ = (unsigned char)((ubuf[i] >> 12) + 0xe0);
+ *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ } else if (ubuf[i] <= 0x1fffff) {
+ if (end - buf < 4)
+ return NULL;
+ *buf++ = (unsigned char)((ubuf[i] >> 18) + 0xf0);
+ *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ } else if (ubuf[i] <= 0x3ffffff) {
+ if (end - buf < 5)
+ return NULL;
+ *buf++ = (unsigned char)((ubuf[i] >> 24) + 0xf8);
+ *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ } else if (ubuf[i] <= 0x7fffffff) {
+ if (end - buf < 6)
+ return NULL;
+ *buf++ = (unsigned char)((ubuf[i] >> 30) + 0xfc);
+ *buf++ = (unsigned char)(((ubuf[i] >> 24) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ } else /* Invalid character */
+ return NULL;
+ }
+
+ return buf;
+}
+
+/*
+ * Decide whether some text looks like UTF-8. Returns:
+ *
+ * -1: invalid UTF-8
+ * 0: uses odd control characters, so doesn't look like text
+ * 1: 7-bit text
+ * 2: definitely UTF-8 text (valid high-bit set bytes)
+ *
+ * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
+ * ubuf must be big enough!
+ */
+protected int
+file_looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
{
size_t i;
int n;
unichar c;
- int gotone = 0;
+ int gotone = 0, ctrl = 0;
- *ulen = 0;
+ if (ubuf)
+ *ulen = 0;
for (i = 0; i < nbytes; i++) {
if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
@@ -539,11 +599,12 @@ looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
*/
if (text_chars[buf[i]] != T)
- return 0;
+ ctrl = 1;
- ubuf[(*ulen)++] = buf[i];
+ if (ubuf)
+ ubuf[(*ulen)++] = buf[i];
} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
- return 0;
+ return -1;
} else { /* 11xxxxxx begins UTF-8 */
int following;
@@ -563,7 +624,7 @@ looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
c = buf[i] & 0x01;
following = 5;
} else
- return 0;
+ return -1;
for (n = 0; n < following; n++) {
i++;
@@ -571,21 +632,37 @@ looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
goto done;
if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
- return 0;
+ return -1;
c = (c << 6) + (buf[i] & 0x3f);
}
- ubuf[(*ulen)++] = c;
+ if (ubuf)
+ ubuf[(*ulen)++] = c;
gotone = 1;
}
}
done:
- return gotone; /* don't claim it's UTF-8 if it's all 7-bit */
+ return ctrl ? 0 : (gotone ? 2 : 1);
+}
+
+/*
+ * Decide whether some text looks like UTF-8 with BOM. If there is no
+ * BOM, return -1; otherwise return the result of looks_utf8 on the
+ * rest of the text.
+ */
+private int
+looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf,
+ size_t *ulen)
+{
+ if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
+ return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
+ else
+ return -1;
}
private int
-looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf,
+looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
size_t *ulen)
{
int bigend;