summaryrefslogtreecommitdiffstats
path: root/ascmagic.c
diff options
context:
space:
mode:
Diffstat (limited to 'ascmagic.c')
-rw-r--r--ascmagic.c198
1 files changed, 138 insertions, 60 deletions
diff --git a/ascmagic.c b/ascmagic.c
index 8d2d9a7..c374e02 100644
--- a/ascmagic.c
+++ b/ascmagic.c
@@ -49,32 +49,32 @@
#include "names.h"
#ifndef lint
-FILE_RCSID("@(#)$File: ascmagic.c,v 1.53 2007/10/29 00:54:08 christos Exp $")
+FILE_RCSID("@(#)$File: ascmagic.c,v 1.64 2008/07/16 18:00:57 christos Exp $")
#endif /* lint */
-typedef unsigned long unichar;
-
#define MAXLINELEN 300 /* longest sane line length */
#define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
|| (x) == 0x85 || (x) == '\f')
private int looks_ascii(const unsigned char *, size_t, unichar *, size_t *);
-private int looks_utf8(const unsigned char *, size_t, unichar *, size_t *);
-private int looks_unicode(const unsigned char *, size_t, unichar *, size_t *);
+private int looks_utf8_with_BOM(const unsigned char *, size_t, unichar *,
+ size_t *);
+private int looks_ucs16(const unsigned char *, size_t, unichar *, size_t *);
private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
private int ascmatch(const unsigned char *, const unichar *, size_t);
+private unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t);
protected int
file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
{
size_t i;
- unsigned char *nbuf = NULL;
+ unsigned char *nbuf = NULL, *utf8_buf = NULL, *utf8_end;
unichar *ubuf = NULL;
- size_t ulen;
- struct names *p;
+ size_t ulen, mlen;
+ const struct names *p;
int rv = -1;
int mime = ms->flags & MAGIC_MIME;
@@ -103,9 +103,11 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
while (nbytes > 1 && buf[nbytes - 1] == '\0')
nbytes--;
- if ((nbuf = calloc(1, (nbytes + 1) * sizeof(nbuf[0]))) == NULL)
+ if ((nbuf = CAST(unsigned char *, calloc((size_t)1,
+ (nbytes + 1) * sizeof(nbuf[0])))) == NULL)
goto done;
- if ((ubuf = calloc(1, (nbytes + 1) * sizeof(ubuf[0]))) == NULL)
+ if ((ubuf = CAST(unichar *, calloc((size_t)1,
+ (nbytes + 1) * sizeof(ubuf[0])))) == NULL)
goto done;
/*
@@ -118,11 +120,15 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
code = "ASCII";
code_mime = "us-ascii";
type = "text";
- } else if (looks_utf8(buf, nbytes, ubuf, &ulen)) {
+ } else if (looks_utf8_with_BOM(buf, nbytes, ubuf, &ulen) > 0) {
+ code = "UTF-8 Unicode (with BOM)";
+ code_mime = "utf-8";
+ type = "text";
+ } else if (file_looks_utf8(buf, nbytes, ubuf, &ulen) > 1) {
code = "UTF-8 Unicode";
code_mime = "utf-8";
type = "text";
- } else if ((i = looks_unicode(buf, nbytes, ubuf, &ulen)) != 0) {
+ } else if ((i = looks_ucs16(buf, nbytes, ubuf, &ulen)) != 0) {
if (i == 1)
code = "Little-endian UTF-16 Unicode";
else
@@ -160,33 +166,25 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
goto done;
}
- /*
- * for troff, look for . + letter + letter or .\";
- * this must be done to disambiguate tar archives' ./file
- * and other trash from real troff input.
- *
- * I believe Plan 9 troff allows non-ASCII characters in the names
- * of macros, so this test might possibly fail on such a file.
- */
- if ((ms->flags & MAGIC_NO_CHECK_TROFF) == 0 && *ubuf == '.') {
- unichar *tp = ubuf + 1;
-
- while (ISSPC(*tp))
- ++tp; /* skip leading whitespace */
- if ((tp[0] == '\\' && tp[1] == '\"') ||
- (isascii((unsigned char)tp[0]) &&
- isalnum((unsigned char)tp[0]) &&
- isascii((unsigned char)tp[1]) &&
- isalnum((unsigned char)tp[1]) &&
- ISSPC(tp[2]))) {
- subtype_mime = "text/troff";
- subtype = "troff or preprocessor input";
- goto subtype_identified;
- }
+ /* Convert ubuf to UTF-8 and try text soft magic */
+ /* If original was ASCII or UTF-8, could use nbuf instead of
+ re-converting. */
+ /* malloc size is a conservative overestimate; could be
+ re-converting improved, or at least realloced after
+ re-converting conversion. */
+ mlen = ulen * 6;
+ if ((utf8_buf = CAST(unsigned char *, malloc(mlen))) == NULL) {
+ file_oomem(ms, mlen);
+ goto done;
+ }
+ if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen)) == NULL)
+ goto done;
+ if (file_softmagic(ms, utf8_buf, utf8_end - utf8_buf, TEXTTEST) != 0) {
+ rv = 1;
+ goto done;
}
/* look for tokens from names.h - this is expensive! */
-
if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0)
goto subtype_identified;
@@ -194,24 +192,18 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
while (i < ulen) {
size_t end;
- /*
- * skip past any leading space
- */
+ /* skip past any leading space */
while (i < ulen && ISSPC(ubuf[i]))
i++;
if (i >= ulen)
break;
- /*
- * find the next whitespace
- */
+ /* find the next whitespace */
for (end = i + 1; end < nbytes; end++)
if (ISSPC(ubuf[end]))
break;
- /*
- * compare the word thus isolated against the token list
- */
+ /* compare the word thus isolated against the token list */
for (p = names; p < names + NNAMES; p++) {
if (ascmatch((const unsigned char *)p->name, ubuf + i,
end - i)) {
@@ -226,9 +218,7 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
subtype_identified:
- /*
- * Now try to discover other details about the file.
- */
+ /* Now try to discover other details about the file. */
for (i = 0; i < ulen; i++) {
if (ubuf[i] == '\n') {
if (seen_cr)
@@ -362,6 +352,8 @@ done:
free(nbuf);
if (ubuf)
free(ubuf);
+ if (utf8_buf)
+ free(utf8_buf);
return rv;
}
@@ -520,15 +512,84 @@ looks_extended(const unsigned char *buf, size_t nbytes, unichar *ubuf,
return 1;
}
-private int
-looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
+/*
+ * Encode Unicode string as UTF-8, returning pointer to character
+ * after end of string, or NULL if an invalid character is found.
+ */
+private unsigned char *
+encode_utf8(unsigned char *buf, size_t len, unichar *ubuf, size_t ulen)
+{
+ size_t i;
+ unsigned char *end = buf + len;
+
+ for (i = 0; i < ulen; i++) {
+ if (ubuf[i] <= 0x7f) {
+ if (end - buf < 1)
+ return NULL;
+ *buf++ = (unsigned char)ubuf[i];
+ } else if (ubuf[i] <= 0x7ff) {
+ if (end - buf < 2)
+ return NULL;
+ *buf++ = (unsigned char)((ubuf[i] >> 6) + 0xc0);
+ *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ } else if (ubuf[i] <= 0xffff) {
+ if (end - buf < 3)
+ return NULL;
+ *buf++ = (unsigned char)((ubuf[i] >> 12) + 0xe0);
+ *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ } else if (ubuf[i] <= 0x1fffff) {
+ if (end - buf < 4)
+ return NULL;
+ *buf++ = (unsigned char)((ubuf[i] >> 18) + 0xf0);
+ *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ } else if (ubuf[i] <= 0x3ffffff) {
+ if (end - buf < 5)
+ return NULL;
+ *buf++ = (unsigned char)((ubuf[i] >> 24) + 0xf8);
+ *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ } else if (ubuf[i] <= 0x7fffffff) {
+ if (end - buf < 6)
+ return NULL;
+ *buf++ = (unsigned char)((ubuf[i] >> 30) + 0xfc);
+ *buf++ = (unsigned char)(((ubuf[i] >> 24) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)(((ubuf[i] >> 18) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)(((ubuf[i] >> 12) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)(((ubuf[i] >> 6) & 0x3f) + 0x80);
+ *buf++ = (unsigned char)((ubuf[i] & 0x3f) + 0x80);
+ } else /* Invalid character */
+ return NULL;
+ }
+
+ return buf;
+}
+
+/*
+ * Decide whether some text looks like UTF-8. Returns:
+ *
+ * -1: invalid UTF-8
+ * 0: uses odd control characters, so doesn't look like text
+ * 1: 7-bit text
+ * 2: definitely UTF-8 text (valid high-bit set bytes)
+ *
+ * If ubuf is non-NULL on entry, text is decoded into ubuf, *ulen;
+ * ubuf must be big enough!
+ */
+protected int
+file_looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
{
size_t i;
int n;
unichar c;
- int gotone = 0;
+ int gotone = 0, ctrl = 0;
- *ulen = 0;
+ if (ubuf)
+ *ulen = 0;
for (i = 0; i < nbytes; i++) {
if ((buf[i] & 0x80) == 0) { /* 0xxxxxxx is plain ASCII */
@@ -538,11 +599,12 @@ looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
*/
if (text_chars[buf[i]] != T)
- return 0;
+ ctrl = 1;
- ubuf[(*ulen)++] = buf[i];
+ if (ubuf)
+ ubuf[(*ulen)++] = buf[i];
} else if ((buf[i] & 0x40) == 0) { /* 10xxxxxx never 1st byte */
- return 0;
+ return -1;
} else { /* 11xxxxxx begins UTF-8 */
int following;
@@ -562,7 +624,7 @@ looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
c = buf[i] & 0x01;
following = 5;
} else
- return 0;
+ return -1;
for (n = 0; n < following; n++) {
i++;
@@ -570,21 +632,37 @@ looks_utf8(const unsigned char *buf, size_t nbytes, unichar *ubuf, size_t *ulen)
goto done;
if ((buf[i] & 0x80) == 0 || (buf[i] & 0x40))
- return 0;
+ return -1;
c = (c << 6) + (buf[i] & 0x3f);
}
- ubuf[(*ulen)++] = c;
+ if (ubuf)
+ ubuf[(*ulen)++] = c;
gotone = 1;
}
}
done:
- return gotone; /* don't claim it's UTF-8 if it's all 7-bit */
+ return ctrl ? 0 : (gotone ? 2 : 1);
+}
+
+/*
+ * Decide whether some text looks like UTF-8 with BOM. If there is no
+ * BOM, return -1; otherwise return the result of looks_utf8 on the
+ * rest of the text.
+ */
+private int
+looks_utf8_with_BOM(const unsigned char *buf, size_t nbytes, unichar *ubuf,
+ size_t *ulen)
+{
+ if (nbytes > 3 && buf[0] == 0xef && buf[1] == 0xbb && buf[2] == 0xbf)
+ return file_looks_utf8(buf + 3, nbytes - 3, ubuf, ulen);
+ else
+ return -1;
}
private int
-looks_unicode(const unsigned char *buf, size_t nbytes, unichar *ubuf,
+looks_ucs16(const unsigned char *buf, size_t nbytes, unichar *ubuf,
size_t *ulen)
{
int bigend;
OpenPOWER on IntegriCloud