1 files changed, 53 insertions, 83 deletions
diff --git a/contrib/file/ascmagic.c b/contrib/file/ascmagic.c
index 9236fb4..5a1caac 100644
--- a/contrib/file/ascmagic.c
+++ b/contrib/file/ascmagic.c
@@ -26,8 +26,7 @@
  * SUCH DAMAGE.
  */
 /*
- * ASCII magic -- file types that we know based on keywords
- * that can appear anywhere in the file.
+ * ASCII magic -- try to detect text encoding.
  *
  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
  * to handle character codes other than ASCII on a unified basis.
@@ -36,7 +35,7 @@
 #include "file.h"
 
 #ifndef	lint
-FILE_RCSID("@(#)$File: ascmagic.c,v 1.75 2009/02/03 20:27:51 christos Exp $")
+FILE_RCSID("@(#)$File: ascmagic.c,v 1.84 2011/12/08 12:38:24 rrt Exp $")
 #endif	/* lint */
 
 #include "magic.h"
@@ -47,13 +46,11 @@ FILE_RCSID("@(#)$File: ascmagic.c,v 1.75 2009/02/03 20:27:51 christos Exp $")
 #ifdef HAVE_UNISTD_H
 #include <unistd.h>
 #endif
-#include "names.h"
 
 #define MAXLINELEN 300	/* longest sane line length */
 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
 		  || (x) == 0x85 || (x) == '\f')
 
-private int ascmatch(const unsigned char *, const unichar *, size_t);
 private unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t);
 private size_t trim_nuls(const unsigned char *, size_t);
 
@@ -71,7 +68,8 @@ trim_nuls(const unsigned char *buf, size_t nbytes)
 }
 
 protected int
-file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
+file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes,
+	int text)
 {
 	unichar *ubuf = NULL;
 	size_t ulen;
@@ -88,17 +86,13 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
 
 	/* If file doesn't look like any sort of text, give up. */
 	if (file_encoding(ms, buf, nbytes, &ubuf, &ulen, &code, &code_mime,
-	    &type) == 0) {
+	    &type) == 0)
 		rv = 0;
-		goto done;
-	}
-
-	rv = file_ascmagic_with_encoding(ms, buf, nbytes, ubuf, ulen, code, 
-	    type);
+        else
+		rv = file_ascmagic_with_encoding(ms, buf, nbytes, ubuf, ulen, code,
+						 type, text);
 
- done:
-	if (ubuf)
-		free(ubuf);
+	free(ubuf);
 
 	return rv;
 }
@@ -106,11 +100,10 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
 protected int
 file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf,
     size_t nbytes, unichar *ubuf, size_t ulen, const char *code,
-    const char *type)
+    const char *type, int text)
 {
 	unsigned char *utf8_buf = NULL, *utf8_end;
 	size_t mlen, i;
-	const struct names *p;
 	int rv = -1;
 	int mime = ms->flags & MAGIC_MIME;
 
@@ -125,6 +118,7 @@ file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf,
 	int n_lf = 0;
 	int n_cr = 0;
 	int n_nel = 0;
+	int executable = 0;
 
 	size_t last_line_end = (size_t)-1;
 	int has_long_lines = 0;
@@ -140,56 +134,23 @@ file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf,
 		goto done;
 	}
 
-	/* Convert ubuf to UTF-8 and try text soft magic */
-	/* malloc size is a conservative overestimate; could be
-	   improved, or at least realloced after conversion. */
-	mlen = ulen * 6;
-	if ((utf8_buf = CAST(unsigned char *, malloc(mlen))) == NULL) {
-		file_oomem(ms, mlen);
-		goto done;
-	}
-	if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen)) == NULL)
-		goto done;
-	if ((rv = file_softmagic(ms, utf8_buf, (size_t)(utf8_end - utf8_buf),
-	    TEXTTEST)) != 0)
-		goto done;
-	else
-		rv = -1;
-
-	/* look for tokens from names.h - this is expensive! */
-	if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0)
-		goto subtype_identified;
-
-	i = 0;
-	while (i < ulen) {
-		size_t end;
-
-		/* skip past any leading space */
-		while (i < ulen && ISSPC(ubuf[i]))
-			i++;
-		if (i >= ulen)
-			break;
-
-		/* find the next whitespace */
-		for (end = i + 1; end < nbytes; end++)
-			if (ISSPC(ubuf[end]))
-				break;
-
-		/* compare the word thus isolated against the token list */
-		for (p = names; p < names + NNAMES; p++) {
-			if (ascmatch((const unsigned char *)p->name, ubuf + i,
-			    end - i)) {
-				subtype = types[p->type].human;
-				subtype_mime = types[p->type].mime;
-				goto subtype_identified;
-			}
+	if ((ms->flags & MAGIC_NO_CHECK_SOFT) == 0) {
+		/* Convert ubuf to UTF-8 and try text soft magic */
+		/* malloc size is a conservative overestimate; could be
+		   improved, or at least realloced after conversion. */
+		mlen = ulen * 6;
+		if ((utf8_buf = CAST(unsigned char *, malloc(mlen))) == NULL) {
+			file_oomem(ms, mlen);
+			goto done;
 		}
-
-		i = end;
+		if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen))
+		    == NULL)
+			goto done;
+		if ((rv = file_softmagic(ms, utf8_buf,
+		    (size_t)(utf8_end - utf8_buf), TEXTTEST, text)) == 0)
+			rv = -1;
 	}
 
-subtype_identified:
-
 	/* Now try to discover other details about the file. */
 	for (i = 0; i < ulen; i++) {
 		if (ubuf[i] == '\n') {
@@ -232,7 +193,7 @@ subtype_identified:
 		goto done;
 	}
 	if (mime) {
-		if ((mime & MAGIC_MIME_TYPE) != 0) {
+		if (!file_printedlen(ms) && (mime & MAGIC_MIME_TYPE) != 0) {
 			if (subtype_mime) {
 				if (file_printf(ms, "%s", subtype_mime) == -1)
 					goto done;
@@ -242,6 +203,28 @@ subtype_identified:
 			}
 		}
 	} else {
+		if (file_printedlen(ms)) {
+			switch (file_replace(ms, " text$", ", ")) {
+			case 0:
+				switch (file_replace(ms, " text executable$",
+				    ", ")) {
+				case 0:
+					if (file_printf(ms, ", ") == -1)
+						goto done;
+				case -1:
+					goto done;
+				default:
+					executable = 1;
+					break;
+				}
+				break;
+			case -1:
+				goto done;
+			default:
+				break;
+			}
+		}
+
 		if (file_printf(ms, "%s", code) == -1)
 			goto done;
 
@@ -253,6 +236,10 @@ subtype_identified:
 		if (file_printf(ms, " %s", type) == -1)
 			goto done;
 
+		if (executable)
+			if (file_printf(ms, " executable") == -1)
+				goto done;
+
 		if (has_long_lines)
 			if (file_printf(ms, ", with very long lines") == -1)
 				goto done;
@@ -309,28 +296,11 @@ subtype_identified:
 	}
 	rv = 1;
 done:
-	if (utf8_buf)
-		free(utf8_buf);
+	free(utf8_buf);
 
 	return rv;
 }
 
-private int
-ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
-{
-	size_t i;
-
-	for (i = 0; i < ulen; i++) {
-		if (s[i] != us[i])
-			return 0;
-	}
-
-	if (s[i])
-		return 0;
-	else
-		return 1;
-}
-
 /*
  * Encode Unicode string as UTF-8, returning pointer to character
  * after end of string, or NULL if an invalid character is found.