1 files changed, 20 insertions, 4 deletions
diff --git a/contrib/file/encoding.c b/contrib/file/encoding.c
index 4e94f9b..dee57a6 100644
--- a/contrib/file/encoding.c
+++ b/contrib/file/encoding.c
@@ -35,7 +35,7 @@
 #include "file.h"
 
 #ifndef	lint
-FILE_RCSID("@(#)$File: encoding.c,v 1.3 2009/02/03 20:27:51 christos Exp $")
+FILE_RCSID("@(#)$File: encoding.c,v 1.7 2012/01/24 19:02:02 christos Exp $")
 #endif	/* lint */
 
 #include "magic.h"
@@ -52,6 +52,12 @@ private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
 private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
 private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
 
+#ifdef DEBUG_ENCODING
+#define DPRINTF(a) printf a
+#else
+#define DPRINTF(a)
+#endif
+
 /*
  * Try to determine whether text is in some character code we can
  * identify.  Each of these tests, if it succeeds, will leave
@@ -65,6 +71,7 @@ file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, uni
 	int rv = 1, ucs_type;
 	unsigned char *nbuf = NULL;
 
+	*type = "text";
 	mlen = (nbytes + 1) * sizeof(nbuf[0]);
 	if ((nbuf = CAST(unsigned char *, calloc((size_t)1, mlen))) == NULL) {
 		file_oomem(ms, mlen);
@@ -76,14 +83,17 @@ file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, uni
 		goto done;
 	}
 
-	*type = "text";
 	if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
+		DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
 		*code = "ASCII";
 		*code_mime = "us-ascii";
 	} else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
+		DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
 		*code = "UTF-8 Unicode (with BOM)";
 		*code_mime = "utf-8";
 	} else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
+		DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
+		*code = "UTF-8 Unicode (with BOM)";
 		*code = "UTF-8 Unicode";
 		*code_mime = "utf-8";
 	} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
@@ -94,30 +104,36 @@ file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, uni
 			*code = "Big-endian UTF-16 Unicode";
 			*code_mime = "utf-16be";
 		}
+		DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen));
 	} else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
+		DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen));
 		*code = "ISO-8859";
 		*code_mime = "iso-8859-1";
 	} else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
+		DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen));
 		*code = "Non-ISO extended-ASCII";
 		*code_mime = "unknown-8bit";
 	} else {
 		from_ebcdic(buf, nbytes, nbuf);
 
 		if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
+			DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen));
 			*code = "EBCDIC";
 			*code_mime = "ebcdic";
 		} else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
+			DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n",
+			    *ulen));
 			*code = "International EBCDIC";
 			*code_mime = "ebcdic";
 		} else { /* Doesn't look like text at all */
+			DPRINTF(("binary\n"));
 			rv = 0;
 			*type = "binary";
 		}
 	}
 
  done:
-	if (nbuf)
-		free(nbuf);
+	free(nbuf);
 
 	return rv;
 }