summaryrefslogtreecommitdiffstats
path: root/contrib/file/encoding.c
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/file/encoding.c')
-rw-r--r--contrib/file/encoding.c24
1 files changed, 20 insertions, 4 deletions
diff --git a/contrib/file/encoding.c b/contrib/file/encoding.c
index 4e94f9b..dee57a6 100644
--- a/contrib/file/encoding.c
+++ b/contrib/file/encoding.c
@@ -35,7 +35,7 @@
#include "file.h"
#ifndef lint
-FILE_RCSID("@(#)$File: encoding.c,v 1.3 2009/02/03 20:27:51 christos Exp $")
+FILE_RCSID("@(#)$File: encoding.c,v 1.7 2012/01/24 19:02:02 christos Exp $")
#endif /* lint */
#include "magic.h"
@@ -52,6 +52,12 @@ private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *);
private int looks_extended(const unsigned char *, size_t, unichar *, size_t *);
private void from_ebcdic(const unsigned char *, size_t, unsigned char *);
+#ifdef DEBUG_ENCODING
+#define DPRINTF(a) printf a
+#else
+#define DPRINTF(a)
+#endif
+
/*
* Try to determine whether text is in some character code we can
* identify. Each of these tests, if it succeeds, will leave
@@ -65,6 +71,7 @@ file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, uni
int rv = 1, ucs_type;
unsigned char *nbuf = NULL;
+ *type = "text";
mlen = (nbytes + 1) * sizeof(nbuf[0]);
if ((nbuf = CAST(unsigned char *, calloc((size_t)1, mlen))) == NULL) {
file_oomem(ms, mlen);
@@ -76,14 +83,17 @@ file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, uni
goto done;
}
- *type = "text";
if (looks_ascii(buf, nbytes, *ubuf, ulen)) {
+ DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen));
*code = "ASCII";
*code_mime = "us-ascii";
} else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) {
+ DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen));
*code = "UTF-8 Unicode (with BOM)";
*code_mime = "utf-8";
} else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) {
+ DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen));
+ *code = "UTF-8 Unicode (with BOM)";
*code = "UTF-8 Unicode";
*code_mime = "utf-8";
} else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) {
@@ -94,30 +104,36 @@ file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, uni
*code = "Big-endian UTF-16 Unicode";
*code_mime = "utf-16be";
}
+ DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen));
} else if (looks_latin1(buf, nbytes, *ubuf, ulen)) {
+ DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen));
*code = "ISO-8859";
*code_mime = "iso-8859-1";
} else if (looks_extended(buf, nbytes, *ubuf, ulen)) {
+ DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen));
*code = "Non-ISO extended-ASCII";
*code_mime = "unknown-8bit";
} else {
from_ebcdic(buf, nbytes, nbuf);
if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) {
+ DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen));
*code = "EBCDIC";
*code_mime = "ebcdic";
} else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) {
+ DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n",
+ *ulen));
*code = "International EBCDIC";
*code_mime = "ebcdic";
} else { /* Doesn't look like text at all */
+ DPRINTF(("binary\n"));
rv = 0;
*type = "binary";
}
}
done:
- if (nbuf)
- free(nbuf);
+ free(nbuf);
return rv;
}
OpenPOWER on IntegriCloud