diff options
Diffstat (limited to 'contrib/file/encoding.c')
-rw-r--r-- | contrib/file/encoding.c | 24 |
1 files changed, 20 insertions, 4 deletions
diff --git a/contrib/file/encoding.c b/contrib/file/encoding.c index 4e94f9b..dee57a6 100644 --- a/contrib/file/encoding.c +++ b/contrib/file/encoding.c @@ -35,7 +35,7 @@ #include "file.h" #ifndef lint -FILE_RCSID("@(#)$File: encoding.c,v 1.3 2009/02/03 20:27:51 christos Exp $") +FILE_RCSID("@(#)$File: encoding.c,v 1.7 2012/01/24 19:02:02 christos Exp $") #endif /* lint */ #include "magic.h" @@ -52,6 +52,12 @@ private int looks_latin1(const unsigned char *, size_t, unichar *, size_t *); private int looks_extended(const unsigned char *, size_t, unichar *, size_t *); private void from_ebcdic(const unsigned char *, size_t, unsigned char *); +#ifdef DEBUG_ENCODING +#define DPRINTF(a) printf a +#else +#define DPRINTF(a) +#endif + /* * Try to determine whether text is in some character code we can * identify. Each of these tests, if it succeeds, will leave @@ -65,6 +71,7 @@ file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, uni int rv = 1, ucs_type; unsigned char *nbuf = NULL; + *type = "text"; mlen = (nbytes + 1) * sizeof(nbuf[0]); if ((nbuf = CAST(unsigned char *, calloc((size_t)1, mlen))) == NULL) { file_oomem(ms, mlen); @@ -76,14 +83,17 @@ file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, uni goto done; } - *type = "text"; if (looks_ascii(buf, nbytes, *ubuf, ulen)) { + DPRINTF(("ascii %" SIZE_T_FORMAT "u\n", *ulen)); *code = "ASCII"; *code_mime = "us-ascii"; } else if (looks_utf8_with_BOM(buf, nbytes, *ubuf, ulen) > 0) { + DPRINTF(("utf8/bom %" SIZE_T_FORMAT "u\n", *ulen)); *code = "UTF-8 Unicode (with BOM)"; *code_mime = "utf-8"; } else if (file_looks_utf8(buf, nbytes, *ubuf, ulen) > 1) { + DPRINTF(("utf8 %" SIZE_T_FORMAT "u\n", *ulen)); + *code = "UTF-8 Unicode (with BOM)"; *code = "UTF-8 Unicode"; *code_mime = "utf-8"; } else if ((ucs_type = looks_ucs16(buf, nbytes, *ubuf, ulen)) != 0) { @@ -94,30 +104,36 @@ file_encoding(struct magic_set *ms, const unsigned char *buf, size_t nbytes, uni *code = "Big-endian UTF-16 Unicode"; *code_mime = "utf-16be"; } + DPRINTF(("ucs16 %" SIZE_T_FORMAT "u\n", *ulen)); } else if (looks_latin1(buf, nbytes, *ubuf, ulen)) { + DPRINTF(("latin1 %" SIZE_T_FORMAT "u\n", *ulen)); *code = "ISO-8859"; *code_mime = "iso-8859-1"; } else if (looks_extended(buf, nbytes, *ubuf, ulen)) { + DPRINTF(("extended %" SIZE_T_FORMAT "u\n", *ulen)); *code = "Non-ISO extended-ASCII"; *code_mime = "unknown-8bit"; } else { from_ebcdic(buf, nbytes, nbuf); if (looks_ascii(nbuf, nbytes, *ubuf, ulen)) { + DPRINTF(("ebcdic %" SIZE_T_FORMAT "u\n", *ulen)); *code = "EBCDIC"; *code_mime = "ebcdic"; } else if (looks_latin1(nbuf, nbytes, *ubuf, ulen)) { + DPRINTF(("ebcdic/international %" SIZE_T_FORMAT "u\n", + *ulen)); *code = "International EBCDIC"; *code_mime = "ebcdic"; } else { /* Doesn't look like text at all */ + DPRINTF(("binary\n")); rv = 0; *type = "binary"; } } done: - if (nbuf) - free(nbuf); + free(nbuf); return rv; } |