summaryrefslogtreecommitdiffstats
path: root/contrib/file/ascmagic.c
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/file/ascmagic.c')
-rw-r--r--contrib/file/ascmagic.c136
1 files changed, 53 insertions, 83 deletions
diff --git a/contrib/file/ascmagic.c b/contrib/file/ascmagic.c
index 9236fb4..5a1caac 100644
--- a/contrib/file/ascmagic.c
+++ b/contrib/file/ascmagic.c
@@ -26,8 +26,7 @@
* SUCH DAMAGE.
*/
/*
- * ASCII magic -- file types that we know based on keywords
- * that can appear anywhere in the file.
+ * ASCII magic -- try to detect text encoding.
*
* Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
* to handle character codes other than ASCII on a unified basis.
@@ -36,7 +35,7 @@
#include "file.h"
#ifndef lint
-FILE_RCSID("@(#)$File: ascmagic.c,v 1.75 2009/02/03 20:27:51 christos Exp $")
+FILE_RCSID("@(#)$File: ascmagic.c,v 1.84 2011/12/08 12:38:24 rrt Exp $")
#endif /* lint */
#include "magic.h"
@@ -47,13 +46,11 @@ FILE_RCSID("@(#)$File: ascmagic.c,v 1.75 2009/02/03 20:27:51 christos Exp $")
#ifdef HAVE_UNISTD_H
#include <unistd.h>
#endif
-#include "names.h"
#define MAXLINELEN 300 /* longest sane line length */
#define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
|| (x) == 0x85 || (x) == '\f')
-private int ascmatch(const unsigned char *, const unichar *, size_t);
private unsigned char *encode_utf8(unsigned char *, size_t, unichar *, size_t);
private size_t trim_nuls(const unsigned char *, size_t);
@@ -71,7 +68,8 @@ trim_nuls(const unsigned char *buf, size_t nbytes)
}
protected int
-file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
+file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes,
+ int text)
{
unichar *ubuf = NULL;
size_t ulen;
@@ -88,17 +86,13 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
/* If file doesn't look like any sort of text, give up. */
if (file_encoding(ms, buf, nbytes, &ubuf, &ulen, &code, &code_mime,
- &type) == 0) {
+ &type) == 0)
rv = 0;
- goto done;
- }
-
- rv = file_ascmagic_with_encoding(ms, buf, nbytes, ubuf, ulen, code,
- type);
+ else
+ rv = file_ascmagic_with_encoding(ms, buf, nbytes, ubuf, ulen, code,
+ type, text);
- done:
- if (ubuf)
- free(ubuf);
+ free(ubuf);
return rv;
}
@@ -106,11 +100,10 @@ file_ascmagic(struct magic_set *ms, const unsigned char *buf, size_t nbytes)
protected int
file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf,
size_t nbytes, unichar *ubuf, size_t ulen, const char *code,
- const char *type)
+ const char *type, int text)
{
unsigned char *utf8_buf = NULL, *utf8_end;
size_t mlen, i;
- const struct names *p;
int rv = -1;
int mime = ms->flags & MAGIC_MIME;
@@ -125,6 +118,7 @@ file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf,
int n_lf = 0;
int n_cr = 0;
int n_nel = 0;
+ int executable = 0;
size_t last_line_end = (size_t)-1;
int has_long_lines = 0;
@@ -140,56 +134,23 @@ file_ascmagic_with_encoding(struct magic_set *ms, const unsigned char *buf,
goto done;
}
- /* Convert ubuf to UTF-8 and try text soft magic */
- /* malloc size is a conservative overestimate; could be
- improved, or at least realloced after conversion. */
- mlen = ulen * 6;
- if ((utf8_buf = CAST(unsigned char *, malloc(mlen))) == NULL) {
- file_oomem(ms, mlen);
- goto done;
- }
- if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen)) == NULL)
- goto done;
- if ((rv = file_softmagic(ms, utf8_buf, (size_t)(utf8_end - utf8_buf),
- TEXTTEST)) != 0)
- goto done;
- else
- rv = -1;
-
- /* look for tokens from names.h - this is expensive! */
- if ((ms->flags & MAGIC_NO_CHECK_TOKENS) != 0)
- goto subtype_identified;
-
- i = 0;
- while (i < ulen) {
- size_t end;
-
- /* skip past any leading space */
- while (i < ulen && ISSPC(ubuf[i]))
- i++;
- if (i >= ulen)
- break;
-
- /* find the next whitespace */
- for (end = i + 1; end < nbytes; end++)
- if (ISSPC(ubuf[end]))
- break;
-
- /* compare the word thus isolated against the token list */
- for (p = names; p < names + NNAMES; p++) {
- if (ascmatch((const unsigned char *)p->name, ubuf + i,
- end - i)) {
- subtype = types[p->type].human;
- subtype_mime = types[p->type].mime;
- goto subtype_identified;
- }
+ if ((ms->flags & MAGIC_NO_CHECK_SOFT) == 0) {
+ /* Convert ubuf to UTF-8 and try text soft magic */
+ /* malloc size is a conservative overestimate; could be
+ improved, or at least realloced after conversion. */
+ mlen = ulen * 6;
+ if ((utf8_buf = CAST(unsigned char *, malloc(mlen))) == NULL) {
+ file_oomem(ms, mlen);
+ goto done;
}
-
- i = end;
+ if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen))
+ == NULL)
+ goto done;
+ if ((rv = file_softmagic(ms, utf8_buf,
+ (size_t)(utf8_end - utf8_buf), TEXTTEST, text)) == 0)
+ rv = -1;
}
-subtype_identified:
-
/* Now try to discover other details about the file. */
for (i = 0; i < ulen; i++) {
if (ubuf[i] == '\n') {
@@ -232,7 +193,7 @@ subtype_identified:
goto done;
}
if (mime) {
- if ((mime & MAGIC_MIME_TYPE) != 0) {
+ if (!file_printedlen(ms) && (mime & MAGIC_MIME_TYPE) != 0) {
if (subtype_mime) {
if (file_printf(ms, "%s", subtype_mime) == -1)
goto done;
@@ -242,6 +203,28 @@ subtype_identified:
}
}
} else {
+ if (file_printedlen(ms)) {
+ switch (file_replace(ms, " text$", ", ")) {
+ case 0:
+ switch (file_replace(ms, " text executable$",
+ ", ")) {
+ case 0:
+ if (file_printf(ms, ", ") == -1)
+ goto done;
+ case -1:
+ goto done;
+ default:
+ executable = 1;
+ break;
+ }
+ break;
+ case -1:
+ goto done;
+ default:
+ break;
+ }
+ }
+
if (file_printf(ms, "%s", code) == -1)
goto done;
@@ -253,6 +236,10 @@ subtype_identified:
if (file_printf(ms, " %s", type) == -1)
goto done;
+ if (executable)
+ if (file_printf(ms, " executable") == -1)
+ goto done;
+
if (has_long_lines)
if (file_printf(ms, ", with very long lines") == -1)
goto done;
@@ -309,28 +296,11 @@ subtype_identified:
}
rv = 1;
done:
- if (utf8_buf)
- free(utf8_buf);
+ free(utf8_buf);
return rv;
}
-private int
-ascmatch(const unsigned char *s, const unichar *us, size_t ulen)
-{
- size_t i;
-
- for (i = 0; i < ulen; i++) {
- if (s[i] != us[i])
- return 0;
- }
-
- if (s[i])
- return 0;
- else
- return 1;
-}
-
/*
* Encode Unicode string as UTF-8, returning pointer to character
* after end of string, or NULL if an invalid character is found.
OpenPOWER on IntegriCloud