diff options
author | joerg <joerg@FreeBSD.org> | 1996-12-11 14:09:12 +0000 |
---|---|---|
committer | joerg <joerg@FreeBSD.org> | 1996-12-11 14:09:12 +0000 |
commit | 2f3249e30505ccc6b0eb2fd1b892f3e2813453ce (patch) | |
tree | 97aba5034f2e97275813d181d18b4b2128f57e57 /usr.bin/file | |
parent | ee57f46ce0a7ad65af6b01f68b94eb2b9d315d79 (diff) | |
download | FreeBSD-src-2f3249e30505ccc6b0eb2fd1b892f3e2813453ce.zip FreeBSD-src-2f3249e30505ccc6b0eb2fd1b892f3e2813453ce.tar.gz |
Add another matching algorithhm to do heuristics for international
language text files.
Should finally close PR # bin/1925: file does not consider cyrillic
text..., though i've never got any response from the originator about
my suggestion.
While i was at it, also move out the `magic' file to /usr/share/misc,
there's nothing that magic with this file to justify its life under
/etc.
Diffstat (limited to 'usr.bin/file')
-rw-r--r-- | usr.bin/file/Makefile | 8 | ||||
-rw-r--r-- | usr.bin/file/file.c | 6 | ||||
-rw-r--r-- | usr.bin/file/file.h | 3 | ||||
-rw-r--r-- | usr.bin/file/international.c | 72 |
4 files changed, 83 insertions, 6 deletions
diff --git a/usr.bin/file/Makefile b/usr.bin/file/Makefile index efac409..d1ff65d 100644 --- a/usr.bin/file/Makefile +++ b/usr.bin/file/Makefile @@ -1,6 +1,6 @@ # Makefile for file(1) cmd. # Copyright (c) Ian F. Darwin 86/09/01 - see LEGAL.NOTICE. -# @(#)$Id: Makefile,v 1.4 1995/07/25 00:36:03 bde Exp $ +# @(#)$Id: Makefile,v 1.5 1996/08/17 22:27:08 wosch Exp $ # # This software is not subject to any license of the American Telephone # and Telegraph Company or of the Regents of the University of California. @@ -23,7 +23,7 @@ # 4. This notice may not be removed or altered. # # Hacked and dismembered for bmake (Geoff Rehmet). -MAGIC= /etc/magic +MAGIC= /usr/share/misc/magic MAGICOWN= bin MAGICGRP= bin MAGICMODE= 444 @@ -33,7 +33,7 @@ CFLAGS+= -DMAGIC='"$(MAGIC)"' PROG= file SRCS= file.c apprentice.c fsmagic.c softmagic.c ascmagic.c \ - compress.c is_tar.c print.c + compress.c is_tar.c print.c international.c MAN1= file.1 MAN5= magic.5 @@ -51,7 +51,7 @@ magic: $(MAGFILES) cat $(MAGFILES) > $(.TARGET) # called from /usr/src/etc/Makefile -etc-magic: +beforeinstall: ${INSTALL} -c -o $(MAGICOWN) -g $(MAGICGRP) -m $(MAGICMODE) magic \ $(DESTDIR)$(MAGIC) diff --git a/usr.bin/file/file.c b/usr.bin/file/file.c index 5814255..5400083 100644 --- a/usr.bin/file/file.c +++ b/usr.bin/file/file.c @@ -26,7 +26,7 @@ */ #ifndef lint static char *moduleid = - "@(#)$Id: file.c,v 1.2 1995/05/30 06:30:01 rgrimes Exp $"; + "@(#)$Id: file.c,v 1.3 1996/01/23 12:40:11 mpp Exp $"; #endif /* lint */ #include <stdio.h> @@ -343,6 +343,10 @@ int nb, zflag; if (ascmagic(buf, nb)) return 'a'; + /* see if it's international language text */ + if (internatmagic(buf, nb)) + return 'i'; + /* abandon hope, all ye who remain here */ ckfputs("data", stdout); return '\0'; diff --git a/usr.bin/file/file.h b/usr.bin/file/file.h index 5c9888d..1ed6772 100644 --- a/usr.bin/file/file.h +++ b/usr.bin/file/file.h @@ -1,6 +1,6 @@ /* * file.h - definitions for file(1) program - * @(#)$Id: file.h,v 1.2 1995/05/30 06:30:02 rgrimes Exp $ + * @(#)$Id: file.h,v 1.3 1996/01/23 12:40:13 mpp Exp $ * * Copyright (c) Ian F. Darwin, 1987. * Written by Ian F. Darwin. @@ -87,6 +87,7 @@ extern void error __P((const char *, ...)); extern void ckfputs __P((const char *, FILE *)); struct stat; extern int fsmagic __P((const char *, struct stat *)); +extern int internatmagic __P((unsigned char *, int)); extern int is_compress __P((const unsigned char *, int *)); extern int is_tar __P((unsigned char *, int)); extern void magwarn __P((const char *, ...)); diff --git a/usr.bin/file/international.c b/usr.bin/file/international.c new file mode 100644 index 0000000..59a508a --- /dev/null +++ b/usr.bin/file/international.c @@ -0,0 +1,72 @@ +#include "file.h" + +#include <string.h> + +#define F 0 +#define T 1 + +/* + * List of characters that look "reasonable" in international + * language texts. That's almost all characters :), except a + * few in the control range of ASCII (all the known international + * charactersets share the bottom half with ASCII). + */ +static char maybe_internat[256] = { + F, F, F, F, F, F, F, F, T, T, T, T, T, T, F, F, /* 0x0X */ + F, F, F, F, F, F, F, F, F, F, F, T, F, F, F, F, /* 0x1X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x2X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x3X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x4X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x5X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x6X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, F, /* 0x7X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x8X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0x9X */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0xaX */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0xbX */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0xcX */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0xdX */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, /* 0xeX */ + T, T, T, T, T, T, T, T, T, T, T, T, T, T, T, T /* 0xfX */ +}; + +/* Maximal length of a line we consider "reasonable". */ +#define MAXLINELEN 300 + +int +internatmagic(buf, nbytes) + unsigned char *buf; + int nbytes; +{ + int i; + unsigned char *cp; + + nbytes--; + + /* First, look whether there are "unreasonable" characters. */ + for (i = 0, cp = buf; i < nbytes; i++, cp++) + if (!maybe_internat[*cp]) + return 0; + + /* + * Now, look whether the file consists of lines of + * "reasonable" length. + */ + + for (i = 0; i < nbytes;) { + cp = memchr(buf, '\n', nbytes - i); + if (cp == NULL) { + /* Don't fail if we hit the end of buffer. */ + if (i + MAXLINELEN >= nbytes) + break; + else + return 0; + } + if (cp - buf > MAXLINELEN) + return 0; + i += (cp - buf + 1); + buf = cp + 1; + } + ckfputs("International language text", stdout); + return 1; +} |