8-Bit character support.

Old locate(1) programs still works with the new database format, print some garbage for 8 bit characters, but don't core (maybe except char 30). 7-Bit Puritan should not notice any difference. Same speed, Same database size if the database contain only ASCII characters. Reviewed by: ache
author: wosch <wosch@FreeBSD.org> 1996-10-13 01:44:43 +0000
committer: wosch <wosch@FreeBSD.org> 1996-10-13 01:44:43 +0000
commit: 1edf24275a071892cdbff4ba07879dea75987a51 (patch)
tree: ba20d97a5160d5236052b18ca5c02a3b4d24bd94 /usr.bin/locate
parent: 154b44fb0a407fda8021c14d48dc08625f97bdbc (diff)
download: FreeBSD-src-1edf24275a071892cdbff4ba07879dea75987a51.zip
FreeBSD-src-1edf24275a071892cdbff4ba07879dea75987a51.tar.gz
6 files changed, 147 insertions, 67 deletions
diff --git a/usr.bin/locate/bigram/locate.bigram.c b/usr.bin/locate/bigram/locate.bigram.c
index e7d86a4..2196957 100644
--- a/usr.bin/locate/bigram/locate.bigram.c
+++ b/usr.bin/locate/bigram/locate.bigram.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 1995 Wolfram Schneider <wosch@FreeBSD.org>. Berlin.
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
@@ -33,7 +34,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * 	$Id: locate.bigram.c,v 1.1 1996/09/13 13:23:48 wosch Exp wosch $
+ * 	$Id: locate.bigram.c,v 1.7 1996/09/14 20:15:49 wosch Exp $
  */
 
 #ifndef lint
@@ -60,7 +61,7 @@ static char sccsid[] = "@(#)locate.bigram.c	8.1 (Berkeley) 6/6/93";
 
 u_char buf1[MAXPATHLEN] = " ";
 u_char buf2[MAXPATHLEN];
-u_int bigram[UCHAR_MAX][UCHAR_MAX];
+u_int bigram[UCHAR_MAX + 1][UCHAR_MAX + 1];
 
 int
 main(void)
@@ -84,7 +85,7 @@ main(void)
 				break;
 
 		while (*cp != '\0' && *(cp + 1) != '\0') {
-			bigram[(u_int)*cp][(u_int)*(cp + 1)]++;
+			bigram[(u_char)*cp][(u_char)*(cp + 1)]++;
 			cp += 2;
 		}
 
diff --git a/usr.bin/locate/code/locate.code.c b/usr.bin/locate/code/locate.code.c
index 469b978..67228da 100644
--- a/usr.bin/locate/code/locate.code.c
+++ b/usr.bin/locate/code/locate.code.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 1995 Wolfram Schneider <wosch@FreeBSD.org>. Berlin.
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
@@ -33,7 +34,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * 	$Id: locate.code.c,v 1.4 1996/08/22 18:46:13 wosch Exp $
+ * 	$Id: locate.code.c,v 1.5 1996/08/31 14:51:18 wosch Exp $
  */
 
 #ifndef lint
@@ -72,13 +73,22 @@ static char sccsid[] = "@(#)locate.code.c	8.1 (Berkeley) 6/6/93";
  *
  *	0-28	likeliest differential counts + offset to make nonnegative
  *	30	switch code for out-of-range count to follow in next word
+ *      31      an 8 bit char followed
  *	128-255 bigram codes (128 most common, as determined by 'updatedb')
  *	32-127  single character (printable) ascii residue (ie, literal)
  *
- * SEE ALSO:	updatedb.csh, bigram.c
+ * The locate database store any character except newline ('\n') 
+ * and NUL ('\0'). The 8-bit character support don't wast extra
+ * space until you have characters in file names less than 32
+ * or greather than 127.
+ * 
+ *
+ * SEE ALSO:	updatedb.sh, ../bigram/locate.bigram.c
  *
  * AUTHOR:	James A. Woods, Informatics General Corp.,
  *		NASA Ames Research Center, 10/82
+ *              8-bit file names characters: 
+ *              	Wolfram Schneider, Berlin September 1996
  */
 
 #include <sys/param.h>
@@ -93,14 +103,14 @@ static char sccsid[] = "@(#)locate.code.c	8.1 (Berkeley) 6/6/93";
 
 u_char buf1[MAXPATHLEN] = " ";	
 u_char buf2[MAXPATHLEN];
-char bigrams[BGBUFSIZE + 1] = { 0 };
+u_char bigrams[BGBUFSIZE + 1] = { 0 };
 
 #define LOOKUP 1 /* use a lookup array instead a function, 3x faster */
 
 #ifdef LOOKUP
-#define BGINDEX(x) (big[(u_int)*x][(u_int)*(x+1)])
-typedef u_char bg_t;
-bg_t big[UCHAR_MAX][UCHAR_MAX];
+#define BGINDEX(x) (big[(u_char)*x][(u_char)*(x + 1)])
+typedef short bg_t;
+bg_t big[UCHAR_MAX + 1][UCHAR_MAX + 1];
 #else
 #define BGINDEX(x) bgindex(x)
 typedef int bg_t;
@@ -145,12 +155,13 @@ main(argc, argv)
 
 #ifdef LOOKUP
 	/* init lookup table */
-	for (i = 0; i < UCHAR_MAX; i++)
-	    	for (j = 0; j < UCHAR_MAX; j++) 
+	for (i = 0; i < UCHAR_MAX + 1; i++)
+	    	for (j = 0; j < UCHAR_MAX + 1; j++) 
 			big[i][j] = (bg_t)-1;
 
 	for (cp = bigrams, i = 0; *cp != '\0'; i += 2, cp += 2)
-	        big[(int)*cp][(int)*(cp + 1)] = (bg_t)i;
+	        big[(u_char)*cp][(u_char)*(cp + 1)] = (bg_t)i;
+
 #endif /* LOOKUP */
 
 	oldpath = buf1;
@@ -159,22 +170,21 @@ main(argc, argv)
 
 	while (fgets(path, sizeof(buf2), stdin) != NULL) {
 
-	    	/* skip empty lines */
+		/* skip empty lines */
 		if (*path == '\n')
 			continue;
 
-		/* Squelch characters that would botch the decoding. */
+		/* remove newline */
 		for (cp = path; *cp != '\0'; cp++) {
 			/* chop newline */
 			if (*cp == '\n')
 				*cp = '\0';
-			/* range */
-			else if (*cp < ASCII_MIN || *cp > ASCII_MAX)
-				*cp = '?';
 		}
 
 		/* Skip longest common prefix. */
-		for (cp = path; *cp == *oldpath && *cp != '\0'; cp++, oldpath++);
+		for (cp = path; *cp == *oldpath; cp++, oldpath++)
+			if (*cp == '\0')
+				break;
 
 		count = cp - path;
 		diffcount = count - oldcount + OFFSET;
@@ -188,22 +198,42 @@ main(argc, argv)
 				err(1, "stdout");
 
 		while (*cp != '\0') {
-			if (*(cp + 1) == '\0') {
-				if (putchar(*cp) == EOF)
-					err(1, "stdout");
-				break;
-			}
-			if ((code = BGINDEX(cp)) == (bg_t)-1) {
-				if (putchar(*cp++) == EOF ||
-				    putchar(*cp++) == EOF)
-					err(1, "stdout");
-			} else {
-				/* Found, so mark byte with parity bit. */
+			/* print *two* characters */
+
+			if ((code = BGINDEX(cp)) != (bg_t)-1) {
+				/*
+				 * print *one* as bigram
+				 * Found, so mark byte with 
+				 *  parity bit. 
+				 */
 				if (putchar((code / 2) | PARITY) == EOF)
 					err(1, "stdout");
 				cp += 2;
 			}
+
+			else {
+				for (i = 0; i < 2; i++) {
+					if (*cp == '\0')
+						break;
+
+					/* print umlauts in file names */
+					if (*cp < ASCII_MIN || 
+					    *cp > ASCII_MAX) {
+						if (putchar(UMLAUT) == EOF ||
+						    putchar(*cp++) == EOF)
+							err(1, "stdout");
+					} 
+
+					else {
+						/* normal character */
+						if(putchar(*cp++) == EOF)
+							err(1, "stdout");
+					}
+				}
+
+			}
 		}
+
 		if (path == buf1) {		/* swap pointers */
 			path = buf2;
 			oldpath = buf1;
diff --git a/usr.bin/locate/locate/fastfind.c b/usr.bin/locate/locate/fastfind.c
index a83b660..c175c18 100644
--- a/usr.bin/locate/locate/fastfind.c
+++ b/usr.bin/locate/locate/fastfind.c
@@ -34,7 +34,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $Id: fastfind.c,v 1.1 1996/08/31 23:14:52 wosch Exp $
+ * $Id: fastfind.c,v 1.2 1996/10/09 00:33:32 wosch Exp $
  */
 
 
@@ -46,10 +46,10 @@ statistic (fp, path_fcodes)
 	FILE *fp;               /* open database */
 	char *path_fcodes;  	/* for error message */
 {
-	register int lines, chars, size, big;
+	register int lines, chars, size, big, zwerg;
 	register u_char *p, *s;
 	register int c;
-	int count;
+	int count, umlaut;
 	u_char bigram1[NBG], bigram2[NBG], path[MAXPATHLEN];
 
 	for (c = 0, p = bigram1, s = bigram2; c < NBG; c++) {
@@ -57,20 +57,27 @@ statistic (fp, path_fcodes)
 		s[c] = check_bigram_char(getc(fp));
 	}
 
-	lines = chars = big = 0;
+	lines = chars = big = zwerg = umlaut = 0;
 	size = NBG + NBG;
 
 	for (c = getc(fp), count = 0; c != EOF; size++) {
 		if (c == SWITCH) {
 			count += getwf(fp) - OFFSET;
 			size += sizeof(int);
+			zwerg++;
 		} else
 			count += c - OFFSET;
 		
 		for (p = path + count; (c = getc(fp)) > SWITCH; size++)
-			if (c < PARITY)
+			if (c < PARITY) {
+				if (c == UMLAUT) {
+					c = getc(fp);
+					size++;
+					umlaut++;
+				}
 				p++;
-			else {
+			} else {
+				/* bigram char */
 				big++;
 				p += 2;
 			}
@@ -82,13 +89,16 @@ statistic (fp, path_fcodes)
 
 	(void)printf("\nDatabase: %s\n", path_fcodes);
 	(void)printf("Compression: Front: %2.2f%%, ",
-		     (float)(100 * (size + big)) / chars);
+		     (float)(100 * (size + big - (2 * NBG))) / chars);
 	(void)printf("Bigram: %2.2f%%, ", (float)(100 * (size - big)) / size);
-	(void)printf("Total: %2.2f%%\n", (float)(100 * size) / chars);
+	(void)printf("Total: %2.2f%%\n", 
+		     (float)(100 * (size - (2 * NBG))) / chars);
 	(void)printf("Filenames: %d, ", lines);
-	(void)printf("Chars: %d\n", chars);
-	(void)printf("Database size: %d, ", size);
-	(void)printf("Bigram chars: %d\n", big);
+	(void)printf("Characters: %d, ", chars);
+	(void)printf("Database size: %d\n", size);
+	(void)printf("Bigram characters: %d, ", big);
+	(void)printf("Integers: %d, ", zwerg);
+	(void)printf("8-Bit characters: %d\n", umlaut);
 
 }
 #endif /* _LOCATE_STATISTIC_ */
@@ -102,7 +112,7 @@ void
 fastfind_mmap_icase
 #else
 fastfind_mmap
-#endif
+#endif /* FF_ICASE */
 (pathpart, paddr, len, database)
 	char *pathpart; 	/* search string */
 	caddr_t paddr;  	/* mmap pointer */
@@ -115,7 +125,7 @@ fastfind_mmap
 
 #ifdef FF_ICASE
 fastfind_icase
-#else /* !FF_ICASE */
+#else
 fastfind
 #endif /* FF_ICASE */
 
@@ -136,10 +146,10 @@ fastfind
 
 #ifdef FF_ICASE
 	/* use a lookup table for case insensitive search */
-	u_char table[UCHAR_MAX];
+	u_char table[UCHAR_MAX + 1];
 
 	tolower_word(pathpart);
-#endif
+#endif /* FF_ICASE*/
 
 	/* init bigram table */
 #ifdef FF_MMAP
@@ -157,7 +167,7 @@ fastfind
 		p[c] = check_bigram_char(getc(fp));
 		s[c] = check_bigram_char(getc(fp));
 	}
-#endif
+#endif /* FF_MMAP */
 
 	/* find optimal (last) char for searching */
 	for (p = pathpart; *p != '\0'; p++)
@@ -177,7 +187,7 @@ fastfind
 	/* set patend char to true */
 	table[TOLOWER(*patend)] = 1;
 	table[toupper(*patend)] = 1;
-#endif
+#endif /* FF_ICASE */
 
 
 	/* main loop */
@@ -185,10 +195,12 @@ fastfind
 	foundchar = 0;
 
 #ifdef FF_MMAP
-	for (c = (u_char)*paddr++; len-- > 0; ) {
+	c = (u_char)*paddr++; len--;
+	for (; len > 0; ) {
 #else
-	for (c = getc(fp); c != EOF; ) {
-#endif
+	c = getc(fp);
+	for (; c != EOF; ) {
+#endif /* FF_MMAP */
 
 		/* go forward or backward */
 		if (c == SWITCH) { /* big step, an integer */
@@ -197,7 +209,7 @@ fastfind
 			len -= INTSIZE; paddr += INTSIZE;
 #else
 			count +=  getwf(fp) - OFFSET;
-#endif
+#endif /* FF_MMAP */
 		} else {	   /* slow step, =< 14 chars */
 			count += c - OFFSET;
 		}
@@ -205,18 +217,40 @@ fastfind
 		/* overlay old path */
 		p = path + count;
 		foundchar = p - 1;
+
+		for (;;) {
 #ifdef FF_MMAP
-		for (; (c = (u_char)*paddr++) > SWITCH; len--)
+			c = (u_char)*paddr++; 
+		        len--;
 #else
-		for (; (c = getc(fp)) > SWITCH; )
-#endif
-
+			c = getc(fp);
+#endif /* FF_MMAP */
+			/*
+			 * == UMLAUT: 8 bit char followed
+			 * <= SWITCH: offset
+			 * >= PARITY: bigram
+			 * rest:      single ascii char
+			 *
+			 * offset < SWITCH < UMLAUT < ascii < PARITY < bigram
+			 */
 			if (c < PARITY) {
+				if (c <= UMLAUT) {
+					if (c == UMLAUT) {
+#ifdef FF_MMAP
+						c = (u_char)*paddr++;
+						len--;
+#else
+						c = getc(fp);
+#endif /* FF_MMAP */
+						
+					} else
+						break; /* SWITCH */
+				}
 #ifdef FF_ICASE
 				if (table[c])
 #else
 				if (c == cc)
-#endif
+#endif /* FF_ICASE */
 					foundchar = p;
 				*p++ = c;
 			}
@@ -231,13 +265,13 @@ fastfind
 
 					if (table[bigram1[c]] ||
 					    table[bigram2[c]])
-#endif
+#endif /* FF_ICASE */
 						foundchar = p + 1;
 
 				*p++ = bigram1[c];
 				*p++ = bigram2[c];
 			}
-
+		}
 		
 		if (found) {                     /* previous line matched */
 			cutoff = path;
@@ -254,14 +288,14 @@ fastfind
 			if (*s == cc
 #ifdef FF_ICASE
 			    || TOLOWER(*s) == cc
-#endif
+#endif /* FF_ICASE */
 			    ) {	/* fast first char check */
 				for (p = patend - 1, q = s - 1; *p != '\0';
 				     p--, q--)
 					if (*q != *p
 #ifdef FF_ICASE
 					    && TOLOWER(*q) != *p
-#endif
+#endif /* FF_ICASE */
 					    )
 						break;
 				if (*p == '\0') {   /* fast match success */
diff --git a/usr.bin/locate/locate/locate.1 b/usr.bin/locate/locate/locate.1
index 1c19977..9d29c71 100644
--- a/usr.bin/locate/locate/locate.1
+++ b/usr.bin/locate/locate/locate.1
@@ -31,7 +31,7 @@
 .\" SUCH DAMAGE.
 .\"
 .\"	@(#)locate.1	8.1 (Berkeley) 6/6/93
-.\"	$Id$
+.\"	$Id: locate.1,v 1.4 1996/08/31 23:14:52 wosch Exp $
 .\"
 .Dd June 6, 1993
 .Dt LOCATE 1
@@ -66,6 +66,12 @@ including slashes (``/'').
 As a special case, a pattern containing no globbing characters (``foo'')
 is matched as though it were ``*foo*''.
 
+Historically, locate store only characters between 32 and 127.  The
+current implementation store any character except newline ('\\n') and
+NUL ('\\0'). The 8-bit character support don't wast extra space for
+plain ASCII file names. Characters less than 32 or greather than 127
+are stored in 2 bytes.
+
 The following options are available:
 .Bl -tag -width 10n indent
 .It Fl S
@@ -200,7 +206,8 @@ to share the databases between machines with different byte order.
 The current 
 .Nm
 implementation understand databases in host byte order or
-network byte order. So you can read on a FreeBSD/i386 machine 
+network byte order if both architectures use the same integer size. 
+So you can read on a FreeBSD/i386 machine 
 (little endian)
 a locate database which was built on SunOS/sparc machine
 (big endian, net).
diff --git a/usr.bin/locate/locate/locate.c b/usr.bin/locate/locate/locate.c
index c983ac5..0de5df0 100644
--- a/usr.bin/locate/locate/locate.c
+++ b/usr.bin/locate/locate/locate.c
@@ -34,7 +34,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *      $Id: locate.c,v 1.4 1996/08/31 23:14:53 wosch Exp $
+ *      $Id: locate.c,v 1.5 1996/09/16 01:17:25 wosch Exp $
  */
 
 #ifndef lint
@@ -60,6 +60,7 @@ static char sccsid[] = "@(#)locate.c    8.1 (Berkeley) 6/6/93";
  *
  *      0-28    likeliest differential counts + offset to make nonnegative
  *      30      switch code for out-of-range count to follow in next word
+ *      31      an 8 bit char followed
  *      128-255 bigram codes (128 most common, as determined by 'updatedb')
  *      32-127  single character (printable) ascii residue (ie, literal)
  *
@@ -76,19 +77,22 @@ static char sccsid[] = "@(#)locate.c    8.1 (Berkeley) 6/6/93";
  */
 
 #include <sys/param.h>
+#include <ctype.h>
+#include <err.h>
 #include <fnmatch.h>
-#include <unistd.h>
+#include <locale.h>
 #include <stdio.h>
-#include <string.h>
 #include <stdlib.h>
-#include <ctype.h>
+#include <string.h>
+#include <unistd.h>
+
 #ifdef MMAP
 #  include <sys/types.h>
 #  include <sys/stat.h>
 #  include <sys/mman.h>
 #  include <fcntl.h>
 #endif
-#include <err.h>
+
 
 #ifdef sun
 #include <netinet/in.h> /* SunOS byteorder(3) htohl(3) */
@@ -148,6 +152,7 @@ main(argc, argv)
 #ifdef MMAP
         f_mmap = 1;		/* mmap is default */
 #endif
+	(void) setlocale(LC_ALL, "");
 
         while ((ch = getopt(argc, argv, "Scd:il:ms")) != EOF)
                 switch(ch) {
@@ -198,7 +203,7 @@ main(argc, argv)
         }
 
         if (f_icase && UCHAR_MAX < 4096) /* init tolower lookup table */
-                for (ch = 0; ch <= UCHAR_MAX; ch++)
+                for (ch = 0; ch < UCHAR_MAX + 1; ch++)
                         myctype[ch] = tolower(ch);
 
         /* foreach database ... */
diff --git a/usr.bin/locate/locate/locate.h b/usr.bin/locate/locate/locate.h
index cb7d087..9e997d4 100644
--- a/usr.bin/locate/locate/locate.h
+++ b/usr.bin/locate/locate/locate.h
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 1995 Wolfram Schneider <wosch@FreeBSD.org>. Berlin.
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
@@ -31,6 +32,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)locate.h	8.1 (Berkeley) 6/6/93
+ *	$Id$
  */
 
 /* Symbolic constants shared by locate.c and code.c */
@@ -39,6 +41,7 @@
 #define	OFFSET		14		/* abs value of max likely diff */
 #define	PARITY		0200		/* parity bit */
 #define	SWITCH		30		/* switch code */
+#define UMLAUT          31              /* an 8 bit char followed */
 
 /* 	0-28	likeliest differential counts + offset to make nonnegative */
 #define LDC_MIN         0
author	wosch <wosch@FreeBSD.org>	1996-10-13 01:44:43 +0000
committer	wosch <wosch@FreeBSD.org>	1996-10-13 01:44:43 +0000
commit	1edf24275a071892cdbff4ba07879dea75987a51 (patch)
tree	ba20d97a5160d5236052b18ca5c02a3b4d24bd94 /usr.bin/locate
parent	154b44fb0a407fda8021c14d48dc08625f97bdbc (diff)
download	FreeBSD-src-1edf24275a071892cdbff4ba07879dea75987a51.zip FreeBSD-src-1edf24275a071892cdbff4ba07879dea75987a51.tar.gz