From 1edf24275a071892cdbff4ba07879dea75987a51 Mon Sep 17 00:00:00 2001
From: wosch <wosch@FreeBSD.org>
Date: Sun, 13 Oct 1996 01:44:43 +0000
Subject: 8-Bit character support.

Old locate(1) programs still works with the new database format, print
some garbage for 8 bit characters, but don't core (maybe except char 30).

7-Bit Puritan should not notice any difference. Same speed,
Same database size if the database contain only ASCII characters.

Reviewed by: ache
---
 usr.bin/locate/code/locate.code.c | 82 ++++++++++++++++++++++++++-------------
 1 file changed, 56 insertions(+), 26 deletions(-)

(limited to 'usr.bin/locate/code')

diff --git a/usr.bin/locate/code/locate.code.c b/usr.bin/locate/code/locate.code.c
index 469b978..67228da 100644
--- a/usr.bin/locate/code/locate.code.c
+++ b/usr.bin/locate/code/locate.code.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (c) 1995 Wolfram Schneider <wosch@FreeBSD.org>. Berlin.
  * Copyright (c) 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
@@ -33,7 +34,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * 	$Id: locate.code.c,v 1.4 1996/08/22 18:46:13 wosch Exp $
+ * 	$Id: locate.code.c,v 1.5 1996/08/31 14:51:18 wosch Exp $
  */
 
 #ifndef lint
@@ -72,13 +73,22 @@ static char sccsid[] = "@(#)locate.code.c	8.1 (Berkeley) 6/6/93";
  *
  *	0-28	likeliest differential counts + offset to make nonnegative
  *	30	switch code for out-of-range count to follow in next word
+ *      31      an 8 bit char followed
  *	128-255 bigram codes (128 most common, as determined by 'updatedb')
  *	32-127  single character (printable) ascii residue (ie, literal)
  *
- * SEE ALSO:	updatedb.csh, bigram.c
+ * The locate database store any character except newline ('\n') 
+ * and NUL ('\0'). The 8-bit character support don't wast extra
+ * space until you have characters in file names less than 32
+ * or greather than 127.
+ * 
+ *
+ * SEE ALSO:	updatedb.sh, ../bigram/locate.bigram.c
  *
  * AUTHOR:	James A. Woods, Informatics General Corp.,
  *		NASA Ames Research Center, 10/82
+ *              8-bit file names characters: 
+ *              	Wolfram Schneider, Berlin September 1996
  */
 
 #include <sys/param.h>
@@ -93,14 +103,14 @@ static char sccsid[] = "@(#)locate.code.c	8.1 (Berkeley) 6/6/93";
 
 u_char buf1[MAXPATHLEN] = " ";	
 u_char buf2[MAXPATHLEN];
-char bigrams[BGBUFSIZE + 1] = { 0 };
+u_char bigrams[BGBUFSIZE + 1] = { 0 };
 
 #define LOOKUP 1 /* use a lookup array instead a function, 3x faster */
 
 #ifdef LOOKUP
-#define BGINDEX(x) (big[(u_int)*x][(u_int)*(x+1)])
-typedef u_char bg_t;
-bg_t big[UCHAR_MAX][UCHAR_MAX];
+#define BGINDEX(x) (big[(u_char)*x][(u_char)*(x + 1)])
+typedef short bg_t;
+bg_t big[UCHAR_MAX + 1][UCHAR_MAX + 1];
 #else
 #define BGINDEX(x) bgindex(x)
 typedef int bg_t;
@@ -145,12 +155,13 @@ main(argc, argv)
 
 #ifdef LOOKUP
 	/* init lookup table */
-	for (i = 0; i < UCHAR_MAX; i++)
-	    	for (j = 0; j < UCHAR_MAX; j++) 
+	for (i = 0; i < UCHAR_MAX + 1; i++)
+	    	for (j = 0; j < UCHAR_MAX + 1; j++) 
 			big[i][j] = (bg_t)-1;
 
 	for (cp = bigrams, i = 0; *cp != '\0'; i += 2, cp += 2)
-	        big[(int)*cp][(int)*(cp + 1)] = (bg_t)i;
+	        big[(u_char)*cp][(u_char)*(cp + 1)] = (bg_t)i;
+
 #endif /* LOOKUP */
 
 	oldpath = buf1;
@@ -159,22 +170,21 @@ main(argc, argv)
 
 	while (fgets(path, sizeof(buf2), stdin) != NULL) {
 
-	    	/* skip empty lines */
+		/* skip empty lines */
 		if (*path == '\n')
 			continue;
 
-		/* Squelch characters that would botch the decoding. */
+		/* remove newline */
 		for (cp = path; *cp != '\0'; cp++) {
 			/* chop newline */
 			if (*cp == '\n')
 				*cp = '\0';
-			/* range */
-			else if (*cp < ASCII_MIN || *cp > ASCII_MAX)
-				*cp = '?';
 		}
 
 		/* Skip longest common prefix. */
-		for (cp = path; *cp == *oldpath && *cp != '\0'; cp++, oldpath++);
+		for (cp = path; *cp == *oldpath; cp++, oldpath++)
+			if (*cp == '\0')
+				break;
 
 		count = cp - path;
 		diffcount = count - oldcount + OFFSET;
@@ -188,22 +198,42 @@ main(argc, argv)
 				err(1, "stdout");
 
 		while (*cp != '\0') {
-			if (*(cp + 1) == '\0') {
-				if (putchar(*cp) == EOF)
-					err(1, "stdout");
-				break;
-			}
-			if ((code = BGINDEX(cp)) == (bg_t)-1) {
-				if (putchar(*cp++) == EOF ||
-				    putchar(*cp++) == EOF)
-					err(1, "stdout");
-			} else {
-				/* Found, so mark byte with parity bit. */
+			/* print *two* characters */
+
+			if ((code = BGINDEX(cp)) != (bg_t)-1) {
+				/*
+				 * print *one* as bigram
+				 * Found, so mark byte with 
+				 *  parity bit. 
+				 */
 				if (putchar((code / 2) | PARITY) == EOF)
 					err(1, "stdout");
 				cp += 2;
 			}
+
+			else {
+				for (i = 0; i < 2; i++) {
+					if (*cp == '\0')
+						break;
+
+					/* print umlauts in file names */
+					if (*cp < ASCII_MIN || 
+					    *cp > ASCII_MAX) {
+						if (putchar(UMLAUT) == EOF ||
+						    putchar(*cp++) == EOF)
+							err(1, "stdout");
+					} 
+
+					else {
+						/* normal character */
+						if(putchar(*cp++) == EOF)
+							err(1, "stdout");
+					}
+				}
+
+			}
 		}
+
 		if (path == buf1) {		/* swap pointers */
 			path = buf2;
 			oldpath = buf1;
-- 
cgit v1.1