From 1edf24275a071892cdbff4ba07879dea75987a51 Mon Sep 17 00:00:00 2001 From: wosch Date: Sun, 13 Oct 1996 01:44:43 +0000 Subject: 8-Bit character support. Old locate(1) programs still works with the new database format, print some garbage for 8 bit characters, but don't core (maybe except char 30). 7-Bit Puritan should not notice any difference. Same speed, Same database size if the database contain only ASCII characters. Reviewed by: ache --- usr.bin/locate/code/locate.code.c | 82 ++++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 26 deletions(-) (limited to 'usr.bin/locate/code') diff --git a/usr.bin/locate/code/locate.code.c b/usr.bin/locate/code/locate.code.c index 469b978..67228da 100644 --- a/usr.bin/locate/code/locate.code.c +++ b/usr.bin/locate/code/locate.code.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 1995 Wolfram Schneider . Berlin. * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * @@ -33,7 +34,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: locate.code.c,v 1.4 1996/08/22 18:46:13 wosch Exp $ + * $Id: locate.code.c,v 1.5 1996/08/31 14:51:18 wosch Exp $ */ #ifndef lint @@ -72,13 +73,22 @@ static char sccsid[] = "@(#)locate.code.c 8.1 (Berkeley) 6/6/93"; * * 0-28 likeliest differential counts + offset to make nonnegative * 30 switch code for out-of-range count to follow in next word + * 31 an 8 bit char followed * 128-255 bigram codes (128 most common, as determined by 'updatedb') * 32-127 single character (printable) ascii residue (ie, literal) * - * SEE ALSO: updatedb.csh, bigram.c + * The locate database store any character except newline ('\n') + * and NUL ('\0'). The 8-bit character support don't wast extra + * space until you have characters in file names less than 32 + * or greather than 127. + * + * + * SEE ALSO: updatedb.sh, ../bigram/locate.bigram.c * * AUTHOR: James A. Woods, Informatics General Corp., * NASA Ames Research Center, 10/82 + * 8-bit file names characters: + * Wolfram Schneider, Berlin September 1996 */ #include @@ -93,14 +103,14 @@ static char sccsid[] = "@(#)locate.code.c 8.1 (Berkeley) 6/6/93"; u_char buf1[MAXPATHLEN] = " "; u_char buf2[MAXPATHLEN]; -char bigrams[BGBUFSIZE + 1] = { 0 }; +u_char bigrams[BGBUFSIZE + 1] = { 0 }; #define LOOKUP 1 /* use a lookup array instead a function, 3x faster */ #ifdef LOOKUP -#define BGINDEX(x) (big[(u_int)*x][(u_int)*(x+1)]) -typedef u_char bg_t; -bg_t big[UCHAR_MAX][UCHAR_MAX]; +#define BGINDEX(x) (big[(u_char)*x][(u_char)*(x + 1)]) +typedef short bg_t; +bg_t big[UCHAR_MAX + 1][UCHAR_MAX + 1]; #else #define BGINDEX(x) bgindex(x) typedef int bg_t; @@ -145,12 +155,13 @@ main(argc, argv) #ifdef LOOKUP /* init lookup table */ - for (i = 0; i < UCHAR_MAX; i++) - for (j = 0; j < UCHAR_MAX; j++) + for (i = 0; i < UCHAR_MAX + 1; i++) + for (j = 0; j < UCHAR_MAX + 1; j++) big[i][j] = (bg_t)-1; for (cp = bigrams, i = 0; *cp != '\0'; i += 2, cp += 2) - big[(int)*cp][(int)*(cp + 1)] = (bg_t)i; + big[(u_char)*cp][(u_char)*(cp + 1)] = (bg_t)i; + #endif /* LOOKUP */ oldpath = buf1; @@ -159,22 +170,21 @@ main(argc, argv) while (fgets(path, sizeof(buf2), stdin) != NULL) { - /* skip empty lines */ + /* skip empty lines */ if (*path == '\n') continue; - /* Squelch characters that would botch the decoding. */ + /* remove newline */ for (cp = path; *cp != '\0'; cp++) { /* chop newline */ if (*cp == '\n') *cp = '\0'; - /* range */ - else if (*cp < ASCII_MIN || *cp > ASCII_MAX) - *cp = '?'; } /* Skip longest common prefix. */ - for (cp = path; *cp == *oldpath && *cp != '\0'; cp++, oldpath++); + for (cp = path; *cp == *oldpath; cp++, oldpath++) + if (*cp == '\0') + break; count = cp - path; diffcount = count - oldcount + OFFSET; @@ -188,22 +198,42 @@ main(argc, argv) err(1, "stdout"); while (*cp != '\0') { - if (*(cp + 1) == '\0') { - if (putchar(*cp) == EOF) - err(1, "stdout"); - break; - } - if ((code = BGINDEX(cp)) == (bg_t)-1) { - if (putchar(*cp++) == EOF || - putchar(*cp++) == EOF) - err(1, "stdout"); - } else { - /* Found, so mark byte with parity bit. */ + /* print *two* characters */ + + if ((code = BGINDEX(cp)) != (bg_t)-1) { + /* + * print *one* as bigram + * Found, so mark byte with + * parity bit. + */ if (putchar((code / 2) | PARITY) == EOF) err(1, "stdout"); cp += 2; } + + else { + for (i = 0; i < 2; i++) { + if (*cp == '\0') + break; + + /* print umlauts in file names */ + if (*cp < ASCII_MIN || + *cp > ASCII_MAX) { + if (putchar(UMLAUT) == EOF || + putchar(*cp++) == EOF) + err(1, "stdout"); + } + + else { + /* normal character */ + if(putchar(*cp++) == EOF) + err(1, "stdout"); + } + } + + } } + if (path == buf1) { /* swap pointers */ path = buf2; oldpath = buf1; -- cgit v1.1