diff options
author | wosch <wosch@FreeBSD.org> | 1996-10-13 01:44:43 +0000 |
---|---|---|
committer | wosch <wosch@FreeBSD.org> | 1996-10-13 01:44:43 +0000 |
commit | 1edf24275a071892cdbff4ba07879dea75987a51 (patch) | |
tree | ba20d97a5160d5236052b18ca5c02a3b4d24bd94 /usr.bin/locate | |
parent | 154b44fb0a407fda8021c14d48dc08625f97bdbc (diff) | |
download | FreeBSD-src-1edf24275a071892cdbff4ba07879dea75987a51.zip FreeBSD-src-1edf24275a071892cdbff4ba07879dea75987a51.tar.gz |
8-Bit character support.
Old locate(1) programs still works with the new database format, print
some garbage for 8 bit characters, but don't core (maybe except char 30).
7-Bit Puritan should not notice any difference. Same speed,
Same database size if the database contain only ASCII characters.
Reviewed by: ache
Diffstat (limited to 'usr.bin/locate')
-rw-r--r-- | usr.bin/locate/bigram/locate.bigram.c | 7 | ||||
-rw-r--r-- | usr.bin/locate/code/locate.code.c | 82 | ||||
-rw-r--r-- | usr.bin/locate/locate/fastfind.c | 94 | ||||
-rw-r--r-- | usr.bin/locate/locate/locate.1 | 11 | ||||
-rw-r--r-- | usr.bin/locate/locate/locate.c | 17 | ||||
-rw-r--r-- | usr.bin/locate/locate/locate.h | 3 |
6 files changed, 147 insertions, 67 deletions
diff --git a/usr.bin/locate/bigram/locate.bigram.c b/usr.bin/locate/bigram/locate.bigram.c index e7d86a4..2196957 100644 --- a/usr.bin/locate/bigram/locate.bigram.c +++ b/usr.bin/locate/bigram/locate.bigram.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 1995 Wolfram Schneider <wosch@FreeBSD.org>. Berlin. * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * @@ -33,7 +34,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: locate.bigram.c,v 1.1 1996/09/13 13:23:48 wosch Exp wosch $ + * $Id: locate.bigram.c,v 1.7 1996/09/14 20:15:49 wosch Exp $ */ #ifndef lint @@ -60,7 +61,7 @@ static char sccsid[] = "@(#)locate.bigram.c 8.1 (Berkeley) 6/6/93"; u_char buf1[MAXPATHLEN] = " "; u_char buf2[MAXPATHLEN]; -u_int bigram[UCHAR_MAX][UCHAR_MAX]; +u_int bigram[UCHAR_MAX + 1][UCHAR_MAX + 1]; int main(void) @@ -84,7 +85,7 @@ main(void) break; while (*cp != '\0' && *(cp + 1) != '\0') { - bigram[(u_int)*cp][(u_int)*(cp + 1)]++; + bigram[(u_char)*cp][(u_char)*(cp + 1)]++; cp += 2; } diff --git a/usr.bin/locate/code/locate.code.c b/usr.bin/locate/code/locate.code.c index 469b978..67228da 100644 --- a/usr.bin/locate/code/locate.code.c +++ b/usr.bin/locate/code/locate.code.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 1995 Wolfram Schneider <wosch@FreeBSD.org>. Berlin. * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * @@ -33,7 +34,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: locate.code.c,v 1.4 1996/08/22 18:46:13 wosch Exp $ + * $Id: locate.code.c,v 1.5 1996/08/31 14:51:18 wosch Exp $ */ #ifndef lint @@ -72,13 +73,22 @@ static char sccsid[] = "@(#)locate.code.c 8.1 (Berkeley) 6/6/93"; * * 0-28 likeliest differential counts + offset to make nonnegative * 30 switch code for out-of-range count to follow in next word + * 31 an 8 bit char followed * 128-255 bigram codes (128 most common, as determined by 'updatedb') * 32-127 single character (printable) ascii residue (ie, literal) * - * SEE ALSO: updatedb.csh, bigram.c + * The locate database store any character except newline ('\n') + * and NUL ('\0'). The 8-bit character support don't wast extra + * space until you have characters in file names less than 32 + * or greather than 127. + * + * + * SEE ALSO: updatedb.sh, ../bigram/locate.bigram.c * * AUTHOR: James A. Woods, Informatics General Corp., * NASA Ames Research Center, 10/82 + * 8-bit file names characters: + * Wolfram Schneider, Berlin September 1996 */ #include <sys/param.h> @@ -93,14 +103,14 @@ static char sccsid[] = "@(#)locate.code.c 8.1 (Berkeley) 6/6/93"; u_char buf1[MAXPATHLEN] = " "; u_char buf2[MAXPATHLEN]; -char bigrams[BGBUFSIZE + 1] = { 0 }; +u_char bigrams[BGBUFSIZE + 1] = { 0 }; #define LOOKUP 1 /* use a lookup array instead a function, 3x faster */ #ifdef LOOKUP -#define BGINDEX(x) (big[(u_int)*x][(u_int)*(x+1)]) -typedef u_char bg_t; -bg_t big[UCHAR_MAX][UCHAR_MAX]; +#define BGINDEX(x) (big[(u_char)*x][(u_char)*(x + 1)]) +typedef short bg_t; +bg_t big[UCHAR_MAX + 1][UCHAR_MAX + 1]; #else #define BGINDEX(x) bgindex(x) typedef int bg_t; @@ -145,12 +155,13 @@ main(argc, argv) #ifdef LOOKUP /* init lookup table */ - for (i = 0; i < UCHAR_MAX; i++) - for (j = 0; j < UCHAR_MAX; j++) + for (i = 0; i < UCHAR_MAX + 1; i++) + for (j = 0; j < UCHAR_MAX + 1; j++) big[i][j] = (bg_t)-1; for (cp = bigrams, i = 0; *cp != '\0'; i += 2, cp += 2) - big[(int)*cp][(int)*(cp + 1)] = (bg_t)i; + big[(u_char)*cp][(u_char)*(cp + 1)] = (bg_t)i; + #endif /* LOOKUP */ oldpath = buf1; @@ -159,22 +170,21 @@ main(argc, argv) while (fgets(path, sizeof(buf2), stdin) != NULL) { - /* skip empty lines */ + /* skip empty lines */ if (*path == '\n') continue; - /* Squelch characters that would botch the decoding. */ + /* remove newline */ for (cp = path; *cp != '\0'; cp++) { /* chop newline */ if (*cp == '\n') *cp = '\0'; - /* range */ - else if (*cp < ASCII_MIN || *cp > ASCII_MAX) - *cp = '?'; } /* Skip longest common prefix. */ - for (cp = path; *cp == *oldpath && *cp != '\0'; cp++, oldpath++); + for (cp = path; *cp == *oldpath; cp++, oldpath++) + if (*cp == '\0') + break; count = cp - path; diffcount = count - oldcount + OFFSET; @@ -188,22 +198,42 @@ main(argc, argv) err(1, "stdout"); while (*cp != '\0') { - if (*(cp + 1) == '\0') { - if (putchar(*cp) == EOF) - err(1, "stdout"); - break; - } - if ((code = BGINDEX(cp)) == (bg_t)-1) { - if (putchar(*cp++) == EOF || - putchar(*cp++) == EOF) - err(1, "stdout"); - } else { - /* Found, so mark byte with parity bit. */ + /* print *two* characters */ + + if ((code = BGINDEX(cp)) != (bg_t)-1) { + /* + * print *one* as bigram + * Found, so mark byte with + * parity bit. + */ if (putchar((code / 2) | PARITY) == EOF) err(1, "stdout"); cp += 2; } + + else { + for (i = 0; i < 2; i++) { + if (*cp == '\0') + break; + + /* print umlauts in file names */ + if (*cp < ASCII_MIN || + *cp > ASCII_MAX) { + if (putchar(UMLAUT) == EOF || + putchar(*cp++) == EOF) + err(1, "stdout"); + } + + else { + /* normal character */ + if(putchar(*cp++) == EOF) + err(1, "stdout"); + } + } + + } } + if (path == buf1) { /* swap pointers */ path = buf2; oldpath = buf1; diff --git a/usr.bin/locate/locate/fastfind.c b/usr.bin/locate/locate/fastfind.c index a83b660..c175c18 100644 --- a/usr.bin/locate/locate/fastfind.c +++ b/usr.bin/locate/locate/fastfind.c @@ -34,7 +34,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: fastfind.c,v 1.1 1996/08/31 23:14:52 wosch Exp $ + * $Id: fastfind.c,v 1.2 1996/10/09 00:33:32 wosch Exp $ */ @@ -46,10 +46,10 @@ statistic (fp, path_fcodes) FILE *fp; /* open database */ char *path_fcodes; /* for error message */ { - register int lines, chars, size, big; + register int lines, chars, size, big, zwerg; register u_char *p, *s; register int c; - int count; + int count, umlaut; u_char bigram1[NBG], bigram2[NBG], path[MAXPATHLEN]; for (c = 0, p = bigram1, s = bigram2; c < NBG; c++) { @@ -57,20 +57,27 @@ statistic (fp, path_fcodes) s[c] = check_bigram_char(getc(fp)); } - lines = chars = big = 0; + lines = chars = big = zwerg = umlaut = 0; size = NBG + NBG; for (c = getc(fp), count = 0; c != EOF; size++) { if (c == SWITCH) { count += getwf(fp) - OFFSET; size += sizeof(int); + zwerg++; } else count += c - OFFSET; for (p = path + count; (c = getc(fp)) > SWITCH; size++) - if (c < PARITY) + if (c < PARITY) { + if (c == UMLAUT) { + c = getc(fp); + size++; + umlaut++; + } p++; - else { + } else { + /* bigram char */ big++; p += 2; } @@ -82,13 +89,16 @@ statistic (fp, path_fcodes) (void)printf("\nDatabase: %s\n", path_fcodes); (void)printf("Compression: Front: %2.2f%%, ", - (float)(100 * (size + big)) / chars); + (float)(100 * (size + big - (2 * NBG))) / chars); (void)printf("Bigram: %2.2f%%, ", (float)(100 * (size - big)) / size); - (void)printf("Total: %2.2f%%\n", (float)(100 * size) / chars); + (void)printf("Total: %2.2f%%\n", + (float)(100 * (size - (2 * NBG))) / chars); (void)printf("Filenames: %d, ", lines); - (void)printf("Chars: %d\n", chars); - (void)printf("Database size: %d, ", size); - (void)printf("Bigram chars: %d\n", big); + (void)printf("Characters: %d, ", chars); + (void)printf("Database size: %d\n", size); + (void)printf("Bigram characters: %d, ", big); + (void)printf("Integers: %d, ", zwerg); + (void)printf("8-Bit characters: %d\n", umlaut); } #endif /* _LOCATE_STATISTIC_ */ @@ -102,7 +112,7 @@ void fastfind_mmap_icase #else fastfind_mmap -#endif +#endif /* FF_ICASE */ (pathpart, paddr, len, database) char *pathpart; /* search string */ caddr_t paddr; /* mmap pointer */ @@ -115,7 +125,7 @@ fastfind_mmap #ifdef FF_ICASE fastfind_icase -#else /* !FF_ICASE */ +#else fastfind #endif /* FF_ICASE */ @@ -136,10 +146,10 @@ fastfind #ifdef FF_ICASE /* use a lookup table for case insensitive search */ - u_char table[UCHAR_MAX]; + u_char table[UCHAR_MAX + 1]; tolower_word(pathpart); -#endif +#endif /* FF_ICASE*/ /* init bigram table */ #ifdef FF_MMAP @@ -157,7 +167,7 @@ fastfind p[c] = check_bigram_char(getc(fp)); s[c] = check_bigram_char(getc(fp)); } -#endif +#endif /* FF_MMAP */ /* find optimal (last) char for searching */ for (p = pathpart; *p != '\0'; p++) @@ -177,7 +187,7 @@ fastfind /* set patend char to true */ table[TOLOWER(*patend)] = 1; table[toupper(*patend)] = 1; -#endif +#endif /* FF_ICASE */ /* main loop */ @@ -185,10 +195,12 @@ fastfind foundchar = 0; #ifdef FF_MMAP - for (c = (u_char)*paddr++; len-- > 0; ) { + c = (u_char)*paddr++; len--; + for (; len > 0; ) { #else - for (c = getc(fp); c != EOF; ) { -#endif + c = getc(fp); + for (; c != EOF; ) { +#endif /* FF_MMAP */ /* go forward or backward */ if (c == SWITCH) { /* big step, an integer */ @@ -197,7 +209,7 @@ fastfind len -= INTSIZE; paddr += INTSIZE; #else count += getwf(fp) - OFFSET; -#endif +#endif /* FF_MMAP */ } else { /* slow step, =< 14 chars */ count += c - OFFSET; } @@ -205,18 +217,40 @@ fastfind /* overlay old path */ p = path + count; foundchar = p - 1; + + for (;;) { #ifdef FF_MMAP - for (; (c = (u_char)*paddr++) > SWITCH; len--) + c = (u_char)*paddr++; + len--; #else - for (; (c = getc(fp)) > SWITCH; ) -#endif - + c = getc(fp); +#endif /* FF_MMAP */ + /* + * == UMLAUT: 8 bit char followed + * <= SWITCH: offset + * >= PARITY: bigram + * rest: single ascii char + * + * offset < SWITCH < UMLAUT < ascii < PARITY < bigram + */ if (c < PARITY) { + if (c <= UMLAUT) { + if (c == UMLAUT) { +#ifdef FF_MMAP + c = (u_char)*paddr++; + len--; +#else + c = getc(fp); +#endif /* FF_MMAP */ + + } else + break; /* SWITCH */ + } #ifdef FF_ICASE if (table[c]) #else if (c == cc) -#endif +#endif /* FF_ICASE */ foundchar = p; *p++ = c; } @@ -231,13 +265,13 @@ fastfind if (table[bigram1[c]] || table[bigram2[c]]) -#endif +#endif /* FF_ICASE */ foundchar = p + 1; *p++ = bigram1[c]; *p++ = bigram2[c]; } - + } if (found) { /* previous line matched */ cutoff = path; @@ -254,14 +288,14 @@ fastfind if (*s == cc #ifdef FF_ICASE || TOLOWER(*s) == cc -#endif +#endif /* FF_ICASE */ ) { /* fast first char check */ for (p = patend - 1, q = s - 1; *p != '\0'; p--, q--) if (*q != *p #ifdef FF_ICASE && TOLOWER(*q) != *p -#endif +#endif /* FF_ICASE */ ) break; if (*p == '\0') { /* fast match success */ diff --git a/usr.bin/locate/locate/locate.1 b/usr.bin/locate/locate/locate.1 index 1c19977..9d29c71 100644 --- a/usr.bin/locate/locate/locate.1 +++ b/usr.bin/locate/locate/locate.1 @@ -31,7 +31,7 @@ .\" SUCH DAMAGE. .\" .\" @(#)locate.1 8.1 (Berkeley) 6/6/93 -.\" $Id$ +.\" $Id: locate.1,v 1.4 1996/08/31 23:14:52 wosch Exp $ .\" .Dd June 6, 1993 .Dt LOCATE 1 @@ -66,6 +66,12 @@ including slashes (``/''). As a special case, a pattern containing no globbing characters (``foo'') is matched as though it were ``*foo*''. +Historically, locate store only characters between 32 and 127. The +current implementation store any character except newline ('\\n') and +NUL ('\\0'). The 8-bit character support don't wast extra space for +plain ASCII file names. Characters less than 32 or greather than 127 +are stored in 2 bytes. + The following options are available: .Bl -tag -width 10n indent .It Fl S @@ -200,7 +206,8 @@ to share the databases between machines with different byte order. The current .Nm implementation understand databases in host byte order or -network byte order. So you can read on a FreeBSD/i386 machine +network byte order if both architectures use the same integer size. +So you can read on a FreeBSD/i386 machine (little endian) a locate database which was built on SunOS/sparc machine (big endian, net). diff --git a/usr.bin/locate/locate/locate.c b/usr.bin/locate/locate/locate.c index c983ac5..0de5df0 100644 --- a/usr.bin/locate/locate/locate.c +++ b/usr.bin/locate/locate/locate.c @@ -34,7 +34,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $Id: locate.c,v 1.4 1996/08/31 23:14:53 wosch Exp $ + * $Id: locate.c,v 1.5 1996/09/16 01:17:25 wosch Exp $ */ #ifndef lint @@ -60,6 +60,7 @@ static char sccsid[] = "@(#)locate.c 8.1 (Berkeley) 6/6/93"; * * 0-28 likeliest differential counts + offset to make nonnegative * 30 switch code for out-of-range count to follow in next word + * 31 an 8 bit char followed * 128-255 bigram codes (128 most common, as determined by 'updatedb') * 32-127 single character (printable) ascii residue (ie, literal) * @@ -76,19 +77,22 @@ static char sccsid[] = "@(#)locate.c 8.1 (Berkeley) 6/6/93"; */ #include <sys/param.h> +#include <ctype.h> +#include <err.h> #include <fnmatch.h> -#include <unistd.h> +#include <locale.h> #include <stdio.h> -#include <string.h> #include <stdlib.h> -#include <ctype.h> +#include <string.h> +#include <unistd.h> + #ifdef MMAP # include <sys/types.h> # include <sys/stat.h> # include <sys/mman.h> # include <fcntl.h> #endif -#include <err.h> + #ifdef sun #include <netinet/in.h> /* SunOS byteorder(3) htohl(3) */ @@ -148,6 +152,7 @@ main(argc, argv) #ifdef MMAP f_mmap = 1; /* mmap is default */ #endif + (void) setlocale(LC_ALL, ""); while ((ch = getopt(argc, argv, "Scd:il:ms")) != EOF) switch(ch) { @@ -198,7 +203,7 @@ main(argc, argv) } if (f_icase && UCHAR_MAX < 4096) /* init tolower lookup table */ - for (ch = 0; ch <= UCHAR_MAX; ch++) + for (ch = 0; ch < UCHAR_MAX + 1; ch++) myctype[ch] = tolower(ch); /* foreach database ... */ diff --git a/usr.bin/locate/locate/locate.h b/usr.bin/locate/locate/locate.h index cb7d087..9e997d4 100644 --- a/usr.bin/locate/locate/locate.h +++ b/usr.bin/locate/locate/locate.h @@ -1,4 +1,5 @@ /* + * Copyright (c) 1995 Wolfram Schneider <wosch@FreeBSD.org>. Berlin. * Copyright (c) 1989, 1993 * The Regents of the University of California. All rights reserved. * @@ -31,6 +32,7 @@ * SUCH DAMAGE. * * @(#)locate.h 8.1 (Berkeley) 6/6/93 + * $Id$ */ /* Symbolic constants shared by locate.c and code.c */ @@ -39,6 +41,7 @@ #define OFFSET 14 /* abs value of max likely diff */ #define PARITY 0200 /* parity bit */ #define SWITCH 30 /* switch code */ +#define UMLAUT 31 /* an 8 bit char followed */ /* 0-28 likeliest differential counts + offset to make nonnegative */ #define LDC_MIN 0 |