diff options
author | wosch <wosch@FreeBSD.org> | 1996-08-14 00:22:31 +0000 |
---|---|---|
committer | wosch <wosch@FreeBSD.org> | 1996-08-14 00:22:31 +0000 |
commit | 9c0ad6a2b3f4178ace4292cdb66f55997eddf45c (patch) | |
tree | 5ee2c3ccbe51dc57abaaffc3dd772e05c9efab8c /usr.bin/locate/bigram | |
parent | 521551a94c26abb72ce3538eb54e226c481f5dad (diff) | |
download | FreeBSD-src-9c0ad6a2b3f4178ace4292cdb66f55997eddf45c.zip FreeBSD-src-9c0ad6a2b3f4178ace4292cdb66f55997eddf45c.tar.gz |
bigram
Bigram does not remove newline at end of filename. This
break particulary the bigram algorithm and /var/db/locate.database
grow up 15 %.
Bigram does not check for characters outside 32-127.
The bigram output is silly and need ~1/2 CPU time of
database rebuilding.
old:
locate.bigram < $filelist | sort | uniq -c | sort -nr
^^^^^^^^^^^^^^
this can easy made bigram
new:
bigram < $filelist | sort -nr
code
Code does not check for char 31.
Use a lookup array instead a function. 3 x faster.
updatedb
rewritten
sync with bigram changes
read config file /etc/locate.rc if exists
submitted by: guido@gvr.win.tue.nl (Guido van Rooij)
concatdb - concatenate locate databases
mklocatedb - build locate database
Diffstat (limited to 'usr.bin/locate/bigram')
-rw-r--r-- | usr.bin/locate/bigram/Makefile | 4 | ||||
-rw-r--r-- | usr.bin/locate/bigram/locate.bigram.c | 55 |
2 files changed, 47 insertions, 12 deletions
diff --git a/usr.bin/locate/bigram/Makefile b/usr.bin/locate/bigram/Makefile index d7d4348..fbba14d 100644 --- a/usr.bin/locate/bigram/Makefile +++ b/usr.bin/locate/bigram/Makefile @@ -2,6 +2,8 @@ PROG= locate.bigram NOMAN= noman -BINDIR= /usr/libexec +BINDIR= ${LIBEXECDIR} +CFLAGS+= -I${.CURDIR}/../locate +.include "../Makefile.inc" .include <bsd.prog.mk> diff --git a/usr.bin/locate/bigram/locate.bigram.c b/usr.bin/locate/bigram/locate.bigram.c index 149e437..dc95399 100644 --- a/usr.bin/locate/bigram/locate.bigram.c +++ b/usr.bin/locate/bigram/locate.bigram.c @@ -53,32 +53,65 @@ static char sccsid[] = "@(#)locate.bigram.c 8.1 (Berkeley) 6/6/93"; #include <stdio.h> #include <sys/param.h> /* for MAXPATHLEN */ +#include <string.h> /* memchr */ +#include "locate.h" -char buf1[MAXPATHLEN] = " "; -char buf2[MAXPATHLEN]; +u_char buf1[MAXPATHLEN] = " "; +u_char buf2[MAXPATHLEN]; +unsigned int bigram[UCHAR_MAX][UCHAR_MAX]; -main ( ) + +void main ( ) { - register char *cp; - register char *oldpath = buf1, *path = buf2; + register u_char *cp; + register u_char *oldpath = buf1, *path = buf2; + register int i, j; + + /* init bigram buffer */ + for (i = 0; i < UCHAR_MAX; i++) + for (j = 0; j < UCHAR_MAX; j++) + bigram[i][j] = 0; while ( fgets ( path, sizeof(buf2), stdin ) != NULL ) { + /* skip empty lines */ + if (*path == '\n') + continue; + + /* Squelch characters that would botch the decoding. */ + for (cp = path; *cp != NULL; cp++) { + /* chop newline */ + if (*cp == '\n') + *cp = NULL; + /* range */ + else if (*cp < ASCII_MIN || *cp > ASCII_MAX) + *cp = '?'; + } + + /* skip longest common prefix */ - for ( cp = path; *cp == *oldpath; cp++, oldpath++ ) - if ( *oldpath == NULL ) - break; + for (cp = path; *cp == *oldpath && *cp; cp++, oldpath++); + /* * output post-residue bigrams only */ + + /* check later for boundary */ while ( *cp != NULL && *(cp + 1) != NULL ) { - putchar ( *cp++ ); - putchar ( *cp++ ); - putchar ( '\n' ); + bigram[*cp][*(cp+1)]++; + cp += 2; } + if ( path == buf1 ) /* swap pointers */ path = buf2, oldpath = buf1; else path = buf1, oldpath = buf2; } + + /* output, boundary check */ + for (i = ASCII_MIN; i <= ASCII_MAX; i++) + for (j = ASCII_MIN; j <= ASCII_MAX; j++) + if (bigram[i][j] != 0) + fprintf(stdout, "%4d %c%c\n", + bigram[i][j], i, j); } |