bigram

Bigram does not remove newline at end of filename. This break particulary the bigram algorithm and /var/db/locate.database grow up 15 %. Bigram does not check for characters outside 32-127. The bigram output is silly and need ~1/2 CPU time of database rebuilding. old: locate.bigram < $filelist | sort | uniq -c | sort -nr ^^^^^^^^^^^^^^ this can easy made bigram new: bigram < $filelist | sort -nr code Code does not check for char 31. Use a lookup array instead a function. 3 x faster. updatedb rewritten sync with bigram changes read config file /etc/locate.rc if exists submitted by: guido@gvr.win.tue.nl (Guido van Rooij) concatdb - concatenate locate databases mklocatedb - build locate database
author: wosch <wosch@FreeBSD.org> 1996-08-14 00:22:31 +0000
committer: wosch <wosch@FreeBSD.org> 1996-08-14 00:22:31 +0000
commit: 9c0ad6a2b3f4178ace4292cdb66f55997eddf45c (patch)
tree: 5ee2c3ccbe51dc57abaaffc3dd772e05c9efab8c /usr.bin/locate/bigram
parent: 521551a94c26abb72ce3538eb54e226c481f5dad (diff)
download: FreeBSD-src-9c0ad6a2b3f4178ace4292cdb66f55997eddf45c.zip
FreeBSD-src-9c0ad6a2b3f4178ace4292cdb66f55997eddf45c.tar.gz
2 files changed, 47 insertions, 12 deletions
diff --git a/usr.bin/locate/bigram/Makefile b/usr.bin/locate/bigram/Makefile
index d7d4348..fbba14d 100644
--- a/usr.bin/locate/bigram/Makefile
+++ b/usr.bin/locate/bigram/Makefile
@@ -2,6 +2,8 @@
 
 PROG=	locate.bigram
 NOMAN=	noman
-BINDIR=	/usr/libexec
+BINDIR=	${LIBEXECDIR}
+CFLAGS+= -I${.CURDIR}/../locate
 
+.include "../Makefile.inc"
 .include <bsd.prog.mk>
diff --git a/usr.bin/locate/bigram/locate.bigram.c b/usr.bin/locate/bigram/locate.bigram.c
index 149e437..dc95399 100644
--- a/usr.bin/locate/bigram/locate.bigram.c
+++ b/usr.bin/locate/bigram/locate.bigram.c
@@ -53,32 +53,65 @@ static char sccsid[] = "@(#)locate.bigram.c	8.1 (Berkeley) 6/6/93";
 
 #include <stdio.h>
 #include <sys/param.h>			/* for MAXPATHLEN */
+#include <string.h>			/* memchr */
+#include "locate.h"
 
-char buf1[MAXPATHLEN] = " ";
-char buf2[MAXPATHLEN];
+u_char buf1[MAXPATHLEN] = " ";
+u_char buf2[MAXPATHLEN];
+unsigned int bigram[UCHAR_MAX][UCHAR_MAX];
 
-main ( )
+
+void main ( )
 {
-  	register char *cp;
-	register char *oldpath = buf1, *path = buf2;
+  	register u_char *cp;
+	register u_char *oldpath = buf1, *path = buf2;
+	register int i, j;
+
+	/* init bigram buffer */
+	for (i = 0; i < UCHAR_MAX; i++)
+	    	for (j = 0; j < UCHAR_MAX; j++)
+			bigram[i][j] = 0;
 
      	while ( fgets ( path, sizeof(buf2), stdin ) != NULL ) {
 
+	    	/* skip empty lines */
+		if (*path == '\n')
+			continue;
+
+		/* Squelch characters that would botch the decoding. */
+		for (cp = path; *cp != NULL; cp++) {
+			/* chop newline */
+			if (*cp == '\n')
+				*cp = NULL;
+			/* range */
+			else if (*cp < ASCII_MIN || *cp > ASCII_MAX)
+				*cp = '?';
+		}
+
+
 		/* skip longest common prefix */
-		for ( cp = path; *cp == *oldpath; cp++, oldpath++ )
-			if ( *oldpath == NULL )
-				break;
+		for (cp = path; *cp == *oldpath && *cp; cp++, oldpath++);
+
 		/*
 		 * output post-residue bigrams only
 		 */
+
+		/* check later for boundary */
 		while ( *cp != NULL && *(cp + 1) != NULL ) {
-			putchar ( *cp++ );
-			putchar ( *cp++ );
-			putchar ( '\n' );
+			bigram[*cp][*(cp+1)]++;
+			cp += 2;
 		}
+
 		if ( path == buf1 )		/* swap pointers */
 			path = buf2, oldpath = buf1;
 		else
 			path = buf1, oldpath = buf2;
    	}
+
+	/* output, boundary check */
+	for (i = ASCII_MIN; i <= ASCII_MAX; i++)
+		for (j = ASCII_MIN; j <= ASCII_MAX; j++)
+			if (bigram[i][j] != 0)
+				fprintf(stdout, "%4d %c%c\n",
+					bigram[i][j], i, j);
 }
author	wosch <wosch@FreeBSD.org>	1996-08-14 00:22:31 +0000
committer	wosch <wosch@FreeBSD.org>	1996-08-14 00:22:31 +0000
commit	9c0ad6a2b3f4178ace4292cdb66f55997eddf45c (patch)
tree	5ee2c3ccbe51dc57abaaffc3dd772e05c9efab8c /usr.bin/locate/bigram
parent	521551a94c26abb72ce3538eb54e226c481f5dad (diff)
download	FreeBSD-src-9c0ad6a2b3f4178ace4292cdb66f55997eddf45c.zip FreeBSD-src-9c0ad6a2b3f4178ace4292cdb66f55997eddf45c.tar.gz