diff options
author | wosch <wosch@FreeBSD.org> | 1996-08-14 00:22:31 +0000 |
---|---|---|
committer | wosch <wosch@FreeBSD.org> | 1996-08-14 00:22:31 +0000 |
commit | 9c0ad6a2b3f4178ace4292cdb66f55997eddf45c (patch) | |
tree | 5ee2c3ccbe51dc57abaaffc3dd772e05c9efab8c /usr.bin/locate | |
parent | 521551a94c26abb72ce3538eb54e226c481f5dad (diff) | |
download | FreeBSD-src-9c0ad6a2b3f4178ace4292cdb66f55997eddf45c.zip FreeBSD-src-9c0ad6a2b3f4178ace4292cdb66f55997eddf45c.tar.gz |
bigram
Bigram does not remove newline at end of filename. This
break particulary the bigram algorithm and /var/db/locate.database
grow up 15 %.
Bigram does not check for characters outside 32-127.
The bigram output is silly and need ~1/2 CPU time of
database rebuilding.
old:
locate.bigram < $filelist | sort | uniq -c | sort -nr
^^^^^^^^^^^^^^
this can easy made bigram
new:
bigram < $filelist | sort -nr
code
Code does not check for char 31.
Use a lookup array instead a function. 3 x faster.
updatedb
rewritten
sync with bigram changes
read config file /etc/locate.rc if exists
submitted by: guido@gvr.win.tue.nl (Guido van Rooij)
concatdb - concatenate locate databases
mklocatedb - build locate database
Diffstat (limited to 'usr.bin/locate')
-rw-r--r-- | usr.bin/locate/Makefile | 1 | ||||
-rw-r--r-- | usr.bin/locate/Makefile.inc | 3 | ||||
-rw-r--r-- | usr.bin/locate/bigram/Makefile | 4 | ||||
-rw-r--r-- | usr.bin/locate/bigram/locate.bigram.c | 55 | ||||
-rw-r--r-- | usr.bin/locate/code/Makefile | 5 | ||||
-rw-r--r-- | usr.bin/locate/code/locate.code.c | 58 | ||||
-rw-r--r-- | usr.bin/locate/locate/Makefile | 12 | ||||
-rw-r--r-- | usr.bin/locate/locate/concatdb.sh | 49 | ||||
-rw-r--r-- | usr.bin/locate/locate/locate.h | 26 | ||||
-rw-r--r-- | usr.bin/locate/locate/locate.rc | 23 | ||||
-rw-r--r-- | usr.bin/locate/locate/mklocatedb.sh | 52 | ||||
-rw-r--r-- | usr.bin/locate/locate/updatedb.sh | 117 |
12 files changed, 307 insertions, 98 deletions
diff --git a/usr.bin/locate/Makefile b/usr.bin/locate/Makefile index bc55dab..45f82b9 100644 --- a/usr.bin/locate/Makefile +++ b/usr.bin/locate/Makefile @@ -1,4 +1,5 @@ # @(#)Makefile 8.1 (Berkeley) 6/6/93 +# $Id$ SUBDIR= bigram code locate diff --git a/usr.bin/locate/Makefile.inc b/usr.bin/locate/Makefile.inc new file mode 100644 index 0000000..0f80876 --- /dev/null +++ b/usr.bin/locate/Makefile.inc @@ -0,0 +1,3 @@ +# $Id$ + +LIBEXECDIR?= /usr/libexec diff --git a/usr.bin/locate/bigram/Makefile b/usr.bin/locate/bigram/Makefile index d7d4348..fbba14d 100644 --- a/usr.bin/locate/bigram/Makefile +++ b/usr.bin/locate/bigram/Makefile @@ -2,6 +2,8 @@ PROG= locate.bigram NOMAN= noman -BINDIR= /usr/libexec +BINDIR= ${LIBEXECDIR} +CFLAGS+= -I${.CURDIR}/../locate +.include "../Makefile.inc" .include <bsd.prog.mk> diff --git a/usr.bin/locate/bigram/locate.bigram.c b/usr.bin/locate/bigram/locate.bigram.c index 149e437..dc95399 100644 --- a/usr.bin/locate/bigram/locate.bigram.c +++ b/usr.bin/locate/bigram/locate.bigram.c @@ -53,32 +53,65 @@ static char sccsid[] = "@(#)locate.bigram.c 8.1 (Berkeley) 6/6/93"; #include <stdio.h> #include <sys/param.h> /* for MAXPATHLEN */ +#include <string.h> /* memchr */ +#include "locate.h" -char buf1[MAXPATHLEN] = " "; -char buf2[MAXPATHLEN]; +u_char buf1[MAXPATHLEN] = " "; +u_char buf2[MAXPATHLEN]; +unsigned int bigram[UCHAR_MAX][UCHAR_MAX]; -main ( ) + +void main ( ) { - register char *cp; - register char *oldpath = buf1, *path = buf2; + register u_char *cp; + register u_char *oldpath = buf1, *path = buf2; + register int i, j; + + /* init bigram buffer */ + for (i = 0; i < UCHAR_MAX; i++) + for (j = 0; j < UCHAR_MAX; j++) + bigram[i][j] = 0; while ( fgets ( path, sizeof(buf2), stdin ) != NULL ) { + /* skip empty lines */ + if (*path == '\n') + continue; + + /* Squelch characters that would botch the decoding. */ + for (cp = path; *cp != NULL; cp++) { + /* chop newline */ + if (*cp == '\n') + *cp = NULL; + /* range */ + else if (*cp < ASCII_MIN || *cp > ASCII_MAX) + *cp = '?'; + } + + /* skip longest common prefix */ - for ( cp = path; *cp == *oldpath; cp++, oldpath++ ) - if ( *oldpath == NULL ) - break; + for (cp = path; *cp == *oldpath && *cp; cp++, oldpath++); + /* * output post-residue bigrams only */ + + /* check later for boundary */ while ( *cp != NULL && *(cp + 1) != NULL ) { - putchar ( *cp++ ); - putchar ( *cp++ ); - putchar ( '\n' ); + bigram[*cp][*(cp+1)]++; + cp += 2; } + if ( path == buf1 ) /* swap pointers */ path = buf2, oldpath = buf1; else path = buf1, oldpath = buf2; } + + /* output, boundary check */ + for (i = ASCII_MIN; i <= ASCII_MAX; i++) + for (j = ASCII_MIN; j <= ASCII_MAX; j++) + if (bigram[i][j] != 0) + fprintf(stdout, "%4d %c%c\n", + bigram[i][j], i, j); } diff --git a/usr.bin/locate/code/Makefile b/usr.bin/locate/code/Makefile index 743e968..a7d8e80 100644 --- a/usr.bin/locate/code/Makefile +++ b/usr.bin/locate/code/Makefile @@ -1,8 +1,9 @@ # @(#)Makefile 8.1 (Berkeley) 6/6/93 PROG= locate.code -CFLAGS+=-I${.CURDIR}/../locate +CFLAGS+=-I${.CURDIR}/../locate NOMAN= noman -BINDIR= /usr/libexec +BINDIR= ${LIBEXECDIR} +.include "../Makefile.inc" .include <bsd.prog.mk> diff --git a/usr.bin/locate/code/locate.code.c b/usr.bin/locate/code/locate.code.c index a7506ec..60be32a 100644 --- a/usr.bin/locate/code/locate.code.c +++ b/usr.bin/locate/code/locate.code.c @@ -89,25 +89,38 @@ static char sccsid[] = "@(#)locate.code.c 8.1 (Berkeley) 6/6/93"; #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */ -char buf1[MAXPATHLEN + 1] = " "; -char buf2[MAXPATHLEN + 1]; +u_char buf1[MAXPATHLEN] = " "; +u_char buf2[MAXPATHLEN]; char bigrams[BGBUFSIZE + 1] = { 0 }; +#define LOOKUP 1 +#ifdef LOOKUP +#define BGINDEX(x) (big[(u_int)*x][(u_int)*(x+1)]) +typedef u_char bg_t; +bg_t big[UCHAR_MAX][UCHAR_MAX]; + +#else +#define BGINDEX(x) bgindex(x) +typedef int bg_t; +#endif + int bgindex __P((char *)); void usage __P((void)); +extern int optind; +extern int optopt; int main(argc, argv) int argc; char *argv[]; { - register char *cp, *oldpath, *path; + register u_char *cp, *oldpath, *path; int ch, code, count, diffcount, oldcount; FILE *fp; + register int i, j; while ((ch = getopt(argc, argv, "")) != EOF) switch(ch) { - case '?': default: usage(); } @@ -126,27 +139,38 @@ main(argc, argv) err(1, "stdout"); (void)fclose(fp); +#ifdef LOOKUP + /* init lookup table */ + for (i = 0; i < UCHAR_MAX; i++) + for (j = 0; j < UCHAR_MAX; j++) + big[i][j] = (bg_t)-1; + + for (cp = bigrams, i = 0; *cp != NULL; i += 2, cp += 2) + big[(int)*cp][(int)*(cp + 1)] = (bg_t)i; +#endif + oldpath = buf1; path = buf2; oldcount = 0; - while (fgets(path, sizeof(buf2) - 1, stdin) != NULL) { - /* Truncate newline. */ - cp = path + strlen(path) - 1; - if (cp > path && *cp == '\n') - *cp = '\0'; + while (fgets(path, sizeof(buf2), stdin) != NULL) { + + /* skip empty lines */ + if (*path == '\n') + continue; /* Squelch characters that would botch the decoding. */ for (cp = path; *cp != NULL; cp++) { - if ((u_char)*cp >= PARITY) - *cp &= PARITY-1; - if (*cp <= SWITCH) + /* chop newline */ + if (*cp == '\n') + *cp = NULL; + /* range */ + else if (*cp < ASCII_MIN || *cp > ASCII_MAX) *cp = '?'; } /* Skip longest common prefix. */ - for (cp = path; *cp == *oldpath; cp++, oldpath++) - if (*oldpath == NULL) - break; + for (cp = path; *cp == *oldpath && *cp; cp++, oldpath++); + count = cp - path; diffcount = count - oldcount + OFFSET; oldcount = count; @@ -164,7 +188,7 @@ main(argc, argv) err(1, "stdout"); break; } - if ((code = bgindex(cp)) < 0) { + if ((code = BGINDEX(cp)) == (bg_t)-1) { if (putchar(*cp++) == EOF || putchar(*cp++) == EOF) err(1, "stdout"); @@ -189,6 +213,7 @@ main(argc, argv) exit(0); } +#ifndef LOOKUP int bgindex(bg) /* Return location of bg in bigrams or -1. */ char *bg; @@ -202,6 +227,7 @@ bgindex(bg) /* Return location of bg in bigrams or -1. */ break; return (*p == NULL ? -1 : --p - bigrams); } +#endif /* !LOOKUP */ void usage() diff --git a/usr.bin/locate/locate/Makefile b/usr.bin/locate/locate/Makefile index fd1bae8..e4ebc2f 100644 --- a/usr.bin/locate/locate/Makefile +++ b/usr.bin/locate/locate/Makefile @@ -1,12 +1,22 @@ # @(#)Makefile 8.1 (Berkeley) 6/6/93 +# $Id: Makefile,v 1.3 1996/04/25 15:54:22 wosch Exp wosch $ PROG= locate MAN1= locate.1 MAN8= locate.updatedb.8 +SCRIPTS= updatedb mklocatedb concatdb +MLINKS+= locate.updatedb.8 updatedb.8 beforeinstall: +.for script in ${SCRIPTS} ${INSTALL} -c -o ${BINOWN} -g ${BINGRP} -m ${BINMODE} \ - ${.CURDIR}/updatedb.sh ${DESTDIR}/usr/libexec/locate.updatedb + ${.CURDIR}/${script}.sh ${DESTDIR}${LIBEXECDIR}/locate.${script} +.endfor + +# only /usr/src/etc/Makefile install files in /etc +# ${INSTALL} -c -o root -g wheel -m 644 \ +# ${.CURDIR}/locate.rc ${DESTDIR}/etc .include "../../Makefile.inc" +.include "../Makefile.inc" .include <bsd.prog.mk> diff --git a/usr.bin/locate/locate/concatdb.sh b/usr.bin/locate/locate/concatdb.sh new file mode 100644 index 0000000..7b1aafd --- /dev/null +++ b/usr.bin/locate/locate/concatdb.sh @@ -0,0 +1,49 @@ +#!/bin/sh +# +# (c) Wolfram Schneider, Berlin. September 1995. Public domain. +# +# concatdb - concatenate locate databases +# +# usage: concatdb database1 ... databaseN > newdb +# +# Sequence of databases is important. +# +# $Id: concatdb.sh,v 1.2 1996/04/20 21:55:21 wosch Exp wosch $ + +# The directory containing locate subprograms +: ${LIBEXECDIR=/usr/libexec}; export LIBEXECDIR + +PATH=$LIBEXECDIR:/bin:/usr/bin:$PATH; export PATH + +umask 077 # protect temp files + +: ${TMPDIR=/tmp}; export TMPDIR; +if test X"$TMPDIR" = X -o ! -d "$TMPDIR"; then + TMPDIR=/tmp; export TMPDIR +fi + +# utilities to built locate database +: ${bigram=locate.bigram} +: ${code=locate.code} +: ${sort=sort} + + +case $# in + [01]) echo 'usage: concatdb databases1 ... databaseN > newdb' + exit 1 + ;; +esac + + +bigrams=$TMPDIR/_concatdb$$.bigrams +trap 'rm -f $bigrams' 0 1 2 3 5 10 15 + +for db +do + $locate -d $db / +done | $bigram | $sort -nr | awk 'NR <= 128 { printf $2 }' > $bigrams + +for db +do + $locate -d $db / +done | $code $bigrams diff --git a/usr.bin/locate/locate/locate.h b/usr.bin/locate/locate/locate.h index fe4da28..c3a7845 100644 --- a/usr.bin/locate/locate/locate.h +++ b/usr.bin/locate/locate/locate.h @@ -39,3 +39,29 @@ #define OFFSET 14 /* abs value of max likely diff */ #define PARITY 0200 /* parity bit */ #define SWITCH 30 /* switch code */ + +/* 0-28 likeliest differential counts + offset to make nonnegative */ +#define LDC_MIN 0 +#define LDC_MAX 28 + +/* 128-255 bigram codes (128 most common, as determined by 'updatedb') */ +#define BIGRAM_MIN (UCHAR_MAX - CHAR_MAX) +#define BIGRAM_MAX UCHAR_MAX + +/* 32-127 single character (printable) ascii residue (ie, literal) */ +#define ASCII_MIN 32 +#define ASCII_MAX CHAR_MAX + +/* #define TO7BIT(x) (x = ( ((u_char)x) & CHAR_MAX )) */ +#define TO7BIT(x) (x = x & CHAR_MAX ) + + +#if UCHAR_MAX >= 4096 + define TOLOWER(ch) tolower(ch) +#else + +u_char myctype[UCHAR_MAX + 1]; +#define TOLOWER(ch) (myctype[ch]) +#endif + +#define INTSIZE (sizeof(int)) diff --git a/usr.bin/locate/locate/locate.rc b/usr.bin/locate/locate/locate.rc new file mode 100644 index 0000000..4b52669 --- /dev/null +++ b/usr.bin/locate/locate/locate.rc @@ -0,0 +1,23 @@ +# +# /etc/locate.rc - command script for updatedb(8) +# +# $Id: locate.rc,v 1.1 1996/04/26 15:25:23 wosch Exp wosch $ + +# temp directory +#TMPDIR="/tmp" + +# the actual database +#FCODES="/var/db/locate.database" + +# directories to be put in the database +#SEARCHPATHS="/" + +# directories unwanted in output +#PRUNEPATHS="/tmp /usr/tmp /var/tmp" + +# filesystems allowed. Beware: a non-listed filesystem will be pruned +# and is the SEARCHPATHS starts in such a filesystem locate will build +# an empty database +# +# be carefully if you add 'nfs' +#FILESYSTEMS="ufs" diff --git a/usr.bin/locate/locate/mklocatedb.sh b/usr.bin/locate/locate/mklocatedb.sh new file mode 100644 index 0000000..52555a0 --- /dev/null +++ b/usr.bin/locate/locate/mklocatedb.sh @@ -0,0 +1,52 @@ +#!/bin/sh +# +# (c) Wolfram Schneider, September 1995. Public domain. +# +# mklocatedb - build locate database +# +# usage: mklocatedb [-presort] < filelist > database +# +# $Id: mklocatedb.sh,v 1.2 1996/04/20 21:55:21 wosch Exp wosch $ + + +# The directory containing locate subprograms +: ${LIBEXECDIR=/usr/libexec}; export LIBEXECDIR + +PATH=$LIBEXECDIR:/bin:/usr/bin:$PATH; export PATH + +umask 077 # protect temp files + +: ${TMPDIR=/tmp}; export TMPDIR; +if test X"$TMPDIR" = X -o ! -d "$TMPDIR"; then + TMPDIR=/tmp; export TMPDIR +fi + +# utilities to built locate database +: ${bigram=locate.bigram} +: ${code=locate.code} +: ${sort=sort} + + +sortopt="-u -T $TMPDIR" +sortcmd=$sort + +# Input already sorted +case X"$1" in + X-nosort|X-presort) sortcmd=cat; sortopt=;shift;; +esac + + +bigrams=$TMPDIR/_mklocatedb$$.bigrams +filelist=$TMPDIR/_mklocatedb$$.list + +trap 'rm -f $bigrams $filelist' 0 1 2 3 5 10 15 + + +if $sortcmd $sortopt > $filelist; then + $bigram < $filelist | $sort -nr | + awk 'NR <= 128 { printf $2 }' > $bigrams && + $code $bigrams < $filelist +else + echo "`basename $0`: cannot build locate database" >&2 + exit 1 +fi diff --git a/usr.bin/locate/locate/updatedb.sh b/usr.bin/locate/locate/updatedb.sh index af9eb47..02d303e 100644 --- a/usr.bin/locate/locate/updatedb.sh +++ b/usr.bin/locate/locate/updatedb.sh @@ -1,79 +1,62 @@ #!/bin/sh # -# Copyright (c) 1989, 1993 -# The Regents of the University of California. All rights reserved. +# (c) Wolfram Schneider, Berlin. September 1995. Public domain. # -# This code is derived from software contributed to Berkeley by -# James A. Woods. -# -# Modified to be a /bin/sh script by Nate Williams -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# 1. Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# 2. Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# 3. All advertising materials mentioning features or use of this software -# must display the following acknowledgement: -# This product includes software developed by the University of -# California, Berkeley and its contributors. -# 4. Neither the name of the University nor the names of its contributors -# may be used to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND -# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS -# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) -# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT -# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY -# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF -# SUCH DAMAGE. -# -# @(#)updatedb.csh 8.3 (Berkeley) 3/19/94 +# updatedb - update locate database for local mounted filesystems # +# $Id: updatedb.sh,v 1.3 1996/04/20 21:55:21 wosch Exp wosch $ -SRCHPATHS="/" # directories to be put in the database -LIBDIR="/usr/libexec" # for subprograms -FCODES="/var/db/locate.database" # the database -if [ "$TMPDIR" = "" ]; then - TMPDIR="/var/tmp" # for temp files +LOCATE_CONFIG="/etc/locate.rc" +if [ -f "$LOCATE_CONFIG" -a -r "$LOCATE_CONFIG" ]; then + . $LOCATE_CONFIG fi -PATH=/bin:/usr/bin -BIGRAMS="$TMPDIR/locate.bigrams.$$" -FILELIST="$TMPDIR/locate.list.$$" -ERRS="$TMPDIR/locate.errs.$$" +# The directory containing locate subprograms +: ${LIBEXECDIR=/usr/libexec}; export LIBEXECDIR + +PATH=$LIBEXECDIR:/bin:/usr/bin:$PATH; export PATH + + +: ${mklocatedb=locate.mklocatedb} # make locate database program +: ${FCODES=/var/db/locate.database} # the database +: ${SEARCHPATHS="/"} # directories to be put in the database +: ${PRUNEPATHS="/tmp /usr/tmp /var/tmp"} # unwanted directories +: ${FILESYSTEMS="ufs"} # allowed filesystems +: ${find=find} -# Make a file list and compute common bigrams. -# Alphabetize '/' before any other char with 'tr'. -# If the system is very short of sort space, 'bigram' can be made -# smarter to accumulate common bigrams directly without sorting -# ('awk', with its associative memory capacity, can do this in several -# lines, but is too slow, and runs out of string space on small machines). +case X"$SEARCHPATHS" in + X) echo "$0: empty variable SEARCHPATHS"; exit 1;; esac +case X"$FILESYSTEMS" in + X) echo "$0: empty variable FILESYSTEMS"; exit 1;; esac -# search locally or everything -# find ${SRCHPATHS} -print | \ -find ${SRCHPATHS} ! -fstype ufs -prune -or -print | \ - tr '/' '\001' | \ - (sort -T $TMPDIR -f; echo $? > $ERRS) | tr '\001' '/' > $FILELIST +# Make a list a paths to exclude in the locate run +excludes="! (" or="" +for fstype in $FILESYSTEMS +do + excludes="$excludes $or -fstype $fstype" + or="-or" +done +excludes="$excludes ) -prune" -$LIBDIR/locate.bigram < $FILELIST | \ - (sort -T $TMPDIR ; echo $? >> $ERRS) | \ - uniq -c | sort -T $TMPDIR -nr | \ - awk '{ if (NR <= 128) print $2 }' | tr -d '\012' > $BIGRAMS +case X"$PRUNEPATHS" in + X) ;; + *) for path in $PRUNEPATHS + do + excludes="$excludes -or -path $path -prune" + done;; +esac -# code the file list -if [ `sort -u $ERRS | grep -s -v 0` ]; then - printf 'locate: updatedb failed\n\n' -else - $LIBDIR/locate.code $BIGRAMS < $FILELIST > $FCODES - chmod 644 $FCODES - rm $BIGRAMS $FILELIST $ERRS +tmp=${TMPDIR=/tmp}/_updatedb$$ +trap 'rm -f $tmp' 0 1 2 3 5 10 15 + +# search locally +# echo $find $SEARCHPATHS $excludes -or -print && exit +if $find $SEARCHPATHS $excludes -or -print 2>/dev/null | + $mklocatedb > $tmp +then + case X"`$find $tmp -size -257c -print`" in + X) cat $tmp > $FCODES;; + *) echo "updatedb: locate database $tmp is empty" + exit 1 + esac fi |