Add private text file binary search API to libheimbase

2011-11-28 22:03:28 -06:00
parent aea02876e7
commit 659c761213
10 changed files with 1197 additions and 1 deletions
--- a/appl/Makefile.am
+++ b/appl/Makefile.am
@@ -10,6 +10,7 @@ dir_dce = dceutils
 endif
 SUBDIRS = 					\
 	  afsutil				\
+	  dbutils				\
 	  ftp					\
 	  login					\
 	  $(dir_otp)				\
--- a/appl/dbutils/Makefile.am
+++ b/appl/dbutils/Makefile.am
@@ -0,0 +1,13 @@
+# $Id$
+
+include $(top_srcdir)/Makefile.am.common
+
+bin_PROGRAMS = bsearch
+
+bsearch_SOURCES  = bsearch.c
+
+man_MANS = bsearch.1
+
+EXTRA_DIST = NTMakefile $(man_MANS)
+
+LDADD = $(LIB_roken) $(LIB_vers) $(LIB_heimbase)
--- a/appl/dbutils/NTMakefile
+++ b/appl/dbutils/NTMakefile
@@ -0,0 +1,35 @@
+########################################################################
+#
+# Copyright (c) 2009, Secure Endpoints Inc.
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 
+# - Redistributions of source code must retain the above copyright
+#   notice, this list of conditions and the following disclaimer.
+# 
+# - Redistributions in binary form must reproduce the above copyright
+#   notice, this list of conditions and the following disclaimer in
+#   the documentation and/or other materials provided with the
+#   distribution.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+# 
+
+RELDIR=appl\dbutils 
+
+!include ../../windows/NTMakefile.w32 
+
--- a/appl/dbutils/bsearch.1
+++ b/appl/dbutils/bsearch.1
@@ -0,0 +1,114 @@
+.\"
+.\" Copyright (c) 2011, Secure Endpoints Inc.
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\"
+.\" - Redistributions of source code must retain the above copyright
+.\"   notice, this list of conditions and the following disclaimer.
+.\"
+.\" - Redistributions in binary form must reproduce the above copyright
+.\"   notice, this list of conditions and the following disclaimer in
+.\"   the documentation and/or other materials provided with the
+.\"   distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+.\" "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+.\" LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+.\" FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+.\" COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+.\" INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+.\" (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+.\" SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+.\" STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+.\" OF THE POSSIBILITY OF SUCH DAMAGE.
+.\"
+.Dd November 30, 2011
+.Dt BSEARCH 1
+.Os KTH-KRB
+.Sh NAME
+.Nm bsearch
+.Nd manages one-time passwords
+.Sh SYNOPSIS
+.Nm bsearch
+.Op Fl KVvh
+.Op Fl b Ar block-size
+.Op Fl m Ar max-cache-size
+.Ar file
+.Ar [key ...]
+.Sh DESCRIPTION
+The
+.Nm
+program performs binary searches of
+.Ar file
+which must be a sorted flat text file.
+.Pp
+Each line is a record.  Each record starts with a key
+that is optionally followed by whitespace and a value.
+Whitespace may be quoted with a backslash, but newline
+and carriage-return characters must be quoted in some
+other manner (e.g., as backslash-n and backslash-r).
+Escapes are not interpreted nor removed.
+.Pp
+If no key arguments are given on the comman-line, then
+keys will be read from standard input.
+.Pp
+By default only values are printed to standard output.
+Use the -K option to also print keys.  The exit status
+will be non-zero if any key lookups fail.
+.Pp
+Options are:
+.Bl -tag -width Ds
+.It Fl K
+Print keys.
+.It Fl V
+Don't print values.
+.It Fl h
+Print usage and exit.
+.It Fl v
+Print statistic and debug information to standard
+error.
+.Ar file
+A sorted flat text file.  NOTE: use the "C" locale for
+sorting this file, as in "LC_ALL=C sort -u -o file
+file".
+.It Fl h
+For getting a help message.
+.It Fl m
+Set
+.Ar max-cache-size
+as the maximum cache size.  If the
+.Ar file
+is smaller than this size then the whole file will be
+read into memory, else the program will read blocks.
+Defaults to 1MB.
+.It Fl b
+Set
+.Ar block-size
+as the block size for block-wise I/O.  This must be a
+power of 2, must be no smaller than 512 and no larger
+than 1MB.  Defaults to the
+.Ar file's
+filesystem's preferred blocksize.
+.El
+.Sh EXAMPLES
+.Bd -literal -offset indent
+$ env LC_ALL=C sort -o /tmp/words /usr/share/dict/words
+$ bsearch -Kv /tmp/words day
+Using whole-file method
+Key day found at offset 327695 in 12 loops and 0 reads
+day
+$ 
+.Sh NOTES
+.Pp
+Records must not be longer than one block's size.
+.Pp
+Flat text files must be sorted in the "C" locale.  In
+some systems the default locale may result in
+case-insensitive sorting by the sort command.
+.Sh SEE ALSO
+.Xr sort 1
--- a/appl/dbutils/bsearch.c
+++ b/appl/dbutils/bsearch.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2011, Secure Endpoints Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <errno.h>
+#include <roken.h>
+#include <heimbase.h>
+#include <getarg.h>
+#include <vers.h>
+
+int help_flag;
+int version_flag;
+int verbose_flag;
+int print_keys_flag;
+int no_values_flag;
+int block_size_int;
+int max_size_int;
+
+struct getargs args[] = {
+    { "print-keys",     'K',  arg_flag, &print_keys_flag,
+	"print keys", NULL },
+    { "no-values",      'V',  arg_flag, &no_values_flag,
+	"don't print values", NULL },
+    { "verbose",        'v',  arg_flag, &verbose_flag,
+	"print statistics and informative messages", NULL },
+    { "help",           'h',  arg_flag, &help_flag,
+	"print usage message", NULL },
+    { "block-size",     'b',  arg_integer, &block_size_int,
+	"block size", "integer" },
+    { "max-cache-size", 'm',  arg_integer, &max_size_int,
+	"maximum cache size", "integer" },
+    { "version",        '\0', arg_flag, &version_flag, NULL, NULL }
+};
+
+static int num_args = sizeof(args) / sizeof(args[0]);
+
+static void
+usage(const char *progname, int status)
+{
+    arg_printusage(args, num_args, progname, "\n"
+"\tThis program does a binary search of the given file for the\n"
+"\tgiven keys.  Two binary search algorithms are implemented\n"
+"\twhole-file and block-wise.\n\n"
+"\tIf keys are not given as arguments keys are read from stdin.\n\n"
+"\tExit status will be 1 for errors, 2 if any keys are not found,\n"
+"\tand 0 if all keys are found.\n\n"
+"\tOptions:\n"
+"\t\t-K \tPrint keys\n"
+"\t\t-V \tDon't print values\n"
+"\t\t-b size\tUse block-wise search with give blocksize\n"
+"\t\t-m size\tRead DB in if its size is less than given\n"
+"\t\t-v \tVerbose (includes count of reads and comparisons)\n"
+"\t\t-h \tPrint usage message and exit\n"
+"\tIf blocksize is not given, empty, or zero then the\n"
+"\tfilesystem's block size (st_blksize) will be used.\n"
+"\tBlock sizes should be powers of two, and larger than 256.\n"
+"\tIf the max file size is not given or empty then the max\n"
+"\tfile size for non-block-wise search will be 1MB.\n"
+"\tKeys from stdin must not be longer than 1023 bytes.\n\n"
+	    );
+    exit(status);
+}
+
+#define MAX_BLOCK_SIZE (1024 * 1024)
+#define DEFAULT_MAX_FILE_SIZE (1024 * 1024)
+
+int
+main(int argc, char **argv)
+{
+    char keybuf[1024];
+    char *progname = argv[0];
+    char *fname;
+    char *key = keybuf;
+    char *value;
+    char *p;
+    bsearch_file_handle bfh = NULL;
+    size_t num;
+    size_t loc;           /* index where record is located or to be inserted */
+    size_t loops;         /* number of loops/comparisons needed for lookup */
+    size_t reads = 0;	  /* number of reads needed for a lookup */
+    size_t failures = 0;  /* number of lookup failures -- for exit status */
+    size_t block_size = 0;
+    size_t max_size = 0;
+    int optidx = 0;
+    int blockwise;
+    int ret = 0;
+
+    if (getarg(args, num_args, argc, argv, &optidx))
+	usage(progname, 1);
+
+    if (version_flag) {
+	print_version(NULL);
+	return 0;
+    }
+
+    if (help_flag)
+	usage(progname, 0);
+
+    if (block_size_int != 0 && block_size_int < 512) {
+	fprintf(stderr, "Invalid block size: too small\n");
+	return 1;
+    }
+    if (block_size_int > 0) {
+	/* Check that block_size is a power of 2 */
+	num = block_size_int;
+	while (num) {
+	    if ((num % 2) && (num >> 1)) {
+		fprintf(stderr, "Invalid block size: must be power "
+			"of two\n");
+		return 1;
+	    }
+	    num >>= 1;
+	}
+	if (block_size_int > MAX_BLOCK_SIZE)
+	    fprintf(stderr, "Invalid block size: too large\n");
+	block_size = block_size_int;
+    }
+    if (max_size_int < 0)
+	usage(progname, 1);
+    max_size = max_size_int;
+
+    argc -= optind;
+    argv += optind;
+
+    if (argc == 0)
+	usage(progname, 1);
+
+    fname = argv[0];
+    argc--;
+    argv++;
+
+    ret = __bsearch_file_open(fname, max_size, block_size, &bfh, &reads);
+    if (ret != 0) {
+	perror("bsearch_file_open");
+	return 1;
+    }
+
+    __bsearch_file_info(bfh, &block_size, &max_size, &blockwise);
+    if (verbose_flag && blockwise) {
+	fprintf(stderr, "Using block-wise method with block size %lu and "
+		"cache size %lu\n",
+		(long unsigned)block_size, (long unsigned)max_size);
+    } else if (verbose_flag) {
+	fprintf(stderr, "Using whole-file method\n");
+    }
+
+    for (;;) {
+	loops = 0; /* reset stats */
+	/* Eww */
+	if (argc) {
+	    key = *(argv++);
+	    if (!key)
+		break;
+	} else {
+	    if (!fgets(keybuf, sizeof (keybuf), stdin))
+		break;
+	    p = strchr(key, '\n');
+	    if (!p)
+		break;
+	    *p = '\0';
+	    if (!*key)
+		continue;
+	}
+	ret = __bsearch_file(bfh, key, &value, &loc, &loops, &reads);
+	if (ret != 0) {
+	    if (ret > 0) {
+		fprintf(stderr, "Error: %s\n", strerror(ret));
+		__bsearch_file_close(&bfh);
+		return 1;
+	    }
+	    if (verbose_flag)
+		fprintf(stderr, "Key %s not found in %lu loops and %lu reads; "
+			"insert at %lu\n", key, (long unsigned)loops,
+			(long unsigned)reads, (long unsigned)loc);
+	    failures++;
+	    continue;
+	}
+	if (verbose_flag)
+	    fprintf(stderr, "Key %s found at offset %lu in %lu loops and "
+		    "%lu reads\n", key, (long unsigned)loc,
+		    (long unsigned)loops, (long unsigned)reads);
+	if (print_keys_flag && !no_values_flag && value)
+	    printf("%s %s\n", key, value);
+	else if (print_keys_flag)
+	    printf("%s\n", key);
+	else if (no_values_flag && value)
+	    printf("%s\n", value);
+	free(value);
+    }
+    if (failures)
+	return 2;
+    __bsearch_file_close(&bfh);
+    return 0;
+}
--- a/base/Makefile.am
+++ b/base/Makefile.am
@@ -17,6 +17,7 @@ include_HEADERS	= heimbase.h
 dist_libheimbase_la_SOURCES =	\
 	array.c			\
 	baselocl.h		\
+	bsearch.c		\
 	bool.c			\
 	data.c			\
 	dict.c			\
--- a/base/bsearch.c
+++ b/base/bsearch.c
@@ -0,0 +1,786 @@
+/*
+ * Copyright (c) 2011, Secure Endpoints Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * - Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "baselocl.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <errno.h>
+#include <assert.h>
+#include <getopt.h>
+
+/*
+ * This file contains functions for binary searching flat text in memory
+ * and in text files where each line is a [variable length] record.
+ * Each record has a key and an optional value separated from the key by
+ * unquoted whitespace.  Whitespace in the key, and leading whitespace
+ * for the value, can be quoted with backslashes (but CR and LF must be
+ * quoted in such a way that they don't appear in the quoted result).
+ *
+ * Binary searching a tree are normally a dead simple algorithm.  It
+ * turns out that binary searching flat text with *variable* length
+ * records is... tricky.  There's no indexes to record beginning bytes,
+ * thus any index selected during the search is likely to fall in the
+ * middle of a record.  When deciding to search a left sub-tree one
+ * might fail to find the last record in that sub-tree on account of the
+ * right boundary falling in the middle of it -- the chosen solution to
+ * this makes left sub-tree searches slightly less efficient than right
+ * sub-tree searches.
+ *
+ * If binary searching flat text in memory is tricky, using block-wise
+ * I/O instead is trickier!  But it's necessary in order to support
+ * large files (which we either can't or wouldn't want to read or map
+ * into memory).  Each block we read has to be large enough that the
+ * largest record can fit in it.  And each block might start and/or end
+ * in the middle of a record.  Here it is the right sub-tree searches
+ * that are less efficient than left sub-tree searches.
+ *
+ * bsearch_common() contains the common text block binary search code.
+ *
+ * __bsearch_text() is the interface for searching in-core text.
+ * __bsearch_file() is the interface for block-wise searching files.
+ */
+
+struct bsearch_file_handle {
+    int fd;          /* file descriptor */
+    char *cache;     /* cache bytes */
+    char *page;      /* one double-size page worth of bytes */
+    size_t file_sz;  /* file size */
+    size_t cache_sz; /* cache size */
+    size_t page_sz;  /* page size */
+};
+
+/* Find a new-line */
+static const char *
+find_line(const char *buf, size_t i, size_t right)
+{
+    if (i == 0)
+	return &buf[i];
+    for (; i < right; i++) {
+	if (buf[i] == '\n') {
+	    if ((i + 1) < right)
+		return &buf[i + 1];
+	    return NULL;
+	}
+    }
+    return NULL;
+}
+
+/**
+ * Common routine for binary searching text in core.
+ *
+ * Perform a binary search of a char array containing a block from a
+ * text file where each line is a record (LF and CRLF supported).  Each
+ * record consists of a key followed by an optional value separated from
+ * the key by whitespace.  Whitespace can be quoted with backslashes.
+ * It's the caller's responsibility to encode/decode keys/values if
+ * quoting is desired; newlines should be encoded such that a newline
+ * does not appear in the result.
+ *
+ * All output arguments are optional.
+ *
+ * Returns 0 if key is found, -1 if not found, or an error code such as
+ * ENOMEM in case of error.
+ *
+ * Inputs:
+ *
+ * @buf          String to search
+ * @sz           Size of string to search
+ * @key          Key string to search for
+ * @buf_is_start True if the buffer starts with a record, false if it
+ *               starts in the middle of a record or if the caller
+ *               doesn't know.
+ *
+ * Outputs:
+ *
+ * @value        Location to store a copy of the value (caller must free)
+ * @location     Record location if found else the location where the
+ *               record should be inserted (index into @buf)
+ * @cmp	         Set to less than or greater than 0 to indicate that a
+ *               key not found would have fit in an earlier or later
+ *               part of a file.  Callers should use this to decide
+ *               whether to read a block to the left or to the right and
+ *               search that.
+ * @loops        Location to store a count of bisections required for
+ *               search (useful for confirming logarithmic performance)
+ */
+static int
+bsearch_common(const char *buf, size_t sz, const char *key,
+	       int buf_is_start, char **value, size_t *location,
+	       int *cmp, size_t *loops)
+{
+    const char *linep;
+    size_t key_start, key_len; /* key string in buf */
+    size_t val_start, val_len; /* value string in buf */
+    int key_cmp;
+    size_t k;
+    size_t l;    /* left side of buffer for binary search */
+    size_t r;    /* right side of buffer for binary search */
+    size_t rmax; /* right side of buffer for binary search */
+    size_t i;    /* index into buffer, typically in the middle of l and r */
+    size_t loop_count = 0;
+    int ret = -1;
+
+    if (value)
+	*value = NULL;
+    if (cmp)
+	*cmp = 0;
+    if (loops)
+	*loops = 0;
+
+    /* Binary search; file should be sorted */
+    for (l = 0, r = rmax = sz, i = sz >> 1; i >= l && i < rmax; loop_count++) {
+	heim_assert(i >= 0 && i < sz, "invalid aname2lname db index");
+
+	/* buf[i] is likely in the middle of a line; find the next line */
+	linep = find_line(buf, i, rmax);
+	k = linep ? linep - buf : i;
+	if (linep == NULL || k >= rmax) {
+	    /*
+	     * No new line found to the right; search to the left then
+	     * but don't change rmax (this isn't optimal, but it's
+	     * simple).
+	     */
+	    if (i == l)
+		break;
+	    r = i;
+	    i = l + ((r - l) >> 1);
+	    continue;
+	}
+	i = k;
+	heim_assert(i >= l && i < rmax, "invalid aname2lname db index");
+
+	/* Got a line; check it */
+
+	/* Search for and split on unquoted whitespace */
+	for (key_start = i, key_len = 0, val_len = 0, k = i; k < rmax; k++) {
+	    if (buf[k] == '\\') {
+		k++;
+		continue;
+	    }
+	    if (buf[k] == '\r' || buf[k] == '\n') {
+		/* We now know where the key ends, and there's no value */
+		key_len = k - i;
+		break;
+	    }
+	    if (!isspace(buf[k]))
+		continue;
+
+	    while (k < rmax && isspace(buf[k])) {
+		key_len = k - i;
+		k++;
+	    }
+	    if (k < rmax)
+		val_start = k;
+	    /* Find end of value */
+	    for (; k < rmax && buf[k] != '\0'; k++) {
+		if (buf[k] == '\r' || buf[k] == '\n') {
+		    val_len = k - val_start;
+		    break;
+		}
+	    }
+	    break;
+	}
+
+	/*
+	 * The following logic is for dealing with partial buffers,
+	 * which we use for block-wise binary searches of large files
+	 */
+	if (key_start == 0 && !buf_is_start) {
+	    /*
+	     * We're at the beginning of a block that might have started
+	     * in the middle of a record whose "key" might well compare
+	     * as greater than the key we're looking for, so we don't
+	     * bother comparing -- we know key_cmp must be -1 here.
+	     */
+	    key_cmp = -1;
+	    break;
+	}
+	if ((val_len && buf[val_start + val_len] != '\n') ||
+	    (!val_len && buf[key_start + key_len] != '\n')) {
+	    /*
+	     * We're at the end of a block that ends in the middle of a
+	     * record whose "key" might well compare as less than the
+	     * key we're looking for, so we don't bother comparing -- we
+	     * know key_cmp must be >= 0 but we can't tell.  Our caller
+	     * will end up reading a double-size block to handle this.
+	     */
+	    key_cmp = 1;
+	    break;
+	}
+
+	key_cmp = strncmp(key, &buf[key_start], key_len);
+	if (key_cmp == 0 && strlen(key) != key_len)
+	    key_cmp = 1;
+	if (key_cmp < 0) {
+	    /* search left */
+	    r = rmax = (linep - buf);
+	    i = l + ((r - l) >> 1);
+	    if (location)
+		*location = key_start;
+	} else if (key_cmp > 0) {
+	    /* search right */
+	    if (l == i)
+		break; /* not found */
+	    l = i;
+	    i = l + ((r - l) >> 1);
+	    if (location)
+		*location = val_start + val_len;
+	} else {
+	    /* match! */
+	    if (location)
+		*location = key_start;
+	    ret = 0;
+	    if (val_len && value) {
+		*value = strndup(&buf[val_start], val_len);
+		if (!*value)
+		    ret = errno;
+	    }
+	    break;
+	}
+    }
+
+    if (cmp)
+	*cmp = key_cmp;
+    if (loops)
+	*loops = loop_count;
+
+    return ret;
+}
+
+/**
+ * Binary search a char array containing sorted text records separated
+ * by new-lines (or CRLF).  Each record consists of a key and an
+ * optional value following the key, separated from the key by unquoted
+ * whitespace.
+ *
+ * All output arguments are optional.
+ *
+ * Returns 0 if key is found, -1 if not found, or an error code such as
+ * ENOMEM in case of error.
+ *
+ * Inputs:
+ *
+ * @buf      Char array pointer
+ * @buf_sz   Size of buf
+ * @key      Key to search for
+ *
+ * Outputs:
+ *
+ * @value    Location where to put the value, if any (caller must free)
+ * @location Record location if found else the location where the record
+ *           should be inserted (index into @buf)
+ * @loops    Location where to put a number of loops (or comparisons)
+ *           needed for the search (useful for benchmarking)
+ */
+int
+__bsearch_text(const char *buf, size_t buf_sz, const char *key,
+	       char **value, size_t *location, size_t *loops)
+{
+    return bsearch_common(buf, buf_sz, key, 1, value, location, NULL, loops);
+}
+
+#define MAX_BLOCK_SIZE (1024 * 1024)
+#define DEFAULT_MAX_FILE_SIZE (1024 * 1024)
+/**
+ * Open a file for binary searching.  The file will be read in entirely
+ * if it is smaller than @max_sz, else a cache of @max_sz bytes will be
+ * allocated.
+ *
+ * Returns 0 on success, else an error number or -1 if the file is empty.
+ *
+ * Inputs:
+ *
+ * @fname   Name of file to open
+ * @max_sz  Maximum size of cache to allocate, in bytes (if zero, default)
+ * @page_sz Page size (must be a power of two, larger than 256, smaller
+ *          than 1MB; if zero use default)
+ * 
+ * Outputs:
+ *
+ * @bfh     Handle for use with __bsearch_file() and __bsearch_file_close()
+ * @reads   Number of reads performed
+ */
+int
+__bsearch_file_open(const char *fname, size_t max_sz, size_t page_sz,
+		    bsearch_file_handle *bfh, size_t *reads)
+{
+    bsearch_file_handle new_bfh;
+    struct stat st;
+    size_t i;
+    int fd;
+    int ret;
+
+    *bfh = NULL;
+
+    if (reads)
+	*reads = 0;
+
+    fd = open(fname, O_RDONLY);
+    if (fd == -1)
+	return errno;
+
+    if (fstat(fd, &st) == -1) {
+	ret = errno;
+	goto err;
+    }
+
+    if (st.st_size == 0) {
+	ret = -1; /* no data -> no binary search */
+	goto err;
+    }
+
+    /* Validate / default arguments */
+    if (max_sz == 0)
+	max_sz = DEFAULT_MAX_FILE_SIZE;
+    for (i = page_sz; i; i >>= 1) {
+	/* Make sure page_sz is a power of two */
+	if ((i % 2) && (i >> 1)) {
+	    page_sz = 0;
+	    break;
+	}
+    }
+    if (page_sz == 0)
+	page_sz = st.st_blksize;
+    for (i = page_sz; i; i >>= 1) {
+	/* Make sure page_sz is a power of two */
+	if ((i % 2) && (i >> 1)) {
+	    /* Can't happen! Filesystems always use powers of two! */
+	    page_sz = 4096;
+	    break;
+	}
+    }
+    if (page_sz > MAX_BLOCK_SIZE)
+	page_sz = MAX_BLOCK_SIZE;
+
+    new_bfh = calloc(1, sizeof (*new_bfh));
+    if (new_bfh == NULL) {
+	ret = ENOMEM;
+	goto err;
+    }
+
+    new_bfh->fd = fd;
+    new_bfh->page_sz = page_sz;
+    new_bfh->file_sz = st.st_size;
+
+    if (max_sz >= st.st_size) {
+	/* Whole-file method */
+	new_bfh->cache = malloc(st.st_size + 1);
+	if (new_bfh->cache) {
+	    new_bfh->cache[st.st_size] = '\0';
+	    new_bfh->cache_sz = st.st_size;
+	    ret = read(fd, new_bfh->cache, st.st_size);
+	    if (ret < 0) {
+		ret = errno;
+		goto err;
+	    }
+	    if (ret != st.st_size) {
+		ret = EIO; /* XXX ??? */
+		goto err;
+	    }
+	    if (reads)
+		*reads = 1;
+	    (void) close(fd);
+	    new_bfh->fd = -1;
+	    *bfh = new_bfh;
+	    return 0;
+	}
+    }
+
+    /* Block-size method, or above malloc() failed */
+    new_bfh->page = malloc(new_bfh->page_sz << 1);
+    if (new_bfh->page == NULL) {
+	/* Can't even allocate a single double-size page! */
+	ret = ENOMEM;
+	goto err;
+    }
+
+    new_bfh->cache_sz = max_sz < st.st_size ? max_sz : st.st_size;
+    new_bfh->cache = malloc(new_bfh->cache_sz);
+    *bfh = new_bfh;
+
+    /*
+     * malloc() may have failed because we were asking for a lot of
+     * memory, but we may still be able to operate without a cache,
+     * so let's not fail.
+     */
+    if (new_bfh->cache == NULL) {
+	new_bfh->cache_sz = 0;
+	return 0;
+    }
+
+    /* Initialize cache */
+    for (i = 0; i < new_bfh->cache_sz; i += new_bfh->page_sz)
+	new_bfh->cache[i] = '\0';
+    return 0;
+
+err:
+    (void) close(fd);
+    if (new_bfh) {
+	free(new_bfh->page);
+	free(new_bfh->cache);
+	free(new_bfh);
+    }
+    return ret;
+}
+
+/**
+ * Indicate whether the given binary search file handle will be searched
+ * with block-wise method.
+ */
+void
+__bsearch_file_info(bsearch_file_handle bfh,
+		    size_t *page_sz, size_t *max_sz, int *blockwise)
+{
+    if (page_sz)
+	*page_sz = bfh->page_sz;
+    if (max_sz)
+	*max_sz = bfh->cache_sz;
+    if (blockwise)
+	*blockwise = (bfh->file_sz != bfh->cache_sz);
+}
+
+/**
+ * Close the given binary file search handle.
+ *
+ * Inputs:
+ *
+ * @bfh Pointer to variable containing handle to close.
+ */
+void
+__bsearch_file_close(bsearch_file_handle *bfh)
+{
+    if (!*bfh)
+	return;
+    if ((*bfh)->fd >= 0)
+	(void) close((*bfh)->fd);
+    if ((*bfh)->page)
+	free((*bfh)->page);
+    if ((*bfh)->cache)
+	free((*bfh)->cache);
+    free(*bfh);
+    *bfh = NULL;
+}
+
+/**
+ * Private function to get a page from a cache.  The cache is a char
+ * array of 2^n - 1 double-size page worth of bytes, where n is the
+ * number of tree levels that the cache stores.  The cache can be
+ * smaller than n implies.
+ *
+ * The page may or may not be valid.  If the first byte of it is NUL
+ * then it's not valid, else it is.
+ *
+ * Returns 1 if page is in cache and valid, 0 if the cache is too small
+ * or the page is invalid.  The page address is output in @buf if the
+ * cache is large enough to contain it regardless of whether the page is
+ * valid.
+ *
+ * Inputs:
+ *
+ * @bfh      Binary search file handle
+ * @level    Level in the tree that we want a page for
+ * @page_idx Page number in the given level (0..2^level - 1)
+ *
+ * Outputs:
+ *
+ * @buf      Set to address of page if the cache is large enough
+ */
+static int
+get_page_from_cache(bsearch_file_handle bfh, size_t level, size_t page_idx,
+		    char **buf)
+{
+    size_t idx = 0;
+    size_t page_sz;
+
+    page_sz = bfh->page_sz << 1; /* we use double-size pages in the cache */
+
+    *buf = NULL;
+
+    /*
+     * Compute index into cache.  The cache is basically an array of
+     * double-size pages.  The first (zeroth) double-size page in the
+     * cache will be the middle page of the file -- the root of the
+     * tree.  The next two double-size pages will be the left and right
+     * pages of the second level in the tree.  The next four double-size
+     * pages will be the four pages at the next level.  And so on for as
+     * many pages as fit in the cache.
+     *
+     * The page index is the number of the page at the given level.  We
+     * then compute (2^level - 1 + page index) * 2page size, check that
+     * we have that in the cache, check that the page has been read (it
+     * doesn't start with NUL).
+     */
+    if (level)
+	idx = (1 << level) - 1 + page_idx;
+    if (((idx + 1) * page_sz * 2) > bfh->cache_sz)
+	return 0;
+
+    *buf = &bfh->cache[idx * page_sz * 2];
+    if (bfh->cache[idx * page_sz * 2] == '\0')
+	return 0; /* cache[idx] == NUL -> page not loaded in cache */
+    return 1;
+}
+
+/**
+ * Private function to read a page of @page_sz from @fd at offset @off
+ * into @buf, outputing the number of bytes read, which will be the same
+ * as @page_sz unless the page being read is the last page, in which
+ * case the number of remaining bytes in the file will be output.
+ *
+ * Returns 0 on success or an errno value otherwise (EIO if reads are
+ * short).
+ *
+ * Inputs:
+ *
+ * @bfh        Binary search file handle
+ * @level      Level in the binary search tree that we're at
+ * @page_idx   Page "index" at the @level of the tree that we want
+ * @page       Actual page number that we want
+ * want_double Whether we need a page or double page read
+ *
+ * Outputs:
+ *
+ * @buf        Page read or cached
+ * @bytes      Bytes read (may be less than page or double page size in
+ *             the case of the last page, of course)
+ */
+static int
+read_page(bsearch_file_handle bfh, size_t level, size_t page_idx, size_t page,
+	  int want_double, const char **buf, size_t *bytes)
+{
+    int ret;
+    off_t off;
+    size_t expected;
+    size_t wanted;
+    char *page_buf;
+
+    /* Figure out where we're reading and how much */
+    off = page * bfh->page_sz;
+    if (off < 0)
+	return EOVERFLOW;
+
+    wanted = bfh->page_sz << want_double;
+    expected = ((bfh->file_sz - off) > wanted) ? wanted : bfh->file_sz - off;
+
+    if (get_page_from_cache(bfh, level, page_idx, &page_buf)) {
+	*buf = page_buf;
+	*bytes = expected;
+	return 0; /* found in cache */
+    }
+
+
+    *bytes = 0;
+    *buf = NULL;
+
+    /* OK, we have to read a page or double-size page */
+
+    if (page_buf)
+	want_double = 1; /* we'll be caching; we cache double-size pages */
+    else
+	page_buf = bfh->page; /* we won't cache this page */
+
+    wanted = bfh->page_sz << want_double;
+    expected = ((bfh->file_sz - off) > wanted) ? wanted : bfh->file_sz - off;
+
+#ifdef HAVE_PREAD
+    ret = pread(bfh->fd, page_buf, expected, off);
+#else
+    if (lseek(bfh->fd, off, SEEK_SET) == (off_t)-1)
+	return errno;
+    ret = read(bfh->fd, page_buf, expected);
+#endif
+    if (ret < 0)
+	return errno;
+    
+    if (ret != expected)
+	return EIO; /* XXX ??? */
+
+    *buf = page_buf;
+    *bytes = expected;
+    return 0;
+}
+
+/**
+ * Perform a binary search of a file where each line is a record (LF and
+ * CRLF supported).  Each record consists of a key followed by an
+ * optional value separated from the key by whitespace.  Whitespace can
+ * be quoted with backslashes.  It's the caller's responsibility to
+ * encode/decode keys/values if quoting is desired; newlines should be
+ * encoded such that a newline does not appear in the result.
+ *
+ * The search is done with block-wise I/O (i.e., the whole file is not
+ * read into memory).
+ *
+ * All output arguments are optional.
+ *
+ * Returns 0 if key is found, -1 if not found, or an error code such as
+ * ENOMEM in case of error.
+ *
+ * NOTE: We could improve this by not freeing the buffer, instead
+ *       requiring that the caller provide it.  Further, we could cache
+ *       the top N levels of [double-size] pages (2^N - 1 pages), which
+ *       should speed up most searches by reducing the number of reads
+ *       by N.
+ *
+ * Inputs:
+ *
+ * @fd           File descriptor (file to search)
+ * @page_sz      Page size (if zero then the file's st_blksize will be used)
+ * @key          Key string to search for
+ *
+ * Outputs:
+ *
+ * @value        Location to store a copy of the value (caller must free)
+ * @location     Record location if found else the location where the
+ *               record should be inserted (index into @buf)
+ * @loops        Location to store a count of bisections required for
+ *               search (useful for confirming logarithmic performance)
+ * @reads        Location to store a count of pages read during search
+ *               (useful for confirming logarithmic performance)
+ */
+int
+__bsearch_file(bsearch_file_handle bfh, const char *key,
+	       char **value, size_t *location, size_t *loops, size_t *reads)
+{
+    int ret;
+    const char *buf;
+    size_t buf_sz;
+    size_t page, l, r;
+    size_t my_reads = 0;
+    size_t my_loops_total = 0;
+    size_t my_loops;
+    size_t level;        /* level in the tree */
+    size_t page_idx = 0; /* page number in the tree level */
+    size_t buf_location;
+    int cmp;
+    int buf_ends_in_eol = 0;
+    int buf_is_start = 0;
+
+    if (reads)
+	*reads = 0;
+
+    /* If whole file is in memory then search that and we're done */
+    if (bfh->file_sz == bfh->cache_sz)
+	return __bsearch_text(bfh->cache, bfh->cache_sz, key, value, location, loops);
+
+    /* Else block-wise binary search */
+
+    if (value)
+	*value = NULL;
+    if (loops)
+	*loops = 0;
+
+    l = 0;
+    r = (bfh->file_sz / bfh->page_sz) + 1;
+    for (level = 0, page = r >> 1; page >= l && page < r ; level++) {
+	ret = read_page(bfh, level, page_idx, page, 0, &buf, &buf_sz);
+	if (ret != 0)
+	    return ret;
+	my_reads++;
+	if (buf[buf_sz - 1] == '\r' || buf[buf_sz - 1] == '\n')
+	    buf_ends_in_eol = 1;
+	else
+	    buf_ends_in_eol = 0;
+
+	buf_is_start = page == 0 ? 1 : 0;
+	ret = bsearch_common(buf, (size_t)buf_sz, key, buf_is_start,
+			     value, &buf_location, &cmp, &my_loops);
+	if (ret > 0)
+	    return ret;
+	/* Found or no we update stats */
+	my_loops_total += my_loops;
+	if (loops)
+	    *loops = my_loops_total;
+	if (reads)
+	    *reads = my_reads;
+	if (location)
+	    *location = page * bfh->page_sz + buf_location;
+	if (ret == 0)
+	    return 0; /* found! */
+	/* Not found */
+	if (cmp < 0) {
+	    /* Search left */
+	    page_idx <<= 1;
+	    r = page;
+	    page = l + ((r - l) >> 1);
+	    continue;
+	} else {
+	    /*
+	     * Search right, but first search the current and next
+	     * blocks in case that the record we're looking for either
+	     * straddles the boundary between this and the next record,
+	     * or in case the record starts exactly at the next page.
+	     */
+	    heim_assert(cmp > 0, "cmp > 0");
+
+	    if (!buf_ends_in_eol || page == l || page == (r - 1)) {
+		ret = read_page(bfh, level, page_idx, page, 1, &buf, &buf_sz);
+		if (ret != 0)
+		    return ret;
+		my_reads++;
+
+		buf_is_start = page == l ? 1 : 0;
+
+		ret = bsearch_common(buf, (size_t)buf_sz, key, buf_is_start,
+				     value, &buf_location, &cmp, &my_loops);
+		if (ret > 0)
+		    return ret;
+		my_loops_total += my_loops;
+		if (loops)
+		    *loops = my_loops_total;
+		if (reads)
+		    *reads = my_reads;
+		if (location)
+		    *location = page * bfh->page_sz + buf_location;
+		if (ret == 0)
+		    return 0;
+	    }
+
+	    /* Oh well, search right */
+	    if (l == page && r == (l + 1))
+		break;
+	    page_idx = (page_idx << 1) + 1;
+	    l = page;
+	    page = l + ((r - l) >> 1);
+	    continue;
+	}
+    }
+    return -1;
+}
+
--- a/base/heimbase.h
+++ b/base/heimbase.h
@@ -235,5 +235,20 @@ const void *	heim_data_get_ptr(heim_data_t);
 size_t		heim_data_get_length(heim_data_t);


+/*
+ * Binary search.
+ *
+ * Note: these are private until integrated into the heimbase object system.
+ */
+typedef struct bsearch_file_handle *bsearch_file_handle;
+int __bsearch_text(const char *buf, size_t buf_sz, const char *key,
+		   char **value, size_t *location, size_t *loops);
+int __bsearch_file_open(const char *fname, size_t max_sz, size_t page_sz,
+			bsearch_file_handle *bfh, size_t *reads);
+int __bsearch_file(bsearch_file_handle bfh, const char *key, char **value,
+		   size_t *location, size_t *loops, size_t *reads);
+void __bsearch_file_info(bsearch_file_handle bfh, size_t *page_sz,
+			 size_t *max_sz, int *blockwise);
+void __bsearch_file_close(bsearch_file_handle *bfh);

 #endif /* HEIM_BASE_H */
--- a/base/version-script.map
+++ b/base/version-script.map
@@ -39,6 +39,11 @@ HEIMDAL_BASE_1.0 {
 		heim_string_create_with_bytes;
 		heim_string_get_type_id;
 		heim_string_get_utf8;
+		__bsearch_text;
+		__bsearch_file_open;
+		__bsearch_file;
+		__bsearch_file_info;
+		__bsearch_file_close;
 	local:
 		*;
 };
--- a/configure.ac
+++ b/configure.ac
@@ -637,6 +637,7 @@ AC_CONFIG_FILES(Makefile 		\
 	kdc/Makefile			\
 	appl/Makefile			\
 	appl/afsutil/Makefile		\
+	appl/dbutils/Makefile		\
 	appl/ftp/Makefile		\
 	appl/ftp/common/Makefile	\
 	appl/ftp/ftp/Makefile		\