From 659c7612131814fde7acfffeeedc4877fa08b0b3 Mon Sep 17 00:00:00 2001 From: Nicolas Williams Date: Mon, 28 Nov 2011 22:03:28 -0600 Subject: [PATCH] Add private text file binary search API to libheimbase --- appl/Makefile.am | 3 +- appl/dbutils/Makefile.am | 13 + appl/dbutils/NTMakefile | 35 ++ appl/dbutils/bsearch.1 | 114 ++++++ appl/dbutils/bsearch.c | 225 +++++++++++ base/Makefile.am | 1 + base/bsearch.c | 786 +++++++++++++++++++++++++++++++++++++++ base/heimbase.h | 15 + base/version-script.map | 5 + configure.ac | 1 + 10 files changed, 1197 insertions(+), 1 deletion(-) create mode 100644 appl/dbutils/Makefile.am create mode 100644 appl/dbutils/NTMakefile create mode 100644 appl/dbutils/bsearch.1 create mode 100644 appl/dbutils/bsearch.c create mode 100644 base/bsearch.c diff --git a/appl/Makefile.am b/appl/Makefile.am index 5e4e320bc..8f3ae704e 100644 --- a/appl/Makefile.am +++ b/appl/Makefile.am @@ -10,6 +10,7 @@ dir_dce = dceutils endif SUBDIRS = \ afsutil \ + dbutils \ ftp \ login \ $(dir_otp) \ @@ -26,4 +27,4 @@ SUBDIRS = \ kf \ $(dir_dce) -EXTRA_DIST = NTMakefile \ No newline at end of file +EXTRA_DIST = NTMakefile diff --git a/appl/dbutils/Makefile.am b/appl/dbutils/Makefile.am new file mode 100644 index 000000000..a1fc3842a --- /dev/null +++ b/appl/dbutils/Makefile.am @@ -0,0 +1,13 @@ +# $Id$ + +include $(top_srcdir)/Makefile.am.common + +bin_PROGRAMS = bsearch + +bsearch_SOURCES = bsearch.c + +man_MANS = bsearch.1 + +EXTRA_DIST = NTMakefile $(man_MANS) + +LDADD = $(LIB_roken) $(LIB_vers) $(LIB_heimbase) diff --git a/appl/dbutils/NTMakefile b/appl/dbutils/NTMakefile new file mode 100644 index 000000000..73ea8168d --- /dev/null +++ b/appl/dbutils/NTMakefile @@ -0,0 +1,35 @@ +######################################################################## +# +# Copyright (c) 2009, Secure Endpoints Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# - Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# + +RELDIR=appl\dbutils + +!include ../../windows/NTMakefile.w32 + diff --git a/appl/dbutils/bsearch.1 b/appl/dbutils/bsearch.1 new file mode 100644 index 000000000..07c788b72 --- /dev/null +++ b/appl/dbutils/bsearch.1 @@ -0,0 +1,114 @@ +.\" +.\" Copyright (c) 2011, Secure Endpoints Inc. +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" +.\" - Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" +.\" - Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in +.\" the documentation and/or other materials provided with the +.\" distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +.\" "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +.\" LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +.\" FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +.\" COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +.\" INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +.\" (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +.\" SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +.\" STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +.\" OF THE POSSIBILITY OF SUCH DAMAGE. +.\" +.Dd November 30, 2011 +.Dt BSEARCH 1 +.Os KTH-KRB +.Sh NAME +.Nm bsearch +.Nd manages one-time passwords +.Sh SYNOPSIS +.Nm bsearch +.Op Fl KVvh +.Op Fl b Ar block-size +.Op Fl m Ar max-cache-size +.Ar file +.Ar [key ...] +.Sh DESCRIPTION +The +.Nm +program performs binary searches of +.Ar file +which must be a sorted flat text file. +.Pp +Each line is a record. Each record starts with a key +that is optionally followed by whitespace and a value. +Whitespace may be quoted with a backslash, but newline +and carriage-return characters must be quoted in some +other manner (e.g., as backslash-n and backslash-r). +Escapes are not interpreted nor removed. +.Pp +If no key arguments are given on the comman-line, then +keys will be read from standard input. +.Pp +By default only values are printed to standard output. +Use the -K option to also print keys. The exit status +will be non-zero if any key lookups fail. +.Pp +Options are: +.Bl -tag -width Ds +.It Fl K +Print keys. +.It Fl V +Don't print values. +.It Fl h +Print usage and exit. +.It Fl v +Print statistic and debug information to standard +error. +.Ar file +A sorted flat text file. NOTE: use the "C" locale for +sorting this file, as in "LC_ALL=C sort -u -o file +file". +.It Fl h +For getting a help message. +.It Fl m +Set +.Ar max-cache-size +as the maximum cache size. If the +.Ar file +is smaller than this size then the whole file will be +read into memory, else the program will read blocks. +Defaults to 1MB. +.It Fl b +Set +.Ar block-size +as the block size for block-wise I/O. This must be a +power of 2, must be no smaller than 512 and no larger +than 1MB. Defaults to the +.Ar file's +filesystem's preferred blocksize. +.El +.Sh EXAMPLES +.Bd -literal -offset indent +$ env LC_ALL=C sort -o /tmp/words /usr/share/dict/words +$ bsearch -Kv /tmp/words day +Using whole-file method +Key day found at offset 327695 in 12 loops and 0 reads +day +$ +.Sh NOTES +.Pp +Records must not be longer than one block's size. +.Pp +Flat text files must be sorted in the "C" locale. In +some systems the default locale may result in +case-insensitive sorting by the sort command. +.Sh SEE ALSO +.Xr sort 1 diff --git a/appl/dbutils/bsearch.c b/appl/dbutils/bsearch.c new file mode 100644 index 000000000..34aa6bae1 --- /dev/null +++ b/appl/dbutils/bsearch.c @@ -0,0 +1,225 @@ +/* + * Copyright (c) 2011, Secure Endpoints Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int help_flag; +int version_flag; +int verbose_flag; +int print_keys_flag; +int no_values_flag; +int block_size_int; +int max_size_int; + +struct getargs args[] = { + { "print-keys", 'K', arg_flag, &print_keys_flag, + "print keys", NULL }, + { "no-values", 'V', arg_flag, &no_values_flag, + "don't print values", NULL }, + { "verbose", 'v', arg_flag, &verbose_flag, + "print statistics and informative messages", NULL }, + { "help", 'h', arg_flag, &help_flag, + "print usage message", NULL }, + { "block-size", 'b', arg_integer, &block_size_int, + "block size", "integer" }, + { "max-cache-size", 'm', arg_integer, &max_size_int, + "maximum cache size", "integer" }, + { "version", '\0', arg_flag, &version_flag, NULL, NULL } +}; + +static int num_args = sizeof(args) / sizeof(args[0]); + +static void +usage(const char *progname, int status) +{ + arg_printusage(args, num_args, progname, "\n" +"\tThis program does a binary search of the given file for the\n" +"\tgiven keys. Two binary search algorithms are implemented\n" +"\twhole-file and block-wise.\n\n" +"\tIf keys are not given as arguments keys are read from stdin.\n\n" +"\tExit status will be 1 for errors, 2 if any keys are not found,\n" +"\tand 0 if all keys are found.\n\n" +"\tOptions:\n" +"\t\t-K \tPrint keys\n" +"\t\t-V \tDon't print values\n" +"\t\t-b size\tUse block-wise search with give blocksize\n" +"\t\t-m size\tRead DB in if its size is less than given\n" +"\t\t-v \tVerbose (includes count of reads and comparisons)\n" +"\t\t-h \tPrint usage message and exit\n" +"\tIf blocksize is not given, empty, or zero then the\n" +"\tfilesystem's block size (st_blksize) will be used.\n" +"\tBlock sizes should be powers of two, and larger than 256.\n" +"\tIf the max file size is not given or empty then the max\n" +"\tfile size for non-block-wise search will be 1MB.\n" +"\tKeys from stdin must not be longer than 1023 bytes.\n\n" + ); + exit(status); +} + +#define MAX_BLOCK_SIZE (1024 * 1024) +#define DEFAULT_MAX_FILE_SIZE (1024 * 1024) + +int +main(int argc, char **argv) +{ + char keybuf[1024]; + char *progname = argv[0]; + char *fname; + char *key = keybuf; + char *value; + char *p; + bsearch_file_handle bfh = NULL; + size_t num; + size_t loc; /* index where record is located or to be inserted */ + size_t loops; /* number of loops/comparisons needed for lookup */ + size_t reads = 0; /* number of reads needed for a lookup */ + size_t failures = 0; /* number of lookup failures -- for exit status */ + size_t block_size = 0; + size_t max_size = 0; + int optidx = 0; + int blockwise; + int ret = 0; + + if (getarg(args, num_args, argc, argv, &optidx)) + usage(progname, 1); + + if (version_flag) { + print_version(NULL); + return 0; + } + + if (help_flag) + usage(progname, 0); + + if (block_size_int != 0 && block_size_int < 512) { + fprintf(stderr, "Invalid block size: too small\n"); + return 1; + } + if (block_size_int > 0) { + /* Check that block_size is a power of 2 */ + num = block_size_int; + while (num) { + if ((num % 2) && (num >> 1)) { + fprintf(stderr, "Invalid block size: must be power " + "of two\n"); + return 1; + } + num >>= 1; + } + if (block_size_int > MAX_BLOCK_SIZE) + fprintf(stderr, "Invalid block size: too large\n"); + block_size = block_size_int; + } + if (max_size_int < 0) + usage(progname, 1); + max_size = max_size_int; + + argc -= optind; + argv += optind; + + if (argc == 0) + usage(progname, 1); + + fname = argv[0]; + argc--; + argv++; + + ret = __bsearch_file_open(fname, max_size, block_size, &bfh, &reads); + if (ret != 0) { + perror("bsearch_file_open"); + return 1; + } + + __bsearch_file_info(bfh, &block_size, &max_size, &blockwise); + if (verbose_flag && blockwise) { + fprintf(stderr, "Using block-wise method with block size %lu and " + "cache size %lu\n", + (long unsigned)block_size, (long unsigned)max_size); + } else if (verbose_flag) { + fprintf(stderr, "Using whole-file method\n"); + } + + for (;;) { + loops = 0; /* reset stats */ + /* Eww */ + if (argc) { + key = *(argv++); + if (!key) + break; + } else { + if (!fgets(keybuf, sizeof (keybuf), stdin)) + break; + p = strchr(key, '\n'); + if (!p) + break; + *p = '\0'; + if (!*key) + continue; + } + ret = __bsearch_file(bfh, key, &value, &loc, &loops, &reads); + if (ret != 0) { + if (ret > 0) { + fprintf(stderr, "Error: %s\n", strerror(ret)); + __bsearch_file_close(&bfh); + return 1; + } + if (verbose_flag) + fprintf(stderr, "Key %s not found in %lu loops and %lu reads; " + "insert at %lu\n", key, (long unsigned)loops, + (long unsigned)reads, (long unsigned)loc); + failures++; + continue; + } + if (verbose_flag) + fprintf(stderr, "Key %s found at offset %lu in %lu loops and " + "%lu reads\n", key, (long unsigned)loc, + (long unsigned)loops, (long unsigned)reads); + if (print_keys_flag && !no_values_flag && value) + printf("%s %s\n", key, value); + else if (print_keys_flag) + printf("%s\n", key); + else if (no_values_flag && value) + printf("%s\n", value); + free(value); + } + if (failures) + return 2; + __bsearch_file_close(&bfh); + return 0; +} diff --git a/base/Makefile.am b/base/Makefile.am index f4960c3dc..7d22ed325 100644 --- a/base/Makefile.am +++ b/base/Makefile.am @@ -17,6 +17,7 @@ include_HEADERS = heimbase.h dist_libheimbase_la_SOURCES = \ array.c \ baselocl.h \ + bsearch.c \ bool.c \ data.c \ dict.c \ diff --git a/base/bsearch.c b/base/bsearch.c new file mode 100644 index 000000000..9fbd9026d --- /dev/null +++ b/base/bsearch.c @@ -0,0 +1,786 @@ +/* + * Copyright (c) 2011, Secure Endpoints Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "baselocl.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This file contains functions for binary searching flat text in memory + * and in text files where each line is a [variable length] record. + * Each record has a key and an optional value separated from the key by + * unquoted whitespace. Whitespace in the key, and leading whitespace + * for the value, can be quoted with backslashes (but CR and LF must be + * quoted in such a way that they don't appear in the quoted result). + * + * Binary searching a tree are normally a dead simple algorithm. It + * turns out that binary searching flat text with *variable* length + * records is... tricky. There's no indexes to record beginning bytes, + * thus any index selected during the search is likely to fall in the + * middle of a record. When deciding to search a left sub-tree one + * might fail to find the last record in that sub-tree on account of the + * right boundary falling in the middle of it -- the chosen solution to + * this makes left sub-tree searches slightly less efficient than right + * sub-tree searches. + * + * If binary searching flat text in memory is tricky, using block-wise + * I/O instead is trickier! But it's necessary in order to support + * large files (which we either can't or wouldn't want to read or map + * into memory). Each block we read has to be large enough that the + * largest record can fit in it. And each block might start and/or end + * in the middle of a record. Here it is the right sub-tree searches + * that are less efficient than left sub-tree searches. + * + * bsearch_common() contains the common text block binary search code. + * + * __bsearch_text() is the interface for searching in-core text. + * __bsearch_file() is the interface for block-wise searching files. + */ + +struct bsearch_file_handle { + int fd; /* file descriptor */ + char *cache; /* cache bytes */ + char *page; /* one double-size page worth of bytes */ + size_t file_sz; /* file size */ + size_t cache_sz; /* cache size */ + size_t page_sz; /* page size */ +}; + +/* Find a new-line */ +static const char * +find_line(const char *buf, size_t i, size_t right) +{ + if (i == 0) + return &buf[i]; + for (; i < right; i++) { + if (buf[i] == '\n') { + if ((i + 1) < right) + return &buf[i + 1]; + return NULL; + } + } + return NULL; +} + +/** + * Common routine for binary searching text in core. + * + * Perform a binary search of a char array containing a block from a + * text file where each line is a record (LF and CRLF supported). Each + * record consists of a key followed by an optional value separated from + * the key by whitespace. Whitespace can be quoted with backslashes. + * It's the caller's responsibility to encode/decode keys/values if + * quoting is desired; newlines should be encoded such that a newline + * does not appear in the result. + * + * All output arguments are optional. + * + * Returns 0 if key is found, -1 if not found, or an error code such as + * ENOMEM in case of error. + * + * Inputs: + * + * @buf String to search + * @sz Size of string to search + * @key Key string to search for + * @buf_is_start True if the buffer starts with a record, false if it + * starts in the middle of a record or if the caller + * doesn't know. + * + * Outputs: + * + * @value Location to store a copy of the value (caller must free) + * @location Record location if found else the location where the + * record should be inserted (index into @buf) + * @cmp Set to less than or greater than 0 to indicate that a + * key not found would have fit in an earlier or later + * part of a file. Callers should use this to decide + * whether to read a block to the left or to the right and + * search that. + * @loops Location to store a count of bisections required for + * search (useful for confirming logarithmic performance) + */ +static int +bsearch_common(const char *buf, size_t sz, const char *key, + int buf_is_start, char **value, size_t *location, + int *cmp, size_t *loops) +{ + const char *linep; + size_t key_start, key_len; /* key string in buf */ + size_t val_start, val_len; /* value string in buf */ + int key_cmp; + size_t k; + size_t l; /* left side of buffer for binary search */ + size_t r; /* right side of buffer for binary search */ + size_t rmax; /* right side of buffer for binary search */ + size_t i; /* index into buffer, typically in the middle of l and r */ + size_t loop_count = 0; + int ret = -1; + + if (value) + *value = NULL; + if (cmp) + *cmp = 0; + if (loops) + *loops = 0; + + /* Binary search; file should be sorted */ + for (l = 0, r = rmax = sz, i = sz >> 1; i >= l && i < rmax; loop_count++) { + heim_assert(i >= 0 && i < sz, "invalid aname2lname db index"); + + /* buf[i] is likely in the middle of a line; find the next line */ + linep = find_line(buf, i, rmax); + k = linep ? linep - buf : i; + if (linep == NULL || k >= rmax) { + /* + * No new line found to the right; search to the left then + * but don't change rmax (this isn't optimal, but it's + * simple). + */ + if (i == l) + break; + r = i; + i = l + ((r - l) >> 1); + continue; + } + i = k; + heim_assert(i >= l && i < rmax, "invalid aname2lname db index"); + + /* Got a line; check it */ + + /* Search for and split on unquoted whitespace */ + for (key_start = i, key_len = 0, val_len = 0, k = i; k < rmax; k++) { + if (buf[k] == '\\') { + k++; + continue; + } + if (buf[k] == '\r' || buf[k] == '\n') { + /* We now know where the key ends, and there's no value */ + key_len = k - i; + break; + } + if (!isspace(buf[k])) + continue; + + while (k < rmax && isspace(buf[k])) { + key_len = k - i; + k++; + } + if (k < rmax) + val_start = k; + /* Find end of value */ + for (; k < rmax && buf[k] != '\0'; k++) { + if (buf[k] == '\r' || buf[k] == '\n') { + val_len = k - val_start; + break; + } + } + break; + } + + /* + * The following logic is for dealing with partial buffers, + * which we use for block-wise binary searches of large files + */ + if (key_start == 0 && !buf_is_start) { + /* + * We're at the beginning of a block that might have started + * in the middle of a record whose "key" might well compare + * as greater than the key we're looking for, so we don't + * bother comparing -- we know key_cmp must be -1 here. + */ + key_cmp = -1; + break; + } + if ((val_len && buf[val_start + val_len] != '\n') || + (!val_len && buf[key_start + key_len] != '\n')) { + /* + * We're at the end of a block that ends in the middle of a + * record whose "key" might well compare as less than the + * key we're looking for, so we don't bother comparing -- we + * know key_cmp must be >= 0 but we can't tell. Our caller + * will end up reading a double-size block to handle this. + */ + key_cmp = 1; + break; + } + + key_cmp = strncmp(key, &buf[key_start], key_len); + if (key_cmp == 0 && strlen(key) != key_len) + key_cmp = 1; + if (key_cmp < 0) { + /* search left */ + r = rmax = (linep - buf); + i = l + ((r - l) >> 1); + if (location) + *location = key_start; + } else if (key_cmp > 0) { + /* search right */ + if (l == i) + break; /* not found */ + l = i; + i = l + ((r - l) >> 1); + if (location) + *location = val_start + val_len; + } else { + /* match! */ + if (location) + *location = key_start; + ret = 0; + if (val_len && value) { + *value = strndup(&buf[val_start], val_len); + if (!*value) + ret = errno; + } + break; + } + } + + if (cmp) + *cmp = key_cmp; + if (loops) + *loops = loop_count; + + return ret; +} + +/** + * Binary search a char array containing sorted text records separated + * by new-lines (or CRLF). Each record consists of a key and an + * optional value following the key, separated from the key by unquoted + * whitespace. + * + * All output arguments are optional. + * + * Returns 0 if key is found, -1 if not found, or an error code such as + * ENOMEM in case of error. + * + * Inputs: + * + * @buf Char array pointer + * @buf_sz Size of buf + * @key Key to search for + * + * Outputs: + * + * @value Location where to put the value, if any (caller must free) + * @location Record location if found else the location where the record + * should be inserted (index into @buf) + * @loops Location where to put a number of loops (or comparisons) + * needed for the search (useful for benchmarking) + */ +int +__bsearch_text(const char *buf, size_t buf_sz, const char *key, + char **value, size_t *location, size_t *loops) +{ + return bsearch_common(buf, buf_sz, key, 1, value, location, NULL, loops); +} + +#define MAX_BLOCK_SIZE (1024 * 1024) +#define DEFAULT_MAX_FILE_SIZE (1024 * 1024) +/** + * Open a file for binary searching. The file will be read in entirely + * if it is smaller than @max_sz, else a cache of @max_sz bytes will be + * allocated. + * + * Returns 0 on success, else an error number or -1 if the file is empty. + * + * Inputs: + * + * @fname Name of file to open + * @max_sz Maximum size of cache to allocate, in bytes (if zero, default) + * @page_sz Page size (must be a power of two, larger than 256, smaller + * than 1MB; if zero use default) + * + * Outputs: + * + * @bfh Handle for use with __bsearch_file() and __bsearch_file_close() + * @reads Number of reads performed + */ +int +__bsearch_file_open(const char *fname, size_t max_sz, size_t page_sz, + bsearch_file_handle *bfh, size_t *reads) +{ + bsearch_file_handle new_bfh; + struct stat st; + size_t i; + int fd; + int ret; + + *bfh = NULL; + + if (reads) + *reads = 0; + + fd = open(fname, O_RDONLY); + if (fd == -1) + return errno; + + if (fstat(fd, &st) == -1) { + ret = errno; + goto err; + } + + if (st.st_size == 0) { + ret = -1; /* no data -> no binary search */ + goto err; + } + + /* Validate / default arguments */ + if (max_sz == 0) + max_sz = DEFAULT_MAX_FILE_SIZE; + for (i = page_sz; i; i >>= 1) { + /* Make sure page_sz is a power of two */ + if ((i % 2) && (i >> 1)) { + page_sz = 0; + break; + } + } + if (page_sz == 0) + page_sz = st.st_blksize; + for (i = page_sz; i; i >>= 1) { + /* Make sure page_sz is a power of two */ + if ((i % 2) && (i >> 1)) { + /* Can't happen! Filesystems always use powers of two! */ + page_sz = 4096; + break; + } + } + if (page_sz > MAX_BLOCK_SIZE) + page_sz = MAX_BLOCK_SIZE; + + new_bfh = calloc(1, sizeof (*new_bfh)); + if (new_bfh == NULL) { + ret = ENOMEM; + goto err; + } + + new_bfh->fd = fd; + new_bfh->page_sz = page_sz; + new_bfh->file_sz = st.st_size; + + if (max_sz >= st.st_size) { + /* Whole-file method */ + new_bfh->cache = malloc(st.st_size + 1); + if (new_bfh->cache) { + new_bfh->cache[st.st_size] = '\0'; + new_bfh->cache_sz = st.st_size; + ret = read(fd, new_bfh->cache, st.st_size); + if (ret < 0) { + ret = errno; + goto err; + } + if (ret != st.st_size) { + ret = EIO; /* XXX ??? */ + goto err; + } + if (reads) + *reads = 1; + (void) close(fd); + new_bfh->fd = -1; + *bfh = new_bfh; + return 0; + } + } + + /* Block-size method, or above malloc() failed */ + new_bfh->page = malloc(new_bfh->page_sz << 1); + if (new_bfh->page == NULL) { + /* Can't even allocate a single double-size page! */ + ret = ENOMEM; + goto err; + } + + new_bfh->cache_sz = max_sz < st.st_size ? max_sz : st.st_size; + new_bfh->cache = malloc(new_bfh->cache_sz); + *bfh = new_bfh; + + /* + * malloc() may have failed because we were asking for a lot of + * memory, but we may still be able to operate without a cache, + * so let's not fail. + */ + if (new_bfh->cache == NULL) { + new_bfh->cache_sz = 0; + return 0; + } + + /* Initialize cache */ + for (i = 0; i < new_bfh->cache_sz; i += new_bfh->page_sz) + new_bfh->cache[i] = '\0'; + return 0; + +err: + (void) close(fd); + if (new_bfh) { + free(new_bfh->page); + free(new_bfh->cache); + free(new_bfh); + } + return ret; +} + +/** + * Indicate whether the given binary search file handle will be searched + * with block-wise method. + */ +void +__bsearch_file_info(bsearch_file_handle bfh, + size_t *page_sz, size_t *max_sz, int *blockwise) +{ + if (page_sz) + *page_sz = bfh->page_sz; + if (max_sz) + *max_sz = bfh->cache_sz; + if (blockwise) + *blockwise = (bfh->file_sz != bfh->cache_sz); +} + +/** + * Close the given binary file search handle. + * + * Inputs: + * + * @bfh Pointer to variable containing handle to close. + */ +void +__bsearch_file_close(bsearch_file_handle *bfh) +{ + if (!*bfh) + return; + if ((*bfh)->fd >= 0) + (void) close((*bfh)->fd); + if ((*bfh)->page) + free((*bfh)->page); + if ((*bfh)->cache) + free((*bfh)->cache); + free(*bfh); + *bfh = NULL; +} + +/** + * Private function to get a page from a cache. The cache is a char + * array of 2^n - 1 double-size page worth of bytes, where n is the + * number of tree levels that the cache stores. The cache can be + * smaller than n implies. + * + * The page may or may not be valid. If the first byte of it is NUL + * then it's not valid, else it is. + * + * Returns 1 if page is in cache and valid, 0 if the cache is too small + * or the page is invalid. The page address is output in @buf if the + * cache is large enough to contain it regardless of whether the page is + * valid. + * + * Inputs: + * + * @bfh Binary search file handle + * @level Level in the tree that we want a page for + * @page_idx Page number in the given level (0..2^level - 1) + * + * Outputs: + * + * @buf Set to address of page if the cache is large enough + */ +static int +get_page_from_cache(bsearch_file_handle bfh, size_t level, size_t page_idx, + char **buf) +{ + size_t idx = 0; + size_t page_sz; + + page_sz = bfh->page_sz << 1; /* we use double-size pages in the cache */ + + *buf = NULL; + + /* + * Compute index into cache. The cache is basically an array of + * double-size pages. The first (zeroth) double-size page in the + * cache will be the middle page of the file -- the root of the + * tree. The next two double-size pages will be the left and right + * pages of the second level in the tree. The next four double-size + * pages will be the four pages at the next level. And so on for as + * many pages as fit in the cache. + * + * The page index is the number of the page at the given level. We + * then compute (2^level - 1 + page index) * 2page size, check that + * we have that in the cache, check that the page has been read (it + * doesn't start with NUL). + */ + if (level) + idx = (1 << level) - 1 + page_idx; + if (((idx + 1) * page_sz * 2) > bfh->cache_sz) + return 0; + + *buf = &bfh->cache[idx * page_sz * 2]; + if (bfh->cache[idx * page_sz * 2] == '\0') + return 0; /* cache[idx] == NUL -> page not loaded in cache */ + return 1; +} + +/** + * Private function to read a page of @page_sz from @fd at offset @off + * into @buf, outputing the number of bytes read, which will be the same + * as @page_sz unless the page being read is the last page, in which + * case the number of remaining bytes in the file will be output. + * + * Returns 0 on success or an errno value otherwise (EIO if reads are + * short). + * + * Inputs: + * + * @bfh Binary search file handle + * @level Level in the binary search tree that we're at + * @page_idx Page "index" at the @level of the tree that we want + * @page Actual page number that we want + * want_double Whether we need a page or double page read + * + * Outputs: + * + * @buf Page read or cached + * @bytes Bytes read (may be less than page or double page size in + * the case of the last page, of course) + */ +static int +read_page(bsearch_file_handle bfh, size_t level, size_t page_idx, size_t page, + int want_double, const char **buf, size_t *bytes) +{ + int ret; + off_t off; + size_t expected; + size_t wanted; + char *page_buf; + + /* Figure out where we're reading and how much */ + off = page * bfh->page_sz; + if (off < 0) + return EOVERFLOW; + + wanted = bfh->page_sz << want_double; + expected = ((bfh->file_sz - off) > wanted) ? wanted : bfh->file_sz - off; + + if (get_page_from_cache(bfh, level, page_idx, &page_buf)) { + *buf = page_buf; + *bytes = expected; + return 0; /* found in cache */ + } + + + *bytes = 0; + *buf = NULL; + + /* OK, we have to read a page or double-size page */ + + if (page_buf) + want_double = 1; /* we'll be caching; we cache double-size pages */ + else + page_buf = bfh->page; /* we won't cache this page */ + + wanted = bfh->page_sz << want_double; + expected = ((bfh->file_sz - off) > wanted) ? wanted : bfh->file_sz - off; + +#ifdef HAVE_PREAD + ret = pread(bfh->fd, page_buf, expected, off); +#else + if (lseek(bfh->fd, off, SEEK_SET) == (off_t)-1) + return errno; + ret = read(bfh->fd, page_buf, expected); +#endif + if (ret < 0) + return errno; + + if (ret != expected) + return EIO; /* XXX ??? */ + + *buf = page_buf; + *bytes = expected; + return 0; +} + +/** + * Perform a binary search of a file where each line is a record (LF and + * CRLF supported). Each record consists of a key followed by an + * optional value separated from the key by whitespace. Whitespace can + * be quoted with backslashes. It's the caller's responsibility to + * encode/decode keys/values if quoting is desired; newlines should be + * encoded such that a newline does not appear in the result. + * + * The search is done with block-wise I/O (i.e., the whole file is not + * read into memory). + * + * All output arguments are optional. + * + * Returns 0 if key is found, -1 if not found, or an error code such as + * ENOMEM in case of error. + * + * NOTE: We could improve this by not freeing the buffer, instead + * requiring that the caller provide it. Further, we could cache + * the top N levels of [double-size] pages (2^N - 1 pages), which + * should speed up most searches by reducing the number of reads + * by N. + * + * Inputs: + * + * @fd File descriptor (file to search) + * @page_sz Page size (if zero then the file's st_blksize will be used) + * @key Key string to search for + * + * Outputs: + * + * @value Location to store a copy of the value (caller must free) + * @location Record location if found else the location where the + * record should be inserted (index into @buf) + * @loops Location to store a count of bisections required for + * search (useful for confirming logarithmic performance) + * @reads Location to store a count of pages read during search + * (useful for confirming logarithmic performance) + */ +int +__bsearch_file(bsearch_file_handle bfh, const char *key, + char **value, size_t *location, size_t *loops, size_t *reads) +{ + int ret; + const char *buf; + size_t buf_sz; + size_t page, l, r; + size_t my_reads = 0; + size_t my_loops_total = 0; + size_t my_loops; + size_t level; /* level in the tree */ + size_t page_idx = 0; /* page number in the tree level */ + size_t buf_location; + int cmp; + int buf_ends_in_eol = 0; + int buf_is_start = 0; + + if (reads) + *reads = 0; + + /* If whole file is in memory then search that and we're done */ + if (bfh->file_sz == bfh->cache_sz) + return __bsearch_text(bfh->cache, bfh->cache_sz, key, value, location, loops); + + /* Else block-wise binary search */ + + if (value) + *value = NULL; + if (loops) + *loops = 0; + + l = 0; + r = (bfh->file_sz / bfh->page_sz) + 1; + for (level = 0, page = r >> 1; page >= l && page < r ; level++) { + ret = read_page(bfh, level, page_idx, page, 0, &buf, &buf_sz); + if (ret != 0) + return ret; + my_reads++; + if (buf[buf_sz - 1] == '\r' || buf[buf_sz - 1] == '\n') + buf_ends_in_eol = 1; + else + buf_ends_in_eol = 0; + + buf_is_start = page == 0 ? 1 : 0; + ret = bsearch_common(buf, (size_t)buf_sz, key, buf_is_start, + value, &buf_location, &cmp, &my_loops); + if (ret > 0) + return ret; + /* Found or no we update stats */ + my_loops_total += my_loops; + if (loops) + *loops = my_loops_total; + if (reads) + *reads = my_reads; + if (location) + *location = page * bfh->page_sz + buf_location; + if (ret == 0) + return 0; /* found! */ + /* Not found */ + if (cmp < 0) { + /* Search left */ + page_idx <<= 1; + r = page; + page = l + ((r - l) >> 1); + continue; + } else { + /* + * Search right, but first search the current and next + * blocks in case that the record we're looking for either + * straddles the boundary between this and the next record, + * or in case the record starts exactly at the next page. + */ + heim_assert(cmp > 0, "cmp > 0"); + + if (!buf_ends_in_eol || page == l || page == (r - 1)) { + ret = read_page(bfh, level, page_idx, page, 1, &buf, &buf_sz); + if (ret != 0) + return ret; + my_reads++; + + buf_is_start = page == l ? 1 : 0; + + ret = bsearch_common(buf, (size_t)buf_sz, key, buf_is_start, + value, &buf_location, &cmp, &my_loops); + if (ret > 0) + return ret; + my_loops_total += my_loops; + if (loops) + *loops = my_loops_total; + if (reads) + *reads = my_reads; + if (location) + *location = page * bfh->page_sz + buf_location; + if (ret == 0) + return 0; + } + + /* Oh well, search right */ + if (l == page && r == (l + 1)) + break; + page_idx = (page_idx << 1) + 1; + l = page; + page = l + ((r - l) >> 1); + continue; + } + } + return -1; +} + diff --git a/base/heimbase.h b/base/heimbase.h index 685c5c57d..6c7bb2b5c 100644 --- a/base/heimbase.h +++ b/base/heimbase.h @@ -235,5 +235,20 @@ const void * heim_data_get_ptr(heim_data_t); size_t heim_data_get_length(heim_data_t); +/* + * Binary search. + * + * Note: these are private until integrated into the heimbase object system. + */ +typedef struct bsearch_file_handle *bsearch_file_handle; +int __bsearch_text(const char *buf, size_t buf_sz, const char *key, + char **value, size_t *location, size_t *loops); +int __bsearch_file_open(const char *fname, size_t max_sz, size_t page_sz, + bsearch_file_handle *bfh, size_t *reads); +int __bsearch_file(bsearch_file_handle bfh, const char *key, char **value, + size_t *location, size_t *loops, size_t *reads); +void __bsearch_file_info(bsearch_file_handle bfh, size_t *page_sz, + size_t *max_sz, int *blockwise); +void __bsearch_file_close(bsearch_file_handle *bfh); #endif /* HEIM_BASE_H */ diff --git a/base/version-script.map b/base/version-script.map index 964ac37cd..0dbff7340 100644 --- a/base/version-script.map +++ b/base/version-script.map @@ -39,6 +39,11 @@ HEIMDAL_BASE_1.0 { heim_string_create_with_bytes; heim_string_get_type_id; heim_string_get_utf8; + __bsearch_text; + __bsearch_file_open; + __bsearch_file; + __bsearch_file_info; + __bsearch_file_close; local: *; }; diff --git a/configure.ac b/configure.ac index 4c21e508f..f6d5c22d2 100644 --- a/configure.ac +++ b/configure.ac @@ -637,6 +637,7 @@ AC_CONFIG_FILES(Makefile \ kdc/Makefile \ appl/Makefile \ appl/afsutil/Makefile \ + appl/dbutils/Makefile \ appl/ftp/Makefile \ appl/ftp/common/Makefile \ appl/ftp/ftp/Makefile \