From 5716b51c184b10600955f3a75dd1b396d06cdace Mon Sep 17 00:00:00 2001 From: Nicolas Williams Date: Thu, 8 Oct 2020 20:58:17 -0500 Subject: [PATCH] hdb: Fix HDB corruption with LMDB backend --- lib/hdb/hdb-mdb.c | 520 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 397 insertions(+), 123 deletions(-) diff --git a/lib/hdb/hdb-mdb.c b/lib/hdb/hdb-mdb.c index 0450d2742..0342d8cdb 100644 --- a/lib/hdb/hdb-mdb.c +++ b/lib/hdb/hdb-mdb.c @@ -42,13 +42,289 @@ #define KILO 1024 +#define E(sym, kret) case sym: ret = kret; ename = #sym; break + +/* Note: calls krb5_set_error_message() */ +static krb5_error_code +mdb2krb5_code(krb5_context context, int code) +{ + krb5_error_code ret = 0; + const char *ename = "UNKNOWN"; + const char *estr = mdb_strerror(code); + + switch (code) { + case MDB_SUCCESS: return 0; + E(MDB_KEYEXIST, HDB_ERR_EXISTS); + E(MDB_NOTFOUND, HDB_ERR_NOENTRY); + E(MDB_PAGE_NOTFOUND, HDB_ERR_UK_SERROR); + E(MDB_CORRUPTED, HDB_ERR_UK_SERROR); + E(MDB_PANIC, HDB_ERR_UK_SERROR); + E(MDB_VERSION_MISMATCH, HDB_ERR_UK_SERROR); + E(MDB_INVALID, HDB_ERR_UK_SERROR); + E(MDB_MAP_FULL, HDB_ERR_UK_SERROR); + E(MDB_DBS_FULL, HDB_ERR_UK_SERROR); + E(MDB_READERS_FULL, HDB_ERR_UK_SERROR); + E(MDB_TLS_FULL, HDB_ERR_UK_SERROR); + E(MDB_TXN_FULL, HDB_ERR_UK_SERROR); + E(MDB_CURSOR_FULL, HDB_ERR_UK_SERROR); + E(MDB_PAGE_FULL, HDB_ERR_UK_SERROR); + E(MDB_MAP_RESIZED, HDB_ERR_UK_SERROR); + E(MDB_INCOMPATIBLE, HDB_ERR_UK_SERROR); + E(MDB_BAD_RSLOT, HDB_ERR_UK_SERROR); + E(MDB_BAD_TXN, HDB_ERR_UK_SERROR); + E(MDB_BAD_VALSIZE, HDB_ERR_UK_SERROR); + E(MDB_BAD_DBI, HDB_ERR_UK_SERROR); + default: + if (code > 0 && code < 100) + ret = code; + else + ret = HDB_ERR_UK_SERROR; + break; + } + if (ret) + krb5_set_error_message(context, ret, "MDB error %s (%d): %s", + ename, code, estr); + return ret; +} + typedef struct mdb_info { MDB_env *e; MDB_txn *t; MDB_dbi d; MDB_cursor *c; + int oflags; + mode_t mode; + size_t mapsize; + unsigned int in_tx:1; } mdb_info; +/* See below */ +struct keep_it_open { + char *path; + MDB_env *env; + MDB_dbi d; + unsigned int oflags; + size_t refs; + size_t mapsize; + unsigned int valid:1; + struct keep_it_open *next; +} *keep_them_open; +HEIMDAL_MUTEX keep_them_open_lock = HEIMDAL_MUTEX_INITIALIZER; + +/* + * On Unix LMDB uses fcntl() byte-range locks, and unlike SQLite3 (which also + * uses fcntl() byte-range locks) LMDB takes no precautions to avoid early + * first-close()s that cause other threads' locks to get dropped. No, LMDB + * requires the caller to take such precautions. For us that means opening one + * mdb env per-{HDB, mode} (where mode is read-write or read-only), never + * closing it, and sharing it with all threads. + * + * Sharing an MDB_env * across multiple threads is documented to be safe, and + * internally LMDB uses pread(2), pwrite(2), and mmap(2) for I/O, using + * read(2)/write(2) only in the DB copy routines that we don't use. + * + * On WIN32 we don't have to do any of this, however, to avoid ifdef spaghetti, + * we share this code on all platforms, even if it isn't strictly needed. + * + * Also, one must call mdb_open() (aka mdb_dbi_open()) only once per call to + * mdb_env_open() and per B-tree. We only use one B-tree in each LMDB: the + * main one. + * + * On success this outputs an `MDB_env *' (the handle for the LMDB) and an + * `MDB_dbi' (the handle for the main B-tree in the LMDB). + * + * ALSO, LMDB requires that we re-open the `MDB_env' when the database grows + * larger than the mmap size. We handle this by finding in `keep_them_open' + * the env we already have, marking it unusable, and the finding some other + * better one or opening a new one and adding it to the list. + */ +static krb5_error_code +my_mdb_env_create_and_open(krb5_context context, + mdb_info *mi, + const char *path, + int mapfull) +{ + struct keep_it_open *p, *n; + MDB_txn *txn = NULL; + unsigned int flags = MDB_NOSUBDIR; + struct stat st; + size_t mapsize = 0; + int max_readers; + int locked = 0; + int code = 0; + + mi->oflags &= O_ACCMODE; + flags |= (mi->oflags == O_RDONLY) ? MDB_RDONLY : 0; + + mi->e = NULL; + + /* + * Allocate a new object, in case we don't already have one in + * `keep_them_open'; if we don't need it, we'll free it. This way we do + * some of the work of creating one while not holding a lock. + */ + if ((n = calloc(1, sizeof(*n))) == NULL || + (n->path = strdup(path)) == NULL) { + free(n); + return krb5_enomem(context); + } + n->oflags = mi->oflags; + + max_readers = krb5_config_get_int_default(context, NULL, 0, "kdc", + "hdb-mdb-maxreaders", NULL); + mapsize = krb5_config_get_int_default(context, NULL, 0, "kdc", "hdb-mdb-mapsize", + NULL); + if (mapsize > INT_MAX) + mapsize = 0; + + memset(&st, 0, sizeof(st)); + if (stat(path, &st) == 0 && st.st_size > mapsize * KILO) + mapsize += (st.st_size + (st.st_size >> 2)) / KILO; + if (mapsize < 100 * 1024) + mapsize = 100 * 1024; /* 100MB */ + if (mapsize < mi->mapsize) + mapsize = mi->mapsize; + if (mapfull) + mapsize += 10 * 1024; + if ((code = mdb_env_create(&n->env)) || + (max_readers && (code = mdb_env_set_maxreaders(n->env, max_readers)))) + goto out; + + /* Look for an existing env */ + HEIMDAL_MUTEX_lock(&keep_them_open_lock); + locked = 1; + for (p = keep_them_open; p; p = p->next) { + if (strcmp(p->path, path)) + continue; + if (p->mapsize > mapsize) + /* Always increase mapsize */ + mapsize = p->mapsize + (p->mapsize >> 1); + if (!p->valid || p->oflags != mi->oflags) + continue; + /* Found one; output it and get out */ + mi->e = p->env; + mi->d = p->d; + p->refs++; + goto out; + } + + /* Did not find one, so open and add this one to the list */ + + /* Open the LMDB itself */ + n->refs = 1; + n->valid = 1; + krb5_debug(context, 5, "Opening HDB LMDB %s with mapsize %llu", + path, (unsigned long long)mapsize * KILO); + code = mdb_env_set_mapsize(n->env, mapsize * KILO); + if (code == 0) + code = mdb_env_open(n->env, path, flags, mi->mode); + if (code == 0) + /* Open a transaction so we can resolve the main B-tree */ + code = mdb_txn_begin(n->env, NULL, MDB_RDONLY, &txn); + if (code == 0) + /* Resolve the main B-tree */ + code = mdb_open(txn, NULL, 0, &n->d); + if (code) + goto out; + + /* Successfully opened the LMDB; output the two handles */ + mi->mapsize = n->mapsize = mapsize; + mi->e = n->env; + mi->d = n->d; + + /* Add this keep_it_open to the front of the list */ + n->next = keep_them_open; + keep_them_open = n; + n = NULL; + +out: + if (locked) + HEIMDAL_MUTEX_unlock(&keep_them_open_lock); + if (n) { + if (n->env) + mdb_env_close(n->env); + free(n->path); + free(n); + } + (void) mdb_txn_commit(txn); /* Safe when `txn == NULL' */ + return mdb2krb5_code(context, code); +} + +static void +my_mdb_env_close(krb5_context context, + const char *db_name, + MDB_env **envp) +{ + struct keep_it_open **prev; + struct keep_it_open *p, *old; + size_t refs_seen = 0; + size_t slen = strlen(db_name); + MDB_env *env = *envp; + + if (env == NULL) + return; + + HEIMDAL_MUTEX_lock(&keep_them_open_lock); + for (p = keep_them_open; p; p = p->next) { + /* + * We can have multiple open ones and we need to know if this is the + * last one, so we can't break out early. + */ + if (p->env == env) + refs_seen += (--(p->refs)); + else if (strncmp(db_name, p->path, slen) == 0 && + strcmp(p->path + slen, ".mdb") == 0) + refs_seen += p->refs; + } + krb5_debug(context, 6, "Closing HDB LMDB %s / %p; refs %llu", db_name, env, + (unsigned long long)refs_seen); + prev = &keep_them_open; + for (p = keep_them_open; !refs_seen && p; ) { + /* We're the last close */ + if (p->refs || + strncmp(db_name, p->path, slen) || + strcmp(p->path + slen, ".mdb")) { + + /* Not us; this keep_it_open stays */ + prev = &p->next; + p = p->next; + continue; + } + + /* Close and remove this one */ + krb5_debug(context, 6, "Closing HDB LMDB %s (mapsize was %llu)", + db_name, (unsigned long long)p->mapsize * KILO); + old = p; + *prev = (p = p->next); /* prev stays */ + mdb_env_close(old->env); + free(old->path); + free(old); + } + HEIMDAL_MUTEX_unlock(&keep_them_open_lock); +} + +/* + * This is a wrapper around my_mdb_env_create_and_open(). It may close an + * existing MDB_env in mi->e if it's there. If we need to reopen because the + * MDB grew too much, then we call this. + */ +static krb5_error_code +my_reopen_mdb(krb5_context context, HDB *db, int mapfull) +{ + mdb_info *mi = (mdb_info *)db->hdb_db; + char *fn; + krb5_error_code ret = 0; + + /* No-op if we don't have an open one */ + my_mdb_env_close(context, db->hdb_name, &mi->e); + if (asprintf(&fn, "%s.mdb", db->hdb_name) == -1) + ret = krb5_enomem(context); + if (ret == 0) + ret = my_mdb_env_create_and_open(context, mi, fn, mapfull); + free(fn); + return ret; +} + static krb5_error_code DB_close(krb5_context context, HDB *db) { @@ -56,7 +332,7 @@ DB_close(krb5_context context, HDB *db) mdb_cursor_close(mi->c); mdb_txn_abort(mi->t); - mdb_env_close(mi->e); + my_mdb_env_close(context, db->hdb_name, &mi->e); mi->c = 0; mi->t = 0; mi->e = 0; @@ -114,13 +390,16 @@ DB_seq(krb5_context context, HDB *db, krb5_data key_data, data; int code; + /* + * No need to worry about MDB_MAP_FULL when we're scanning the DB since we + * have snapshot semantics, and any DB growth from other transactions + * should not affect us. + */ key.mv_size = 0; value.mv_size = 0; code = mdb_cursor_get(mi->c, &key, &value, flag); - if (code == MDB_NOTFOUND) - return HDB_ERR_NOENTRY; if (code) - return code; + return mdb2krb5_code(context, code); key_data.data = key.mv_data; key_data.length = key.mv_size; @@ -151,22 +430,34 @@ DB_seq(krb5_context context, HDB *db, static krb5_error_code DB_firstkey(krb5_context context, HDB *db, unsigned flags, hdb_entry_ex *entry) { + krb5_error_code ret = 0; mdb_info *mi = db->hdb_db; - int code; + int tries = 3; + int code = 0; /* Always start with a fresh cursor to pick up latest DB state */ - if (mi->t) - mdb_txn_abort(mi->t); - code = mdb_txn_begin(mi->e, NULL, MDB_RDONLY, &mi->t); - if (code) - return code; + do { + if (mi->t) + mdb_txn_abort(mi->t); + mi->t = NULL; + if (code) + code = my_reopen_mdb(context, db, 1); + if (code == 0) + code = mdb_txn_begin(mi->e, NULL, MDB_RDONLY, &mi->t); + if (code == 0) + code = mdb_cursor_open(mi->t, mi->d, &mi->c); + if (code == 0) { + ret = DB_seq(context, db, flags, entry, MDB_FIRST); + break; + } + } while (code == MDB_MAP_FULL && --tries > 0); - code = mdb_cursor_open(mi->t, mi->d, &mi->c); - if (code) - return code; - - return DB_seq(context, db, flags, entry, MDB_FIRST); + if (code || ret) { + mdb_txn_abort(mi->t); + mi->t = NULL; + } + return ret ? ret : mdb2krb5_code(context, code); } @@ -207,24 +498,34 @@ static krb5_error_code DB__get(krb5_context context, HDB *db, krb5_data key, krb5_data *reply) { mdb_info *mi = (mdb_info*)db->hdb_db; - MDB_txn *txn; + MDB_txn *txn = NULL; MDB_val k, v; - int code; + int tries = 3; + int code = 0; k.mv_data = key.data; k.mv_size = key.length; - code = mdb_txn_begin(mi->e, NULL, MDB_RDONLY, &txn); - if (code) - return code; + do { + if (txn) { + mdb_txn_abort(txn); + txn = NULL; + } + if (code) + code = my_reopen_mdb(context, db, 1); + if (code == 0) + code = mdb_txn_begin(mi->e, NULL, MDB_RDONLY, &txn); + if (code == 0) + code = mdb_get(txn, mi->d, &k, &v); + if (code == 0) + krb5_data_copy(reply, v.mv_data, v.mv_size); + } while (code == MDB_MAP_FULL && --tries > 0); - code = mdb_get(txn, mi->d, &k, &v); - if (code == 0) - krb5_data_copy(reply, v.mv_data, v.mv_size); - mdb_txn_abort(txn); - if(code == MDB_NOTFOUND) - return HDB_ERR_NOENTRY; - return code; + if (code) + mdb_txn_abort(txn); + else + (void) mdb_txn_commit(txn); /* Empty transaction? -> commit */ + return mdb2krb5_code(context, code); } static krb5_error_code @@ -232,137 +533,110 @@ DB__put(krb5_context context, HDB *db, int replace, krb5_data key, krb5_data value) { mdb_info *mi = (mdb_info*)db->hdb_db; - MDB_txn *txn; + MDB_txn *txn = NULL; MDB_val k, v; - int code; + int tries = 3; + int code = 0; k.mv_data = key.data; k.mv_size = key.length; v.mv_data = value.data; v.mv_size = value.length; - code = mdb_txn_begin(mi->e, NULL, 0, &txn); - if (code) - return code; - - code = mdb_put(txn, mi->d, &k, &v, replace ? 0 : MDB_NOOVERWRITE); - if (code) - mdb_txn_abort(txn); - else - code = mdb_txn_commit(txn); - /* - * No need to call mdb_env_sync(); it's done automatically if MDB_NOSYNC is - * not set. - */ - if(code == MDB_KEYEXIST) - return HDB_ERR_EXISTS; - return code; + do { + if (txn) { + mdb_txn_abort(txn); + txn = NULL; + } + if (code) + code = my_reopen_mdb(context, db, 1); + if (code == 0) + code = mdb_txn_begin(mi->e, NULL, 0, &txn); + if (code == 0) + code = mdb_put(txn, mi->d, &k, &v, replace ? 0 : MDB_NOOVERWRITE); + if (code == 0) { + /* + * No need to call mdb_env_sync(); it's done automatically if + * MDB_NOSYNC is not set. + */ + code = mdb_txn_commit(txn); + txn = NULL; + } + } while (code == MDB_MAP_FULL && --tries > 0); + if (txn) + mdb_txn_abort(txn); + return mdb2krb5_code(context, code); } static krb5_error_code DB__del(krb5_context context, HDB *db, krb5_data key) { mdb_info *mi = (mdb_info*)db->hdb_db; - MDB_txn *txn; + MDB_txn *txn = NULL; MDB_val k; - krb5_error_code code; + int tries = 3; + int code = 0; k.mv_data = key.data; k.mv_size = key.length; - code = mdb_txn_begin(mi->e, NULL, 0, &txn); - if (code) - return code; + do { + if (txn) { + mdb_txn_abort(txn); + txn = NULL; + } + if (code) + code = my_reopen_mdb(context, db, 1); + if (code == 0) + code = mdb_txn_begin(mi->e, NULL, 0, &txn); + if (code == 0) + code = mdb_del(txn, mi->d, &k, NULL); + if (code == 0) { + /* + * No need to call mdb_env_sync(); it's done automatically if + * MDB_NOSYNC is not set. + */ + code = mdb_txn_commit(txn); + txn = NULL; + } + } while (code == MDB_MAP_FULL && --tries > 0); - code = mdb_del(txn, mi->d, &k, NULL); - if (code) - mdb_txn_abort(txn); - else - code = mdb_txn_commit(txn); - /* - * No need to call mdb_env_sync(); it's done automatically if MDB_NOSYNC is - * not set. - */ - if(code == MDB_NOTFOUND) - return HDB_ERR_NOENTRY; - return code; + if (txn) + mdb_txn_abort(txn); + return mdb2krb5_code(context, code); } static krb5_error_code -DB_open(krb5_context context, HDB *db, int flags, mode_t mode) +DB_open(krb5_context context, HDB *db, int oflags, mode_t mode) { mdb_info *mi = (mdb_info *)db->hdb_db; - MDB_txn *txn; - char *fn; krb5_error_code ret; - int myflags = MDB_NOSUBDIR, tmp; - if((flags & O_ACCMODE) == O_RDONLY) - myflags |= MDB_RDONLY; - - if (asprintf(&fn, "%s.mdb", db->hdb_name) == -1) - return krb5_enomem(context); - if (mdb_env_create(&mi->e)) { - free(fn); - return krb5_enomem(context); - } - - tmp = krb5_config_get_int_default(context, NULL, 0, "kdc", - "hdb-mdb-maxreaders", NULL); - if (tmp) { - ret = mdb_env_set_maxreaders(mi->e, tmp); - if (ret) { - free(fn); - krb5_set_error_message(context, ret, "setting maxreaders on %s: %s", - db->hdb_name, mdb_strerror(ret)); - return ret; - } - } - - tmp = krb5_config_get_int_default(context, NULL, 0, "kdc", - "hdb-mdb-mapsize", NULL); - if (tmp) { - size_t maps = tmp; - maps *= KILO; - ret = mdb_env_set_mapsize(mi->e, maps); - if (ret) { - free(fn); - krb5_set_error_message(context, ret, "setting mapsize on %s: %s", - db->hdb_name, mdb_strerror(ret)); - return ret; - } - } - - ret = mdb_env_open(mi->e, fn, myflags, mode); - free(fn); + mi->e = NULL; + mi->mode = mode; + mi->oflags = oflags & O_ACCMODE; + ret = my_reopen_mdb(context, db, 0); if (ret) { -fail: - mdb_env_close(mi->e); - mi->e = 0; - krb5_set_error_message(context, ret, "opening %s: %s", - db->hdb_name, mdb_strerror(ret)); + krb5_prepend_error_message(context, ret, "opening %s:", db->hdb_name); return ret; } - ret = mdb_txn_begin(mi->e, NULL, MDB_RDONLY, &txn); - if (ret) - goto fail; - - ret = mdb_open(txn, NULL, 0, &mi->d); - mdb_txn_abort(txn); - if (ret) - goto fail; - - if((flags & O_ACCMODE) == O_RDONLY) + if ((oflags & O_ACCMODE) == O_RDONLY) { ret = hdb_check_db_format(context, db); - else + /* + * Dubious: if the DB is not initialized, shouldn't we tell the + * caller?? + */ + if (ret == HDB_ERR_NOENTRY) + return 0; + } else { + /* hdb_init_db() calls hdb_check_db_format() */ ret = hdb_init_db(context, db); - if(ret == HDB_ERR_NOENTRY) - return 0; + } if (ret) { DB_close(context, db); krb5_set_error_message(context, ret, "hdb_open: failed %s database %s", - (flags & O_ACCMODE) == O_RDONLY ? + (oflags & O_ACCMODE) == O_RDONLY ? "checking format of" : "initialize", db->hdb_name); }