hdb: Fix HDB corruption with LMDB backend

This commit is contained in:
Nicolas Williams
2020-10-08 20:58:17 -05:00
parent ea8f3991fb
commit 5716b51c18

View File

@@ -42,13 +42,289 @@
#define KILO 1024
#define E(sym, kret) case sym: ret = kret; ename = #sym; break
/* Note: calls krb5_set_error_message() */
static krb5_error_code
mdb2krb5_code(krb5_context context, int code)
{
krb5_error_code ret = 0;
const char *ename = "UNKNOWN";
const char *estr = mdb_strerror(code);
switch (code) {
case MDB_SUCCESS: return 0;
E(MDB_KEYEXIST, HDB_ERR_EXISTS);
E(MDB_NOTFOUND, HDB_ERR_NOENTRY);
E(MDB_PAGE_NOTFOUND, HDB_ERR_UK_SERROR);
E(MDB_CORRUPTED, HDB_ERR_UK_SERROR);
E(MDB_PANIC, HDB_ERR_UK_SERROR);
E(MDB_VERSION_MISMATCH, HDB_ERR_UK_SERROR);
E(MDB_INVALID, HDB_ERR_UK_SERROR);
E(MDB_MAP_FULL, HDB_ERR_UK_SERROR);
E(MDB_DBS_FULL, HDB_ERR_UK_SERROR);
E(MDB_READERS_FULL, HDB_ERR_UK_SERROR);
E(MDB_TLS_FULL, HDB_ERR_UK_SERROR);
E(MDB_TXN_FULL, HDB_ERR_UK_SERROR);
E(MDB_CURSOR_FULL, HDB_ERR_UK_SERROR);
E(MDB_PAGE_FULL, HDB_ERR_UK_SERROR);
E(MDB_MAP_RESIZED, HDB_ERR_UK_SERROR);
E(MDB_INCOMPATIBLE, HDB_ERR_UK_SERROR);
E(MDB_BAD_RSLOT, HDB_ERR_UK_SERROR);
E(MDB_BAD_TXN, HDB_ERR_UK_SERROR);
E(MDB_BAD_VALSIZE, HDB_ERR_UK_SERROR);
E(MDB_BAD_DBI, HDB_ERR_UK_SERROR);
default:
if (code > 0 && code < 100)
ret = code;
else
ret = HDB_ERR_UK_SERROR;
break;
}
if (ret)
krb5_set_error_message(context, ret, "MDB error %s (%d): %s",
ename, code, estr);
return ret;
}
typedef struct mdb_info {
MDB_env *e;
MDB_txn *t;
MDB_dbi d;
MDB_cursor *c;
int oflags;
mode_t mode;
size_t mapsize;
unsigned int in_tx:1;
} mdb_info;
/* See below */
struct keep_it_open {
char *path;
MDB_env *env;
MDB_dbi d;
unsigned int oflags;
size_t refs;
size_t mapsize;
unsigned int valid:1;
struct keep_it_open *next;
} *keep_them_open;
HEIMDAL_MUTEX keep_them_open_lock = HEIMDAL_MUTEX_INITIALIZER;
/*
* On Unix LMDB uses fcntl() byte-range locks, and unlike SQLite3 (which also
* uses fcntl() byte-range locks) LMDB takes no precautions to avoid early
* first-close()s that cause other threads' locks to get dropped. No, LMDB
* requires the caller to take such precautions. For us that means opening one
* mdb env per-{HDB, mode} (where mode is read-write or read-only), never
* closing it, and sharing it with all threads.
*
* Sharing an MDB_env * across multiple threads is documented to be safe, and
* internally LMDB uses pread(2), pwrite(2), and mmap(2) for I/O, using
* read(2)/write(2) only in the DB copy routines that we don't use.
*
* On WIN32 we don't have to do any of this, however, to avoid ifdef spaghetti,
* we share this code on all platforms, even if it isn't strictly needed.
*
* Also, one must call mdb_open() (aka mdb_dbi_open()) only once per call to
* mdb_env_open() and per B-tree. We only use one B-tree in each LMDB: the
* main one.
*
* On success this outputs an `MDB_env *' (the handle for the LMDB) and an
* `MDB_dbi' (the handle for the main B-tree in the LMDB).
*
* ALSO, LMDB requires that we re-open the `MDB_env' when the database grows
* larger than the mmap size. We handle this by finding in `keep_them_open'
* the env we already have, marking it unusable, and the finding some other
* better one or opening a new one and adding it to the list.
*/
static krb5_error_code
my_mdb_env_create_and_open(krb5_context context,
mdb_info *mi,
const char *path,
int mapfull)
{
struct keep_it_open *p, *n;
MDB_txn *txn = NULL;
unsigned int flags = MDB_NOSUBDIR;
struct stat st;
size_t mapsize = 0;
int max_readers;
int locked = 0;
int code = 0;
mi->oflags &= O_ACCMODE;
flags |= (mi->oflags == O_RDONLY) ? MDB_RDONLY : 0;
mi->e = NULL;
/*
* Allocate a new object, in case we don't already have one in
* `keep_them_open'; if we don't need it, we'll free it. This way we do
* some of the work of creating one while not holding a lock.
*/
if ((n = calloc(1, sizeof(*n))) == NULL ||
(n->path = strdup(path)) == NULL) {
free(n);
return krb5_enomem(context);
}
n->oflags = mi->oflags;
max_readers = krb5_config_get_int_default(context, NULL, 0, "kdc",
"hdb-mdb-maxreaders", NULL);
mapsize = krb5_config_get_int_default(context, NULL, 0, "kdc", "hdb-mdb-mapsize",
NULL);
if (mapsize > INT_MAX)
mapsize = 0;
memset(&st, 0, sizeof(st));
if (stat(path, &st) == 0 && st.st_size > mapsize * KILO)
mapsize += (st.st_size + (st.st_size >> 2)) / KILO;
if (mapsize < 100 * 1024)
mapsize = 100 * 1024; /* 100MB */
if (mapsize < mi->mapsize)
mapsize = mi->mapsize;
if (mapfull)
mapsize += 10 * 1024;
if ((code = mdb_env_create(&n->env)) ||
(max_readers && (code = mdb_env_set_maxreaders(n->env, max_readers))))
goto out;
/* Look for an existing env */
HEIMDAL_MUTEX_lock(&keep_them_open_lock);
locked = 1;
for (p = keep_them_open; p; p = p->next) {
if (strcmp(p->path, path))
continue;
if (p->mapsize > mapsize)
/* Always increase mapsize */
mapsize = p->mapsize + (p->mapsize >> 1);
if (!p->valid || p->oflags != mi->oflags)
continue;
/* Found one; output it and get out */
mi->e = p->env;
mi->d = p->d;
p->refs++;
goto out;
}
/* Did not find one, so open and add this one to the list */
/* Open the LMDB itself */
n->refs = 1;
n->valid = 1;
krb5_debug(context, 5, "Opening HDB LMDB %s with mapsize %llu",
path, (unsigned long long)mapsize * KILO);
code = mdb_env_set_mapsize(n->env, mapsize * KILO);
if (code == 0)
code = mdb_env_open(n->env, path, flags, mi->mode);
if (code == 0)
/* Open a transaction so we can resolve the main B-tree */
code = mdb_txn_begin(n->env, NULL, MDB_RDONLY, &txn);
if (code == 0)
/* Resolve the main B-tree */
code = mdb_open(txn, NULL, 0, &n->d);
if (code)
goto out;
/* Successfully opened the LMDB; output the two handles */
mi->mapsize = n->mapsize = mapsize;
mi->e = n->env;
mi->d = n->d;
/* Add this keep_it_open to the front of the list */
n->next = keep_them_open;
keep_them_open = n;
n = NULL;
out:
if (locked)
HEIMDAL_MUTEX_unlock(&keep_them_open_lock);
if (n) {
if (n->env)
mdb_env_close(n->env);
free(n->path);
free(n);
}
(void) mdb_txn_commit(txn); /* Safe when `txn == NULL' */
return mdb2krb5_code(context, code);
}
static void
my_mdb_env_close(krb5_context context,
const char *db_name,
MDB_env **envp)
{
struct keep_it_open **prev;
struct keep_it_open *p, *old;
size_t refs_seen = 0;
size_t slen = strlen(db_name);
MDB_env *env = *envp;
if (env == NULL)
return;
HEIMDAL_MUTEX_lock(&keep_them_open_lock);
for (p = keep_them_open; p; p = p->next) {
/*
* We can have multiple open ones and we need to know if this is the
* last one, so we can't break out early.
*/
if (p->env == env)
refs_seen += (--(p->refs));
else if (strncmp(db_name, p->path, slen) == 0 &&
strcmp(p->path + slen, ".mdb") == 0)
refs_seen += p->refs;
}
krb5_debug(context, 6, "Closing HDB LMDB %s / %p; refs %llu", db_name, env,
(unsigned long long)refs_seen);
prev = &keep_them_open;
for (p = keep_them_open; !refs_seen && p; ) {
/* We're the last close */
if (p->refs ||
strncmp(db_name, p->path, slen) ||
strcmp(p->path + slen, ".mdb")) {
/* Not us; this keep_it_open stays */
prev = &p->next;
p = p->next;
continue;
}
/* Close and remove this one */
krb5_debug(context, 6, "Closing HDB LMDB %s (mapsize was %llu)",
db_name, (unsigned long long)p->mapsize * KILO);
old = p;
*prev = (p = p->next); /* prev stays */
mdb_env_close(old->env);
free(old->path);
free(old);
}
HEIMDAL_MUTEX_unlock(&keep_them_open_lock);
}
/*
* This is a wrapper around my_mdb_env_create_and_open(). It may close an
* existing MDB_env in mi->e if it's there. If we need to reopen because the
* MDB grew too much, then we call this.
*/
static krb5_error_code
my_reopen_mdb(krb5_context context, HDB *db, int mapfull)
{
mdb_info *mi = (mdb_info *)db->hdb_db;
char *fn;
krb5_error_code ret = 0;
/* No-op if we don't have an open one */
my_mdb_env_close(context, db->hdb_name, &mi->e);
if (asprintf(&fn, "%s.mdb", db->hdb_name) == -1)
ret = krb5_enomem(context);
if (ret == 0)
ret = my_mdb_env_create_and_open(context, mi, fn, mapfull);
free(fn);
return ret;
}
static krb5_error_code
DB_close(krb5_context context, HDB *db)
{
@@ -56,7 +332,7 @@ DB_close(krb5_context context, HDB *db)
mdb_cursor_close(mi->c);
mdb_txn_abort(mi->t);
mdb_env_close(mi->e);
my_mdb_env_close(context, db->hdb_name, &mi->e);
mi->c = 0;
mi->t = 0;
mi->e = 0;
@@ -114,13 +390,16 @@ DB_seq(krb5_context context, HDB *db,
krb5_data key_data, data;
int code;
/*
* No need to worry about MDB_MAP_FULL when we're scanning the DB since we
* have snapshot semantics, and any DB growth from other transactions
* should not affect us.
*/
key.mv_size = 0;
value.mv_size = 0;
code = mdb_cursor_get(mi->c, &key, &value, flag);
if (code == MDB_NOTFOUND)
return HDB_ERR_NOENTRY;
if (code)
return code;
return mdb2krb5_code(context, code);
key_data.data = key.mv_data;
key_data.length = key.mv_size;
@@ -151,22 +430,34 @@ DB_seq(krb5_context context, HDB *db,
static krb5_error_code
DB_firstkey(krb5_context context, HDB *db, unsigned flags, hdb_entry_ex *entry)
{
krb5_error_code ret = 0;
mdb_info *mi = db->hdb_db;
int code;
int tries = 3;
int code = 0;
/* Always start with a fresh cursor to pick up latest DB state */
do {
if (mi->t)
mdb_txn_abort(mi->t);
mi->t = NULL;
if (code)
code = my_reopen_mdb(context, db, 1);
if (code == 0)
code = mdb_txn_begin(mi->e, NULL, MDB_RDONLY, &mi->t);
if (code)
return code;
if (code == 0)
code = mdb_cursor_open(mi->t, mi->d, &mi->c);
if (code)
return code;
if (code == 0) {
ret = DB_seq(context, db, flags, entry, MDB_FIRST);
break;
}
} while (code == MDB_MAP_FULL && --tries > 0);
return DB_seq(context, db, flags, entry, MDB_FIRST);
if (code || ret) {
mdb_txn_abort(mi->t);
mi->t = NULL;
}
return ret ? ret : mdb2krb5_code(context, code);
}
@@ -207,24 +498,34 @@ static krb5_error_code
DB__get(krb5_context context, HDB *db, krb5_data key, krb5_data *reply)
{
mdb_info *mi = (mdb_info*)db->hdb_db;
MDB_txn *txn;
MDB_txn *txn = NULL;
MDB_val k, v;
int code;
int tries = 3;
int code = 0;
k.mv_data = key.data;
k.mv_size = key.length;
code = mdb_txn_begin(mi->e, NULL, MDB_RDONLY, &txn);
do {
if (txn) {
mdb_txn_abort(txn);
txn = NULL;
}
if (code)
return code;
code = my_reopen_mdb(context, db, 1);
if (code == 0)
code = mdb_txn_begin(mi->e, NULL, MDB_RDONLY, &txn);
if (code == 0)
code = mdb_get(txn, mi->d, &k, &v);
if (code == 0)
krb5_data_copy(reply, v.mv_data, v.mv_size);
} while (code == MDB_MAP_FULL && --tries > 0);
if (code)
mdb_txn_abort(txn);
if(code == MDB_NOTFOUND)
return HDB_ERR_NOENTRY;
return code;
else
(void) mdb_txn_commit(txn); /* Empty transaction? -> commit */
return mdb2krb5_code(context, code);
}
static krb5_error_code
@@ -232,137 +533,110 @@ DB__put(krb5_context context, HDB *db, int replace,
krb5_data key, krb5_data value)
{
mdb_info *mi = (mdb_info*)db->hdb_db;
MDB_txn *txn;
MDB_txn *txn = NULL;
MDB_val k, v;
int code;
int tries = 3;
int code = 0;
k.mv_data = key.data;
k.mv_size = key.length;
v.mv_data = value.data;
v.mv_size = value.length;
code = mdb_txn_begin(mi->e, NULL, 0, &txn);
if (code)
return code;
code = mdb_put(txn, mi->d, &k, &v, replace ? 0 : MDB_NOOVERWRITE);
if (code)
do {
if (txn) {
mdb_txn_abort(txn);
else
code = mdb_txn_commit(txn);
txn = NULL;
}
if (code)
code = my_reopen_mdb(context, db, 1);
if (code == 0)
code = mdb_txn_begin(mi->e, NULL, 0, &txn);
if (code == 0)
code = mdb_put(txn, mi->d, &k, &v, replace ? 0 : MDB_NOOVERWRITE);
if (code == 0) {
/*
* No need to call mdb_env_sync(); it's done automatically if MDB_NOSYNC is
* not set.
* No need to call mdb_env_sync(); it's done automatically if
* MDB_NOSYNC is not set.
*/
if(code == MDB_KEYEXIST)
return HDB_ERR_EXISTS;
return code;
code = mdb_txn_commit(txn);
txn = NULL;
}
} while (code == MDB_MAP_FULL && --tries > 0);
if (txn)
mdb_txn_abort(txn);
return mdb2krb5_code(context, code);
}
static krb5_error_code
DB__del(krb5_context context, HDB *db, krb5_data key)
{
mdb_info *mi = (mdb_info*)db->hdb_db;
MDB_txn *txn;
MDB_txn *txn = NULL;
MDB_val k;
krb5_error_code code;
int tries = 3;
int code = 0;
k.mv_data = key.data;
k.mv_size = key.length;
code = mdb_txn_begin(mi->e, NULL, 0, &txn);
if (code)
return code;
code = mdb_del(txn, mi->d, &k, NULL);
if (code)
do {
if (txn) {
mdb_txn_abort(txn);
else
code = mdb_txn_commit(txn);
txn = NULL;
}
if (code)
code = my_reopen_mdb(context, db, 1);
if (code == 0)
code = mdb_txn_begin(mi->e, NULL, 0, &txn);
if (code == 0)
code = mdb_del(txn, mi->d, &k, NULL);
if (code == 0) {
/*
* No need to call mdb_env_sync(); it's done automatically if MDB_NOSYNC is
* not set.
* No need to call mdb_env_sync(); it's done automatically if
* MDB_NOSYNC is not set.
*/
if(code == MDB_NOTFOUND)
return HDB_ERR_NOENTRY;
return code;
code = mdb_txn_commit(txn);
txn = NULL;
}
} while (code == MDB_MAP_FULL && --tries > 0);
if (txn)
mdb_txn_abort(txn);
return mdb2krb5_code(context, code);
}
static krb5_error_code
DB_open(krb5_context context, HDB *db, int flags, mode_t mode)
DB_open(krb5_context context, HDB *db, int oflags, mode_t mode)
{
mdb_info *mi = (mdb_info *)db->hdb_db;
MDB_txn *txn;
char *fn;
krb5_error_code ret;
int myflags = MDB_NOSUBDIR, tmp;
if((flags & O_ACCMODE) == O_RDONLY)
myflags |= MDB_RDONLY;
if (asprintf(&fn, "%s.mdb", db->hdb_name) == -1)
return krb5_enomem(context);
if (mdb_env_create(&mi->e)) {
free(fn);
return krb5_enomem(context);
}
tmp = krb5_config_get_int_default(context, NULL, 0, "kdc",
"hdb-mdb-maxreaders", NULL);
if (tmp) {
ret = mdb_env_set_maxreaders(mi->e, tmp);
mi->e = NULL;
mi->mode = mode;
mi->oflags = oflags & O_ACCMODE;
ret = my_reopen_mdb(context, db, 0);
if (ret) {
free(fn);
krb5_set_error_message(context, ret, "setting maxreaders on %s: %s",
db->hdb_name, mdb_strerror(ret));
return ret;
}
}
tmp = krb5_config_get_int_default(context, NULL, 0, "kdc",
"hdb-mdb-mapsize", NULL);
if (tmp) {
size_t maps = tmp;
maps *= KILO;
ret = mdb_env_set_mapsize(mi->e, maps);
if (ret) {
free(fn);
krb5_set_error_message(context, ret, "setting mapsize on %s: %s",
db->hdb_name, mdb_strerror(ret));
return ret;
}
}
ret = mdb_env_open(mi->e, fn, myflags, mode);
free(fn);
if (ret) {
fail:
mdb_env_close(mi->e);
mi->e = 0;
krb5_set_error_message(context, ret, "opening %s: %s",
db->hdb_name, mdb_strerror(ret));
krb5_prepend_error_message(context, ret, "opening %s:", db->hdb_name);
return ret;
}
ret = mdb_txn_begin(mi->e, NULL, MDB_RDONLY, &txn);
if (ret)
goto fail;
ret = mdb_open(txn, NULL, 0, &mi->d);
mdb_txn_abort(txn);
if (ret)
goto fail;
if((flags & O_ACCMODE) == O_RDONLY)
if ((oflags & O_ACCMODE) == O_RDONLY) {
ret = hdb_check_db_format(context, db);
else
ret = hdb_init_db(context, db);
if(ret == HDB_ERR_NOENTRY)
/*
* Dubious: if the DB is not initialized, shouldn't we tell the
* caller??
*/
if (ret == HDB_ERR_NOENTRY)
return 0;
} else {
/* hdb_init_db() calls hdb_check_db_format() */
ret = hdb_init_db(context, db);
}
if (ret) {
DB_close(context, db);
krb5_set_error_message(context, ret, "hdb_open: failed %s database %s",
(flags & O_ACCMODE) == O_RDONLY ?
(oflags & O_ACCMODE) == O_RDONLY ?
"checking format of" : "initialize",
db->hdb_name);
}