ipropd-master/slave: enhancements and bug fixes

- fix int/uint confusion and use unsigned integral types for time
 - improve messages
 - add --verbose option
 - attempt transaction recovery in ipropd-master during idle times
 - begin hardening daemons against dying at the slightest provocation
 - better recovery from various errors
 - daemons now restart automatically in most of the many error cases
   where the daemons still die
This commit is contained in:
Nicolas Williams
2016-02-03 17:35:16 -06:00
parent 20df2c8706
commit ebc1ad34ba
5 changed files with 531 additions and 254 deletions

View File

@@ -68,4 +68,13 @@ enum iprop_cmd { I_HAVE = 1,
extern sig_atomic_t exit_flag;
void setup_signal(void);
enum ipropd_exit_code {
IPROPD_DONE = 0,
IPROPD_RESTART = 1,
IPROPD_RESTART_SLOW = 2,
IPROPD_FATAL = 3,
};
int restarter(krb5_context, size_t *);
#endif /* __IPROP_H__ */

View File

@@ -32,7 +32,11 @@
*/
#include "iprop.h"
RCSID("$Id$");
#if defined(HAVE_FORK) && defined(HAVE_WAITPID)
#include <sys/types.h>
#include <sys/wait.h>
#endif
sig_atomic_t exit_flag;
@@ -71,3 +75,191 @@ setup_signal(void)
#endif
#endif
}
/*
* Fork a child to run the service, and restart it if it dies.
*
* Returns -1 if not supported, else a file descriptor that the service
* should select() for. Any events on that file descriptor should cause
* the caller to exit immediately, as that means that the restarter
* exited.
*
* The service's normal exit status values should be should be taken
* from enum ipropd_exit_code. IPROPD_FATAL causes the restarter to
* stop restarting the service and to exit.
*
* A count of restarts is output via the `countp' argument, if it is
* non-NULL. This is useful for testing this function (e.g., kill the
* restarter after N restarts and check that the child gets the signal
* sent to it).
*
* This requires fork() and waitpid() (otherwise returns -1). Ignoring
* SIGCHLD, of course, would be bad.
*
* We could support this on Windows by spawning a child with mostly the
* same arguments as the restarter process.
*/
int
restarter(krb5_context context, size_t *countp)
{
#if defined(HAVE_FORK) && defined(HAVE_WAITPID)
struct timeval tmout;
pid_t pid;
pid_t wpid = -1;
int status;
int fds[2];
int fds2[2];
size_t count = 0;
fd_set readset;
fds[0] = -1;
fds[1] = -1;
fds2[0] = -1;
fds2[1] = -1;
signal(SIGCHLD, SIG_DFL);
while (!exit_flag) {
/* Close the pipe ends we keep open */
if (fds[1] != -1)
(void) close(fds[1]);
if (fds2[0] != -1)
(void) close(fds2[1]);
/* A pipe so the child can detect the parent's death */
if (pipe(fds) == -1) {
krb5_err(context, 1, errno,
"Could not setup pipes in service restarter");
}
/* A pipe so the parent can detect the child's death */
if (pipe(fds2) == -1) {
krb5_err(context, 1, errno,
"Could not setup pipes in service restarter");
}
fflush(stdout);
fflush(stderr);
pid = fork();
if (pid == -1)
krb5_err(context, 1, errno, "Could not fork in service restarter");
if (pid == 0) {
if (countp != NULL)
*countp = count;
(void) close(fds[1]);
(void) close(fds2[0]);
return fds[0];
}
count++;
(void) close(fds[0]);
(void) close(fds2[1]);
do {
wpid = waitpid(pid, &status, 0);
} while (wpid == -1 && errno == EINTR && !exit_flag);
if (wpid == -1 && errno == EINTR)
break; /* We were signaled; gotta kill the child and exit */
if (wpid == -1) {
if (errno != ECHILD) {
warn("waitpid() failed; killing restarter's child process");
kill(pid, SIGTERM);
}
krb5_err(context, 1, errno, "restarter failed waiting for child");
}
assert(wpid == pid);
wpid = -1;
pid = -1;
if (WIFEXITED(status)) {
switch (WEXITSTATUS(status)) {
case IPROPD_DONE:
exit(0);
case IPROPD_RESTART_SLOW:
if (exit_flag)
exit(1);
krb5_warnx(context, "Waiting 2 minutes to restart");
sleep(120);
continue;
case IPROPD_FATAL:
krb5_errx(context, WEXITSTATUS(status),
"Sockets and pipes not supported for "
"iprop log files");
case IPROPD_RESTART:
default:
if (exit_flag)
exit(1);
/* Add exponential backoff (with max backoff)? */
krb5_warnx(context, "Waiting 30 seconds to restart");
sleep(30);
continue;
}
}
/* else */
krb5_warnx(context, "Child was killed; waiting 30 seconds to restart");
sleep(30);
}
if (pid == -1)
exit(0); /* No dead child to reap; done */
assert(pid > 0);
if (wpid != pid) {
warnx("Interrupted; killing child (pid %ld) with %d",
(long)pid, exit_flag);
krb5_warnx(context, "Interrupted; killing child (pid %ld) with %d",
(long)pid, exit_flag);
kill(pid, exit_flag);
/* Wait up to one second for the child */
tmout.tv_sec = 1;
tmout.tv_usec = 0;
FD_ZERO(&readset);
FD_SET(fds2[0], &readset);
/* We don't care why select() returns */
(void) select(fds2[0] + 1, &readset, NULL, NULL, &tmout);
/*
* We haven't reaped the child yet; if it's a zombie, then
* SIGKILLing it won't hurt. If it's not a zombie yet, well,
* we're out of patience.
*/
kill(pid, SIGKILL);
do {
wpid = waitpid(pid, &status, 0);
} while (wpid != pid && errno == EINTR);
if (wpid == -1)
krb5_err(context, 1, errno, "restarter failed waiting for child");
}
/* Finally, the child is dead and reaped */
if (WIFEXITED(status))
exit(WEXITSTATUS(status));
if (WIFSIGNALED(status)) {
switch (WTERMSIG(status)) {
case SIGTERM:
case SIGXCPU:
case SIGINT:
exit(0);
default:
/*
* Attempt to set the same exit status for the parent as for
* the child.
*/
kill(getpid(), WTERMSIG(status));
/*
* We can get past the self-kill if we inherited a SIG_IGN
* disposition that the child reset to SIG_DFL.
*/
}
}
exit(1);
#else
if (countp != NULL)
*countp = 0;
errno = ENOTSUP;
return -1;
#endif
}

View File

@@ -36,6 +36,8 @@
static krb5_log_facility *log_facility;
static int verbose;
const char *slave_stats_file;
const char *slave_time_missing = "2 min";
const char *slave_time_gone = "5 min";
@@ -125,7 +127,7 @@ struct slave {
char *name;
krb5_auth_context ac;
uint32_t version;
int32_t version_tstamp;
uint32_t version_tstamp;
time_t seen;
unsigned long flags;
#define SLAVE_F_DEAD 0x1
@@ -331,7 +333,7 @@ dump_one (krb5_context context, HDB *db, hdb_entry_ex *entry, void *v)
ret = ENOMEM;
goto done;
}
krb5_store_int32(sp, ONE_PRINC);
krb5_store_uint32(sp, ONE_PRINC);
krb5_storage_free(sp);
ret = krb5_store_data(dump, data);
@@ -372,15 +374,15 @@ write_dump (krb5_context context, krb5_storage *dump,
ret = hdb_create (context, &db, database);
if (ret)
krb5_err (context, 1, ret, "hdb_create: %s", database);
krb5_err (context, IPROPD_RESTART, ret, "hdb_create: %s", database);
ret = db->hdb_open (context, db, O_RDONLY, 0);
if (ret)
krb5_err (context, 1, ret, "db->open");
krb5_err (context, IPROPD_RESTART, ret, "db->open");
sp = krb5_storage_from_mem (buf, 4);
if (sp == NULL)
krb5_errx (context, 1, "krb5_storage_from_mem");
krb5_store_int32 (sp, TELL_YOU_EVERYTHING);
krb5_errx (context, IPROPD_RESTART, "krb5_storage_from_mem");
krb5_store_uint32 (sp, TELL_YOU_EVERYTHING);
krb5_storage_free (sp);
data.data = buf;
@@ -403,9 +405,9 @@ write_dump (krb5_context context, krb5_storage *dump,
sp = krb5_storage_from_mem (buf, 8);
if (sp == NULL)
krb5_errx (context, 1, "krb5_storage_from_mem");
krb5_store_int32 (sp, NOW_YOU_HAVE);
krb5_store_int32 (sp, current_version);
krb5_errx (context, IPROPD_RESTART, "krb5_storage_from_mem");
krb5_store_uint32 (sp, NOW_YOU_HAVE);
krb5_store_uint32 (sp, current_version);
krb5_storage_free (sp);
data.length = 8;
@@ -459,7 +461,7 @@ write_dump (krb5_context context, krb5_storage *dump,
static int
send_complete (krb5_context context, slave *s, const char *database,
uint32_t current_version, uint32_t oldest_version,
int32_t initial_log_tstamp)
uint32_t initial_log_tstamp)
{
krb5_error_code ret;
krb5_storage *dump = NULL;
@@ -527,6 +529,9 @@ send_complete (krb5_context context, slave *s, const char *database,
vno >= oldest_version && vno <= current_version)
break;
if (verbose)
krb5_warnx(context, "send_complete: dumping HDB");
/*
* Otherwise, we may need to write a new dump file. We
* obtain an exclusive lock on the fd. Because this is
@@ -561,7 +566,7 @@ send_complete (krb5_context context, slave *s, const char *database,
if (fstat(fd, &st) == -1) {
ret = errno;
krb5_warn(context, ret, "write_dump: could not stat dump file");
krb5_warn(context, ret, "send_complete: could not stat dump file");
goto done;
}
@@ -648,7 +653,7 @@ send_are_you_there (krb5_context context, slave *s)
slave_dead(context, s);
return 1;
}
krb5_store_int32 (sp, ARE_YOU_THERE);
krb5_store_uint32 (sp, ARE_YOU_THERE);
krb5_storage_free (sp);
ret = krb5_write_priv_message(context, s->ac, &s->fd, &data);
@@ -665,12 +670,12 @@ send_are_you_there (krb5_context context, slave *s)
static int
send_diffs (kadm5_server_context *server_context, slave *s, int log_fd,
const char *database, uint32_t current_version,
int32_t current_tstamp)
uint32_t current_tstamp)
{
krb5_context context = server_context->context;
krb5_storage *sp;
uint32_t ver, initial_version, initial_version2;
int32_t initial_tstamp, initial_tstamp2;
uint32_t initial_tstamp, initial_tstamp2;
enum kadm_ops op;
uint32_t len;
off_t right, left;
@@ -679,7 +684,7 @@ send_diffs (kadm5_server_context *server_context, slave *s, int log_fd,
int ret = 0;
if (s->flags & SLAVE_F_DEAD) {
krb5_warnx(context, "not sending diffs to a dead slave");
krb5_warnx(context, "not sending diffs to dead slave %s", s->name);
return 0;
}
@@ -688,8 +693,8 @@ send_diffs (kadm5_server_context *server_context, slave *s, int log_fd,
sp = krb5_storage_from_mem(buf, 4);
if (sp == NULL)
krb5_errx(context, 1, "krb5_storage_from_mem");
krb5_store_int32(sp, YOU_HAVE_LAST_VERSION);
krb5_errx(context, IPROPD_RESTART, "krb5_storage_from_mem");
krb5_store_uint32(sp, YOU_HAVE_LAST_VERSION);
krb5_storage_free(sp);
data.data = buf;
data.length = 4;
@@ -703,7 +708,8 @@ send_diffs (kadm5_server_context *server_context, slave *s, int log_fd,
return ret;
}
krb5_warnx(context, "sending diffs to a live-seeming slave");
if (verbose)
krb5_warnx(context, "sending diffs to live-seeming slave %s", s->name);
/*
* XXX The code that makes the diffs should be made a separate function,
@@ -748,7 +754,7 @@ send_diffs (kadm5_server_context *server_context, slave *s, int log_fd,
for (;;) {
ret = kadm5_log_previous (context, sp, &ver, NULL, &op, &len);
if (ret)
krb5_err(context, 1, ret,
krb5_err(context, IPROPD_RESTART, ret,
"send_diffs: failed to find previous entry");
left = krb5_storage_seek(sp, -16, SEEK_CUR);
if (left == (off_t)-1) {
@@ -836,7 +842,7 @@ send_diffs (kadm5_server_context *server_context, slave *s, int log_fd,
send_are_you_there(context, s);
return 1;
}
krb5_store_int32 (sp, FOR_YOU);
krb5_store_uint32 (sp, FOR_YOU);
krb5_storage_free(sp);
ret = krb5_write_priv_message(context, s->ac, &s->fd, &data);
@@ -851,7 +857,7 @@ send_diffs (kadm5_server_context *server_context, slave *s, int log_fd,
s->version = current_version;
krb5_warnx(context, "slave is now up to date");
krb5_warnx(context, "slave %s is now up to date (%u)", s->name, s->version);
return 0;
}
@@ -859,13 +865,13 @@ send_diffs (kadm5_server_context *server_context, slave *s, int log_fd,
static int
process_msg (kadm5_server_context *server_context, slave *s, int log_fd,
const char *database, uint32_t current_version,
int32_t current_tstamp)
uint32_t current_tstamp)
{
krb5_context context = server_context->context;
int ret = 0;
krb5_data out;
krb5_storage *sp;
int32_t tmp;
uint32_t tmp;
ret = krb5_read_priv_message(context, s->ac, &s->fd, &out);
if(ret) {
@@ -879,37 +885,41 @@ process_msg (kadm5_server_context *server_context, slave *s, int log_fd,
krb5_data_free(&out);
return 1;
}
if (krb5_ret_int32(sp, &tmp) != 0) {
if (krb5_ret_uint32(sp, &tmp) != 0) {
krb5_warnx(context, "process_msg: client send too short command");
krb5_data_free(&out);
return 1;
}
switch (tmp) {
case I_HAVE :
ret = krb5_ret_int32(sp, &tmp);
ret = krb5_ret_uint32(sp, &tmp);
if (ret != 0) {
krb5_warnx(context, "process_msg: client send too I_HAVE data");
krb5_warnx(context, "process_msg: client send too little I_HAVE data");
break;
}
/* new started slave that have old log */
if (s->version == 0 && tmp != 0) {
if (current_version < (uint32_t)tmp) {
krb5_warnx(context, "Slave %s (version %lu) have later version "
"the master (version %lu) OUT OF SYNC",
s->name, (unsigned long)tmp,
(unsigned long)current_version);
if (current_version < tmp) {
krb5_warnx(context, "Slave %s (version %u) have later version "
"the master (version %u) OUT OF SYNC",
s->name, tmp, current_version);
}
if (verbose)
krb5_warnx(context, "slave %s updated from %u to %u",
s->name, s->version, tmp);
s->version = tmp;
}
if ((uint32_t)tmp < s->version) {
krb5_warnx(context, "Slave claims to not have "
"version we already sent to it");
if (tmp < s->version) {
krb5_warnx(context, "Slave %s claims to not have "
"version we already sent to it", s->name);
s->version = tmp;
}
ret = send_diffs(server_context, s, log_fd, database, current_version,
current_tstamp);
break;
case I_AM_HERE :
if (verbose)
krb5_warnx(context, "slave %s is there", s->name);
break;
case ARE_YOU_THERE:
case FOR_YOU :
@@ -1073,6 +1083,7 @@ static struct getargs args[] = {
"private argument, do not use", NULL },
{ "hostname", 0, arg_string, rk_UNCONST(&master_hostname),
"hostname of master (if not same as hostname)", "hostname" },
{ "verbose", 0, arg_flag, &verbose, NULL, NULL },
{ "version", 0, arg_flag, &version_flag, NULL, NULL },
{ "help", 0, arg_flag, &help_flag, NULL, NULL }
};
@@ -1090,11 +1101,13 @@ main(int argc, char **argv)
int log_fd;
slave *slaves = NULL;
uint32_t current_version = 0, old_version = 0;
int32_t current_tstamp = 0;
uint32_t current_tstamp = 0;
krb5_keytab keytab;
char **files;
int aret;
int optidx = 0;
int restarter_fd = -1;
struct stat st;
setprogname(argv[0]);
@@ -1173,8 +1186,9 @@ main(int argc, char **argv)
krb5_err (context, 1, errno, "open %s",
server_context->log_context.log_file);
signal_fd = make_signal_socket (context);
listen_fd = make_listen_socket (context, port_str);
if (fstat(log_fd, &st) == -1)
krb5_err(context, 1, errno, "stat %s",
server_context->log_context.log_file);
if (flock(log_fd, LOCK_SH) == -1)
krb5_err(context, 1, errno, "shared flock %s",
@@ -1183,10 +1197,14 @@ main(int argc, char **argv)
&current_version, &current_tstamp);
flock(log_fd, LOCK_UN);
signal_fd = make_signal_socket (context);
listen_fd = make_listen_socket (context, port_str);
krb5_warnx(context, "ipropd-master started at version: %lu",
(unsigned long)current_version);
roken_detach_finish(NULL, daemon_child);
restarter_fd = restarter(context, NULL);
while (exit_flag == 0){
slave *p;
@@ -1194,10 +1212,12 @@ main(int argc, char **argv)
int max_fd = 0;
struct timeval to = {30, 0};
uint32_t vers;
struct stat st2;;
#ifndef NO_LIMIT_FD_SETSIZE
if (signal_fd >= FD_SETSIZE || listen_fd >= FD_SETSIZE)
krb5_errx (context, 1, "fd too large");
if (signal_fd >= FD_SETSIZE || listen_fd >= FD_SETSIZE ||
restarter_fd >= FD_SETSIZE)
krb5_errx (context, IPROPD_RESTART, "fd too large");
#endif
FD_ZERO(&readset);
@@ -1205,6 +1225,10 @@ main(int argc, char **argv)
max_fd = max(max_fd, signal_fd);
FD_SET(listen_fd, &readset);
max_fd = max(max_fd, listen_fd);
if (restarter_fd > -1) {
FD_SET(restarter_fd, &readset);
max_fd = max(max_fd, restarter_fd);
}
for (p = slaves; p != NULL; p = p->next) {
if (p->flags & SLAVE_F_DEAD)
@@ -1219,7 +1243,7 @@ main(int argc, char **argv)
if (errno == EINTR)
continue;
else
krb5_err (context, 1, errno, "select");
krb5_err (context, IPROPD_RESTART, errno, "select");
}
if (stat(server_context->log_context.log_file, &st2) == -1) {
@@ -1232,15 +1256,15 @@ main(int argc, char **argv)
log_fd = open(server_context->log_context.log_file, O_RDONLY, 0);
if (log_fd < 0)
krb5_err(context, 1, 1, "open %s",
krb5_err(context, 1, IPROPD_RESTART_SLOW, "open %s",
server_context->log_context.log_file);
if (fstat(log_fd, &st) == -1)
krb5_err(context, 1, errno, "stat %s",
krb5_err(context, IPROPD_RESTART_SLOW, errno, "stat %s",
server_context->log_context.log_file);
if (flock(log_fd, LOCK_SH) == -1)
krb5_err(context, 1, errno, "shared flock %s",
krb5_err(context, IPROPD_RESTART, errno, "shared flock %s",
server_context->log_context.log_file);
kadm5_log_get_version_fd(server_context, log_fd, LOG_VERSION_LAST,
&current_version, &current_tstamp);
@@ -1252,10 +1276,9 @@ main(int argc, char **argv)
if (kadm5_log_init_nb(server_context) == 0)
kadm5_log_end(server_context);
if (flock(log_fd, LOCK_SH) == -1) {
krb5_err(context, 1, errno,
if (flock(log_fd, LOCK_SH) == -1)
krb5_err(context, IPROPD_RESTART, errno,
"could not lock log file");
}
kadm5_log_get_version_fd(server_context, log_fd, LOG_VERSION_LAST,
&current_version, &current_tstamp);
flock(log_fd, LOCK_UN);
@@ -1274,6 +1297,11 @@ main(int argc, char **argv)
}
}
if (ret && FD_ISSET(restarter_fd, &readset)) {
exit_flag = SIGTERM;
break;
}
if (ret && FD_ISSET(signal_fd, &readset)) {
#ifndef NO_UNIX_SOCKETS
struct sockaddr_un peer_addr;
@@ -1291,7 +1319,7 @@ main(int argc, char **argv)
assert(ret >= 0);
old_version = current_version;
if (flock(log_fd, LOCK_SH) == -1)
krb5_err(context, 1, errno, "shared flock %s",
krb5_err(context, IPROPD_RESTART, errno, "shared flock %s",
server_context->log_context.log_file);
kadm5_log_get_version_fd(server_context, log_fd, LOG_VERSION_LAST,
&current_version, &current_tstamp);

View File

@@ -37,6 +37,8 @@ RCSID("$Id$");
static const char *config_name = "ipropd-slave";
static int verbose;
static krb5_log_facility *log_facility;
static char five_min[] = "5 min";
static char *server_time_lost = five_min;
@@ -164,18 +166,123 @@ ihave(krb5_context context, krb5_auth_context auth_context,
krb5_data data;
sp = krb5_storage_from_mem(buf, 8);
krb5_store_int32(sp, I_HAVE);
krb5_store_int32(sp, version);
krb5_store_uint32(sp, I_HAVE);
krb5_store_uint32(sp, version);
krb5_storage_free(sp);
data.length = 8;
data.data = buf;
if (verbose)
krb5_warnx(context, "telling master we are at %u", version);
ret = krb5_write_priv_message(context, auth_context, &fd, &data);
if (ret)
krb5_warn(context, ret, "krb5_write_message");
return ret;
}
static int
append_to_log_file(krb5_context context,
kadm5_server_context *server_context,
krb5_storage *sp, off_t start, ssize_t slen)
{
size_t len;
ssize_t sret;
off_t log_off;
int ret, ret2;
void *buf;
if (verbose)
krb5_warnx(context, "appending diffs to log");
if (slen == 0)
return 0;
if (slen < 0)
return EINVAL;
len = slen;
if (len != slen)
return EOVERFLOW;
buf = malloc(len);
if (buf == NULL && len != 0) {
krb5_warn(context, errno, "malloc: no memory");
return ENOMEM;
}
if (krb5_storage_seek(sp, start, SEEK_SET) != start) {
krb5_errx(context, IPROPD_RESTART,
"krb5_storage_seek() failed"); /* can't happen */
}
sret = krb5_storage_read(sp, buf, len);
if (sret < 0)
return errno;
if (len != (size_t)sret) {
/* Can't happen */
krb5_errx(context, IPROPD_RESTART,
"short krb5_storage_read() from memory buffer");
}
log_off = lseek(server_context->log_context.log_fd, 0, SEEK_CUR);
/*
* Use net_write() so we get an errno if less that len bytes were
* written.
*/
sret = net_write(server_context->log_context.log_fd, buf, len);
free(buf);
if (sret != slen)
ret = errno;
else
ret = fsync(server_context->log_context.log_fd);
if (ret == 0)
return 0;
/*
* Attempt to recover from this. First, truncate the log file
* and reset the fd offset. Failure to do this -> unlink the
* log file and re-create it. Since we're the slave, we ought to be
* able to recover from the log being unlinked...
*/
if (ftruncate(server_context->log_context.log_fd, log_off) == -1 ||
lseek(server_context->log_context.log_fd, log_off, SEEK_SET) == -1) {
(void) kadm5_log_end(server_context);
if (unlink(server_context->log_context.log_file) == -1) {
krb5_err(context, IPROPD_FATAL, errno,
"Failed to recover from failure to write log "
"entries from master to disk");
}
ret2 = kadm5_log_init(server_context);
if (ret2) {
krb5_err(context, IPROPD_RESTART_SLOW, ret2,
"Failed to initialize log to recover from "
"failure to write log entries from master to disk");
}
}
if (ret == ENOSPC || ret == EDQUOT || ret == EFBIG) {
/* Unlink the file in these cases. */
krb5_warn(context, IPROPD_RESTART_SLOW,
"Failed to write log entries from master to disk");
(void) kadm5_log_end(server_context);
if (unlink(server_context->log_context.log_file) == -1) {
krb5_err(context, IPROPD_FATAL, errno,
"Failed to recover from failure to write log "
"entries from master to disk");
}
ret2 = kadm5_log_init(server_context);
if (ret2) {
krb5_err(context, IPROPD_RESTART_SLOW, ret2,
"Failed to initialize log to recover from "
"failure to write log entries from master to disk");
}
return ret;
}
/*
* All other errors we treat as fatal here. This includes, for
* example, EIO and EPIPE (sorry, can't log to pipes nor sockets).
*/
krb5_err(context, IPROPD_FATAL, ret,
"Failed to write log entries from master to disk");
}
static int
receive_loop (krb5_context context,
krb5_storage *sp,
@@ -183,22 +290,31 @@ receive_loop (krb5_context context,
{
int ret;
off_t left, right, off;
size_t mlen;
void *buf;
int32_t len, vers, vers2;
ssize_t sret, smlen;
uint32_t len, vers;
if (verbose)
krb5_warnx(context, "receiving diffs");
/*
* Seek to the first entry in the message from the master that is
* past the current version of the local database.
*/
do {
int32_t timestamp, tmp;
uint32_t timestamp;
uint32_t op;
if (krb5_ret_int32(sp, &vers) != 0 ||
krb5_ret_int32(sp, &timestamp) != 0 ||
krb5_ret_int32(sp, &tmp) != 0 ||
krb5_ret_int32(sp, &len) != 0) {
/*
* TODO We could do more to validate the entries from the master
* here. And we could use/reuse more kadm5_log_*() code here.
*
* Alternatively we should trust that the master sent us exactly
* what we needed and just write this to the log file and let
* kadm5_log_recover() do the rest.
*/
if (krb5_ret_uint32(sp, &vers) != 0 ||
krb5_ret_uint32(sp, &timestamp) != 0 ||
krb5_ret_uint32(sp, &op) != 0 ||
krb5_ret_uint32(sp, &len) != 0) {
/*
* This shouldn't happen. Reconnecting probably won't help
@@ -208,19 +324,20 @@ receive_loop (krb5_context context,
krb5_warnx(context, "iprop entries from master were truncated");
return EINVAL;
}
if (len < 0) {
krb5_warnx(context, "master sent entry with negative length for"
"version %ld", (long)vers);
return EINVAL;
}
if ((uint32_t)vers > server_context->log_context.version)
if (vers > server_context->log_context.version) {
break;
}
off = krb5_storage_seek(sp, 0, SEEK_CUR);
if (krb5_storage_seek(sp, len + 8, SEEK_CUR) != off + len + 8) {
krb5_warnx(context, "iprop entries from master were truncated");
return 0;
}
} while((uint32_t)vers <= server_context->log_context.version);
if (verbose) {
krb5_warnx(context, "diff contains old log record version "
"%u %lld %u length %u",
vers, (long long)timestamp, op, len);
}
} while(vers <= server_context->log_context.version);
/*
* Read the remaining entries into memory...
@@ -233,161 +350,38 @@ receive_loop (krb5_context context,
return EINVAL;
}
mlen = (size_t)(right - left);
smlen = right - left;
buf = malloc (mlen);
if (buf == NULL && mlen != 0) {
krb5_warn(context, errno, "malloc: no memory");
return ENOMEM;
}
/*
* ...and then write them out to the on-disk log.
*/
/* NOTE: We haven't validated the entries yet */
if (krb5_storage_seek(sp, left, SEEK_SET) != left)
krb5_errx(context, 1, "krb5_storage_seek() failed");
sret = krb5_storage_read(sp, buf, mlen);
if (sret < 0)
return errno;
if (mlen != (size_t)sret)
krb5_errx(context, 1, "short krb5_storage_read() from memory buffer");
sret = write(server_context->log_context.log_fd, buf, mlen);
if (sret != smlen) {
/* This is probably ENOSPC. We can't recover. */
krb5_err(context, 1, errno, "Failed to write log to disk");
}
ret = fsync(server_context->log_context.log_fd);
if (ret) {
/* This is also probably ENOSPC. We can't recover. */
krb5_err(context, 1, errno, "Failed to sync log to disk");
}
free(buf);
ret = append_to_log_file(context, server_context, sp, left, right - left);
if (ret)
return ret;
/*
* Go back to the startpoint and commit the entries to the HDB.
* Replay the new entries.
*/
krb5_storage_seek(sp, left, SEEK_SET);
if (verbose)
krb5_warnx(context, "replaying entries from master");
ret = kadm5_log_recover(server_context, kadm_recover_replay);
if (ret) {
krb5_warn(context, ret, "replay of entries from master failed");
krb5_warn(context, ret, "replay failed");
return ret;
}
for (;;) {
int32_t len2, timestamp, tmp;
off_t cur, cur2;
enum kadm_ops op;
if (krb5_ret_int32(sp, &vers) != 0)
break;
ret = krb5_ret_int32(sp, &timestamp);
ret = kadm5_log_get_version(server_context, &vers);
if (ret) {
krb5_warnx(context, "entry %ld: too short", (long)vers);
return EINVAL;
krb5_warn(context, ret,
"could not get log version after applying diffs!");
return ret;
}
ret = krb5_ret_int32(sp, &tmp);
if (ret) {
krb5_warnx(context, "entry %ld: too short", (long)vers);
return EINVAL;
}
op = tmp;
ret = krb5_ret_int32(sp, &len);
if (ret) {
krb5_warnx(context, "entry %ld: too short", (long)vers);
return EINVAL;
}
if (len < 0) {
krb5_warnx(context, "entry %ld: negative length (%ld); "
"master is confused", (long)vers, (long)len);
return EINVAL;
}
cur = krb5_storage_seek(sp, 0, SEEK_CUR);
if (verbose)
krb5_warnx(context, "slave at version %u", vers);
krb5_warnx(context, "replaying entry %d", (int)vers);
/*
* kadm5_log_replay() returns errors from among others, the HDB
* layer, which can return errors from the actual DBs, some of
* which return -1 and set errno, and some of which return
* system error codes.
*/
ret = kadm5_log_replay(server_context,
op, vers, len, sp);
if (ret == -1 && errno != 0)
ret = errno;
if (ret) {
const char *s = krb5_get_error_message(server_context->context, ret);
/*
* XXX We don't really know here whether the error is
* recoverable or not. Some HDB errors might be safe to
* ignore, and others will not be (e.g., any resulting from
* ENOSPC), but we can't tell which is which, particularly
* as errors from the databases are not mapped to HDB_ERR_*.
*
* We do our best to die if the error is not recoverable.
*/
switch (ret) {
#ifdef EDQUOT
case EDQUOT:
#endif
case ENOSPC:
case EPIPE:
case EINTR:
case EFBIG:
case EIO:
krb5_err(context, 1, ret, "kadm5_log_replay: %ld. Fatal write "
"error: %s (%d)", (long)vers,
s ? s : "unknown error", ret);
}
krb5_warnx(context,
"kadm5_log_replay: %ld. Replay failed. "
"Database out of sync?: %s (%d)",
(long)vers, s ? s : "unknown error", ret);
krb5_free_error_message(context, s);
}
{
/*
* Make sure that kadm5_log_replay() read the whole entry
* from sp and left the sp offset at the start of the
* trailer.
*/
cur2 = krb5_storage_seek(sp, 0, SEEK_CUR);
if (cur + len != cur2)
krb5_errx(context, 1,
"kadm5_log_reply version: %ld didn't read the whole entry",
(long)vers);
}
if (krb5_ret_int32(sp, &len2) != 0) {
krb5_warnx(context, "entry %ld: postamble too short; "
"master is confused", (long)vers);
return EINVAL;
}
if(krb5_ret_int32(sp, &vers2) != 0) {
krb5_warnx(context, "entry %ld: postamble too short; "
"master is confused", (long)vers);
return EINVAL;
}
if (len != len2) {
krb5_warnx(context, "entry %ld: len != len2; master is "
"confused", (long)vers);
return EINVAL;
}
if (vers != vers2) {
krb5_warnx(context, "entry %ld: vers != vers2; master is "
"confused", (long)vers);
return EINVAL;
}
/*
* Update version after each replay.
*/
server_context->log_context.version = vers;
kadm5_log_update_uber(server_context);
if (vers != server_context->log_context.version) {
krb5_warnx(context, "slave's log_context version (%u) is "
"inconsistent with log's version (%u)",
server_context->log_context.version, vers);
}
return 0;
@@ -404,7 +398,7 @@ receive(krb5_context context,
server_context->db,
O_RDWR | O_CREAT, 0600);
if (ret)
krb5_err(context, 1, ret, "db->open");
krb5_err(context, IPROPD_RESTART_SLOW, ret, "db->open");
ret2 = receive_loop(context, sp, server_context);
if (ret2)
@@ -412,7 +406,7 @@ receive(krb5_context context,
ret = server_context->db->hdb_close(context, server_context->db);
if (ret)
krb5_err(context, 1, ret, "db->close");
krb5_err(context, IPROPD_RESTART_SLOW, ret, "db->close");
return ret2;
}
@@ -427,19 +421,22 @@ send_im_here(krb5_context context, int fd,
ret = krb5_data_alloc(&data, 4);
if (ret)
krb5_err(context, 1, ret, "send_im_here");
krb5_err(context, IPROPD_RESTART, ret, "send_im_here");
sp = krb5_storage_from_data (&data);
if (sp == NULL)
krb5_errx(context, 1, "krb5_storage_from_data");
krb5_store_int32(sp, I_AM_HERE);
krb5_errx(context, IPROPD_RESTART, "krb5_storage_from_data");
krb5_store_uint32(sp, I_AM_HERE);
krb5_storage_free(sp);
ret = krb5_write_priv_message(context, auth_context, &fd, &data);
krb5_data_free(&data);
if (ret)
krb5_err(context, 1, ret, "krb5_write_priv_message");
krb5_err(context, IPROPD_RESTART, ret, "krb5_write_priv_message");
if (verbose)
krb5_warnx(context, "pinged master");
return;
}
@@ -447,13 +444,16 @@ send_im_here(krb5_context context, int fd,
static void
reinit_log(krb5_context context,
kadm5_server_context *server_context,
int32_t vno)
uint32_t vno)
{
krb5_error_code ret;
if (verbose)
krb5_warnx(context, "truncating log on slave");
ret = kadm5_log_reinit(server_context);
if (ret)
krb5_err(context, 1, ret, "kadm5_log_reinit");
krb5_err(context, IPROPD_RESTART_SLOW, ret, "kadm5_log_reinit");
}
@@ -464,8 +464,8 @@ receive_everything(krb5_context context, int fd,
{
int ret;
krb5_data data;
int32_t vno = 0;
int32_t opcode;
uint32_t vno = 0;
uint32_t opcode;
krb5_storage *sp;
char *dbname;
@@ -475,22 +475,22 @@ receive_everything(krb5_context context, int fd,
ret = asprintf(&dbname, "%s-NEW", server_context->db->hdb_name);
if (ret == -1)
krb5_err(context, 1, ENOMEM, "asprintf");
krb5_err(context, IPROPD_RESTART, ENOMEM, "asprintf");
ret = hdb_create(context, &mydb, dbname);
if(ret)
krb5_err(context,1, ret, "hdb_create");
krb5_err(context, IPROPD_RESTART, ret, "hdb_create");
free(dbname);
ret = hdb_set_master_keyfile(context,
mydb, server_context->config.stash_file);
if(ret)
krb5_err(context,1, ret, "hdb_set_master_keyfile");
krb5_err(context, IPROPD_RESTART, ret, "hdb_set_master_keyfile");
/* I really want to use O_EXCL here, but given that I can't easily clean
up on error, I won't */
ret = mydb->hdb_open(context, mydb, O_RDWR | O_CREAT | O_TRUNC, 0600);
if (ret)
krb5_err(context, 1, ret, "db->open");
krb5_err(context, IPROPD_RESTART, ret, "db->open");
sp = NULL;
krb5_data_zero(&data);
@@ -504,8 +504,8 @@ receive_everything(krb5_context context, int fd,
sp = krb5_storage_from_data(&data);
if (sp == NULL)
krb5_errx(context, 1, "krb5_storage_from_data");
krb5_ret_int32(sp, &opcode);
krb5_errx(context, IPROPD_RESTART, "krb5_storage_from_data");
krb5_ret_uint32(sp, &opcode);
if (opcode == ONE_PRINC) {
krb5_data fake_data;
hdb_entry_ex entry;
@@ -519,12 +519,12 @@ receive_everything(krb5_context context, int fd,
ret = hdb_value2entry(context, &fake_data, &entry.entry);
if (ret)
krb5_err(context, 1, ret, "hdb_value2entry");
krb5_err(context, IPROPD_RESTART, ret, "hdb_value2entry");
ret = mydb->hdb_store(server_context->context,
mydb,
0, &entry);
if (ret)
krb5_err(context, 1, ret, "hdb_store");
krb5_err(context, IPROPD_RESTART_SLOW, ret, "hdb_store");
hdb_free_entry(context, &entry);
krb5_data_free(&data);
@@ -535,20 +535,21 @@ receive_everything(krb5_context context, int fd,
} while (opcode == ONE_PRINC);
if (opcode != NOW_YOU_HAVE)
krb5_errx(context, 1, "receive_everything: strange %d", opcode);
krb5_errx(context, IPROPD_RESTART_SLOW,
"receive_everything: strange %d", opcode);
krb5_ret_int32(sp, &vno);
krb5_ret_uint32(sp, &vno);
krb5_storage_free(sp);
reinit_log(context, server_context, vno);
ret = mydb->hdb_close(context, mydb);
if (ret)
krb5_err(context, 1, ret, "db->close");
krb5_err(context, IPROPD_RESTART_SLOW, ret, "db->close");
ret = mydb->hdb_rename(context, mydb, server_context->db->hdb_name);
if (ret)
krb5_err(context, 1, ret, "db->rename");
krb5_err(context, IPROPD_RESTART_SLOW, ret, "db->rename");
server_context->log_context.version = vno;
@@ -558,11 +559,11 @@ receive_everything(krb5_context context, int fd,
krb5_data_free(&data);
if (ret)
krb5_err(context, 1, ret, "db->close");
krb5_err(context, IPROPD_RESTART_SLOW, ret, "db->close");
ret = mydb->hdb_destroy(context, mydb);
if (ret)
krb5_err(context, 1, ret, "db->destroy");
krb5_err(context, IPROPD_RESTART, ret, "db->destroy");
krb5_warnx(context, "receive complete database, version %ld", (long)vno);
return ret;
@@ -639,6 +640,7 @@ static struct getargs args[] = {
"private argument, do not use", NULL },
{ "hostname", 0, arg_string, rk_UNCONST(&slave_str),
"hostname of slave (if not same as hostname)", "hostname" },
{ "verbose", 0, arg_flag, &verbose, NULL, NULL },
{ "version", 0, arg_flag, &version_flag, NULL, NULL },
{ "help", 0, arg_flag, &help_flag, NULL, NULL }
};
@@ -671,6 +673,7 @@ main(int argc, char **argv)
time_t reconnect_max;
time_t reconnect;
time_t before = 0;
int restarter_fd = -1;
const char *master;
@@ -783,11 +786,23 @@ main(int argc, char **argv)
slave_status(context, status_file, "ipropd-slave started");
roken_detach_finish(NULL, daemon_child);
restarter_fd = restarter(context, NULL);
while (!exit_flag) {
struct timeval to;
time_t now, elapsed;
fd_set readset;
int connected = FALSE;
#ifndef NO_LIMIT_FD_SETSIZE
if (restarter_fd >= FD_SETSIZE)
krb5_errx(context, IPROPD_RESTART, "fd too large");
#endif
FD_ZERO(&readset);
if (restarter_fd > -1)
FD_SET(restarter_fd, &readset);
now = time(NULL);
elapsed = now - before;
@@ -795,7 +810,12 @@ main(int argc, char **argv)
time_t left = reconnect - elapsed;
krb5_warnx(context, "sleeping %d seconds before "
"retrying to connect", (int)left);
sleep(left);
to.tv_sec = left;
to.tv_usec = 0;
if (select(restarter_fd + 1, &readset, NULL, NULL, &to) == 1) {
exit_flag = SIGTERM;
continue;
}
}
before = now;
@@ -813,6 +833,8 @@ main(int argc, char **argv)
krb5_cc_destroy(context, ccache);
get_creds(context, keytab_str, &ccache, master);
}
if (verbose)
krb5_warnx(context, "authenticating to master");
ret = krb5_sendauth (context, &auth_context, &master_fd,
IPROP_VERSION, NULL, server,
AP_OPTS_MUTUAL_REQUIRED, NULL, NULL,
@@ -832,27 +854,34 @@ main(int argc, char **argv)
connected = TRUE;
if (verbose)
krb5_warnx(context, "connected to master");
slave_status(context, status_file, "connected to master, waiting instructions");
while (connected && !exit_flag) {
krb5_data out;
krb5_storage *sp;
int32_t tmp;
fd_set readset;
struct timeval to;
uint32_t tmp;
int max_fd;
#ifndef NO_LIMIT_FD_SETSIZE
if (master_fd >= FD_SETSIZE)
krb5_errx (context, 1, "fd too large");
krb5_errx(context, IPROPD_RESTART, "fd too large");
if (restarter_fd >= FD_SETSIZE)
krb5_errx(context, IPROPD_RESTART, "fd too large");
max_fd = max(restarter_fd, master_fd);
#endif
FD_ZERO(&readset);
FD_SET(master_fd, &readset);
if (restarter_fd != -1)
FD_SET(restarter_fd, &readset);
to.tv_sec = time_before_lost;
to.tv_usec = 0;
ret = select (master_fd + 1,
ret = select (max_fd + 1,
&readset, NULL, NULL, &to);
if (ret < 0) {
if (errno == EINTR)
@@ -867,6 +896,18 @@ main(int argc, char **argv)
continue;
}
if (FD_ISSET(restarter_fd, &readset)) {
if (verbose)
krb5_warnx(context, "slave restarter exited");
exit_flag = SIGTERM;
}
if (!FD_ISSET(master_fd, &readset))
continue;
if (verbose)
krb5_warnx(context, "message from master");
ret = krb5_read_priv_message(context, auth_context, &master_fd, &out);
if (ret) {
krb5_warn(context, ret, "krb5_read_priv_message");
@@ -876,8 +917,8 @@ main(int argc, char **argv)
sp = krb5_storage_from_mem (out.data, out.length);
if (sp == NULL)
krb5_err(context, 1, errno, "krb5_storage_from_mem");
ret = krb5_ret_int32(sp, &tmp);
krb5_err(context, IPROPD_RESTART, errno, "krb5_storage_from_mem");
ret = krb5_ret_uint32(sp, &tmp);
if (ret == HEIM_ERR_EOF) {
krb5_warn(context, ret, "master sent zero-length message");
connected = FALSE;
@@ -891,11 +932,13 @@ main(int argc, char **argv)
ret = kadm5_log_init(server_context);
if (ret) {
krb5_err(context, 1, ret, "kadm5_log_init while handling a "
"message from the master");
krb5_err(context, IPROPD_RESTART, ret, "kadm5_log_init while "
"handling a message from the master");
}
switch (tmp) {
case FOR_YOU :
if (verbose)
krb5_warnx(context, "master sent us diffs");
ret2 = receive(context, sp, server_context);
if (ret2)
krb5_warn(context, ret,
@@ -913,6 +956,8 @@ main(int argc, char **argv)
is_up_to_date(context, status_file, server_context);
break;
case TELL_YOU_EVERYTHING :
if (verbose)
krb5_warnx(context, "master sent us a full dump");
ret = receive_everything(context, master_fd, server_context,
auth_context);
if (ret == 0) {
@@ -925,6 +970,8 @@ main(int argc, char **argv)
is_up_to_date(context, status_file, server_context);
break;
case ARE_YOU_THERE :
if (verbose)
krb5_warnx(context, "master sent us a ping");
is_up_to_date(context, status_file, server_context);
ret = ihave(context, auth_context, master_fd,
server_context->log_context.version);
@@ -934,6 +981,8 @@ main(int argc, char **argv)
send_im_here(context, master_fd, auth_context);
break;
case YOU_HAVE_LAST_VERSION:
if (verbose)
krb5_warnx(context, "master tells us we are up to date");
is_up_to_date(context, status_file, server_context);
break;
case NOW_YOU_HAVE :

View File

@@ -186,12 +186,11 @@ RCSID("$Id$");
* Preserves sp's offset on failure where possible.
*/
static kadm5_ret_t
get_header(krb5_storage *sp, int peek, uint32_t *verp, int32_t *tstampp,
get_header(krb5_storage *sp, int peek, uint32_t *verp, uint32_t *tstampp,
enum kadm_ops *opp, uint32_t *lenp)
{
krb5_error_code ret;
uint32_t op, len;
int32_t tstamp;
uint32_t tstamp, op, len;
off_t off, new_off;
if (tstampp == NULL)
@@ -212,7 +211,7 @@ get_header(krb5_storage *sp, int peek, uint32_t *verp, int32_t *tstampp,
}
if (ret)
goto log_corrupt;
ret = krb5_ret_int32(sp, tstampp);
ret = krb5_ret_uint32(sp, tstampp);
if (ret)
goto log_corrupt;
@@ -330,7 +329,7 @@ seek_next(krb5_storage *sp)
krb5_error_code ret;
uint32_t ver, ver2, len, len2;
enum kadm_ops op;
int32_t tstamp;
uint32_t tstamp;
off_t off, off_len, new_off;
off = krb5_storage_seek(sp, 0, SEEK_CUR);
@@ -457,11 +456,11 @@ static krb5_storage *log_goto_first(kadm5_server_context *, int);
*/
kadm5_ret_t
kadm5_log_get_version_fd(kadm5_server_context *server_context, int fd,
int which, uint32_t *ver, int32_t *tstamp)
int which, uint32_t *ver, uint32_t *tstamp)
{
kadm5_ret_t ret;
krb5_storage *sp;
int32_t tmp;
uint32_t tmp;
if (fd == -1)
return 0; /* /dev/null */
@@ -1522,7 +1521,7 @@ log_update_uber(kadm5_server_context *context, off_t off)
ret = krb5_store_uint64(mem_sp, off);
if (ret)
goto out;
ret = krb5_store_int32(mem_sp, log_context->last_time);
ret = krb5_store_uint32(mem_sp, log_context->last_time);
if (ret)
goto out;
ret = krb5_store_uint32(mem_sp, log_context->version);
@@ -1809,7 +1808,7 @@ kadm5_log_foreach(kadm5_server_context *context,
for (;;) {
uint32_t ver, ver2, len, len2;
int32_t tstamp;
uint32_t tstamp;
time_t timestamp;
enum kadm_ops op;
@@ -1977,7 +1976,7 @@ kadm5_log_goto_end(kadm5_server_context *server_context, int fd)
krb5_storage *sp;
enum kadm_ops op;
uint32_t ver, len;
int32_t tstamp;
uint32_t tstamp;
uint64_t off;
if (fd == -1) {
@@ -2075,7 +2074,7 @@ kadm5_log_previous(krb5_context context,
krb5_error_code ret;
off_t oldoff;
uint32_t ver2, len2;
int32_t tstamp;
uint32_t tstamp;
oldoff = krb5_storage_seek(sp, 0, SEEK_CUR);
if (oldoff == -1)