diff --git a/lib/kadm5/iprop.h b/lib/kadm5/iprop.h index 87499fac9..945fb90b1 100644 --- a/lib/kadm5/iprop.h +++ b/lib/kadm5/iprop.h @@ -68,4 +68,13 @@ enum iprop_cmd { I_HAVE = 1, extern sig_atomic_t exit_flag; void setup_signal(void); +enum ipropd_exit_code { + IPROPD_DONE = 0, + IPROPD_RESTART = 1, + IPROPD_RESTART_SLOW = 2, + IPROPD_FATAL = 3, +}; + +int restarter(krb5_context, size_t *); + #endif /* __IPROP_H__ */ diff --git a/lib/kadm5/ipropd_common.c b/lib/kadm5/ipropd_common.c index 2e3d94107..e2332f833 100644 --- a/lib/kadm5/ipropd_common.c +++ b/lib/kadm5/ipropd_common.c @@ -32,7 +32,11 @@ */ #include "iprop.h" -RCSID("$Id$"); + +#if defined(HAVE_FORK) && defined(HAVE_WAITPID) +#include +#include +#endif sig_atomic_t exit_flag; @@ -71,3 +75,191 @@ setup_signal(void) #endif #endif } + +/* + * Fork a child to run the service, and restart it if it dies. + * + * Returns -1 if not supported, else a file descriptor that the service + * should select() for. Any events on that file descriptor should cause + * the caller to exit immediately, as that means that the restarter + * exited. + * + * The service's normal exit status values should be should be taken + * from enum ipropd_exit_code. IPROPD_FATAL causes the restarter to + * stop restarting the service and to exit. + * + * A count of restarts is output via the `countp' argument, if it is + * non-NULL. This is useful for testing this function (e.g., kill the + * restarter after N restarts and check that the child gets the signal + * sent to it). + * + * This requires fork() and waitpid() (otherwise returns -1). Ignoring + * SIGCHLD, of course, would be bad. + * + * We could support this on Windows by spawning a child with mostly the + * same arguments as the restarter process. + */ +int +restarter(krb5_context context, size_t *countp) +{ +#if defined(HAVE_FORK) && defined(HAVE_WAITPID) + struct timeval tmout; + pid_t pid; + pid_t wpid = -1; + int status; + int fds[2]; + int fds2[2]; + size_t count = 0; + fd_set readset; + + fds[0] = -1; + fds[1] = -1; + fds2[0] = -1; + fds2[1] = -1; + + signal(SIGCHLD, SIG_DFL); + + while (!exit_flag) { + /* Close the pipe ends we keep open */ + if (fds[1] != -1) + (void) close(fds[1]); + if (fds2[0] != -1) + (void) close(fds2[1]); + + /* A pipe so the child can detect the parent's death */ + if (pipe(fds) == -1) { + krb5_err(context, 1, errno, + "Could not setup pipes in service restarter"); + } + + /* A pipe so the parent can detect the child's death */ + if (pipe(fds2) == -1) { + krb5_err(context, 1, errno, + "Could not setup pipes in service restarter"); + } + + fflush(stdout); + fflush(stderr); + + pid = fork(); + if (pid == -1) + krb5_err(context, 1, errno, "Could not fork in service restarter"); + if (pid == 0) { + if (countp != NULL) + *countp = count; + (void) close(fds[1]); + (void) close(fds2[0]); + return fds[0]; + } + + count++; + + (void) close(fds[0]); + (void) close(fds2[1]); + + do { + wpid = waitpid(pid, &status, 0); + } while (wpid == -1 && errno == EINTR && !exit_flag); + if (wpid == -1 && errno == EINTR) + break; /* We were signaled; gotta kill the child and exit */ + if (wpid == -1) { + if (errno != ECHILD) { + warn("waitpid() failed; killing restarter's child process"); + kill(pid, SIGTERM); + } + krb5_err(context, 1, errno, "restarter failed waiting for child"); + } + + assert(wpid == pid); + wpid = -1; + pid = -1; + if (WIFEXITED(status)) { + switch (WEXITSTATUS(status)) { + case IPROPD_DONE: + exit(0); + case IPROPD_RESTART_SLOW: + if (exit_flag) + exit(1); + krb5_warnx(context, "Waiting 2 minutes to restart"); + sleep(120); + continue; + case IPROPD_FATAL: + krb5_errx(context, WEXITSTATUS(status), + "Sockets and pipes not supported for " + "iprop log files"); + case IPROPD_RESTART: + default: + if (exit_flag) + exit(1); + /* Add exponential backoff (with max backoff)? */ + krb5_warnx(context, "Waiting 30 seconds to restart"); + sleep(30); + continue; + } + } + /* else */ + krb5_warnx(context, "Child was killed; waiting 30 seconds to restart"); + sleep(30); + } + + if (pid == -1) + exit(0); /* No dead child to reap; done */ + + assert(pid > 0); + if (wpid != pid) { + warnx("Interrupted; killing child (pid %ld) with %d", + (long)pid, exit_flag); + krb5_warnx(context, "Interrupted; killing child (pid %ld) with %d", + (long)pid, exit_flag); + kill(pid, exit_flag); + + /* Wait up to one second for the child */ + tmout.tv_sec = 1; + tmout.tv_usec = 0; + FD_ZERO(&readset); + FD_SET(fds2[0], &readset); + /* We don't care why select() returns */ + (void) select(fds2[0] + 1, &readset, NULL, NULL, &tmout); + /* + * We haven't reaped the child yet; if it's a zombie, then + * SIGKILLing it won't hurt. If it's not a zombie yet, well, + * we're out of patience. + */ + kill(pid, SIGKILL); + do { + wpid = waitpid(pid, &status, 0); + } while (wpid != pid && errno == EINTR); + if (wpid == -1) + krb5_err(context, 1, errno, "restarter failed waiting for child"); + } + + /* Finally, the child is dead and reaped */ + if (WIFEXITED(status)) + exit(WEXITSTATUS(status)); + if (WIFSIGNALED(status)) { + switch (WTERMSIG(status)) { + case SIGTERM: + case SIGXCPU: + case SIGINT: + exit(0); + default: + /* + * Attempt to set the same exit status for the parent as for + * the child. + */ + kill(getpid(), WTERMSIG(status)); + /* + * We can get past the self-kill if we inherited a SIG_IGN + * disposition that the child reset to SIG_DFL. + */ + } + } + exit(1); +#else + if (countp != NULL) + *countp = 0; + errno = ENOTSUP; + return -1; +#endif +} + diff --git a/lib/kadm5/ipropd_master.c b/lib/kadm5/ipropd_master.c index 64d5e975d..10fd37e34 100644 --- a/lib/kadm5/ipropd_master.c +++ b/lib/kadm5/ipropd_master.c @@ -36,6 +36,8 @@ static krb5_log_facility *log_facility; +static int verbose; + const char *slave_stats_file; const char *slave_time_missing = "2 min"; const char *slave_time_gone = "5 min"; @@ -125,7 +127,7 @@ struct slave { char *name; krb5_auth_context ac; uint32_t version; - int32_t version_tstamp; + uint32_t version_tstamp; time_t seen; unsigned long flags; #define SLAVE_F_DEAD 0x1 @@ -331,7 +333,7 @@ dump_one (krb5_context context, HDB *db, hdb_entry_ex *entry, void *v) ret = ENOMEM; goto done; } - krb5_store_int32(sp, ONE_PRINC); + krb5_store_uint32(sp, ONE_PRINC); krb5_storage_free(sp); ret = krb5_store_data(dump, data); @@ -372,15 +374,15 @@ write_dump (krb5_context context, krb5_storage *dump, ret = hdb_create (context, &db, database); if (ret) - krb5_err (context, 1, ret, "hdb_create: %s", database); + krb5_err (context, IPROPD_RESTART, ret, "hdb_create: %s", database); ret = db->hdb_open (context, db, O_RDONLY, 0); if (ret) - krb5_err (context, 1, ret, "db->open"); + krb5_err (context, IPROPD_RESTART, ret, "db->open"); sp = krb5_storage_from_mem (buf, 4); if (sp == NULL) - krb5_errx (context, 1, "krb5_storage_from_mem"); - krb5_store_int32 (sp, TELL_YOU_EVERYTHING); + krb5_errx (context, IPROPD_RESTART, "krb5_storage_from_mem"); + krb5_store_uint32 (sp, TELL_YOU_EVERYTHING); krb5_storage_free (sp); data.data = buf; @@ -403,9 +405,9 @@ write_dump (krb5_context context, krb5_storage *dump, sp = krb5_storage_from_mem (buf, 8); if (sp == NULL) - krb5_errx (context, 1, "krb5_storage_from_mem"); - krb5_store_int32 (sp, NOW_YOU_HAVE); - krb5_store_int32 (sp, current_version); + krb5_errx (context, IPROPD_RESTART, "krb5_storage_from_mem"); + krb5_store_uint32 (sp, NOW_YOU_HAVE); + krb5_store_uint32 (sp, current_version); krb5_storage_free (sp); data.length = 8; @@ -459,7 +461,7 @@ write_dump (krb5_context context, krb5_storage *dump, static int send_complete (krb5_context context, slave *s, const char *database, uint32_t current_version, uint32_t oldest_version, - int32_t initial_log_tstamp) + uint32_t initial_log_tstamp) { krb5_error_code ret; krb5_storage *dump = NULL; @@ -527,6 +529,9 @@ send_complete (krb5_context context, slave *s, const char *database, vno >= oldest_version && vno <= current_version) break; + if (verbose) + krb5_warnx(context, "send_complete: dumping HDB"); + /* * Otherwise, we may need to write a new dump file. We * obtain an exclusive lock on the fd. Because this is @@ -561,7 +566,7 @@ send_complete (krb5_context context, slave *s, const char *database, if (fstat(fd, &st) == -1) { ret = errno; - krb5_warn(context, ret, "write_dump: could not stat dump file"); + krb5_warn(context, ret, "send_complete: could not stat dump file"); goto done; } @@ -648,7 +653,7 @@ send_are_you_there (krb5_context context, slave *s) slave_dead(context, s); return 1; } - krb5_store_int32 (sp, ARE_YOU_THERE); + krb5_store_uint32 (sp, ARE_YOU_THERE); krb5_storage_free (sp); ret = krb5_write_priv_message(context, s->ac, &s->fd, &data); @@ -665,12 +670,12 @@ send_are_you_there (krb5_context context, slave *s) static int send_diffs (kadm5_server_context *server_context, slave *s, int log_fd, const char *database, uint32_t current_version, - int32_t current_tstamp) + uint32_t current_tstamp) { krb5_context context = server_context->context; krb5_storage *sp; uint32_t ver, initial_version, initial_version2; - int32_t initial_tstamp, initial_tstamp2; + uint32_t initial_tstamp, initial_tstamp2; enum kadm_ops op; uint32_t len; off_t right, left; @@ -679,7 +684,7 @@ send_diffs (kadm5_server_context *server_context, slave *s, int log_fd, int ret = 0; if (s->flags & SLAVE_F_DEAD) { - krb5_warnx(context, "not sending diffs to a dead slave"); + krb5_warnx(context, "not sending diffs to dead slave %s", s->name); return 0; } @@ -688,8 +693,8 @@ send_diffs (kadm5_server_context *server_context, slave *s, int log_fd, sp = krb5_storage_from_mem(buf, 4); if (sp == NULL) - krb5_errx(context, 1, "krb5_storage_from_mem"); - krb5_store_int32(sp, YOU_HAVE_LAST_VERSION); + krb5_errx(context, IPROPD_RESTART, "krb5_storage_from_mem"); + krb5_store_uint32(sp, YOU_HAVE_LAST_VERSION); krb5_storage_free(sp); data.data = buf; data.length = 4; @@ -703,7 +708,8 @@ send_diffs (kadm5_server_context *server_context, slave *s, int log_fd, return ret; } - krb5_warnx(context, "sending diffs to a live-seeming slave"); + if (verbose) + krb5_warnx(context, "sending diffs to live-seeming slave %s", s->name); /* * XXX The code that makes the diffs should be made a separate function, @@ -748,7 +754,7 @@ send_diffs (kadm5_server_context *server_context, slave *s, int log_fd, for (;;) { ret = kadm5_log_previous (context, sp, &ver, NULL, &op, &len); if (ret) - krb5_err(context, 1, ret, + krb5_err(context, IPROPD_RESTART, ret, "send_diffs: failed to find previous entry"); left = krb5_storage_seek(sp, -16, SEEK_CUR); if (left == (off_t)-1) { @@ -836,7 +842,7 @@ send_diffs (kadm5_server_context *server_context, slave *s, int log_fd, send_are_you_there(context, s); return 1; } - krb5_store_int32 (sp, FOR_YOU); + krb5_store_uint32 (sp, FOR_YOU); krb5_storage_free(sp); ret = krb5_write_priv_message(context, s->ac, &s->fd, &data); @@ -851,7 +857,7 @@ send_diffs (kadm5_server_context *server_context, slave *s, int log_fd, s->version = current_version; - krb5_warnx(context, "slave is now up to date"); + krb5_warnx(context, "slave %s is now up to date (%u)", s->name, s->version); return 0; } @@ -859,13 +865,13 @@ send_diffs (kadm5_server_context *server_context, slave *s, int log_fd, static int process_msg (kadm5_server_context *server_context, slave *s, int log_fd, const char *database, uint32_t current_version, - int32_t current_tstamp) + uint32_t current_tstamp) { krb5_context context = server_context->context; int ret = 0; krb5_data out; krb5_storage *sp; - int32_t tmp; + uint32_t tmp; ret = krb5_read_priv_message(context, s->ac, &s->fd, &out); if(ret) { @@ -879,37 +885,41 @@ process_msg (kadm5_server_context *server_context, slave *s, int log_fd, krb5_data_free(&out); return 1; } - if (krb5_ret_int32(sp, &tmp) != 0) { + if (krb5_ret_uint32(sp, &tmp) != 0) { krb5_warnx(context, "process_msg: client send too short command"); krb5_data_free(&out); return 1; } switch (tmp) { case I_HAVE : - ret = krb5_ret_int32(sp, &tmp); + ret = krb5_ret_uint32(sp, &tmp); if (ret != 0) { - krb5_warnx(context, "process_msg: client send too I_HAVE data"); + krb5_warnx(context, "process_msg: client send too little I_HAVE data"); break; } /* new started slave that have old log */ if (s->version == 0 && tmp != 0) { - if (current_version < (uint32_t)tmp) { - krb5_warnx(context, "Slave %s (version %lu) have later version " - "the master (version %lu) OUT OF SYNC", - s->name, (unsigned long)tmp, - (unsigned long)current_version); + if (current_version < tmp) { + krb5_warnx(context, "Slave %s (version %u) have later version " + "the master (version %u) OUT OF SYNC", + s->name, tmp, current_version); } + if (verbose) + krb5_warnx(context, "slave %s updated from %u to %u", + s->name, s->version, tmp); s->version = tmp; } - if ((uint32_t)tmp < s->version) { - krb5_warnx(context, "Slave claims to not have " - "version we already sent to it"); + if (tmp < s->version) { + krb5_warnx(context, "Slave %s claims to not have " + "version we already sent to it", s->name); s->version = tmp; } ret = send_diffs(server_context, s, log_fd, database, current_version, current_tstamp); break; case I_AM_HERE : + if (verbose) + krb5_warnx(context, "slave %s is there", s->name); break; case ARE_YOU_THERE: case FOR_YOU : @@ -1073,6 +1083,7 @@ static struct getargs args[] = { "private argument, do not use", NULL }, { "hostname", 0, arg_string, rk_UNCONST(&master_hostname), "hostname of master (if not same as hostname)", "hostname" }, + { "verbose", 0, arg_flag, &verbose, NULL, NULL }, { "version", 0, arg_flag, &version_flag, NULL, NULL }, { "help", 0, arg_flag, &help_flag, NULL, NULL } }; @@ -1090,11 +1101,13 @@ main(int argc, char **argv) int log_fd; slave *slaves = NULL; uint32_t current_version = 0, old_version = 0; - int32_t current_tstamp = 0; + uint32_t current_tstamp = 0; krb5_keytab keytab; char **files; int aret; int optidx = 0; + int restarter_fd = -1; + struct stat st; setprogname(argv[0]); @@ -1173,8 +1186,9 @@ main(int argc, char **argv) krb5_err (context, 1, errno, "open %s", server_context->log_context.log_file); - signal_fd = make_signal_socket (context); - listen_fd = make_listen_socket (context, port_str); + if (fstat(log_fd, &st) == -1) + krb5_err(context, 1, errno, "stat %s", + server_context->log_context.log_file); if (flock(log_fd, LOCK_SH) == -1) krb5_err(context, 1, errno, "shared flock %s", @@ -1183,10 +1197,14 @@ main(int argc, char **argv) ¤t_version, ¤t_tstamp); flock(log_fd, LOCK_UN); + signal_fd = make_signal_socket (context); + listen_fd = make_listen_socket (context, port_str); + krb5_warnx(context, "ipropd-master started at version: %lu", (unsigned long)current_version); roken_detach_finish(NULL, daemon_child); + restarter_fd = restarter(context, NULL); while (exit_flag == 0){ slave *p; @@ -1194,10 +1212,12 @@ main(int argc, char **argv) int max_fd = 0; struct timeval to = {30, 0}; uint32_t vers; + struct stat st2;; #ifndef NO_LIMIT_FD_SETSIZE - if (signal_fd >= FD_SETSIZE || listen_fd >= FD_SETSIZE) - krb5_errx (context, 1, "fd too large"); + if (signal_fd >= FD_SETSIZE || listen_fd >= FD_SETSIZE || + restarter_fd >= FD_SETSIZE) + krb5_errx (context, IPROPD_RESTART, "fd too large"); #endif FD_ZERO(&readset); @@ -1205,6 +1225,10 @@ main(int argc, char **argv) max_fd = max(max_fd, signal_fd); FD_SET(listen_fd, &readset); max_fd = max(max_fd, listen_fd); + if (restarter_fd > -1) { + FD_SET(restarter_fd, &readset); + max_fd = max(max_fd, restarter_fd); + } for (p = slaves; p != NULL; p = p->next) { if (p->flags & SLAVE_F_DEAD) @@ -1219,7 +1243,7 @@ main(int argc, char **argv) if (errno == EINTR) continue; else - krb5_err (context, 1, errno, "select"); + krb5_err (context, IPROPD_RESTART, errno, "select"); } if (stat(server_context->log_context.log_file, &st2) == -1) { @@ -1232,15 +1256,15 @@ main(int argc, char **argv) log_fd = open(server_context->log_context.log_file, O_RDONLY, 0); if (log_fd < 0) - krb5_err(context, 1, 1, "open %s", + krb5_err(context, 1, IPROPD_RESTART_SLOW, "open %s", server_context->log_context.log_file); if (fstat(log_fd, &st) == -1) - krb5_err(context, 1, errno, "stat %s", + krb5_err(context, IPROPD_RESTART_SLOW, errno, "stat %s", server_context->log_context.log_file); if (flock(log_fd, LOCK_SH) == -1) - krb5_err(context, 1, errno, "shared flock %s", + krb5_err(context, IPROPD_RESTART, errno, "shared flock %s", server_context->log_context.log_file); kadm5_log_get_version_fd(server_context, log_fd, LOG_VERSION_LAST, ¤t_version, ¤t_tstamp); @@ -1252,10 +1276,9 @@ main(int argc, char **argv) if (kadm5_log_init_nb(server_context) == 0) kadm5_log_end(server_context); - if (flock(log_fd, LOCK_SH) == -1) { - krb5_err(context, 1, errno, + if (flock(log_fd, LOCK_SH) == -1) + krb5_err(context, IPROPD_RESTART, errno, "could not lock log file"); - } kadm5_log_get_version_fd(server_context, log_fd, LOG_VERSION_LAST, ¤t_version, ¤t_tstamp); flock(log_fd, LOCK_UN); @@ -1274,6 +1297,11 @@ main(int argc, char **argv) } } + if (ret && FD_ISSET(restarter_fd, &readset)) { + exit_flag = SIGTERM; + break; + } + if (ret && FD_ISSET(signal_fd, &readset)) { #ifndef NO_UNIX_SOCKETS struct sockaddr_un peer_addr; @@ -1291,7 +1319,7 @@ main(int argc, char **argv) assert(ret >= 0); old_version = current_version; if (flock(log_fd, LOCK_SH) == -1) - krb5_err(context, 1, errno, "shared flock %s", + krb5_err(context, IPROPD_RESTART, errno, "shared flock %s", server_context->log_context.log_file); kadm5_log_get_version_fd(server_context, log_fd, LOG_VERSION_LAST, ¤t_version, ¤t_tstamp); diff --git a/lib/kadm5/ipropd_slave.c b/lib/kadm5/ipropd_slave.c index 5c158faa9..ce8ee605d 100644 --- a/lib/kadm5/ipropd_slave.c +++ b/lib/kadm5/ipropd_slave.c @@ -37,6 +37,8 @@ RCSID("$Id$"); static const char *config_name = "ipropd-slave"; +static int verbose; + static krb5_log_facility *log_facility; static char five_min[] = "5 min"; static char *server_time_lost = five_min; @@ -164,18 +166,123 @@ ihave(krb5_context context, krb5_auth_context auth_context, krb5_data data; sp = krb5_storage_from_mem(buf, 8); - krb5_store_int32(sp, I_HAVE); - krb5_store_int32(sp, version); + krb5_store_uint32(sp, I_HAVE); + krb5_store_uint32(sp, version); krb5_storage_free(sp); data.length = 8; data.data = buf; + if (verbose) + krb5_warnx(context, "telling master we are at %u", version); + ret = krb5_write_priv_message(context, auth_context, &fd, &data); if (ret) krb5_warn(context, ret, "krb5_write_message"); return ret; } +static int +append_to_log_file(krb5_context context, + kadm5_server_context *server_context, + krb5_storage *sp, off_t start, ssize_t slen) +{ + size_t len; + ssize_t sret; + off_t log_off; + int ret, ret2; + void *buf; + + if (verbose) + krb5_warnx(context, "appending diffs to log"); + + if (slen == 0) + return 0; + if (slen < 0) + return EINVAL; + len = slen; + if (len != slen) + return EOVERFLOW; + + buf = malloc(len); + if (buf == NULL && len != 0) { + krb5_warn(context, errno, "malloc: no memory"); + return ENOMEM; + } + + if (krb5_storage_seek(sp, start, SEEK_SET) != start) { + krb5_errx(context, IPROPD_RESTART, + "krb5_storage_seek() failed"); /* can't happen */ + } + sret = krb5_storage_read(sp, buf, len); + if (sret < 0) + return errno; + if (len != (size_t)sret) { + /* Can't happen */ + krb5_errx(context, IPROPD_RESTART, + "short krb5_storage_read() from memory buffer"); + } + log_off = lseek(server_context->log_context.log_fd, 0, SEEK_CUR); + + /* + * Use net_write() so we get an errno if less that len bytes were + * written. + */ + sret = net_write(server_context->log_context.log_fd, buf, len); + free(buf); + if (sret != slen) + ret = errno; + else + ret = fsync(server_context->log_context.log_fd); + if (ret == 0) + return 0; + + /* + * Attempt to recover from this. First, truncate the log file + * and reset the fd offset. Failure to do this -> unlink the + * log file and re-create it. Since we're the slave, we ought to be + * able to recover from the log being unlinked... + */ + if (ftruncate(server_context->log_context.log_fd, log_off) == -1 || + lseek(server_context->log_context.log_fd, log_off, SEEK_SET) == -1) { + (void) kadm5_log_end(server_context); + if (unlink(server_context->log_context.log_file) == -1) { + krb5_err(context, IPROPD_FATAL, errno, + "Failed to recover from failure to write log " + "entries from master to disk"); + } + ret2 = kadm5_log_init(server_context); + if (ret2) { + krb5_err(context, IPROPD_RESTART_SLOW, ret2, + "Failed to initialize log to recover from " + "failure to write log entries from master to disk"); + } + } + if (ret == ENOSPC || ret == EDQUOT || ret == EFBIG) { + /* Unlink the file in these cases. */ + krb5_warn(context, IPROPD_RESTART_SLOW, + "Failed to write log entries from master to disk"); + (void) kadm5_log_end(server_context); + if (unlink(server_context->log_context.log_file) == -1) { + krb5_err(context, IPROPD_FATAL, errno, + "Failed to recover from failure to write log " + "entries from master to disk"); + } + ret2 = kadm5_log_init(server_context); + if (ret2) { + krb5_err(context, IPROPD_RESTART_SLOW, ret2, + "Failed to initialize log to recover from " + "failure to write log entries from master to disk"); + } + return ret; + } + /* + * All other errors we treat as fatal here. This includes, for + * example, EIO and EPIPE (sorry, can't log to pipes nor sockets). + */ + krb5_err(context, IPROPD_FATAL, ret, + "Failed to write log entries from master to disk"); +} + static int receive_loop (krb5_context context, krb5_storage *sp, @@ -183,22 +290,31 @@ receive_loop (krb5_context context, { int ret; off_t left, right, off; - size_t mlen; - void *buf; - int32_t len, vers, vers2; - ssize_t sret, smlen; + uint32_t len, vers; + + if (verbose) + krb5_warnx(context, "receiving diffs"); /* * Seek to the first entry in the message from the master that is * past the current version of the local database. */ do { - int32_t timestamp, tmp; + uint32_t timestamp; + uint32_t op; - if (krb5_ret_int32(sp, &vers) != 0 || - krb5_ret_int32(sp, ×tamp) != 0 || - krb5_ret_int32(sp, &tmp) != 0 || - krb5_ret_int32(sp, &len) != 0) { + /* + * TODO We could do more to validate the entries from the master + * here. And we could use/reuse more kadm5_log_*() code here. + * + * Alternatively we should trust that the master sent us exactly + * what we needed and just write this to the log file and let + * kadm5_log_recover() do the rest. + */ + if (krb5_ret_uint32(sp, &vers) != 0 || + krb5_ret_uint32(sp, ×tamp) != 0 || + krb5_ret_uint32(sp, &op) != 0 || + krb5_ret_uint32(sp, &len) != 0) { /* * This shouldn't happen. Reconnecting probably won't help @@ -208,19 +324,20 @@ receive_loop (krb5_context context, krb5_warnx(context, "iprop entries from master were truncated"); return EINVAL; } - if (len < 0) { - krb5_warnx(context, "master sent entry with negative length for" - "version %ld", (long)vers); - return EINVAL; - } - if ((uint32_t)vers > server_context->log_context.version) + if (vers > server_context->log_context.version) { break; + } off = krb5_storage_seek(sp, 0, SEEK_CUR); if (krb5_storage_seek(sp, len + 8, SEEK_CUR) != off + len + 8) { krb5_warnx(context, "iprop entries from master were truncated"); return 0; } - } while((uint32_t)vers <= server_context->log_context.version); + if (verbose) { + krb5_warnx(context, "diff contains old log record version " + "%u %lld %u length %u", + vers, (long long)timestamp, op, len); + } + } while(vers <= server_context->log_context.version); /* * Read the remaining entries into memory... @@ -233,161 +350,38 @@ receive_loop (krb5_context context, return EINVAL; } - mlen = (size_t)(right - left); - smlen = right - left; - buf = malloc (mlen); - if (buf == NULL && mlen != 0) { - krb5_warn(context, errno, "malloc: no memory"); - return ENOMEM; - } - /* * ...and then write them out to the on-disk log. */ - /* NOTE: We haven't validated the entries yet */ - if (krb5_storage_seek(sp, left, SEEK_SET) != left) - krb5_errx(context, 1, "krb5_storage_seek() failed"); - sret = krb5_storage_read(sp, buf, mlen); - if (sret < 0) - return errno; - if (mlen != (size_t)sret) - krb5_errx(context, 1, "short krb5_storage_read() from memory buffer"); - sret = write(server_context->log_context.log_fd, buf, mlen); - if (sret != smlen) { - /* This is probably ENOSPC. We can't recover. */ - krb5_err(context, 1, errno, "Failed to write log to disk"); - } - ret = fsync(server_context->log_context.log_fd); - if (ret) { - /* This is also probably ENOSPC. We can't recover. */ - krb5_err(context, 1, errno, "Failed to sync log to disk"); - } - free(buf); + + ret = append_to_log_file(context, server_context, sp, left, right - left); + if (ret) + return ret; /* - * Go back to the startpoint and commit the entries to the HDB. + * Replay the new entries. */ - krb5_storage_seek(sp, left, SEEK_SET); + if (verbose) + krb5_warnx(context, "replaying entries from master"); ret = kadm5_log_recover(server_context, kadm_recover_replay); if (ret) { - krb5_warn(context, ret, "replay of entries from master failed"); + krb5_warn(context, ret, "replay failed"); return ret; } - for (;;) { - int32_t len2, timestamp, tmp; - off_t cur, cur2; - enum kadm_ops op; + ret = kadm5_log_get_version(server_context, &vers); + if (ret) { + krb5_warn(context, ret, + "could not get log version after applying diffs!"); + return ret; + } + if (verbose) + krb5_warnx(context, "slave at version %u", vers); - if (krb5_ret_int32(sp, &vers) != 0) - break; - ret = krb5_ret_int32(sp, ×tamp); - if (ret) { - krb5_warnx(context, "entry %ld: too short", (long)vers); - return EINVAL; - } - ret = krb5_ret_int32(sp, &tmp); - if (ret) { - krb5_warnx(context, "entry %ld: too short", (long)vers); - return EINVAL; - } - op = tmp; - ret = krb5_ret_int32(sp, &len); - if (ret) { - krb5_warnx(context, "entry %ld: too short", (long)vers); - return EINVAL; - } - if (len < 0) { - krb5_warnx(context, "entry %ld: negative length (%ld); " - "master is confused", (long)vers, (long)len); - return EINVAL; - } - cur = krb5_storage_seek(sp, 0, SEEK_CUR); - - krb5_warnx(context, "replaying entry %d", (int)vers); - - /* - * kadm5_log_replay() returns errors from among others, the HDB - * layer, which can return errors from the actual DBs, some of - * which return -1 and set errno, and some of which return - * system error codes. - */ - ret = kadm5_log_replay(server_context, - op, vers, len, sp); - if (ret == -1 && errno != 0) - ret = errno; - if (ret) { - const char *s = krb5_get_error_message(server_context->context, ret); - - /* - * XXX We don't really know here whether the error is - * recoverable or not. Some HDB errors might be safe to - * ignore, and others will not be (e.g., any resulting from - * ENOSPC), but we can't tell which is which, particularly - * as errors from the databases are not mapped to HDB_ERR_*. - * - * We do our best to die if the error is not recoverable. - */ - switch (ret) { -#ifdef EDQUOT - case EDQUOT: -#endif - case ENOSPC: - case EPIPE: - case EINTR: - case EFBIG: - case EIO: - krb5_err(context, 1, ret, "kadm5_log_replay: %ld. Fatal write " - "error: %s (%d)", (long)vers, - s ? s : "unknown error", ret); - } - - krb5_warnx(context, - "kadm5_log_replay: %ld. Replay failed. " - "Database out of sync?: %s (%d)", - (long)vers, s ? s : "unknown error", ret); - krb5_free_error_message(context, s); - } - - { - /* - * Make sure that kadm5_log_replay() read the whole entry - * from sp and left the sp offset at the start of the - * trailer. - */ - cur2 = krb5_storage_seek(sp, 0, SEEK_CUR); - if (cur + len != cur2) - krb5_errx(context, 1, - "kadm5_log_reply version: %ld didn't read the whole entry", - (long)vers); - } - - if (krb5_ret_int32(sp, &len2) != 0) { - krb5_warnx(context, "entry %ld: postamble too short; " - "master is confused", (long)vers); - return EINVAL; - } - if(krb5_ret_int32(sp, &vers2) != 0) { - krb5_warnx(context, "entry %ld: postamble too short; " - "master is confused", (long)vers); - return EINVAL; - } - if (len != len2) { - krb5_warnx(context, "entry %ld: len != len2; master is " - "confused", (long)vers); - return EINVAL; - } - if (vers != vers2) { - krb5_warnx(context, "entry %ld: vers != vers2; master is " - "confused", (long)vers); - return EINVAL; - } - - /* - * Update version after each replay. - */ - server_context->log_context.version = vers; - kadm5_log_update_uber(server_context); + if (vers != server_context->log_context.version) { + krb5_warnx(context, "slave's log_context version (%u) is " + "inconsistent with log's version (%u)", + server_context->log_context.version, vers); } return 0; @@ -404,7 +398,7 @@ receive(krb5_context context, server_context->db, O_RDWR | O_CREAT, 0600); if (ret) - krb5_err(context, 1, ret, "db->open"); + krb5_err(context, IPROPD_RESTART_SLOW, ret, "db->open"); ret2 = receive_loop(context, sp, server_context); if (ret2) @@ -412,7 +406,7 @@ receive(krb5_context context, ret = server_context->db->hdb_close(context, server_context->db); if (ret) - krb5_err(context, 1, ret, "db->close"); + krb5_err(context, IPROPD_RESTART_SLOW, ret, "db->close"); return ret2; } @@ -427,19 +421,22 @@ send_im_here(krb5_context context, int fd, ret = krb5_data_alloc(&data, 4); if (ret) - krb5_err(context, 1, ret, "send_im_here"); + krb5_err(context, IPROPD_RESTART, ret, "send_im_here"); sp = krb5_storage_from_data (&data); if (sp == NULL) - krb5_errx(context, 1, "krb5_storage_from_data"); - krb5_store_int32(sp, I_AM_HERE); + krb5_errx(context, IPROPD_RESTART, "krb5_storage_from_data"); + krb5_store_uint32(sp, I_AM_HERE); krb5_storage_free(sp); ret = krb5_write_priv_message(context, auth_context, &fd, &data); krb5_data_free(&data); if (ret) - krb5_err(context, 1, ret, "krb5_write_priv_message"); + krb5_err(context, IPROPD_RESTART, ret, "krb5_write_priv_message"); + + if (verbose) + krb5_warnx(context, "pinged master"); return; } @@ -447,13 +444,16 @@ send_im_here(krb5_context context, int fd, static void reinit_log(krb5_context context, kadm5_server_context *server_context, - int32_t vno) + uint32_t vno) { krb5_error_code ret; + if (verbose) + krb5_warnx(context, "truncating log on slave"); + ret = kadm5_log_reinit(server_context); if (ret) - krb5_err(context, 1, ret, "kadm5_log_reinit"); + krb5_err(context, IPROPD_RESTART_SLOW, ret, "kadm5_log_reinit"); } @@ -464,8 +464,8 @@ receive_everything(krb5_context context, int fd, { int ret; krb5_data data; - int32_t vno = 0; - int32_t opcode; + uint32_t vno = 0; + uint32_t opcode; krb5_storage *sp; char *dbname; @@ -475,22 +475,22 @@ receive_everything(krb5_context context, int fd, ret = asprintf(&dbname, "%s-NEW", server_context->db->hdb_name); if (ret == -1) - krb5_err(context, 1, ENOMEM, "asprintf"); + krb5_err(context, IPROPD_RESTART, ENOMEM, "asprintf"); ret = hdb_create(context, &mydb, dbname); if(ret) - krb5_err(context,1, ret, "hdb_create"); + krb5_err(context, IPROPD_RESTART, ret, "hdb_create"); free(dbname); ret = hdb_set_master_keyfile(context, mydb, server_context->config.stash_file); if(ret) - krb5_err(context,1, ret, "hdb_set_master_keyfile"); + krb5_err(context, IPROPD_RESTART, ret, "hdb_set_master_keyfile"); /* I really want to use O_EXCL here, but given that I can't easily clean up on error, I won't */ ret = mydb->hdb_open(context, mydb, O_RDWR | O_CREAT | O_TRUNC, 0600); if (ret) - krb5_err(context, 1, ret, "db->open"); + krb5_err(context, IPROPD_RESTART, ret, "db->open"); sp = NULL; krb5_data_zero(&data); @@ -504,8 +504,8 @@ receive_everything(krb5_context context, int fd, sp = krb5_storage_from_data(&data); if (sp == NULL) - krb5_errx(context, 1, "krb5_storage_from_data"); - krb5_ret_int32(sp, &opcode); + krb5_errx(context, IPROPD_RESTART, "krb5_storage_from_data"); + krb5_ret_uint32(sp, &opcode); if (opcode == ONE_PRINC) { krb5_data fake_data; hdb_entry_ex entry; @@ -519,12 +519,12 @@ receive_everything(krb5_context context, int fd, ret = hdb_value2entry(context, &fake_data, &entry.entry); if (ret) - krb5_err(context, 1, ret, "hdb_value2entry"); + krb5_err(context, IPROPD_RESTART, ret, "hdb_value2entry"); ret = mydb->hdb_store(server_context->context, mydb, 0, &entry); if (ret) - krb5_err(context, 1, ret, "hdb_store"); + krb5_err(context, IPROPD_RESTART_SLOW, ret, "hdb_store"); hdb_free_entry(context, &entry); krb5_data_free(&data); @@ -535,20 +535,21 @@ receive_everything(krb5_context context, int fd, } while (opcode == ONE_PRINC); if (opcode != NOW_YOU_HAVE) - krb5_errx(context, 1, "receive_everything: strange %d", opcode); + krb5_errx(context, IPROPD_RESTART_SLOW, + "receive_everything: strange %d", opcode); - krb5_ret_int32(sp, &vno); + krb5_ret_uint32(sp, &vno); krb5_storage_free(sp); reinit_log(context, server_context, vno); ret = mydb->hdb_close(context, mydb); if (ret) - krb5_err(context, 1, ret, "db->close"); + krb5_err(context, IPROPD_RESTART_SLOW, ret, "db->close"); ret = mydb->hdb_rename(context, mydb, server_context->db->hdb_name); if (ret) - krb5_err(context, 1, ret, "db->rename"); + krb5_err(context, IPROPD_RESTART_SLOW, ret, "db->rename"); server_context->log_context.version = vno; @@ -558,11 +559,11 @@ receive_everything(krb5_context context, int fd, krb5_data_free(&data); if (ret) - krb5_err(context, 1, ret, "db->close"); + krb5_err(context, IPROPD_RESTART_SLOW, ret, "db->close"); ret = mydb->hdb_destroy(context, mydb); if (ret) - krb5_err(context, 1, ret, "db->destroy"); + krb5_err(context, IPROPD_RESTART, ret, "db->destroy"); krb5_warnx(context, "receive complete database, version %ld", (long)vno); return ret; @@ -639,6 +640,7 @@ static struct getargs args[] = { "private argument, do not use", NULL }, { "hostname", 0, arg_string, rk_UNCONST(&slave_str), "hostname of slave (if not same as hostname)", "hostname" }, + { "verbose", 0, arg_flag, &verbose, NULL, NULL }, { "version", 0, arg_flag, &version_flag, NULL, NULL }, { "help", 0, arg_flag, &help_flag, NULL, NULL } }; @@ -671,6 +673,7 @@ main(int argc, char **argv) time_t reconnect_max; time_t reconnect; time_t before = 0; + int restarter_fd = -1; const char *master; @@ -783,11 +786,23 @@ main(int argc, char **argv) slave_status(context, status_file, "ipropd-slave started"); roken_detach_finish(NULL, daemon_child); + restarter_fd = restarter(context, NULL); while (!exit_flag) { + struct timeval to; time_t now, elapsed; + fd_set readset; int connected = FALSE; +#ifndef NO_LIMIT_FD_SETSIZE + if (restarter_fd >= FD_SETSIZE) + krb5_errx(context, IPROPD_RESTART, "fd too large"); +#endif + + FD_ZERO(&readset); + if (restarter_fd > -1) + FD_SET(restarter_fd, &readset); + now = time(NULL); elapsed = now - before; @@ -795,7 +810,12 @@ main(int argc, char **argv) time_t left = reconnect - elapsed; krb5_warnx(context, "sleeping %d seconds before " "retrying to connect", (int)left); - sleep(left); + to.tv_sec = left; + to.tv_usec = 0; + if (select(restarter_fd + 1, &readset, NULL, NULL, &to) == 1) { + exit_flag = SIGTERM; + continue; + } } before = now; @@ -813,6 +833,8 @@ main(int argc, char **argv) krb5_cc_destroy(context, ccache); get_creds(context, keytab_str, &ccache, master); } + if (verbose) + krb5_warnx(context, "authenticating to master"); ret = krb5_sendauth (context, &auth_context, &master_fd, IPROP_VERSION, NULL, server, AP_OPTS_MUTUAL_REQUIRED, NULL, NULL, @@ -832,27 +854,34 @@ main(int argc, char **argv) connected = TRUE; + if (verbose) + krb5_warnx(context, "connected to master"); + slave_status(context, status_file, "connected to master, waiting instructions"); while (connected && !exit_flag) { krb5_data out; krb5_storage *sp; - int32_t tmp; - fd_set readset; - struct timeval to; + uint32_t tmp; + int max_fd; #ifndef NO_LIMIT_FD_SETSIZE if (master_fd >= FD_SETSIZE) - krb5_errx (context, 1, "fd too large"); + krb5_errx(context, IPROPD_RESTART, "fd too large"); + if (restarter_fd >= FD_SETSIZE) + krb5_errx(context, IPROPD_RESTART, "fd too large"); + max_fd = max(restarter_fd, master_fd); #endif FD_ZERO(&readset); FD_SET(master_fd, &readset); + if (restarter_fd != -1) + FD_SET(restarter_fd, &readset); to.tv_sec = time_before_lost; to.tv_usec = 0; - ret = select (master_fd + 1, + ret = select (max_fd + 1, &readset, NULL, NULL, &to); if (ret < 0) { if (errno == EINTR) @@ -867,6 +896,18 @@ main(int argc, char **argv) continue; } + if (FD_ISSET(restarter_fd, &readset)) { + if (verbose) + krb5_warnx(context, "slave restarter exited"); + exit_flag = SIGTERM; + } + + if (!FD_ISSET(master_fd, &readset)) + continue; + + if (verbose) + krb5_warnx(context, "message from master"); + ret = krb5_read_priv_message(context, auth_context, &master_fd, &out); if (ret) { krb5_warn(context, ret, "krb5_read_priv_message"); @@ -876,8 +917,8 @@ main(int argc, char **argv) sp = krb5_storage_from_mem (out.data, out.length); if (sp == NULL) - krb5_err(context, 1, errno, "krb5_storage_from_mem"); - ret = krb5_ret_int32(sp, &tmp); + krb5_err(context, IPROPD_RESTART, errno, "krb5_storage_from_mem"); + ret = krb5_ret_uint32(sp, &tmp); if (ret == HEIM_ERR_EOF) { krb5_warn(context, ret, "master sent zero-length message"); connected = FALSE; @@ -891,11 +932,13 @@ main(int argc, char **argv) ret = kadm5_log_init(server_context); if (ret) { - krb5_err(context, 1, ret, "kadm5_log_init while handling a " - "message from the master"); + krb5_err(context, IPROPD_RESTART, ret, "kadm5_log_init while " + "handling a message from the master"); } switch (tmp) { case FOR_YOU : + if (verbose) + krb5_warnx(context, "master sent us diffs"); ret2 = receive(context, sp, server_context); if (ret2) krb5_warn(context, ret, @@ -913,6 +956,8 @@ main(int argc, char **argv) is_up_to_date(context, status_file, server_context); break; case TELL_YOU_EVERYTHING : + if (verbose) + krb5_warnx(context, "master sent us a full dump"); ret = receive_everything(context, master_fd, server_context, auth_context); if (ret == 0) { @@ -925,6 +970,8 @@ main(int argc, char **argv) is_up_to_date(context, status_file, server_context); break; case ARE_YOU_THERE : + if (verbose) + krb5_warnx(context, "master sent us a ping"); is_up_to_date(context, status_file, server_context); ret = ihave(context, auth_context, master_fd, server_context->log_context.version); @@ -934,6 +981,8 @@ main(int argc, char **argv) send_im_here(context, master_fd, auth_context); break; case YOU_HAVE_LAST_VERSION: + if (verbose) + krb5_warnx(context, "master tells us we are up to date"); is_up_to_date(context, status_file, server_context); break; case NOW_YOU_HAVE : diff --git a/lib/kadm5/log.c b/lib/kadm5/log.c index 34658ff6f..d7cdd6891 100644 --- a/lib/kadm5/log.c +++ b/lib/kadm5/log.c @@ -186,12 +186,11 @@ RCSID("$Id$"); * Preserves sp's offset on failure where possible. */ static kadm5_ret_t -get_header(krb5_storage *sp, int peek, uint32_t *verp, int32_t *tstampp, +get_header(krb5_storage *sp, int peek, uint32_t *verp, uint32_t *tstampp, enum kadm_ops *opp, uint32_t *lenp) { krb5_error_code ret; - uint32_t op, len; - int32_t tstamp; + uint32_t tstamp, op, len; off_t off, new_off; if (tstampp == NULL) @@ -212,7 +211,7 @@ get_header(krb5_storage *sp, int peek, uint32_t *verp, int32_t *tstampp, } if (ret) goto log_corrupt; - ret = krb5_ret_int32(sp, tstampp); + ret = krb5_ret_uint32(sp, tstampp); if (ret) goto log_corrupt; @@ -330,7 +329,7 @@ seek_next(krb5_storage *sp) krb5_error_code ret; uint32_t ver, ver2, len, len2; enum kadm_ops op; - int32_t tstamp; + uint32_t tstamp; off_t off, off_len, new_off; off = krb5_storage_seek(sp, 0, SEEK_CUR); @@ -457,11 +456,11 @@ static krb5_storage *log_goto_first(kadm5_server_context *, int); */ kadm5_ret_t kadm5_log_get_version_fd(kadm5_server_context *server_context, int fd, - int which, uint32_t *ver, int32_t *tstamp) + int which, uint32_t *ver, uint32_t *tstamp) { kadm5_ret_t ret; krb5_storage *sp; - int32_t tmp; + uint32_t tmp; if (fd == -1) return 0; /* /dev/null */ @@ -1522,7 +1521,7 @@ log_update_uber(kadm5_server_context *context, off_t off) ret = krb5_store_uint64(mem_sp, off); if (ret) goto out; - ret = krb5_store_int32(mem_sp, log_context->last_time); + ret = krb5_store_uint32(mem_sp, log_context->last_time); if (ret) goto out; ret = krb5_store_uint32(mem_sp, log_context->version); @@ -1809,7 +1808,7 @@ kadm5_log_foreach(kadm5_server_context *context, for (;;) { uint32_t ver, ver2, len, len2; - int32_t tstamp; + uint32_t tstamp; time_t timestamp; enum kadm_ops op; @@ -1977,7 +1976,7 @@ kadm5_log_goto_end(kadm5_server_context *server_context, int fd) krb5_storage *sp; enum kadm_ops op; uint32_t ver, len; - int32_t tstamp; + uint32_t tstamp; uint64_t off; if (fd == -1) { @@ -2075,7 +2074,7 @@ kadm5_log_previous(krb5_context context, krb5_error_code ret; off_t oldoff; uint32_t ver2, len2; - int32_t tstamp; + uint32_t tstamp; oldoff = krb5_storage_seek(sp, 0, SEEK_CUR); if (oldoff == -1)