From 49e70a4c41320318a6550ced8474fe82ca4aa8f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Love=20H=C3=B6rnquist=20=C3=85strand?= Date: Sat, 18 Oct 2008 23:55:57 +0000 Subject: [PATCH] Make slave more resiliant to master that go down, make them retry now and then. The client tries connect every "[libdefault] reconnect-min" seconds, and for every time it failes, it backs down "[libdefault] reconnect-backoff" seconds until it reaches "[libdefault] reconnect-max". On successful connect, the start value is reset to [libdefault] reconnect-min. There are default values that make sense. This patch was created by Buck Huppmann 2003, and been nursed along by Alf Wachsmann until I merged it now. While here, add IPv6 support. git-svn-id: svn://svn.h5l.se/heimdal/trunk/heimdal@23936 ec53bebd-3082-4978-b11e-865c3cabbd6b --- lib/kadm5/ipropd_slave.c | 314 +++++++++++++++++++++++++-------------- 1 file changed, 199 insertions(+), 115 deletions(-) diff --git a/lib/kadm5/ipropd_slave.c b/lib/kadm5/ipropd_slave.c index b59948de0..9af9bdcaf 100644 --- a/lib/kadm5/ipropd_slave.c +++ b/lib/kadm5/ipropd_slave.c @@ -35,8 +35,10 @@ RCSID("$Id$"); +static const char *config_name = "ipropd-slave"; + static krb5_log_facility *log_facility; -static char *master_time_lost = "5 min"; +static char *server_time_lost = "5 min"; static int time_before_lost; const char *slave_str = NULL; @@ -44,39 +46,53 @@ static int connect_to_master (krb5_context context, const char *master, const char *port_str) { - int fd; - struct sockaddr_in addr; - struct hostent *he; + char port[NI_MAXSERV]; + struct addrinfo *ai, *a; + struct addrinfo hints; + int error; + int s = -1; - fd = socket (AF_INET, SOCK_STREAM, 0); - if (fd < 0) - krb5_err (context, 1, errno, "socket AF_INET"); - memset (&addr, 0, sizeof(addr)); - addr.sin_family = AF_INET; - if (port_str) { - addr.sin_port = krb5_getportbyname (context, - port_str, "tcp", - 0); - if (addr.sin_port == 0) { - char *ptr; - long port; - - port = strtol (port_str, &ptr, 10); - if (port == 0 && ptr == port_str) - krb5_errx (context, 1, "bad port `%s'", port_str); - addr.sin_port = htons(port); - } - } else { - addr.sin_port = krb5_getportbyname (context, IPROP_SERVICE, - "tcp", IPROP_PORT); + memset (&hints, 0, sizeof(hints)); + hints.ai_socktype = SOCK_STREAM; + + if (port_str == NULL) { + snprintf(port, sizeof(port), "%u", IPROP_PORT); + port_str = port; } - he = roken_gethostbyname (master); - if (he == NULL) - krb5_errx (context, 1, "gethostbyname: %s", hstrerror(h_errno)); - memcpy (&addr.sin_addr, he->h_addr, sizeof(addr.sin_addr)); - if(connect(fd, (struct sockaddr *)&addr, sizeof(addr)) < 0) - krb5_err (context, 1, errno, "connect"); - return fd; + + error = getaddrinfo (master, port_str, &hints, &ai); + if (error) { + krb5_warnx(context, "Failed to get address of to %s: %s", + master, gai_strerror(error)); + return -1; + } + + for (a = ai; a != NULL; a = a->ai_next) { + char node[NI_MAXHOST]; + error = getnameinfo(a->ai_addr, a->ai_addrlen, + node, sizeof(node), NULL, 0, NI_NUMERICHOST); + if (error) + strlcpy(node, "[unknown-addr]", sizeof(node)); + + s = socket (a->ai_family, a->ai_socktype, a->ai_protocol); + if (s < 0) + continue; + if (connect (s, a->ai_addr, a->ai_addrlen) < 0) { + krb5_warn(context, errno, "connection failed to %s[%s]", + master, node); + close (s); + continue; + } + krb5_warnx(context, "connection successful " + "to master: %s[%s]", master, node); + break; + } + freeaddrinfo (ai); + + if (a == NULL) + return -1; + + return s; } static void @@ -133,7 +149,7 @@ get_creds(krb5_context context, const char *keytab_str, if(ret) krb5_err(context, 1, ret, "krb5_cc_store_cred"); } -static void +static krb5_error_code ihave (krb5_context context, krb5_auth_context auth_context, int fd, uint32_t version) { @@ -151,7 +167,8 @@ ihave (krb5_context context, krb5_auth_context auth_context, ret = krb5_write_priv_message(context, auth_context, &fd, &data); if (ret) - krb5_err (context, 1, ret, "krb5_write_priv_message"); + krb5_warn (context, ret, "krb5_write_message"); + return ret; } static void @@ -226,8 +243,8 @@ receive_loop (krb5_context context, if (ret) krb5_errx(context, 1, "entry %ld: too short", (long)vers); if (len < 0) krb5_errx(context, 1, "log is corrupted, " - "negative length of entry version %ld: %ld", - (long)vers, (long)len); + "negative length of entry version %ld: %ld", + (long)vers, (long)len); cur = krb5_storage_seek(sp, 0, SEEK_CUR); krb5_warnx (context, "replaying entry %d", (int)vers); @@ -237,8 +254,8 @@ receive_loop (krb5_context context, if (ret) { const char *s = krb5_get_error_message(server_context->context, ret); krb5_warnx (context, - "kadm5_log_replay: %ld. Lost entry entry, " - "Database out of sync ?: %s (%d)", + "kadm5_log_replay: %ld. Lost entry entry, " + "Database out of sync ?: %s (%d)", (long)vers, s ? s : "unknown error", ret); krb5_free_error_message(context, s); } @@ -318,7 +335,7 @@ send_im_here (krb5_context context, int fd, krb5_err (context, 1, ret, "krb5_write_priv_message"); } -static void +static krb5_error_code receive_everything (krb5_context context, int fd, kadm5_server_context *server_context, krb5_auth_context auth_context) @@ -355,8 +372,10 @@ receive_everything (krb5_context context, int fd, do { ret = krb5_read_priv_message(context, auth_context, &fd, &data); - if (ret) - krb5_err (context, 1, ret, "krb5_read_priv_message"); + if (ret) { + krb5_warn (context, ret, "krb5_read_priv_message"); + goto cleanup; + } sp = krb5_storage_from_data (&data); if (sp == NULL) @@ -408,12 +427,13 @@ receive_everything (krb5_context context, int fd, if (ret) krb5_err (context, 1, ret, "kadm5_log_nop"); - krb5_data_free (&data); - ret = mydb->hdb_rename (context, mydb, server_context->db->hdb_name); if (ret) krb5_err (context, 1, ret, "db->rename"); + cleanup: + krb5_data_free (&data); + ret = mydb->hdb_close (context, mydb); if (ret) krb5_err (context, 1, ret, "db->close"); @@ -423,6 +443,7 @@ receive_everything (krb5_context context, int fd, krb5_err (context, 1, ret, "db->destroy"); krb5_warnx(context, "receive complete database, version %ld", (long)vno); + return ret; } static char *config_file; @@ -438,8 +459,8 @@ static struct getargs args[] = { { "realm", 'r', arg_string, &realm }, { "keytab", 'k', arg_string, &keytab_str, "keytab to get authentication from", "kspec" }, - { "time-lost", 0, arg_string, &master_time_lost, - "time before master is considered lost", "time" }, + { "time-lost", 0, arg_string, &server_time_lost, + "time before server is considered lost", "time" }, { "port", 0, arg_string, &port_str, "port ipropd-slave will connect to", "port"}, { "detach", 0, arg_flag, &detach_from_console, @@ -473,6 +494,11 @@ main(int argc, char **argv) krb5_principal server; char **files; int optidx; + time_t reconnect_min; + time_t backoff; + time_t reconnect_max; + time_t reconnect; + time_t before = 0; const char *master; @@ -527,9 +553,9 @@ main(int argc, char **argv) if(ret) krb5_err(context, 1, ret, "krb5_kt_register"); - time_before_lost = parse_time (master_time_lost, "s"); + time_before_lost = parse_time (server_time_lost, "s"); if (time_before_lost < 0) - krb5_errx (context, 1, "couldn't parse time: %s", master_time_lost); + krb5_errx (context, 1, "couldn't parse time: %s", server_time_lost); memset(&conf, 0, sizeof(conf)); if(realm) { @@ -548,92 +574,150 @@ main(int argc, char **argv) server_context = (kadm5_server_context *)kadm_handle; ret = kadm5_log_init (server_context); - if (ret) { - krb5_clear_error_message(context); + if (ret) krb5_err (context, 1, ret, "kadm5_log_init"); - } get_creds(context, keytab_str, &ccache, master); - master_fd = connect_to_master (context, master, port_str); - ret = krb5_sname_to_principal (context, master, IPROP_NAME, KRB5_NT_SRV_HST, &server); if (ret) krb5_err (context, 1, ret, "krb5_sname_to_principal"); auth_context = NULL; - ret = krb5_sendauth (context, &auth_context, &master_fd, - IPROP_VERSION, NULL, server, - AP_OPTS_MUTUAL_REQUIRED, NULL, NULL, - ccache, NULL, NULL, NULL); - if (ret) - krb5_err (context, 1, ret, "krb5_sendauth"); + master_fd = -1; - krb5_warnx(context, "ipropd-slave started at version: %ld", - (long)server_context->log_context.version); + krb5_appdefault_time(context, config_name, NULL, "reconnect-min", + 10, &reconnect_min); + krb5_appdefault_time(context, config_name, NULL, "reconnect-max", + 300, &reconnect_max); + krb5_appdefault_time(context, config_name, NULL, "reconnect-backoff", + 10, &backoff); + reconnect = reconnect_min; - ihave (context, auth_context, master_fd, - server_context->log_context.version); + while (!exit_flag) { + time_t now, elapsed; + int connected = FALSE; - while (exit_flag == 0) { - krb5_data out; - krb5_storage *sp; - int32_t tmp; - fd_set readset; - struct timeval to; + now = time(NULL); + elapsed = now - before; - if (master_fd >= FD_SETSIZE) - krb5_errx (context, 1, "fd too large"); - - FD_ZERO(&readset); - FD_SET(master_fd, &readset); - - to.tv_sec = time_before_lost; - to.tv_usec = 0; - - ret = select (master_fd + 1, - &readset, NULL, NULL, &to); - if (ret < 0) { - if (errno == EINTR) - continue; - else - krb5_err (context, 1, errno, "select"); + if (elapsed < reconnect) { + time_t left = reconnect - elapsed; + krb5_warnx(context, "sleeping %d seconds before " + "retrying to connect", (int)left); + sleep(left); } - if (ret == 0) - krb5_errx (context, 1, "server didn't send a message " - "in %d seconds", time_before_lost); + before = now; - ret = krb5_read_priv_message(context, auth_context, &master_fd, &out); + master_fd = connect_to_master (context, master, port_str); + if (master_fd < 0) + goto retry; + reconnect = reconnect_min; + + if (auth_context) { + krb5_auth_con_free(context, auth_context); + auth_context = NULL; + get_creds(context, keytab_str, &ccache, master); + } + ret = krb5_sendauth (context, &auth_context, &master_fd, + IPROP_VERSION, NULL, server, + AP_OPTS_MUTUAL_REQUIRED, NULL, NULL, + ccache, NULL, NULL, NULL); + if (ret) { + krb5_warn (context, ret, "krb5_sendauth"); + goto retry; + } + + krb5_warnx(context, "ipropd-slave started at version: %ld", + (long)server_context->log_context.version); + + ret = ihave (context, auth_context, master_fd, + server_context->log_context.version); if (ret) - krb5_err (context, 1, ret, "krb5_read_priv_message"); + goto retry; + + connected = TRUE; + + while (connected && !exit_flag) { + krb5_data out; + krb5_storage *sp; + int32_t tmp; + fd_set readset; + struct timeval to; + + if (master_fd >= FD_SETSIZE) + krb5_errx (context, 1, "fd too large"); + + FD_ZERO(&readset); + FD_SET(master_fd, &readset); + + to.tv_sec = time_before_lost; + to.tv_usec = 0; + + ret = select (master_fd + 1, + &readset, NULL, NULL, &to); + if (ret < 0) { + if (errno == EINTR) + continue; + else + krb5_err (context, 1, errno, "select"); + } + if (ret == 0) + krb5_errx (context, 1, "server didn't send a message " + "in %d seconds", time_before_lost); + + ret = krb5_read_priv_message(context, auth_context, &master_fd, &out); + if (ret) { + krb5_warn (context, ret, "krb5_read_priv_message"); + connected = FALSE; + continue; + } + + sp = krb5_storage_from_mem (out.data, out.length); + krb5_ret_int32 (sp, &tmp); + switch (tmp) { + case FOR_YOU : + receive (context, sp, server_context); + ret = ihave (context, auth_context, master_fd, + server_context->log_context.version); + if (ret) + connected = FALSE; + break; + case TELL_YOU_EVERYTHING : + ret = receive_everything (context, master_fd, server_context, + auth_context); + if (ret) + connected = FALSE; + break; + case ARE_YOU_THERE : + send_im_here (context, master_fd, auth_context); + break; + case NOW_YOU_HAVE : + case I_HAVE : + case ONE_PRINC : + case I_AM_HERE : + default : + krb5_warnx (context, "Ignoring command %d", tmp); + break; + } + krb5_storage_free (sp); + krb5_data_free (&out); - sp = krb5_storage_from_mem (out.data, out.length); - krb5_ret_int32 (sp, &tmp); - switch (tmp) { - case FOR_YOU : - receive (context, sp, server_context); - ihave (context, auth_context, master_fd, - server_context->log_context.version); - break; - case TELL_YOU_EVERYTHING : - receive_everything (context, master_fd, server_context, - auth_context); - break; - case ARE_YOU_THERE : - send_im_here (context, master_fd, auth_context); - break; - case NOW_YOU_HAVE : - case I_HAVE : - case ONE_PRINC : - case I_AM_HERE : - default : - krb5_warnx (context, "Ignoring command %d", tmp); - break; } - krb5_storage_free (sp); - krb5_data_free (&out); + retry: + if (connected == FALSE) + krb5_warnx (context, "disconnected for server"); + if (exit_flag) + krb5_warnx (context, "got an exit signal"); + + if (master_fd >= 0) + close(master_fd); + + reconnect += backoff; + if (reconnect > reconnect_max) + reconnect = reconnect_max; } if(exit_flag == SIGXCPU) @@ -642,7 +726,7 @@ main(int argc, char **argv) krb5_warnx(context, "%s terminated", getprogname()); else krb5_warnx(context, "%s unexpected exit reason: %ld", - getprogname(), (long)exit_flag); - + getprogname(), (long)exit_flag); + return 0; }