From fb9b76fe1435c83d84a3144f1c06ddb1378899b8 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 26 Sep 2013 11:13:17 +0200 Subject: [PATCH] Cluster: slave node now uses the new protocol to get elected. --- src/cluster.c | 52 ++++++++++++++++++++++++++++++++++++++------------ src/redis.h | 11 ++++++++--- src/sentinel.c | 2 -- 3 files changed, 48 insertions(+), 17 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index ede00f79..9ce2905e 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -257,6 +257,7 @@ void clusterInit(void) { server.cluster->nodes = dictCreate(&clusterNodesDictType,NULL); server.cluster->failover_auth_time = 0; server.cluster->failover_auth_count = 0; + server.cluster->failover_auth_epoch = 0; memset(server.cluster->migrating_slots_to,0, sizeof(server.cluster->migrating_slots_to)); memset(server.cluster->importing_slots_from,0, @@ -1581,16 +1582,22 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { * * The gaol of this function is: * 1) To check if we are able to perform a failover, is our data updated? - * 2) Ask reachable masters the authorization to perform the failover. + * 2) Try to get elected by masters. * 3) Check if there is the majority of masters agreeing we should failover. * 4) Perform the failover informing all the other nodes. */ void clusterHandleSlaveFailover(void) { time_t data_age = server.unixtime - server.repl_down_since; - time_t auth_age = server.unixtime - server.cluster->failover_auth_time; + mstime_t auth_age = mstime() - server.cluster->failover_auth_time; int needed_quorum = (server.cluster->size / 2) + 1; int j; + /* Remove the node timeout from the data age as it is fine that we are + * disconnected from our master at least for the time it was down to be + * flagged as FAIL, that's the baseline. */ + if (data_age > server.cluster_node_timeout) + data_age -= server.cluster_node_timeout; + /* Check if our data is recent enough. For now we just use a fixed * constant of ten times the node timeout since the cluster should * react much faster to a master down. */ @@ -1598,19 +1605,37 @@ void clusterHandleSlaveFailover(void) { server.cluster_node_timeout * REDIS_CLUSTER_SLAVE_VALIDITY_MULT) return; - /* TODO: check if we are the first slave as well? Or just rely on the - * master authorization? */ - - /* Ask masters if we are authorized to perform the failover. If there - * is a pending auth request that's too old, reset it. */ + /* Compute the time at which we can start an election. */ if (server.cluster->failover_auth_time == 0 || auth_age > - server.cluster_node_timeout * REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT) + server.cluster_node_timeout * 1000 * REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT) { - redisLog(REDIS_WARNING,"Asking masters if I can failover..."); - server.cluster->failover_auth_time = time(NULL); + server.cluster->failover_auth_time = mstime() + + 500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */ + data_age * 100 + /* Add 100 milliseconds for every second of age. */ + random() % 500; /* Random delay between 0 and 500 milliseconds. */ server.cluster->failover_auth_count = 0; + server.cluster->failover_auth_sent = 0; + redisLog(REDIS_WARNING,"Start of election delayed for %lld milliseconds.", + server.cluster->failover_auth_time - mstime()); + return; + } + + /* Return ASAP if we can't still start the election. */ + if (mstime() < server.cluster->failover_auth_time) return; + + /* Return ASAP if the election is too old to be valid. */ + if (mstime() - server.cluster->failover_auth_time > server.cluster_node_timeout) + return; + + /* Ask for votes if needed. */ + if (server.cluster->failover_auth_sent == 0) { + server.cluster->currentEpoch++; + server.cluster->failover_auth_epoch = server.cluster->currentEpoch; + redisLog(REDIS_WARNING,"Starting a failover election for epoch %llu.", + server.cluster->currentEpoch); clusterRequestFailoverAuth(); + server.cluster->failover_auth_sent = 1; return; /* Wait for replies. */ } @@ -1619,7 +1644,7 @@ void clusterHandleSlaveFailover(void) { clusterNode *oldmaster = server.cluster->myself->slaveof; redisLog(REDIS_WARNING, - "Masters quorum reached: failing over my (failing) master."); + "Failover election won: failing over my (failing) master."); /* We have the quorum, perform all the steps to correctly promote * this slave to a master. * @@ -1644,7 +1669,10 @@ void clusterHandleSlaveFailover(void) { * accordingly and detect that we switched to master role. */ clusterBroadcastPong(); - /* 4) Update state and save config. */ + /* 4) Update my configEpoch to the epoch of the election. */ + server.cluster->myself->configEpoch = server.cluster->failover_auth_epoch; + + /* 5) Update state and save config. */ clusterUpdateState(); clusterSaveConfigOrDie(); } diff --git a/src/redis.h b/src/redis.h index cd2495fd..2b7ca7a0 100644 --- a/src/redis.h +++ b/src/redis.h @@ -368,6 +368,8 @@ * Data types *----------------------------------------------------------------------------*/ +typedef long long mstime_t; /* millisecond time type. */ + /* A redis object, that is a type able to hold a string / list / set */ /* The actual Redis Object */ @@ -581,7 +583,7 @@ typedef struct redisOpArray { #define REDIS_CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */ #define REDIS_CLUSTER_FAIL_UNDO_TIME_ADD 10 /* Some additional time. */ #define REDIS_CLUSTER_SLAVE_VALIDITY_MULT 10 /* Slave data validity. */ -#define REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT 1 /* Auth request retry time. */ +#define REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT 4 /* Auth request retry time. */ #define REDIS_CLUSTER_FAILOVER_DELAY 5 /* Seconds */ struct clusterNode; @@ -643,8 +645,11 @@ typedef struct { clusterNode *importing_slots_from[REDIS_CLUSTER_SLOTS]; clusterNode *slots[REDIS_CLUSTER_SLOTS]; zskiplist *slots_to_keys; - int failover_auth_time; /* Time at which we sent the AUTH request. */ - int failover_auth_count; /* Number of authorizations received. */ + /* The following fields are used to take the slave state on elections. */ + mstime_t failover_auth_time;/* Time at which we'll try to get elected in ms. */ + int failover_auth_count; /* Number of votes received so far. */ + int failover_auth_sent; /* True if we already asked for votes. */ + uint64_t failover_auth_epoch; /* Epoch of the current election. */ } clusterState; /* Redis cluster messages header */ diff --git a/src/sentinel.c b/src/sentinel.c index b257ad68..4bea156d 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -43,8 +43,6 @@ extern char **environ; /* ======================== Sentinel global state =========================== */ -typedef long long mstime_t; /* millisecond time type. */ - /* Address object, used to describe an ip:port pair. */ typedef struct sentinelAddr { char *ip;