Cluster: slave node now uses the new protocol to get elected.

This commit is contained in:
antirez 2013-09-26 11:13:17 +02:00
parent 656c3ffe4a
commit fb9b76fe14
3 changed files with 48 additions and 17 deletions

View File

@ -257,6 +257,7 @@ void clusterInit(void) {
server.cluster->nodes = dictCreate(&clusterNodesDictType,NULL); server.cluster->nodes = dictCreate(&clusterNodesDictType,NULL);
server.cluster->failover_auth_time = 0; server.cluster->failover_auth_time = 0;
server.cluster->failover_auth_count = 0; server.cluster->failover_auth_count = 0;
server.cluster->failover_auth_epoch = 0;
memset(server.cluster->migrating_slots_to,0, memset(server.cluster->migrating_slots_to,0,
sizeof(server.cluster->migrating_slots_to)); sizeof(server.cluster->migrating_slots_to));
memset(server.cluster->importing_slots_from,0, memset(server.cluster->importing_slots_from,0,
@ -1581,16 +1582,22 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
* *
* The gaol of this function is: * The gaol of this function is:
* 1) To check if we are able to perform a failover, is our data updated? * 1) To check if we are able to perform a failover, is our data updated?
* 2) Ask reachable masters the authorization to perform the failover. * 2) Try to get elected by masters.
* 3) Check if there is the majority of masters agreeing we should failover. * 3) Check if there is the majority of masters agreeing we should failover.
* 4) Perform the failover informing all the other nodes. * 4) Perform the failover informing all the other nodes.
*/ */
void clusterHandleSlaveFailover(void) { void clusterHandleSlaveFailover(void) {
time_t data_age = server.unixtime - server.repl_down_since; time_t data_age = server.unixtime - server.repl_down_since;
time_t auth_age = server.unixtime - server.cluster->failover_auth_time; mstime_t auth_age = mstime() - server.cluster->failover_auth_time;
int needed_quorum = (server.cluster->size / 2) + 1; int needed_quorum = (server.cluster->size / 2) + 1;
int j; int j;
/* Remove the node timeout from the data age as it is fine that we are
* disconnected from our master at least for the time it was down to be
* flagged as FAIL, that's the baseline. */
if (data_age > server.cluster_node_timeout)
data_age -= server.cluster_node_timeout;
/* Check if our data is recent enough. For now we just use a fixed /* Check if our data is recent enough. For now we just use a fixed
* constant of ten times the node timeout since the cluster should * constant of ten times the node timeout since the cluster should
* react much faster to a master down. */ * react much faster to a master down. */
@ -1598,19 +1605,37 @@ void clusterHandleSlaveFailover(void) {
server.cluster_node_timeout * REDIS_CLUSTER_SLAVE_VALIDITY_MULT) server.cluster_node_timeout * REDIS_CLUSTER_SLAVE_VALIDITY_MULT)
return; return;
/* TODO: check if we are the first slave as well? Or just rely on the /* Compute the time at which we can start an election. */
* master authorization? */
/* Ask masters if we are authorized to perform the failover. If there
* is a pending auth request that's too old, reset it. */
if (server.cluster->failover_auth_time == 0 || if (server.cluster->failover_auth_time == 0 ||
auth_age > auth_age >
server.cluster_node_timeout * REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT) server.cluster_node_timeout * 1000 * REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT)
{ {
redisLog(REDIS_WARNING,"Asking masters if I can failover..."); server.cluster->failover_auth_time = mstime() +
server.cluster->failover_auth_time = time(NULL); 500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */
data_age * 100 + /* Add 100 milliseconds for every second of age. */
random() % 500; /* Random delay between 0 and 500 milliseconds. */
server.cluster->failover_auth_count = 0; server.cluster->failover_auth_count = 0;
server.cluster->failover_auth_sent = 0;
redisLog(REDIS_WARNING,"Start of election delayed for %lld milliseconds.",
server.cluster->failover_auth_time - mstime());
return;
}
/* Return ASAP if we can't still start the election. */
if (mstime() < server.cluster->failover_auth_time) return;
/* Return ASAP if the election is too old to be valid. */
if (mstime() - server.cluster->failover_auth_time > server.cluster_node_timeout)
return;
/* Ask for votes if needed. */
if (server.cluster->failover_auth_sent == 0) {
server.cluster->currentEpoch++;
server.cluster->failover_auth_epoch = server.cluster->currentEpoch;
redisLog(REDIS_WARNING,"Starting a failover election for epoch %llu.",
server.cluster->currentEpoch);
clusterRequestFailoverAuth(); clusterRequestFailoverAuth();
server.cluster->failover_auth_sent = 1;
return; /* Wait for replies. */ return; /* Wait for replies. */
} }
@ -1619,7 +1644,7 @@ void clusterHandleSlaveFailover(void) {
clusterNode *oldmaster = server.cluster->myself->slaveof; clusterNode *oldmaster = server.cluster->myself->slaveof;
redisLog(REDIS_WARNING, redisLog(REDIS_WARNING,
"Masters quorum reached: failing over my (failing) master."); "Failover election won: failing over my (failing) master.");
/* We have the quorum, perform all the steps to correctly promote /* We have the quorum, perform all the steps to correctly promote
* this slave to a master. * this slave to a master.
* *
@ -1644,7 +1669,10 @@ void clusterHandleSlaveFailover(void) {
* accordingly and detect that we switched to master role. */ * accordingly and detect that we switched to master role. */
clusterBroadcastPong(); clusterBroadcastPong();
/* 4) Update state and save config. */ /* 4) Update my configEpoch to the epoch of the election. */
server.cluster->myself->configEpoch = server.cluster->failover_auth_epoch;
/* 5) Update state and save config. */
clusterUpdateState(); clusterUpdateState();
clusterSaveConfigOrDie(); clusterSaveConfigOrDie();
} }

View File

@ -368,6 +368,8 @@
* Data types * Data types
*----------------------------------------------------------------------------*/ *----------------------------------------------------------------------------*/
typedef long long mstime_t; /* millisecond time type. */
/* A redis object, that is a type able to hold a string / list / set */ /* A redis object, that is a type able to hold a string / list / set */
/* The actual Redis Object */ /* The actual Redis Object */
@ -581,7 +583,7 @@ typedef struct redisOpArray {
#define REDIS_CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */ #define REDIS_CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */
#define REDIS_CLUSTER_FAIL_UNDO_TIME_ADD 10 /* Some additional time. */ #define REDIS_CLUSTER_FAIL_UNDO_TIME_ADD 10 /* Some additional time. */
#define REDIS_CLUSTER_SLAVE_VALIDITY_MULT 10 /* Slave data validity. */ #define REDIS_CLUSTER_SLAVE_VALIDITY_MULT 10 /* Slave data validity. */
#define REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT 1 /* Auth request retry time. */ #define REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT 4 /* Auth request retry time. */
#define REDIS_CLUSTER_FAILOVER_DELAY 5 /* Seconds */ #define REDIS_CLUSTER_FAILOVER_DELAY 5 /* Seconds */
struct clusterNode; struct clusterNode;
@ -643,8 +645,11 @@ typedef struct {
clusterNode *importing_slots_from[REDIS_CLUSTER_SLOTS]; clusterNode *importing_slots_from[REDIS_CLUSTER_SLOTS];
clusterNode *slots[REDIS_CLUSTER_SLOTS]; clusterNode *slots[REDIS_CLUSTER_SLOTS];
zskiplist *slots_to_keys; zskiplist *slots_to_keys;
int failover_auth_time; /* Time at which we sent the AUTH request. */ /* The following fields are used to take the slave state on elections. */
int failover_auth_count; /* Number of authorizations received. */ mstime_t failover_auth_time;/* Time at which we'll try to get elected in ms. */
int failover_auth_count; /* Number of votes received so far. */
int failover_auth_sent; /* True if we already asked for votes. */
uint64_t failover_auth_epoch; /* Epoch of the current election. */
} clusterState; } clusterState;
/* Redis cluster messages header */ /* Redis cluster messages header */

View File

@ -43,8 +43,6 @@ extern char **environ;
/* ======================== Sentinel global state =========================== */ /* ======================== Sentinel global state =========================== */
typedef long long mstime_t; /* millisecond time type. */
/* Address object, used to describe an ip:port pair. */ /* Address object, used to describe an ip:port pair. */
typedef struct sentinelAddr { typedef struct sentinelAddr {
char *ip; char *ip;