mirror of
https://github.com/fluencelabs/redis
synced 2025-03-17 16:10:50 +00:00
Sentinel: epoch introduced in leader vote.
This commit is contained in:
parent
8c1bf9a2bd
commit
90ab62fd5e
213
src/sentinel.c
213
src/sentinel.c
@ -65,13 +65,12 @@ typedef struct sentinelAddr {
|
||||
#define SRI_CAN_FAILOVER (1<<7)
|
||||
#define SRI_FAILOVER_IN_PROGRESS (1<<8) /* Failover is in progress for
|
||||
this master. */
|
||||
#define SRI_I_AM_THE_LEADER (1<<9) /* We are the leader for this master. */
|
||||
#define SRI_PROMOTED (1<<10) /* Slave selected for promotion. */
|
||||
#define SRI_RECONF_SENT (1<<11) /* SLAVEOF <newmaster> sent. */
|
||||
#define SRI_RECONF_INPROG (1<<12) /* Slave synchronization in progress. */
|
||||
#define SRI_RECONF_DONE (1<<13) /* Slave synchronized with new master. */
|
||||
#define SRI_FORCE_FAILOVER (1<<14) /* Force failover with master up. */
|
||||
#define SRI_SCRIPT_KILL_SENT (1<<15) /* SCRIPT KILL already sent on -BUSY */
|
||||
#define SRI_PROMOTED (1<<9) /* Slave selected for promotion. */
|
||||
#define SRI_RECONF_SENT (1<<10) /* SLAVEOF <newmaster> sent. */
|
||||
#define SRI_RECONF_INPROG (1<<11) /* Slave synchronization in progress. */
|
||||
#define SRI_RECONF_DONE (1<<12) /* Slave synchronized with new master. */
|
||||
#define SRI_FORCE_FAILOVER (1<<13) /* Force failover with master up. */
|
||||
#define SRI_SCRIPT_KILL_SENT (1<<14) /* SCRIPT KILL already sent on -BUSY */
|
||||
|
||||
#define SENTINEL_INFO_PERIOD 10000
|
||||
#define SENTINEL_PING_PERIOD 1000
|
||||
@ -175,6 +174,7 @@ typedef struct sentinelRedisInstance {
|
||||
this is a Sentinel, this is the runid of the Sentinel
|
||||
that this Sentinel voted as leader. */
|
||||
uint64_t leader_epoch; /* Epoch of the 'leader' field. */
|
||||
uint64_t failover_epoch; /* Epoch of the currently started failover. */
|
||||
int failover_state; /* See SENTINEL_FAILOVER_STATE_* defines. */
|
||||
mstime_t failover_state_change_time;
|
||||
mstime_t failover_start_time; /* When to start to failover if leader. */
|
||||
@ -323,7 +323,7 @@ void sentinelAbortFailover(sentinelRedisInstance *ri);
|
||||
void sentinelEvent(int level, char *type, sentinelRedisInstance *ri, const char *fmt, ...);
|
||||
sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master);
|
||||
void sentinelScheduleScriptExecution(char *path, ...);
|
||||
void sentinelStartFailover(sentinelRedisInstance *master, int state);
|
||||
void sentinelStartFailover(sentinelRedisInstance *master);
|
||||
void sentinelDiscardReplyCallback(redisAsyncContext *c, void *reply, void *privdata);
|
||||
int sentinelSendSlaveOf(sentinelRedisInstance *ri, char *host, int port);
|
||||
char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char *req_runid, uint64_t *leader_epoch);
|
||||
@ -895,6 +895,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char *
|
||||
/* Failover state. */
|
||||
ri->leader = NULL;
|
||||
ri->leader_epoch = 0;
|
||||
ri->failover_epoch = 0;
|
||||
ri->failover_state = SENTINEL_FAILOVER_STATE_NONE;
|
||||
ri->failover_state_change_time = 0;
|
||||
ri->failover_start_time = 0;
|
||||
@ -1522,7 +1523,6 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
|
||||
* failover state machine. */
|
||||
if (!sentinel.tilt &&
|
||||
(ri->master->flags & SRI_FAILOVER_IN_PROGRESS) &&
|
||||
(ri->master->flags & SRI_I_AM_THE_LEADER) &&
|
||||
(ri->master->failover_state ==
|
||||
SENTINEL_FAILOVER_STATE_WAIT_PROMOTION))
|
||||
{
|
||||
@ -1898,8 +1898,6 @@ void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) {
|
||||
if (ri->flags & SRI_MASTER_DOWN) flags = sdscat(flags,"master_down,");
|
||||
if (ri->flags & SRI_FAILOVER_IN_PROGRESS)
|
||||
flags = sdscat(flags,"failover_in_progress,");
|
||||
if (ri->flags & SRI_I_AM_THE_LEADER)
|
||||
flags = sdscat(flags,"i_am_the_leader,");
|
||||
if (ri->flags & SRI_PROMOTED) flags = sdscat(flags,"promoted,");
|
||||
if (ri->flags & SRI_RECONF_SENT) flags = sdscat(flags,"reconf_sent,");
|
||||
if (ri->flags & SRI_RECONF_INPROG) flags = sdscat(flags,"reconf_inprog,");
|
||||
@ -2147,7 +2145,7 @@ void sentinelCommand(redisClient *c) {
|
||||
addReplySds(c,sdsnew("-NOGOODSLAVE No suitable slave to promote\r\n"));
|
||||
return;
|
||||
}
|
||||
sentinelStartFailover(ri,SENTINEL_FAILOVER_STATE_WAIT_START);
|
||||
sentinelStartFailover(ri);
|
||||
ri->flags |= SRI_FORCE_FAILOVER;
|
||||
addReply(c,shared.ok);
|
||||
} else if (!strcasecmp(c->argv[1]->ptr,"pending-scripts")) {
|
||||
@ -2347,6 +2345,14 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master) {
|
||||
dictIterator *di;
|
||||
dictEntry *de;
|
||||
|
||||
/* Vote for myself if I see the master is already in ODOWN state. */
|
||||
if (master->flags & SRI_O_DOWN) {
|
||||
uint64_t leader_epoch;
|
||||
|
||||
sentinelVoteLeader(master,sentinel.current_epoch,server.runid,
|
||||
&leader_epoch);
|
||||
}
|
||||
|
||||
di = dictGetIterator(master->sentinels);
|
||||
while((de = dictNext(di)) != NULL) {
|
||||
sentinelRedisInstance *ri = dictGetVal(de);
|
||||
@ -2366,8 +2372,7 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master) {
|
||||
* 1) We believe it is down, or there is a failover in progress.
|
||||
* 2) Sentinel is connected.
|
||||
* 3) We did not received the info within SENTINEL_ASK_PERIOD ms. */
|
||||
if ((master->flags & (SRI_S_DOWN|SRI_FAILOVER_IN_PROGRESS)) == 0)
|
||||
continue;
|
||||
if ((master->flags & SRI_S_DOWN) == 0) continue;
|
||||
if (ri->flags & SRI_DISCONNECTED) continue;
|
||||
if (mstime() - ri->last_master_down_reply_time < SENTINEL_ASK_PERIOD)
|
||||
continue;
|
||||
@ -2377,7 +2382,9 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master) {
|
||||
retval = redisAsyncCommand(ri->cc,
|
||||
sentinelReceiveIsMasterDownReply, NULL,
|
||||
"SENTINEL is-master-down-by-addr %s %s %llu %s",
|
||||
master->addr->ip, port, sentinel.current_epoch, server.runid);
|
||||
master->addr->ip, port,
|
||||
sentinel.current_epoch,
|
||||
server.runid);
|
||||
if (retval == REDIS_OK) ri->pending_commands++;
|
||||
}
|
||||
dictReleaseIterator(di);
|
||||
@ -2415,7 +2422,9 @@ char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char
|
||||
if (req_epoch > sentinel.current_epoch)
|
||||
sentinel.current_epoch = req_epoch;
|
||||
|
||||
if (si && master->leader_epoch < req_epoch) {
|
||||
if (si && master->leader_epoch < req_epoch &&
|
||||
sentinel.current_epoch <= req_epoch)
|
||||
{
|
||||
sdsfree(master->leader);
|
||||
master->leader = sdsnew(req_runid);
|
||||
master->leader_epoch = sentinel.current_epoch;
|
||||
@ -2447,25 +2456,27 @@ void sentinelLeaderIncr(dict *counters, char *runid) {
|
||||
}
|
||||
|
||||
/* Scan all the Sentinels attached to this master to check if there
|
||||
* is a leader for a given term, and return it if any.
|
||||
* is a leader for the specified epoch.
|
||||
*
|
||||
* To be a leader for a given epoch, we should have the majorify of
|
||||
* the Sentinels we know about that reported the same instance as
|
||||
* the Sentinels we know that reported the same instance as
|
||||
* leader for the same epoch. */
|
||||
char *sentinelGetLeader(sentinelRedisInstance *master, uint64_t term) {
|
||||
char *sentinelGetLeader(sentinelRedisInstance *master, uint64_t epoch) {
|
||||
dict *counters;
|
||||
dictIterator *di;
|
||||
dictEntry *de;
|
||||
unsigned int voters = 0, voters_quorum;
|
||||
char *myvote;
|
||||
char *winner = NULL;
|
||||
uint64_t leader_epoch;
|
||||
|
||||
redisAssert(master->flags & (SRI_O_DOWN|SRI_FAILOVER_IN_PROGRESS));
|
||||
counters = dictCreate(&leaderVotesDictType,NULL);
|
||||
|
||||
/* Count my vote. */
|
||||
myvote = sentinelGetSubjectiveLeader(master);
|
||||
if (myvote) {
|
||||
/* Count my vote (and vote for myself if I still did not voted for
|
||||
* the currnet epoch). */
|
||||
myvote = sentinelVoteLeader(master,epoch,server.runid,&leader_epoch);
|
||||
if (myvote && leader_epoch == epoch) {
|
||||
sentinelLeaderIncr(counters,myvote);
|
||||
voters++;
|
||||
}
|
||||
@ -2474,13 +2485,8 @@ char *sentinelGetLeader(sentinelRedisInstance *master, uint64_t term) {
|
||||
di = dictGetIterator(master->sentinels);
|
||||
while((de = dictNext(di)) != NULL) {
|
||||
sentinelRedisInstance *ri = dictGetVal(de);
|
||||
if (ri->leader == NULL) continue;
|
||||
/* If the failover is not already in progress we are only interested
|
||||
* in Sentinels that believe the master is down. Otherwise the leader
|
||||
* selection is useful for the "failover-takedown" when the original
|
||||
* leader fails. In that case we consider all the voters. */
|
||||
if (!(master->flags & SRI_FAILOVER_IN_PROGRESS) &&
|
||||
!(ri->flags & SRI_MASTER_DOWN)) continue;
|
||||
if (ri->leader == NULL || ri->leader_epoch != sentinel.current_epoch)
|
||||
continue;
|
||||
sentinelLeaderIncr(counters,ri->leader);
|
||||
voters++;
|
||||
}
|
||||
@ -2546,32 +2552,14 @@ int sentinelSendSlaveOf(sentinelRedisInstance *ri, char *host, int port) {
|
||||
return REDIS_OK;
|
||||
}
|
||||
|
||||
/* Setup the master state to start a failover as a leader.
|
||||
*
|
||||
* State can be either:
|
||||
*
|
||||
* SENTINEL_FAILOVER_STATE_WAIT_START: starts a failover from scratch.
|
||||
* SENTINEL_FAILOVER_STATE_RECONF_SLAVES: takedown a failed failover.
|
||||
*/
|
||||
void sentinelStartFailover(sentinelRedisInstance *master, int state) {
|
||||
/* Setup the master state to start a failover. */
|
||||
void sentinelStartFailover(sentinelRedisInstance *master) {
|
||||
redisAssert(master->flags & SRI_MASTER);
|
||||
redisAssert(state == SENTINEL_FAILOVER_STATE_WAIT_START ||
|
||||
state == SENTINEL_FAILOVER_STATE_RECONF_SLAVES);
|
||||
|
||||
master->failover_state = state;
|
||||
master->flags |= SRI_FAILOVER_IN_PROGRESS|SRI_I_AM_THE_LEADER;
|
||||
master->failover_state = SENTINEL_FAILOVER_STATE_WAIT_START;
|
||||
master->flags |= SRI_FAILOVER_IN_PROGRESS;
|
||||
master->failover_epoch = ++sentinel.current_epoch;
|
||||
sentinelEvent(REDIS_WARNING,"+failover-triggered",master,"%@");
|
||||
|
||||
/* Pick a random delay if it's a fresh failover (WAIT_START), and not
|
||||
* a recovery of a failover started by another sentinel. */
|
||||
if (master->failover_state == SENTINEL_FAILOVER_STATE_WAIT_START) {
|
||||
master->failover_start_time = mstime() +
|
||||
SENTINEL_FAILOVER_FIXED_DELAY +
|
||||
(rand() % SENTINEL_FAILOVER_MAX_RANDOM_DELAY);
|
||||
sentinelEvent(REDIS_WARNING,"+failover-state-wait-start",master,
|
||||
"%@ #starting in %lld milliseconds",
|
||||
master->failover_start_time-mstime());
|
||||
}
|
||||
master->failover_state_change_time = mstime();
|
||||
}
|
||||
|
||||
@ -2580,66 +2568,18 @@ void sentinelStartFailover(sentinelRedisInstance *master, int state) {
|
||||
*
|
||||
* 1) Enough time has passed since O_DOWN.
|
||||
* 2) The master is marked as SRI_CAN_FAILOVER, so we can failover it.
|
||||
* 3) We are the objectively leader for this master.
|
||||
*
|
||||
* If the conditions are met we flag the master as SRI_FAILOVER_IN_PROGRESS
|
||||
* and SRI_I_AM_THE_LEADER.
|
||||
*/
|
||||
*
|
||||
* We still don't know if we'll win the election so it is possible that we
|
||||
* start the failover but that we'll not be able to act. */
|
||||
void sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) {
|
||||
char *leader;
|
||||
int isleader;
|
||||
|
||||
/* We can't failover if the master is not in O_DOWN state or if
|
||||
* there is not already a failover in progress (to perform the
|
||||
* takedown if the leader died) or if this Sentinel is not allowed
|
||||
* to start a failover. */
|
||||
/* We can't failover if the master is not in O_DOWN state. */
|
||||
if (!(master->flags & SRI_CAN_FAILOVER) ||
|
||||
!(master->flags & (SRI_O_DOWN|SRI_FAILOVER_IN_PROGRESS))) return;
|
||||
!(master->flags & SRI_O_DOWN)) return;
|
||||
|
||||
leader = sentinelGetObjectiveLeader(master);
|
||||
isleader = leader && strcasecmp(leader,server.runid) == 0;
|
||||
sdsfree(leader);
|
||||
/* Failover already in progress? */
|
||||
if (master->flags & SRI_FAILOVER_IN_PROGRESS) return;
|
||||
|
||||
/* If I'm not the leader, I can't failover for sure. */
|
||||
if (!isleader) return;
|
||||
|
||||
/* If the failover is already in progress there are two options... */
|
||||
if (master->flags & SRI_FAILOVER_IN_PROGRESS) {
|
||||
if (master->flags & SRI_I_AM_THE_LEADER) {
|
||||
/* 1) I'm flagged as leader so I already started the failover.
|
||||
* Just return. */
|
||||
return;
|
||||
} else {
|
||||
mstime_t elapsed = mstime() - master->failover_state_change_time;
|
||||
|
||||
/* 2) I'm the new leader, but I'm not flagged as leader in the
|
||||
* master: I did not started the failover, but the original
|
||||
* leader has no longer the leadership.
|
||||
*
|
||||
* In this case if the failover appears to be lagging
|
||||
* for at least 25% of the configured failover timeout,
|
||||
* I can assume I can take control. Otherwise
|
||||
* it's better to return and wait more. */
|
||||
if (elapsed < (master->failover_timeout/4)) return;
|
||||
sentinelEvent(REDIS_WARNING,"+failover-takedown",master,"%@");
|
||||
/* We have already an elected slave if we are in
|
||||
* FAILOVER_IN_PROGRESS state, that is, the slave that we
|
||||
* observed turning into a master. */
|
||||
sentinelStartFailover(master,SENTINEL_FAILOVER_STATE_RECONF_SLAVES);
|
||||
/* As an observer we flagged all the slaves as RECONF_SENT but
|
||||
* now we are in charge of actually sending the reconfiguration
|
||||
* command so let's clear this flag for all the instances. */
|
||||
sentinelDelFlagsToDictOfRedisInstances(master->slaves,
|
||||
SRI_RECONF_SENT);
|
||||
}
|
||||
} else {
|
||||
/* Brand new failover as SRI_FAILOVER_IN_PROGRESS was not set.
|
||||
*
|
||||
* Do we have a slave to promote? Otherwise don't start a failover
|
||||
* at all. */
|
||||
if (sentinelSelectSlave(master) == NULL) return;
|
||||
sentinelStartFailover(master,SENTINEL_FAILOVER_STATE_WAIT_START);
|
||||
}
|
||||
sentinelStartFailover(master);
|
||||
}
|
||||
|
||||
/* Select a suitable slave to promote. The current algorithm only uses
|
||||
@ -2723,29 +2663,22 @@ sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) {
|
||||
|
||||
/* ---------------- Failover state machine implementation ------------------- */
|
||||
void sentinelFailoverWaitStart(sentinelRedisInstance *ri) {
|
||||
/* If we in "wait start" but the master is no longer in ODOWN nor in
|
||||
* SDOWN condition we abort the failover. This is important as it
|
||||
* prevents a useless failover in a a notable case of netsplit, where
|
||||
* the sentinels are split from the redis instances. In this case
|
||||
* the failover will not start while there is the split because no
|
||||
* good slave can be reached. However when the split is resolved, we
|
||||
* can go to waitstart if the slave is back reachable a few milliseconds
|
||||
* before the master is. In that case when the master is back online
|
||||
* we cancel the failover. */
|
||||
if ((ri->flags & (SRI_S_DOWN|SRI_O_DOWN|SRI_FORCE_FAILOVER)) == 0) {
|
||||
sentinelEvent(REDIS_WARNING,"-failover-abort-master-is-back",
|
||||
ri,"%@");
|
||||
sentinelAbortFailover(ri);
|
||||
return;
|
||||
}
|
||||
char *leader;
|
||||
int isleader;
|
||||
|
||||
/* Check if we are the leader for the failover epoch. */
|
||||
leader = sentinelGetLeader(ri, ri->failover_epoch);
|
||||
isleader = leader && strcasecmp(leader,server.runid) == 0;
|
||||
sdsfree(leader);
|
||||
|
||||
/* If I'm not the leader, I can't continue with the failover. */
|
||||
if (!isleader) return;
|
||||
|
||||
/* Start the failover going to the next state if enough time has
|
||||
* elapsed. */
|
||||
if (mstime() >= ri->failover_start_time) {
|
||||
ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE;
|
||||
ri->failover_state_change_time = mstime();
|
||||
sentinelEvent(REDIS_WARNING,"+failover-state-select-slave",ri,"%@");
|
||||
}
|
||||
ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE;
|
||||
ri->failover_state_change_time = mstime();
|
||||
sentinelEvent(REDIS_WARNING,"+failover-state-select-slave",ri,"%@");
|
||||
}
|
||||
|
||||
void sentinelFailoverSelectSlave(sentinelRedisInstance *ri) {
|
||||
@ -2829,8 +2762,7 @@ void sentinelFailoverDetectEnd(sentinelRedisInstance *master) {
|
||||
}
|
||||
|
||||
if (not_reconfigured == 0) {
|
||||
int role = (master->flags & SRI_I_AM_THE_LEADER) ? SENTINEL_LEADER :
|
||||
SENTINEL_OBSERVER;
|
||||
int role = SENTINEL_LEADER;
|
||||
|
||||
sentinelEvent(REDIS_WARNING,"+failover-end",master,"%@");
|
||||
master->failover_state = SENTINEL_FAILOVER_STATE_UPDATE_CONFIG;
|
||||
@ -2842,7 +2774,7 @@ void sentinelFailoverDetectEnd(sentinelRedisInstance *master) {
|
||||
/* If I'm the leader it is a good idea to send a best effort SLAVEOF
|
||||
* command to all the slaves still not reconfigured to replicate with
|
||||
* the new master. */
|
||||
if (timeout && (master->flags & SRI_I_AM_THE_LEADER)) {
|
||||
if (timeout) {
|
||||
dictIterator *di;
|
||||
dictEntry *de;
|
||||
|
||||
@ -2999,8 +2931,7 @@ void sentinelAbortFailover(sentinelRedisInstance *ri) {
|
||||
di = dictGetIterator(ri->slaves);
|
||||
while((de = dictNext(di)) != NULL) {
|
||||
sentinelRedisInstance *slave = dictGetVal(de);
|
||||
if ((ri->flags & SRI_I_AM_THE_LEADER) &&
|
||||
!(slave->flags & SRI_DISCONNECTED) &&
|
||||
if (!(slave->flags & SRI_DISCONNECTED) &&
|
||||
(slave->flags & (SRI_PROMOTED|SRI_RECONF_SENT|SRI_RECONF_INPROG|
|
||||
SRI_RECONF_DONE)))
|
||||
{
|
||||
@ -3014,9 +2945,8 @@ void sentinelAbortFailover(sentinelRedisInstance *ri) {
|
||||
}
|
||||
dictReleaseIterator(di);
|
||||
|
||||
sentinel_role = (ri->flags & SRI_I_AM_THE_LEADER) ? SENTINEL_LEADER :
|
||||
SENTINEL_OBSERVER;
|
||||
ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS|SRI_I_AM_THE_LEADER|SRI_FORCE_FAILOVER);
|
||||
sentinel_role = SENTINEL_LEADER;
|
||||
ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS|SRI_FORCE_FAILOVER);
|
||||
ri->failover_state = SENTINEL_FAILOVER_STATE_NONE;
|
||||
ri->failover_state_change_time = mstime();
|
||||
if (ri->promoted_slave) {
|
||||
@ -3039,16 +2969,6 @@ void sentinelHandleRedisInstance(sentinelRedisInstance *ri) {
|
||||
sentinelReconnectInstance(ri);
|
||||
sentinelPingInstance(ri);
|
||||
|
||||
/* Masters and slaves */
|
||||
if (ri->flags & (SRI_MASTER|SRI_SLAVE)) {
|
||||
/* Nothing so far. */
|
||||
}
|
||||
|
||||
/* Only masters */
|
||||
if (ri->flags & SRI_MASTER) {
|
||||
sentinelAskMasterStateToOtherSentinels(ri);
|
||||
}
|
||||
|
||||
/* ============== ACTING HALF ============= */
|
||||
/* We don't proceed with the acting half if we are in TILT mode.
|
||||
* TILT happens when we find something odd with the time, like a
|
||||
@ -3072,6 +2992,7 @@ void sentinelHandleRedisInstance(sentinelRedisInstance *ri) {
|
||||
sentinelCheckObjectivelyDown(ri);
|
||||
sentinelStartFailoverIfNeeded(ri);
|
||||
sentinelFailoverStateMachine(ri);
|
||||
sentinelAskMasterStateToOtherSentinels(ri);
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user