From b93b0adc89fff22349623b9fec9d1b9aeb0d1f12 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 11 Nov 2013 11:05:58 +0100 Subject: [PATCH 01/53] Sentinel: epoch introduced. Sentinel state now includes the idea of current epoch and config epoch. In the Hello message, that is now published both on masters and slaves, a Sentinel no longer just advertises itself but also broadcasts its current view of the configuration: the master name / ip / port and its current epoch. Sentinels receiving such information switch to the new master if the configuration epoch received is newer and the ip / port of the master are indeed different compared to the previos ones. --- src/sentinel.c | 87 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 25 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 09052e81..76ae9d31 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -133,6 +133,7 @@ typedef struct sentinelRedisInstance { int flags; /* See SRI_... defines */ char *name; /* Master name from the point of view of this sentinel. */ char *runid; /* run ID of this instance. */ + uint64_t config_epoch; /* Configuration epoch. */ sentinelAddr *addr; /* Master host. */ redisAsyncContext *cc; /* Hiredis context for commands. */ redisAsyncContext *pc; /* Hiredis context for Pub / Sub. */ @@ -191,13 +192,14 @@ typedef struct sentinelRedisInstance { /* Main state. */ struct sentinelState { + uint64_t current_epoch; /* Current epoch. */ dict *masters; /* Dictionary of master sentinelRedisInstances. Key is the instance name, value is the sentinelRedisInstance structure pointer. */ int tilt; /* Are we in TILT mode? */ int running_scripts; /* Number of scripts in execution right now. */ mstime_t tilt_start_time; /* When TITL started. */ - mstime_t previous_time; /* Time last time we ran the time handler. */ + mstime_t previous_time; /* Last time we ran the time handler. */ list *scripts_queue; /* Queue of user scripts to execute. */ } sentinel; @@ -402,6 +404,7 @@ void initSentinel(void) { } /* Initialize various data structures. */ + sentinel.current_epoch = 0; sentinel.masters = dictCreate(&instancesDictType,NULL); sentinel.tilt = 0; sentinel.tilt_start_time = 0; @@ -861,6 +864,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * ri->flags = flags | SRI_DISCONNECTED; ri->name = sdsname; ri->runid = NULL; + ri->config_epoch = 0; ri->addr = addr; ri->cc = NULL; ri->pc = NULL; @@ -1745,24 +1749,28 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd if (strstr(r->element[2]->str,server.runid) != NULL) return; { - int numtokens, port, removed, canfailover; - /* Separator changed from ":" to "," in recent versions in order to - * play well with IPv6 addresses. For now we make sure to parse both - * correctly detecting if there is "," inside the string. */ - char *sep = strchr(r->element[2]->str,',') ? "," : ":"; + /* Format is composed of 9 tokens: + * 0=ip,1=port,2=runid,3=can_failover,4=current_epoch, + * 5=master_name,6=master_ip,7=master_port,8=master_config_epoch. */ + int numtokens, port, removed, canfailover, master_port; + uint64_t current_epoch, master_config_epoch; char **token = sdssplitlen(r->element[2]->str, r->element[2]->len, - sep,1,&numtokens); - sentinelRedisInstance *sentinel; + ",",1,&numtokens); + sentinelRedisInstance *si; - if (numtokens == 4) { + if (numtokens == 9) { /* First, try to see if we already have this sentinel. */ port = atoi(token[1]); + master_port = atoi(token[7]); canfailover = atoi(token[3]); - sentinel = getSentinelRedisInstanceByAddrAndRunID( + si = getSentinelRedisInstanceByAddrAndRunID( ri->sentinels,token[0],port,token[2]); + current_epoch = strtoull(token[4],NULL,10); + master_config_epoch = strtoull(token[8],NULL,10); + sentinelRedisInstance *master; - if (!sentinel) { + if (!si) { /* If not, remove all the sentinels that have the same runid * OR the same ip/port, because it's either a restart or a * network topology change. */ @@ -1775,24 +1783,45 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd } /* Add the new sentinel. */ - sentinel = createSentinelRedisInstance(NULL,SRI_SENTINEL, + si = createSentinelRedisInstance(NULL,SRI_SENTINEL, token[0],port,ri->quorum,ri); - if (sentinel) { - sentinelEvent(REDIS_NOTICE,"+sentinel",sentinel,"%@"); + if (si) { + sentinelEvent(REDIS_NOTICE,"+sentinel",si,"%@"); /* The runid is NULL after a new instance creation and * for Sentinels we don't have a later chance to fill it, * so do it now. */ - sentinel->runid = sdsnew(token[2]); + si->runid = sdsnew(token[2]); + } + } + + /* Update local current_epoch if received current_epoch is greater. */ + if (current_epoch > sentinel.current_epoch) + sentinel.current_epoch = current_epoch; + + /* Update master info if received configuration is newer. */ + if ((master = sentinelGetMasterByName(token[5])) != NULL) { + if (master->config_epoch < master_config_epoch) { + master->config_epoch = master_config_epoch; + if (master_port != master->addr->port || + !strcmp(master->addr->ip, token[6])) + { + sentinelEvent(REDIS_WARNING,"+switch-master", + master,"%s %s %d %s %d", + master->name, master->addr->ip, master->addr->port, + token[6], master_port); + sentinelResetMasterAndChangeAddress(ri, + token[6], master_port); + } } } /* Update the state of the Sentinel. */ - if (sentinel) { - sentinel->last_hello_time = mstime(); + if (si) { + si->last_hello_time = mstime(); if (canfailover) - sentinel->flags |= SRI_CAN_FAILOVER; + si->flags |= SRI_CAN_FAILOVER; else - sentinel->flags &= ~SRI_CAN_FAILOVER; + si->flags &= ~SRI_CAN_FAILOVER; } } sdsfreesplitres(token,numtokens); @@ -1842,20 +1871,28 @@ void sentinelPingInstance(sentinelRedisInstance *ri) { sentinelPingReplyCallback, NULL, "PING"); if (retval != REDIS_OK) return; ri->pending_commands++; - } else if ((ri->flags & SRI_MASTER) && + } else if ((ri->flags & SRI_SENTINEL) == 0 && (now - ri->last_pub_time) > SENTINEL_PUBLISH_PERIOD) { - /* PUBLISH hello messages only to masters. */ + /* PUBLISH hello messages to masters and slaves. */ char ip[REDIS_IP_STR_LEN]; if (anetSockName(ri->cc->c.fd,ip,sizeof(ip),NULL) != -1) { - char myaddr[REDIS_IP_STR_LEN+128]; + char payload[REDIS_IP_STR_LEN+1024]; + sentinelRedisInstance *master = (ri->flags & SRI_MASTER) ? + NULL : ri->master; - snprintf(myaddr,sizeof(myaddr),"%s,%d,%s,%d", + snprintf(payload,sizeof(payload), + "%s,%d,%s,%d,%llu," /* Info about this sentinel. */ + "%s,%s,%d,%lld", /* Info about current master. */ ip, server.port, server.runid, - (ri->flags & SRI_CAN_FAILOVER) != 0); + (ri->flags & SRI_CAN_FAILOVER) != 0, + (unsigned long long) sentinel.current_epoch, + /* --- */ + master->name,master->addr->ip,master->addr->port, + master->config_epoch); retval = redisAsyncCommand(ri->cc, sentinelPublishReplyCallback, NULL, "PUBLISH %s %s", - SENTINEL_HELLO_CHANNEL,myaddr); + SENTINEL_HELLO_CHANNEL,payload); if (retval != REDIS_OK) return; ri->pending_commands++; } From 9e1b27d49ea362bf97fca51734c9628289b4fb50 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 11 Nov 2013 12:05:16 +0100 Subject: [PATCH 02/53] Sentinel: remove code not useful in the new design. --- src/sentinel.c | 171 ++++++++++--------------------------------------- 1 file changed, 33 insertions(+), 138 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 76ae9d31..9c73a279 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -72,8 +72,6 @@ typedef struct sentinelAddr { #define SRI_RECONF_DONE (1<<13) /* Slave synchronized with new master. */ #define SRI_FORCE_FAILOVER (1<<14) /* Force failover with master up. */ #define SRI_SCRIPT_KILL_SENT (1<<15) /* SCRIPT KILL already sent on -BUSY */ -#define SRI_DEMOTE (1<<16) /* If the instance claims to be a master, demote - it into a slave sending SLAVEOF. */ #define SENTINEL_INFO_PERIOD 10000 #define SENTINEL_PING_PERIOD 1000 @@ -90,7 +88,6 @@ typedef struct sentinelAddr { #define SENTINEL_MIN_LINK_RECONNECT_PERIOD 15000 #define SENTINEL_DEFAULT_FAILOVER_TIMEOUT (60*15*1000) #define SENTINEL_MAX_PENDING_COMMANDS 100 -#define SENTINEL_EXTENDED_SDOWN_MULTIPLIER 10 /* How many milliseconds is an information valid? This applies for instance * to the reply to SENTINEL IS-MASTER-DOWN-BY-ADDR replies. */ @@ -1502,108 +1499,44 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { /* Handle slave -> master role switch. */ if ((ri->flags & SRI_SLAVE) && role == SRI_MASTER) { - if (!sentinel.tilt && ri->flags & SRI_DEMOTE) { - /* If this sentinel was partitioned from the slave's master, - * or tilted recently, wait some time before to act, - * so that DOWN and roles INFO will be refreshed. */ - mstime_t wait_time = SENTINEL_INFO_PERIOD*2 + - ri->master->down_after_period*2; + /* If this is a promoted slave we can change state to the + * failover state machine. */ + if (!sentinel.tilt && + (ri->master->flags & SRI_FAILOVER_IN_PROGRESS) && + (ri->master->flags & SRI_I_AM_THE_LEADER) && + (ri->master->failover_state == + SENTINEL_FAILOVER_STATE_WAIT_PROMOTION)) + { + ri->master->failover_state = SENTINEL_FAILOVER_STATE_RECONF_SLAVES; + ri->master->failover_state_change_time = mstime(); + sentinelEvent(REDIS_WARNING,"+promoted-slave",ri,"%@"); + sentinelEvent(REDIS_WARNING,"+failover-state-reconf-slaves", + ri->master,"%@"); + sentinelCallClientReconfScript(ri->master,SENTINEL_LEADER, + "start",ri->master->addr,ri->addr); + } else if (!sentinel.tilt) { + /* A slave turned into a master. We want to force our view and + * reconfigure as slave, but make sure to wait some time before + * doing this in order to make sure to receive an updated + * configuratio via Pub/Sub if any. */ + mstime_t wait_time = SENTINEL_PUBLISH_PERIOD*4; if (!sentinelRedisInstanceNoDownFor(ri->master,wait_time) || (mstime()-sentinel.tilt_start_time) < wait_time) return; - /* Old master returned back? Turn it into a slave ASAP if - * we can reach what we believe is the new master now, and - * have a recent role information for it. - * - * Note: we'll clear the DEMOTE flag only when we have the - * acknowledge that it's a slave again. */ + /* Make sure the master is sane before reconfiguring this instance + * into a slave. */ if (ri->master->flags & SRI_MASTER && (ri->master->flags & (SRI_S_DOWN|SRI_O_DOWN)) == 0 && (mstime() - ri->master->info_refresh) < SENTINEL_INFO_PERIOD*2) { - int retval; - retval = sentinelSendSlaveOf(ri, + int retval = sentinelSendSlaveOf(ri, ri->master->addr->ip, ri->master->addr->port); if (retval == REDIS_OK) sentinelEvent(REDIS_NOTICE,"+demote-old-slave",ri,"%@"); - } else { - /* Otherwise if there are not the conditions to demote, we - * no longer trust the DEMOTE flag and remove it. */ - ri->flags &= ~SRI_DEMOTE; - sentinelEvent(REDIS_NOTICE,"-demote-flag-cleared",ri,"%@"); } - } else if (!(ri->master->flags & SRI_FAILOVER_IN_PROGRESS) && - (runid_changed || first_runid)) - { - /* If a slave turned into master but: - * - * 1) Failover not in progress. - * 2) RunID has changed or its the first time we see an INFO output. - * - * We assume this is a reboot with a wrong configuration. - * Log the event and remove the slave. Note that this is processed - * in tilt mode as well, otherwise we lose the information that the - * runid changed (reboot?) and when the tilt mode ends a fake - * failover will be detected. */ - int retval; - - sentinelEvent(REDIS_WARNING,"-slave-restart-as-master",ri,"%@ #removing it from the attached slaves"); - retval = dictDelete(ri->master->slaves,ri->name); - redisAssert(retval == REDIS_OK); - return; - } else if (!sentinel.tilt && ri->flags & SRI_PROMOTED) { - /* If this is a promoted slave we can change state to the - * failover state machine. */ - if ((ri->master->flags & SRI_FAILOVER_IN_PROGRESS) && - (ri->master->flags & SRI_I_AM_THE_LEADER) && - (ri->master->failover_state == - SENTINEL_FAILOVER_STATE_WAIT_PROMOTION)) - { - ri->master->failover_state = SENTINEL_FAILOVER_STATE_RECONF_SLAVES; - ri->master->failover_state_change_time = mstime(); - sentinelEvent(REDIS_WARNING,"+promoted-slave",ri,"%@"); - sentinelEvent(REDIS_WARNING,"+failover-state-reconf-slaves", - ri->master,"%@"); - sentinelCallClientReconfScript(ri->master,SENTINEL_LEADER, - "start",ri->master->addr,ri->addr); - } - } else if (!sentinel.tilt && ( - !(ri->master->flags & SRI_FAILOVER_IN_PROGRESS) || - ((ri->master->flags & SRI_FAILOVER_IN_PROGRESS) && - (ri->master->flags & SRI_I_AM_THE_LEADER) && - ri->master->failover_state == - SENTINEL_FAILOVER_STATE_WAIT_START))) - { - /* No failover in progress? Then it is the start of a failover - * and we are an observer. - * - * We also do that if we are a leader doing a failover, in wait - * start, but well, somebody else started before us. */ - - if (ri->master->flags & SRI_FAILOVER_IN_PROGRESS) { - sentinelEvent(REDIS_WARNING,"-failover-abort-race", - ri->master, "%@"); - sentinelAbortFailover(ri->master); - } - - ri->master->flags |= SRI_FAILOVER_IN_PROGRESS; - sentinelEvent(REDIS_WARNING,"+failover-detected",ri->master,"%@"); - ri->master->failover_state = SENTINEL_FAILOVER_STATE_DETECT_END; - ri->master->failover_state_change_time = mstime(); - ri->master->promoted_slave = ri; - ri->flags |= SRI_PROMOTED; - ri->flags &= ~SRI_DEMOTE; - sentinelCallClientReconfScript(ri->master,SENTINEL_OBSERVER, - "start", ri->master->addr,ri->addr); - /* We are an observer, so we can only assume that the leader - * is reconfiguring the slave instances. For this reason we - * set all the instances as RECONF_SENT waiting for progresses - * on this side. */ - sentinelAddFlagsToDictOfRedisInstances(ri->master->slaves, - SRI_RECONF_SENT); } } @@ -1641,13 +1574,6 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { ri->failover_state_change_time = mstime(); } } - - /* Detect if the old master was demoted as slave and generate the - * +slave event. */ - if (role == SRI_SLAVE && ri->flags & SRI_DEMOTE) { - sentinelEvent(REDIS_NOTICE,"+slave",ri,"%@"); - ri->flags &= ~SRI_DEMOTE; - } } void sentinelInfoReplyCallback(redisAsyncContext *c, void *reply, void *privdata) { @@ -1956,7 +1882,6 @@ void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) { if (ri->flags & SRI_RECONF_SENT) flags = sdscat(flags,"reconf_sent,"); if (ri->flags & SRI_RECONF_INPROG) flags = sdscat(flags,"reconf_inprog,"); if (ri->flags & SRI_RECONF_DONE) flags = sdscat(flags,"reconf_done,"); - if (ri->flags & SRI_DEMOTE) flags = sdscat(flags,"demote,"); if (sdslen(flags) != 0) sdsrange(flags,0,-2); /* remove last "," */ addReplyBulkCString(c,flags); @@ -2748,7 +2673,6 @@ sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) { mstime_t info_validity_time = mstime()-SENTINEL_INFO_VALIDITY_TIME; if (slave->flags & (SRI_S_DOWN|SRI_O_DOWN|SRI_DISCONNECTED)) continue; - if (slave->flags & SRI_DEMOTE) continue; /* Old master not yet ready. */ if (slave->last_avail_time < info_validity_time) continue; if (slave->slave_priority == 0) continue; @@ -2992,16 +2916,14 @@ void sentinelFailoverSwitchToPromotedSlave(sentinelRedisInstance *master) { old_master_ip = sdsdup(master->addr->ip); old_master_port = master->addr->port; sentinelResetMasterAndChangeAddress(master,ref->addr->ip,ref->addr->port); - /* If this is a real switch, that is, we have master->promoted_slave not - * NULL, then we want to add the old master as a slave of the new master, - * but flagging it with SRI_DEMOTE so that we know we'll need to send - * SLAVEOF once the old master is reachable again. */ + /* If this is a real switch and not just a user requested reset, we want + * to add all the known instances as slaves, and also all the sentinels + * back to this master. */ if (master != ref) { - /* Add the new slave, but don't generate a Sentinel event as it will - * happen later when finally the instance will claim to be a slave - * in the INFO output. */ - createSentinelRedisInstance(NULL,SRI_SLAVE|SRI_DEMOTE, + /* TODO: + createSentinelRedisInstance(NULL,SRI_SLAVE old_master_ip, old_master_port, master->quorum, master); + */ } sdsfree(old_master_ip); } @@ -3034,15 +2956,9 @@ void sentinelFailoverStateMachine(sentinelRedisInstance *ri) { } /* Abort a failover in progress with the following steps: - * 1) If this instance is the leaer send a SLAVEOF command to all the already - * reconfigured slaves if any to configure them to replicate with the - * original master. - * 2) For both leaders and observers: clear the failover flags and state in - * the master instance. - * 3) If there is already a promoted slave and we are the leader, and this - * slave is not DISCONNECTED, try to reconfigure it to replicate - * back to the master as well, sending a best effort SLAVEOF command. - */ + * 1) Set the master back to the original one, increment the config epoch. + * 2) Reconfig slaves to replicate to the old master. + * 3) Reconfig the promoted slave as a slave as well. */ void sentinelAbortFailover(sentinelRedisInstance *ri) { dictIterator *di; dictEntry *de; @@ -3085,26 +3001,6 @@ void sentinelAbortFailover(sentinelRedisInstance *ri) { } } -/* The following is called only for master instances and will abort the - * failover process if: - * - * 1) The failover is in progress. - * 2) We already promoted a slave. - * 3) The promoted slave is in extended SDOWN condition. - */ -void sentinelAbortFailoverIfNeeded(sentinelRedisInstance *ri) { - /* Failover is in progress? Do we have a promoted slave? */ - if (!(ri->flags & SRI_FAILOVER_IN_PROGRESS) || !ri->promoted_slave) return; - - /* Is the promoted slave into an extended SDOWN state? */ - if (!(ri->promoted_slave->flags & SRI_S_DOWN) || - (mstime() - ri->promoted_slave->s_down_since_time) < - (ri->down_after_period * SENTINEL_EXTENDED_SDOWN_MULTIPLIER)) return; - - sentinelEvent(REDIS_WARNING,"-failover-abort-x-sdown",ri->promoted_slave,"%@"); - sentinelAbortFailover(ri); -} - /* ======================== SENTINEL timer handler ========================== * This is the "main" our Sentinel, being sentinel completely non blocking * in design. The function is called every second. @@ -3150,7 +3046,6 @@ void sentinelHandleRedisInstance(sentinelRedisInstance *ri) { sentinelCheckObjectivelyDown(ri); sentinelStartFailoverIfNeeded(ri); sentinelFailoverStateMachine(ri); - sentinelAbortFailoverIfNeeded(ri); } } From 0bac36d0a1832453df156e7e7493aa0ad17baf0d Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 11 Nov 2013 17:10:28 +0100 Subject: [PATCH 03/53] Sentinel: handle Hello messages received via slaves correctly. Even when messages are received via the slave, we should perform operations (like adding a new Sentinel) in the context of the master. --- src/sentinel.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 9c73a279..af5363a7 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1651,12 +1651,14 @@ void sentinelPublishReplyCallback(redisAsyncContext *c, void *reply, void *privd /* This is our Pub/Sub callback for the Hello channel. It's useful in order * to discover other sentinels attached at the same master. */ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privdata) { - sentinelRedisInstance *ri = c->data; + sentinelRedisInstance *ri = c->data, *master; redisReply *r; if (!reply || !ri) return; r = reply; + master = (ri->flags & SRI_MASTER) ? ri : ri->master; + /* Update the last activity in the pubsub channel. Note that since we * receive our messages as well this timestamp can be used to detect * if the link is probably disconnected even if it seems otherwise. */ @@ -1691,26 +1693,26 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd master_port = atoi(token[7]); canfailover = atoi(token[3]); si = getSentinelRedisInstanceByAddrAndRunID( - ri->sentinels,token[0],port,token[2]); + master->sentinels,token[0],port,token[2]); current_epoch = strtoull(token[4],NULL,10); master_config_epoch = strtoull(token[8],NULL,10); - sentinelRedisInstance *master; + sentinelRedisInstance *msgmaster; if (!si) { /* If not, remove all the sentinels that have the same runid * OR the same ip/port, because it's either a restart or a * network topology change. */ - removed = removeMatchingSentinelsFromMaster(ri,token[0],port, + removed = removeMatchingSentinelsFromMaster(master,token[0],port, token[2]); if (removed) { - sentinelEvent(REDIS_NOTICE,"-dup-sentinel",ri, + sentinelEvent(REDIS_NOTICE,"-dup-sentinel",master, "%@ #duplicate of %s:%d or %s", token[0],port,token[2]); } /* Add the new sentinel. */ si = createSentinelRedisInstance(NULL,SRI_SENTINEL, - token[0],port,ri->quorum,ri); + token[0],port,master->quorum,master); if (si) { sentinelEvent(REDIS_NOTICE,"+sentinel",si,"%@"); /* The runid is NULL after a new instance creation and @@ -1725,17 +1727,18 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd sentinel.current_epoch = current_epoch; /* Update master info if received configuration is newer. */ - if ((master = sentinelGetMasterByName(token[5])) != NULL) { - if (master->config_epoch < master_config_epoch) { - master->config_epoch = master_config_epoch; - if (master_port != master->addr->port || - !strcmp(master->addr->ip, token[6])) + if ((msgmaster = sentinelGetMasterByName(token[5])) != NULL) { + if (msgmaster->config_epoch < master_config_epoch) { + msgmaster->config_epoch = master_config_epoch; + if (master_port != msgmaster->addr->port || + !strcmp(msgmaster->addr->ip, token[6])) { sentinelEvent(REDIS_WARNING,"+switch-master", - master,"%s %s %d %s %d", - master->name, master->addr->ip, master->addr->port, + msgmaster,"%s %s %d %s %d", + msgmaster->name, + msgmaster->addr->ip, msgmaster->addr->port, token[6], master_port); - sentinelResetMasterAndChangeAddress(ri, + sentinelResetMasterAndChangeAddress(msgmaster, token[6], master_port); } } From 8c1bf9a2bdf2059a13f614a924ff226c86aa4b79 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 11 Nov 2013 18:30:11 +0100 Subject: [PATCH 04/53] Sentinel: leadership handling changes WIP. Changes to leadership handling. Now the leader gets selected by every Sentinel, for a specified epoch, when the SENTINEL is-master-down-by-addr is sent. This command now includes the runid and the currentEpoch of the instance seeking for a vote. The Sentinel only votes a single time in a given epoch. Still a work in progress, does not even compile at this stage. --- src/sentinel.c | 139 ++++++++++++++++++++++++++++--------------------- 1 file changed, 81 insertions(+), 58 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index af5363a7..7bb924f0 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -173,9 +173,8 @@ typedef struct sentinelRedisInstance { char *leader; /* If this is a master instance, this is the runid of the Sentinel that should perform the failover. If this is a Sentinel, this is the runid of the Sentinel - that this other Sentinel is voting as leader. - This field is valid only if SRI_MASTER_DOWN is - set on the Sentinel instance. */ + that this Sentinel voted as leader. */ + uint64_t leader_epoch; /* Epoch of the 'leader' field. */ int failover_state; /* See SENTINEL_FAILOVER_STATE_* defines. */ mstime_t failover_state_change_time; mstime_t failover_start_time; /* When to start to failover if leader. */ @@ -327,6 +326,7 @@ void sentinelScheduleScriptExecution(char *path, ...); void sentinelStartFailover(sentinelRedisInstance *master, int state); void sentinelDiscardReplyCallback(redisAsyncContext *c, void *reply, void *privdata); int sentinelSendSlaveOf(sentinelRedisInstance *ri, char *host, int port); +char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char *req_runid, uint64_t *leader_epoch); /* ========================= Dictionary types =============================== */ @@ -894,6 +894,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * /* Failover state. */ ri->leader = NULL; + ri->leader_epoch = 0; ri->failover_state = SENTINEL_FAILOVER_STATE_NONE; ri->failover_state_change_time = 0; ri->failover_start_time = 0; @@ -1031,7 +1032,7 @@ sentinelRedisInstance *getSentinelRedisInstanceByAddrAndRunID(dict *instances, c return instance; } -/* Simple master lookup by name */ +/* Master lookup by name */ sentinelRedisInstance *sentinelGetMasterByName(char *name) { sentinelRedisInstance *ri; sds sdsname = sdsnew(name); @@ -1041,6 +1042,24 @@ sentinelRedisInstance *sentinelGetMasterByName(char *name) { return ri; } +/* Senitnel lookup by runid */ +sentinelRedisInstance *sentinelGetSentinelByRunid(sentinelRedisInstance *master, char *runid) { + sentinelRedisInstance *retval = NULL; + dictIterator *di; + dictEntry *de; + + di = dictGetIterator(master->sentinels); + while((de = dictNext(di)) != NULL) { + sentinelRedisInstance *ri = dictGetVal(de); + if (!strcmp(ri->runid,runid)) { + retval = ri; + break; + } + } + dictReleaseIterator(di); + return retval; +} + /* Add the specified flags to all the instances in the specified dictionary. */ void sentinelAddFlagsToDictOfRedisInstances(dict *instances, int flags) { dictIterator *di; @@ -1979,11 +1998,13 @@ void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) { addReplyBulkLongLong(c,(ri->flags & SRI_CAN_FAILOVER) != 0); fields++; - if (ri->flags & SRI_MASTER_DOWN) { - addReplyBulkCString(c,"subjective-leader"); - addReplyBulkCString(c,ri->leader ? ri->leader : "?"); - fields++; - } + addReplyBulkCString(c,"voted-leader"); + addReplyBulkCString(c,ri->leader ? ri->leader : "?"); + fields++; + + addReplyBulkCString(c,"voted-leader-epoch"); + addReplyBulkLongLong(c,ri->leader_epoch); + fields++; } setDeferredMultiBulkLength(c,mbl,fields*2); @@ -2044,14 +2065,18 @@ void sentinelCommand(redisClient *c) { return; addReplyDictOfRedisInstances(c,ri->sentinels); } else if (!strcasecmp(c->argv[1]->ptr,"is-master-down-by-addr")) { - /* SENTINEL IS-MASTER-DOWN-BY-ADDR */ + /* SENTINEL IS-MASTER-DOWN-BY-ADDR */ sentinelRedisInstance *ri; + long long req_epoch; + uint64_t leader_epoch = 0; char *leader = NULL; long port; int isdown = 0; - if (c->argc != 4) goto numargserr; - if (getLongFromObjectOrReply(c,c->argv[3],&port,NULL) != REDIS_OK) + if (c->argc != 6) goto numargserr; + if (getLongFromObjectOrReply(c,c->argv[3],&port,NULL) != REDIS_OK || + getLongLongFromObjectOrReply(c,c->argv[4],&req_epoch,NULL) + != REDIS_OK) return; ri = getSentinelRedisInstanceByAddrAndRunID(sentinel.masters, c->argv[2]->ptr,port,NULL); @@ -2061,12 +2086,20 @@ void sentinelCommand(redisClient *c) { if (!sentinel.tilt && ri && (ri->flags & SRI_S_DOWN) && (ri->flags & SRI_MASTER)) isdown = 1; - if (ri) leader = sentinelGetSubjectiveLeader(ri); - /* Reply with a two-elements multi-bulk reply: down state, leader. */ - addReplyMultiBulkLen(c,2); + /* Vote for the master (or fetch the previous vote) */ + if (ri && ri->flags & SRI_MASTER) { + leader = sentinelVoteLeader(ri,(uint64_t)req_epoch, + c->argv[5]->ptr, + &leader_epoch); + } + + /* Reply with a three-elements multi-bulk reply: + * down state, leader, vote epoch. */ + addReplyMultiBulkLen(c,3); addReply(c, isdown ? shared.cone : shared.czero); addReplyBulkCString(c, leader ? leader : "?"); + addReplyLongLong(c, (long long)leader_epoch); if (leader) sdsfree(leader); } else if (!strcasecmp(c->argv[1]->ptr,"reset")) { /* SENTINEL RESET */ @@ -2289,9 +2322,10 @@ void sentinelReceiveIsMasterDownReply(redisAsyncContext *c, void *reply, void *p /* Ignore every error or unexpected reply. * Note that if the command returns an error for any reason we'll * end clearing the SRI_MASTER_DOWN flag for timeout anyway. */ - if (r->type == REDIS_REPLY_ARRAY && r->elements == 2 && + if (r->type == REDIS_REPLY_ARRAY && r->elements == 3 && r->element[0]->type == REDIS_REPLY_INTEGER && - r->element[1]->type == REDIS_REPLY_STRING) + r->element[1]->type == REDIS_REPLY_STRING && + r->element[2]->type == REDIS_REPLY_INTEGER) { ri->last_master_down_reply_time = mstime(); if (r->element[0]->integer == 1) { @@ -2301,6 +2335,7 @@ void sentinelReceiveIsMasterDownReply(redisAsyncContext *c, void *reply, void *p } sdsfree(ri->leader); ri->leader = sdsnew(r->element[1]->str); + ri->leader_epoch = r->element[2]->integer; } } @@ -2341,8 +2376,8 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master) { ll2string(port,sizeof(port),master->addr->port); retval = redisAsyncCommand(ri->cc, sentinelReceiveIsMasterDownReply, NULL, - "SENTINEL is-master-down-by-addr %s %s", - master->addr->ip, port); + "SENTINEL is-master-down-by-addr %s %s %llu %s", + master->addr->ip, port, sentinel.current_epoch, server.runid); if (retval == REDIS_OK) ri->pending_commands++; } dictReleaseIterator(di); @@ -2369,41 +2404,25 @@ int compareRunID(const void *a, const void *b) { return strcasecmp(*aptrptr, *bptrptr); } -char *sentinelGetSubjectiveLeader(sentinelRedisInstance *master) { - dictIterator *di; - dictEntry *de; - char **instance = - zmalloc(sizeof(char*)*(dictSize(master->sentinels)+1)); - int instances = 0; - char *leader = NULL; +/* Vote for the sentinel with 'req_runid' or return the old vote if already + * voted for the specifed 'req_epoch' or one greater. + * + * If a vote is not available returns NULL, otherwise return the Sentinel + * runid and populate the leader_epoch with the epoch of the last vote. */ +char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char *req_runid, uint64_t *leader_epoch) { + sentinelRedisInstance *si = sentinelGetSentinelByRunid(master,req_runid); - if (master->flags & SRI_CAN_FAILOVER) { - /* Add myself if I'm a Sentinel that can failover this master. */ - instance[instances++] = server.runid; + if (req_epoch > sentinel.current_epoch) + sentinel.current_epoch = req_epoch; + + if (si && master->leader_epoch < req_epoch) { + sdsfree(master->leader); + master->leader = sdsnew(req_runid); + master->leader_epoch = sentinel.current_epoch; } - di = dictGetIterator(master->sentinels); - while((de = dictNext(di)) != NULL) { - sentinelRedisInstance *ri = dictGetVal(de); - mstime_t lag = mstime() - ri->last_avail_time; - - if (lag > SENTINEL_INFO_VALIDITY_TIME || - !(ri->flags & SRI_CAN_FAILOVER) || - (ri->flags & SRI_DISCONNECTED) || - ri->runid == NULL) - continue; - instance[instances++] = ri->runid; - } - dictReleaseIterator(di); - - /* If we have at least one instance passing our checks, order the array - * by runid. */ - if (instances) { - qsort(instance,instances,sizeof(char*),compareRunID); - leader = sdsnew(instance[0]); - } - zfree(instance); - return leader; + *leader_epoch = master->leader_epoch; + return master->leader; } struct sentinelLeader { @@ -2411,9 +2430,9 @@ struct sentinelLeader { unsigned long votes; }; -/* Helper function for sentinelGetObjectiveLeader, increment the counter +/* Helper function for sentinelGetLeader, increment the counter * relative to the specified runid. */ -void sentinelObjectiveLeaderIncr(dict *counters, char *runid) { +void sentinelLeaderIncr(dict *counters, char *runid) { dictEntry *de = dictFind(counters,runid); uint64_t oldval; @@ -2427,9 +2446,13 @@ void sentinelObjectiveLeaderIncr(dict *counters, char *runid) { } } -/* Scan all the Sentinels attached to this master to check what is the - * most voted leader among Sentinels. */ -char *sentinelGetObjectiveLeader(sentinelRedisInstance *master) { +/* Scan all the Sentinels attached to this master to check if there + * is a leader for a given term, and return it if any. + * + * To be a leader for a given epoch, we should have the majorify of + * the Sentinels we know about that reported the same instance as + * leader for the same epoch. */ +char *sentinelGetLeader(sentinelRedisInstance *master, uint64_t term) { dict *counters; dictIterator *di; dictEntry *de; @@ -2443,7 +2466,7 @@ char *sentinelGetObjectiveLeader(sentinelRedisInstance *master) { /* Count my vote. */ myvote = sentinelGetSubjectiveLeader(master); if (myvote) { - sentinelObjectiveLeaderIncr(counters,myvote); + sentinelLeaderIncr(counters,myvote); voters++; } @@ -2458,7 +2481,7 @@ char *sentinelGetObjectiveLeader(sentinelRedisInstance *master) { * leader fails. In that case we consider all the voters. */ if (!(master->flags & SRI_FAILOVER_IN_PROGRESS) && !(ri->flags & SRI_MASTER_DOWN)) continue; - sentinelObjectiveLeaderIncr(counters,ri->leader); + sentinelLeaderIncr(counters,ri->leader); voters++; } dictReleaseIterator(di); From 90ab62fd5e3a7cfe82919d47376dda1a7366e519 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 11:09:35 +0100 Subject: [PATCH 05/53] Sentinel: epoch introduced in leader vote. --- src/sentinel.c | 213 ++++++++++++++++--------------------------------- 1 file changed, 67 insertions(+), 146 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 7bb924f0..6530b2b6 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -65,13 +65,12 @@ typedef struct sentinelAddr { #define SRI_CAN_FAILOVER (1<<7) #define SRI_FAILOVER_IN_PROGRESS (1<<8) /* Failover is in progress for this master. */ -#define SRI_I_AM_THE_LEADER (1<<9) /* We are the leader for this master. */ -#define SRI_PROMOTED (1<<10) /* Slave selected for promotion. */ -#define SRI_RECONF_SENT (1<<11) /* SLAVEOF sent. */ -#define SRI_RECONF_INPROG (1<<12) /* Slave synchronization in progress. */ -#define SRI_RECONF_DONE (1<<13) /* Slave synchronized with new master. */ -#define SRI_FORCE_FAILOVER (1<<14) /* Force failover with master up. */ -#define SRI_SCRIPT_KILL_SENT (1<<15) /* SCRIPT KILL already sent on -BUSY */ +#define SRI_PROMOTED (1<<9) /* Slave selected for promotion. */ +#define SRI_RECONF_SENT (1<<10) /* SLAVEOF sent. */ +#define SRI_RECONF_INPROG (1<<11) /* Slave synchronization in progress. */ +#define SRI_RECONF_DONE (1<<12) /* Slave synchronized with new master. */ +#define SRI_FORCE_FAILOVER (1<<13) /* Force failover with master up. */ +#define SRI_SCRIPT_KILL_SENT (1<<14) /* SCRIPT KILL already sent on -BUSY */ #define SENTINEL_INFO_PERIOD 10000 #define SENTINEL_PING_PERIOD 1000 @@ -175,6 +174,7 @@ typedef struct sentinelRedisInstance { this is a Sentinel, this is the runid of the Sentinel that this Sentinel voted as leader. */ uint64_t leader_epoch; /* Epoch of the 'leader' field. */ + uint64_t failover_epoch; /* Epoch of the currently started failover. */ int failover_state; /* See SENTINEL_FAILOVER_STATE_* defines. */ mstime_t failover_state_change_time; mstime_t failover_start_time; /* When to start to failover if leader. */ @@ -323,7 +323,7 @@ void sentinelAbortFailover(sentinelRedisInstance *ri); void sentinelEvent(int level, char *type, sentinelRedisInstance *ri, const char *fmt, ...); sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master); void sentinelScheduleScriptExecution(char *path, ...); -void sentinelStartFailover(sentinelRedisInstance *master, int state); +void sentinelStartFailover(sentinelRedisInstance *master); void sentinelDiscardReplyCallback(redisAsyncContext *c, void *reply, void *privdata); int sentinelSendSlaveOf(sentinelRedisInstance *ri, char *host, int port); char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char *req_runid, uint64_t *leader_epoch); @@ -895,6 +895,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * /* Failover state. */ ri->leader = NULL; ri->leader_epoch = 0; + ri->failover_epoch = 0; ri->failover_state = SENTINEL_FAILOVER_STATE_NONE; ri->failover_state_change_time = 0; ri->failover_start_time = 0; @@ -1522,7 +1523,6 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { * failover state machine. */ if (!sentinel.tilt && (ri->master->flags & SRI_FAILOVER_IN_PROGRESS) && - (ri->master->flags & SRI_I_AM_THE_LEADER) && (ri->master->failover_state == SENTINEL_FAILOVER_STATE_WAIT_PROMOTION)) { @@ -1898,8 +1898,6 @@ void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) { if (ri->flags & SRI_MASTER_DOWN) flags = sdscat(flags,"master_down,"); if (ri->flags & SRI_FAILOVER_IN_PROGRESS) flags = sdscat(flags,"failover_in_progress,"); - if (ri->flags & SRI_I_AM_THE_LEADER) - flags = sdscat(flags,"i_am_the_leader,"); if (ri->flags & SRI_PROMOTED) flags = sdscat(flags,"promoted,"); if (ri->flags & SRI_RECONF_SENT) flags = sdscat(flags,"reconf_sent,"); if (ri->flags & SRI_RECONF_INPROG) flags = sdscat(flags,"reconf_inprog,"); @@ -2147,7 +2145,7 @@ void sentinelCommand(redisClient *c) { addReplySds(c,sdsnew("-NOGOODSLAVE No suitable slave to promote\r\n")); return; } - sentinelStartFailover(ri,SENTINEL_FAILOVER_STATE_WAIT_START); + sentinelStartFailover(ri); ri->flags |= SRI_FORCE_FAILOVER; addReply(c,shared.ok); } else if (!strcasecmp(c->argv[1]->ptr,"pending-scripts")) { @@ -2347,6 +2345,14 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master) { dictIterator *di; dictEntry *de; + /* Vote for myself if I see the master is already in ODOWN state. */ + if (master->flags & SRI_O_DOWN) { + uint64_t leader_epoch; + + sentinelVoteLeader(master,sentinel.current_epoch,server.runid, + &leader_epoch); + } + di = dictGetIterator(master->sentinels); while((de = dictNext(di)) != NULL) { sentinelRedisInstance *ri = dictGetVal(de); @@ -2366,8 +2372,7 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master) { * 1) We believe it is down, or there is a failover in progress. * 2) Sentinel is connected. * 3) We did not received the info within SENTINEL_ASK_PERIOD ms. */ - if ((master->flags & (SRI_S_DOWN|SRI_FAILOVER_IN_PROGRESS)) == 0) - continue; + if ((master->flags & SRI_S_DOWN) == 0) continue; if (ri->flags & SRI_DISCONNECTED) continue; if (mstime() - ri->last_master_down_reply_time < SENTINEL_ASK_PERIOD) continue; @@ -2377,7 +2382,9 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master) { retval = redisAsyncCommand(ri->cc, sentinelReceiveIsMasterDownReply, NULL, "SENTINEL is-master-down-by-addr %s %s %llu %s", - master->addr->ip, port, sentinel.current_epoch, server.runid); + master->addr->ip, port, + sentinel.current_epoch, + server.runid); if (retval == REDIS_OK) ri->pending_commands++; } dictReleaseIterator(di); @@ -2415,7 +2422,9 @@ char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char if (req_epoch > sentinel.current_epoch) sentinel.current_epoch = req_epoch; - if (si && master->leader_epoch < req_epoch) { + if (si && master->leader_epoch < req_epoch && + sentinel.current_epoch <= req_epoch) + { sdsfree(master->leader); master->leader = sdsnew(req_runid); master->leader_epoch = sentinel.current_epoch; @@ -2447,25 +2456,27 @@ void sentinelLeaderIncr(dict *counters, char *runid) { } /* Scan all the Sentinels attached to this master to check if there - * is a leader for a given term, and return it if any. + * is a leader for the specified epoch. * * To be a leader for a given epoch, we should have the majorify of - * the Sentinels we know about that reported the same instance as + * the Sentinels we know that reported the same instance as * leader for the same epoch. */ -char *sentinelGetLeader(sentinelRedisInstance *master, uint64_t term) { +char *sentinelGetLeader(sentinelRedisInstance *master, uint64_t epoch) { dict *counters; dictIterator *di; dictEntry *de; unsigned int voters = 0, voters_quorum; char *myvote; char *winner = NULL; + uint64_t leader_epoch; redisAssert(master->flags & (SRI_O_DOWN|SRI_FAILOVER_IN_PROGRESS)); counters = dictCreate(&leaderVotesDictType,NULL); - /* Count my vote. */ - myvote = sentinelGetSubjectiveLeader(master); - if (myvote) { + /* Count my vote (and vote for myself if I still did not voted for + * the currnet epoch). */ + myvote = sentinelVoteLeader(master,epoch,server.runid,&leader_epoch); + if (myvote && leader_epoch == epoch) { sentinelLeaderIncr(counters,myvote); voters++; } @@ -2474,13 +2485,8 @@ char *sentinelGetLeader(sentinelRedisInstance *master, uint64_t term) { di = dictGetIterator(master->sentinels); while((de = dictNext(di)) != NULL) { sentinelRedisInstance *ri = dictGetVal(de); - if (ri->leader == NULL) continue; - /* If the failover is not already in progress we are only interested - * in Sentinels that believe the master is down. Otherwise the leader - * selection is useful for the "failover-takedown" when the original - * leader fails. In that case we consider all the voters. */ - if (!(master->flags & SRI_FAILOVER_IN_PROGRESS) && - !(ri->flags & SRI_MASTER_DOWN)) continue; + if (ri->leader == NULL || ri->leader_epoch != sentinel.current_epoch) + continue; sentinelLeaderIncr(counters,ri->leader); voters++; } @@ -2546,32 +2552,14 @@ int sentinelSendSlaveOf(sentinelRedisInstance *ri, char *host, int port) { return REDIS_OK; } -/* Setup the master state to start a failover as a leader. - * - * State can be either: - * - * SENTINEL_FAILOVER_STATE_WAIT_START: starts a failover from scratch. - * SENTINEL_FAILOVER_STATE_RECONF_SLAVES: takedown a failed failover. - */ -void sentinelStartFailover(sentinelRedisInstance *master, int state) { +/* Setup the master state to start a failover. */ +void sentinelStartFailover(sentinelRedisInstance *master) { redisAssert(master->flags & SRI_MASTER); - redisAssert(state == SENTINEL_FAILOVER_STATE_WAIT_START || - state == SENTINEL_FAILOVER_STATE_RECONF_SLAVES); - master->failover_state = state; - master->flags |= SRI_FAILOVER_IN_PROGRESS|SRI_I_AM_THE_LEADER; + master->failover_state = SENTINEL_FAILOVER_STATE_WAIT_START; + master->flags |= SRI_FAILOVER_IN_PROGRESS; + master->failover_epoch = ++sentinel.current_epoch; sentinelEvent(REDIS_WARNING,"+failover-triggered",master,"%@"); - - /* Pick a random delay if it's a fresh failover (WAIT_START), and not - * a recovery of a failover started by another sentinel. */ - if (master->failover_state == SENTINEL_FAILOVER_STATE_WAIT_START) { - master->failover_start_time = mstime() + - SENTINEL_FAILOVER_FIXED_DELAY + - (rand() % SENTINEL_FAILOVER_MAX_RANDOM_DELAY); - sentinelEvent(REDIS_WARNING,"+failover-state-wait-start",master, - "%@ #starting in %lld milliseconds", - master->failover_start_time-mstime()); - } master->failover_state_change_time = mstime(); } @@ -2580,66 +2568,18 @@ void sentinelStartFailover(sentinelRedisInstance *master, int state) { * * 1) Enough time has passed since O_DOWN. * 2) The master is marked as SRI_CAN_FAILOVER, so we can failover it. - * 3) We are the objectively leader for this master. - * - * If the conditions are met we flag the master as SRI_FAILOVER_IN_PROGRESS - * and SRI_I_AM_THE_LEADER. - */ + * + * We still don't know if we'll win the election so it is possible that we + * start the failover but that we'll not be able to act. */ void sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) { - char *leader; - int isleader; - - /* We can't failover if the master is not in O_DOWN state or if - * there is not already a failover in progress (to perform the - * takedown if the leader died) or if this Sentinel is not allowed - * to start a failover. */ + /* We can't failover if the master is not in O_DOWN state. */ if (!(master->flags & SRI_CAN_FAILOVER) || - !(master->flags & (SRI_O_DOWN|SRI_FAILOVER_IN_PROGRESS))) return; + !(master->flags & SRI_O_DOWN)) return; - leader = sentinelGetObjectiveLeader(master); - isleader = leader && strcasecmp(leader,server.runid) == 0; - sdsfree(leader); + /* Failover already in progress? */ + if (master->flags & SRI_FAILOVER_IN_PROGRESS) return; - /* If I'm not the leader, I can't failover for sure. */ - if (!isleader) return; - - /* If the failover is already in progress there are two options... */ - if (master->flags & SRI_FAILOVER_IN_PROGRESS) { - if (master->flags & SRI_I_AM_THE_LEADER) { - /* 1) I'm flagged as leader so I already started the failover. - * Just return. */ - return; - } else { - mstime_t elapsed = mstime() - master->failover_state_change_time; - - /* 2) I'm the new leader, but I'm not flagged as leader in the - * master: I did not started the failover, but the original - * leader has no longer the leadership. - * - * In this case if the failover appears to be lagging - * for at least 25% of the configured failover timeout, - * I can assume I can take control. Otherwise - * it's better to return and wait more. */ - if (elapsed < (master->failover_timeout/4)) return; - sentinelEvent(REDIS_WARNING,"+failover-takedown",master,"%@"); - /* We have already an elected slave if we are in - * FAILOVER_IN_PROGRESS state, that is, the slave that we - * observed turning into a master. */ - sentinelStartFailover(master,SENTINEL_FAILOVER_STATE_RECONF_SLAVES); - /* As an observer we flagged all the slaves as RECONF_SENT but - * now we are in charge of actually sending the reconfiguration - * command so let's clear this flag for all the instances. */ - sentinelDelFlagsToDictOfRedisInstances(master->slaves, - SRI_RECONF_SENT); - } - } else { - /* Brand new failover as SRI_FAILOVER_IN_PROGRESS was not set. - * - * Do we have a slave to promote? Otherwise don't start a failover - * at all. */ - if (sentinelSelectSlave(master) == NULL) return; - sentinelStartFailover(master,SENTINEL_FAILOVER_STATE_WAIT_START); - } + sentinelStartFailover(master); } /* Select a suitable slave to promote. The current algorithm only uses @@ -2723,29 +2663,22 @@ sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) { /* ---------------- Failover state machine implementation ------------------- */ void sentinelFailoverWaitStart(sentinelRedisInstance *ri) { - /* If we in "wait start" but the master is no longer in ODOWN nor in - * SDOWN condition we abort the failover. This is important as it - * prevents a useless failover in a a notable case of netsplit, where - * the sentinels are split from the redis instances. In this case - * the failover will not start while there is the split because no - * good slave can be reached. However when the split is resolved, we - * can go to waitstart if the slave is back reachable a few milliseconds - * before the master is. In that case when the master is back online - * we cancel the failover. */ - if ((ri->flags & (SRI_S_DOWN|SRI_O_DOWN|SRI_FORCE_FAILOVER)) == 0) { - sentinelEvent(REDIS_WARNING,"-failover-abort-master-is-back", - ri,"%@"); - sentinelAbortFailover(ri); - return; - } + char *leader; + int isleader; + + /* Check if we are the leader for the failover epoch. */ + leader = sentinelGetLeader(ri, ri->failover_epoch); + isleader = leader && strcasecmp(leader,server.runid) == 0; + sdsfree(leader); + + /* If I'm not the leader, I can't continue with the failover. */ + if (!isleader) return; /* Start the failover going to the next state if enough time has * elapsed. */ - if (mstime() >= ri->failover_start_time) { - ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE; - ri->failover_state_change_time = mstime(); - sentinelEvent(REDIS_WARNING,"+failover-state-select-slave",ri,"%@"); - } + ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE; + ri->failover_state_change_time = mstime(); + sentinelEvent(REDIS_WARNING,"+failover-state-select-slave",ri,"%@"); } void sentinelFailoverSelectSlave(sentinelRedisInstance *ri) { @@ -2829,8 +2762,7 @@ void sentinelFailoverDetectEnd(sentinelRedisInstance *master) { } if (not_reconfigured == 0) { - int role = (master->flags & SRI_I_AM_THE_LEADER) ? SENTINEL_LEADER : - SENTINEL_OBSERVER; + int role = SENTINEL_LEADER; sentinelEvent(REDIS_WARNING,"+failover-end",master,"%@"); master->failover_state = SENTINEL_FAILOVER_STATE_UPDATE_CONFIG; @@ -2842,7 +2774,7 @@ void sentinelFailoverDetectEnd(sentinelRedisInstance *master) { /* If I'm the leader it is a good idea to send a best effort SLAVEOF * command to all the slaves still not reconfigured to replicate with * the new master. */ - if (timeout && (master->flags & SRI_I_AM_THE_LEADER)) { + if (timeout) { dictIterator *di; dictEntry *de; @@ -2999,8 +2931,7 @@ void sentinelAbortFailover(sentinelRedisInstance *ri) { di = dictGetIterator(ri->slaves); while((de = dictNext(di)) != NULL) { sentinelRedisInstance *slave = dictGetVal(de); - if ((ri->flags & SRI_I_AM_THE_LEADER) && - !(slave->flags & SRI_DISCONNECTED) && + if (!(slave->flags & SRI_DISCONNECTED) && (slave->flags & (SRI_PROMOTED|SRI_RECONF_SENT|SRI_RECONF_INPROG| SRI_RECONF_DONE))) { @@ -3014,9 +2945,8 @@ void sentinelAbortFailover(sentinelRedisInstance *ri) { } dictReleaseIterator(di); - sentinel_role = (ri->flags & SRI_I_AM_THE_LEADER) ? SENTINEL_LEADER : - SENTINEL_OBSERVER; - ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS|SRI_I_AM_THE_LEADER|SRI_FORCE_FAILOVER); + sentinel_role = SENTINEL_LEADER; + ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS|SRI_FORCE_FAILOVER); ri->failover_state = SENTINEL_FAILOVER_STATE_NONE; ri->failover_state_change_time = mstime(); if (ri->promoted_slave) { @@ -3039,16 +2969,6 @@ void sentinelHandleRedisInstance(sentinelRedisInstance *ri) { sentinelReconnectInstance(ri); sentinelPingInstance(ri); - /* Masters and slaves */ - if (ri->flags & (SRI_MASTER|SRI_SLAVE)) { - /* Nothing so far. */ - } - - /* Only masters */ - if (ri->flags & SRI_MASTER) { - sentinelAskMasterStateToOtherSentinels(ri); - } - /* ============== ACTING HALF ============= */ /* We don't proceed with the acting half if we are in TILT mode. * TILT happens when we find something odd with the time, like a @@ -3072,6 +2992,7 @@ void sentinelHandleRedisInstance(sentinelRedisInstance *ri) { sentinelCheckObjectivelyDown(ri); sentinelStartFailoverIfNeeded(ri); sentinelFailoverStateMachine(ri); + sentinelAskMasterStateToOtherSentinels(ri); } } From b6b65b29c0f5021f2f7845e53ebc1641f2de1615 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 11:10:10 +0100 Subject: [PATCH 06/53] Sentinel: fix PUBLISH to masters and slaves. --- src/sentinel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 6530b2b6..be99147d 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1827,13 +1827,13 @@ void sentinelPingInstance(sentinelRedisInstance *ri) { if (anetSockName(ri->cc->c.fd,ip,sizeof(ip),NULL) != -1) { char payload[REDIS_IP_STR_LEN+1024]; sentinelRedisInstance *master = (ri->flags & SRI_MASTER) ? - NULL : ri->master; + ri : ri->master; snprintf(payload,sizeof(payload), "%s,%d,%s,%d,%llu," /* Info about this sentinel. */ "%s,%s,%d,%lld", /* Info about current master. */ ip, server.port, server.runid, - (ri->flags & SRI_CAN_FAILOVER) != 0, + (master->flags & SRI_CAN_FAILOVER) != 0, (unsigned long long) sentinel.current_epoch, /* --- */ master->name,master->addr->ip,master->addr->port, From ab4b2ec88f192ea894bcf9ce3a7ba2c1a5765aff Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 11:32:40 +0100 Subject: [PATCH 07/53] Sentinel: allow to vote for myself. --- src/sentinel.c | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index be99147d..0a69107e 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1043,24 +1043,6 @@ sentinelRedisInstance *sentinelGetMasterByName(char *name) { return ri; } -/* Senitnel lookup by runid */ -sentinelRedisInstance *sentinelGetSentinelByRunid(sentinelRedisInstance *master, char *runid) { - sentinelRedisInstance *retval = NULL; - dictIterator *di; - dictEntry *de; - - di = dictGetIterator(master->sentinels); - while((de = dictNext(di)) != NULL) { - sentinelRedisInstance *ri = dictGetVal(de); - if (!strcmp(ri->runid,runid)) { - retval = ri; - break; - } - } - dictReleaseIterator(di); - return retval; -} - /* Add the specified flags to all the instances in the specified dictionary. */ void sentinelAddFlagsToDictOfRedisInstances(dict *instances, int flags) { dictIterator *di; @@ -2417,17 +2399,15 @@ int compareRunID(const void *a, const void *b) { * If a vote is not available returns NULL, otherwise return the Sentinel * runid and populate the leader_epoch with the epoch of the last vote. */ char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char *req_runid, uint64_t *leader_epoch) { - sentinelRedisInstance *si = sentinelGetSentinelByRunid(master,req_runid); - if (req_epoch > sentinel.current_epoch) sentinel.current_epoch = req_epoch; - if (si && master->leader_epoch < req_epoch && - sentinel.current_epoch <= req_epoch) - { + if (master->leader_epoch < req_epoch && sentinel.current_epoch <= req_epoch) { sdsfree(master->leader); master->leader = sdsnew(req_runid); master->leader_epoch = sentinel.current_epoch; + printf("Selected leader %s for epoch %llu\n", master->leader, + (unsigned long long) master->leader_epoch); } *leader_epoch = master->leader_epoch; From 54c447be52d89389f31c0794690f51db27d88cd5 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 13:30:31 +0100 Subject: [PATCH 08/53] Sentinel: wait some time between failover attempts. --- src/sentinel.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 0a69107e..6d7dbc52 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -177,7 +177,7 @@ typedef struct sentinelRedisInstance { uint64_t failover_epoch; /* Epoch of the currently started failover. */ int failover_state; /* See SENTINEL_FAILOVER_STATE_* defines. */ mstime_t failover_state_change_time; - mstime_t failover_start_time; /* When to start to failover if leader. */ + mstime_t failover_start_time; /* Last failover attempt start time. */ mstime_t failover_timeout; /* Max time to refresh failover state. */ struct sentinelRedisInstance *promoted_slave; /* Promoted slave instance. */ /* Scripts executed to notify admin or reconfigure clients: when they @@ -2411,7 +2411,7 @@ char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char } *leader_epoch = master->leader_epoch; - return master->leader; + return master->leader ? sdsnew(master->leader) : NULL; } struct sentinelLeader { @@ -2540,6 +2540,7 @@ void sentinelStartFailover(sentinelRedisInstance *master) { master->flags |= SRI_FAILOVER_IN_PROGRESS; master->failover_epoch = ++sentinel.current_epoch; sentinelEvent(REDIS_WARNING,"+failover-triggered",master,"%@"); + master->failover_start_time = mstime(); master->failover_state_change_time = mstime(); } @@ -2559,6 +2560,10 @@ void sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) { /* Failover already in progress? */ if (master->flags & SRI_FAILOVER_IN_PROGRESS) return; + /* Last failover attempt started too little time ago? */ + if (mstime() - master->failover_start_time < + SENTINEL_PUBLISH_PERIOD*4) return; + sentinelStartFailover(master); } From e6b9d5e97ead58ce819d33edfbfce55ffdd63568 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 13:35:25 +0100 Subject: [PATCH 09/53] Sentinel: +new-epoch events. --- src/sentinel.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 6d7dbc52..c0bce345 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1724,8 +1724,11 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd } /* Update local current_epoch if received current_epoch is greater. */ - if (current_epoch > sentinel.current_epoch) + if (current_epoch > sentinel.current_epoch) { sentinel.current_epoch = current_epoch; + sentinelEvent(REDIS_WARNING,"+new-epoch",ri,"%llu", + (unsigned long long) sentinel.current_epoch); + } /* Update master info if received configuration is newer. */ if ((msgmaster = sentinelGetMasterByName(token[5])) != NULL) { @@ -2399,15 +2402,18 @@ int compareRunID(const void *a, const void *b) { * If a vote is not available returns NULL, otherwise return the Sentinel * runid and populate the leader_epoch with the epoch of the last vote. */ char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char *req_runid, uint64_t *leader_epoch) { - if (req_epoch > sentinel.current_epoch) + if (req_epoch > sentinel.current_epoch) { sentinel.current_epoch = req_epoch; + sentinelEvent(REDIS_WARNING,"+new-epoch",master,"%llu", + (unsigned long long) sentinel.current_epoch); + } if (master->leader_epoch < req_epoch && sentinel.current_epoch <= req_epoch) { sdsfree(master->leader); master->leader = sdsnew(req_runid); master->leader_epoch = sentinel.current_epoch; - printf("Selected leader %s for epoch %llu\n", master->leader, - (unsigned long long) master->leader_epoch); + sentinelEvent(REDIS_WARNING,"+vote-for-leader",master,"%s %llu", + master->leader, (unsigned long long) master->leader_epoch); } *leader_epoch = master->leader_epoch; @@ -2539,7 +2545,9 @@ void sentinelStartFailover(sentinelRedisInstance *master) { master->failover_state = SENTINEL_FAILOVER_STATE_WAIT_START; master->flags |= SRI_FAILOVER_IN_PROGRESS; master->failover_epoch = ++sentinel.current_epoch; - sentinelEvent(REDIS_WARNING,"+failover-triggered",master,"%@"); + sentinelEvent(REDIS_WARNING,"+new-epoch",master,"%llu", + (unsigned long long) sentinel.current_epoch); + sentinelEvent(REDIS_WARNING,"+try-failover",master,"%@"); master->failover_start_time = mstime(); master->failover_state_change_time = mstime(); } @@ -2658,6 +2666,7 @@ void sentinelFailoverWaitStart(sentinelRedisInstance *ri) { /* If I'm not the leader, I can't continue with the failover. */ if (!isleader) return; + sentinelEvent(REDIS_WARNING,"+elected-leader",ri,"%@"); /* Start the failover going to the next state if enough time has * elapsed. */ From 4a128b949df68d24517e3692b456b460377b7790 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 16:38:02 +0100 Subject: [PATCH 10/53] Sentinel: when starting failover seek for votes ASAP. --- src/sentinel.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index c0bce345..7cf32d7e 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -72,6 +72,7 @@ typedef struct sentinelAddr { #define SRI_FORCE_FAILOVER (1<<13) /* Force failover with master up. */ #define SRI_SCRIPT_KILL_SENT (1<<14) /* SCRIPT KILL already sent on -BUSY */ +#define SENTINEL_NO_FLAGS 0 /* Generic no flags define. */ #define SENTINEL_INFO_PERIOD 10000 #define SENTINEL_PING_PERIOD 1000 #define SENTINEL_ASK_PERIOD 1000 @@ -2326,7 +2327,8 @@ void sentinelReceiveIsMasterDownReply(redisAsyncContext *c, void *reply, void *p * SENTINEL IS-MASTER-DOWN-BY-ADDR requests to other sentinels * in order to get the replies that allow to reach the quorum and * possibly also mark the master as objectively down. */ -void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master) { +#define SENTINEL_ASK_FORCED (1<<0) +void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master, int flags) { dictIterator *di; dictEntry *de; @@ -2359,7 +2361,8 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master) { * 3) We did not received the info within SENTINEL_ASK_PERIOD ms. */ if ((master->flags & SRI_S_DOWN) == 0) continue; if (ri->flags & SRI_DISCONNECTED) continue; - if (mstime() - ri->last_master_down_reply_time < SENTINEL_ASK_PERIOD) + if (!(flags & SENTINEL_ASK_FORCED) && + mstime() - ri->last_master_down_reply_time < SENTINEL_ASK_PERIOD) continue; /* Ask */ @@ -2559,20 +2562,23 @@ void sentinelStartFailover(sentinelRedisInstance *master) { * 2) The master is marked as SRI_CAN_FAILOVER, so we can failover it. * * We still don't know if we'll win the election so it is possible that we - * start the failover but that we'll not be able to act. */ -void sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) { + * start the failover but that we'll not be able to act. + * + * Return non-zero if a failover was started. */ +int sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) { /* We can't failover if the master is not in O_DOWN state. */ if (!(master->flags & SRI_CAN_FAILOVER) || - !(master->flags & SRI_O_DOWN)) return; + !(master->flags & SRI_O_DOWN)) return 0; /* Failover already in progress? */ - if (master->flags & SRI_FAILOVER_IN_PROGRESS) return; + if (master->flags & SRI_FAILOVER_IN_PROGRESS) return 0; /* Last failover attempt started too little time ago? */ if (mstime() - master->failover_start_time < - SENTINEL_PUBLISH_PERIOD*4) return; + SENTINEL_PUBLISH_PERIOD*4) return 0; sentinelStartFailover(master); + return 1; } /* Select a suitable slave to promote. The current algorithm only uses @@ -2984,9 +2990,10 @@ void sentinelHandleRedisInstance(sentinelRedisInstance *ri) { /* Only masters */ if (ri->flags & SRI_MASTER) { sentinelCheckObjectivelyDown(ri); - sentinelStartFailoverIfNeeded(ri); + if (sentinelStartFailoverIfNeeded(ri)) + sentinelAskMasterStateToOtherSentinels(ri,SENTINEL_ASK_FORCED); sentinelFailoverStateMachine(ri); - sentinelAskMasterStateToOtherSentinels(ri); + sentinelAskMasterStateToOtherSentinels(ri,SENTINEL_NO_FLAGS); } } From d2bc6dc39a50a412234ed05147b1e92e68fbe8aa Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 17:07:31 +0100 Subject: [PATCH 11/53] Sentinel: new failover algo, desync slaves and update config epoch. --- src/sentinel.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index 7cf32d7e..5ad7bee3 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1509,6 +1509,12 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { (ri->master->failover_state == SENTINEL_FAILOVER_STATE_WAIT_PROMOTION)) { + /* Now that we are sure the slave was reconfigured as a master + * set the master configuration epoch to the epoch we won the + * election to perform this failover. This will force the other + * Sentinels to update their config (assuming there is not + * a newer one already available). */ + ri->master->config_epoch = ri->master->failover_epoch; ri->master->failover_state = SENTINEL_FAILOVER_STATE_RECONF_SLAVES; ri->master->failover_state_change_time = mstime(); sentinelEvent(REDIS_WARNING,"+promoted-slave",ri,"%@"); @@ -2417,6 +2423,13 @@ char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char master->leader_epoch = sentinel.current_epoch; sentinelEvent(REDIS_WARNING,"+vote-for-leader",master,"%s %llu", master->leader, (unsigned long long) master->leader_epoch); + /* If we did not voted for ourselves, set the master failover start + * time to now, in order to force a delay before we can start a + * failover for the same master. + * + * The random addition is useful to desynchronize a bit the slaves + * and reduce the chance that no slave gets majority. */ + master->failover_start_time = mstime() + rand() % 2000; } *leader_epoch = master->leader_epoch; @@ -2671,7 +2684,14 @@ void sentinelFailoverWaitStart(sentinelRedisInstance *ri) { sdsfree(leader); /* If I'm not the leader, I can't continue with the failover. */ - if (!isleader) return; + if (!isleader) { + /* Abort the failover if I'm not the leader after some time. */ + if (mstime() - ri->failover_start_time > 10000) { + sentinelEvent(REDIS_WARNING,"-failover-abort-not-elected",ri,"%@"); + sentinelAbortFailover(ri); + } + return; + } sentinelEvent(REDIS_WARNING,"+elected-leader",ri,"%@"); /* Start the failover going to the next state if enough time has From 24158d1488654cd5bfd2a66ab49ff7c8d0f4ad96 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 17:21:48 +0100 Subject: [PATCH 12/53] Sentinel: added config-epoch to SENTINEL masters output. --- src/sentinel.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/sentinel.c b/src/sentinel.c index 5ad7bee3..479df17e 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1939,6 +1939,10 @@ void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) { /* Only masters */ if (ri->flags & SRI_MASTER) { + addReplyBulkCString(c,"config-epoch"); + addReplyBulkLongLong(c,ri->config_epoch); + fields++; + addReplyBulkCString(c,"num-slaves"); addReplyBulkLongLong(c,dictSize(ri->slaves)); fields++; From dfa5f8b777cefdfcdad3665dd1eade734022636b Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 23:00:14 +0100 Subject: [PATCH 13/53] Sentinel: change event name when converting master to slave. --- src/sentinel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index 479df17e..af3c264d 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1543,7 +1543,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { ri->master->addr->ip, ri->master->addr->port); if (retval == REDIS_OK) - sentinelEvent(REDIS_NOTICE,"+demote-old-slave",ri,"%@"); + sentinelEvent(REDIS_NOTICE,"+convert-to-slave",ri,"%@"); } } } From 1569af1f2390fa746e8ad363949cfc23a3407199 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Nov 2013 23:07:33 +0100 Subject: [PATCH 14/53] Sentinel: receive Pub/Sub messages from slaves. --- src/sentinel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index af3c264d..e66b8b84 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1334,7 +1334,7 @@ void sentinelReconnectInstance(sentinelRedisInstance *ri) { } } /* Pub / Sub */ - if ((ri->flags & SRI_MASTER) && ri->pc == NULL) { + if ((ri->flags & (SRI_MASTER|SRI_SLAVE)) && ri->pc == NULL) { ri->pc = redisAsyncConnect(ri->addr->ip,ri->addr->port); if (ri->pc->err) { sentinelEvent(REDIS_DEBUG,"-pubsub-link-reconnection",ri,"%@ #%s", From 6bd4f6bffe96878cee454ade8595aae3e2913477 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 13 Nov 2013 10:30:45 +0100 Subject: [PATCH 15/53] Sentinel: sentinelResetMaster() new flag to avoid removing set of sentinels. This commit also removes some dead code and cleanup generic flags. --- src/sentinel.c | 36 ++++++++++++++---------------------- 1 file changed, 14 insertions(+), 22 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index e66b8b84..648e1ac0 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -72,7 +72,6 @@ typedef struct sentinelAddr { #define SRI_FORCE_FAILOVER (1<<13) /* Force failover with master up. */ #define SRI_SCRIPT_KILL_SENT (1<<14) /* SCRIPT KILL already sent on -BUSY */ -#define SENTINEL_NO_FLAGS 0 /* Generic no flags define. */ #define SENTINEL_INFO_PERIOD 10000 #define SENTINEL_PING_PERIOD 1000 #define SENTINEL_ASK_PERIOD 1000 @@ -111,11 +110,13 @@ typedef struct sentinelAddr { #define SENTINEL_MASTER_LINK_STATUS_UP 0 #define SENTINEL_MASTER_LINK_STATUS_DOWN 1 -/* Generic flags that can be used with different functions. */ +/* Generic flags that can be used with different functions. + * They use higher bits to avoid colliding with the function specific + * flags. */ #define SENTINEL_NO_FLAGS 0 -#define SENTINEL_GENERATE_EVENT 1 -#define SENTINEL_LEADER 2 -#define SENTINEL_OBSERVER 4 +#define SENTINEL_GENERATE_EVENT (1<<16) +#define SENTINEL_LEADER (1<<17) +#define SENTINEL_OBSERVER (1<<18) /* Script execution flags and limits. */ #define SENTINEL_SCRIPT_NONE 0 @@ -1079,12 +1080,16 @@ void sentinelDelFlagsToDictOfRedisInstances(dict *instances, int flags) { * 5) In the process of doing this undo the failover if in progress. * 6) Disconnect the connections with the master (will reconnect automatically). */ + +#define SENTINEL_RESET_NO_SENTINELS (1<<0) void sentinelResetMaster(sentinelRedisInstance *ri, int flags) { redisAssert(ri->flags & SRI_MASTER); dictRelease(ri->slaves); - dictRelease(ri->sentinels); ri->slaves = dictCreate(&instancesDictType,NULL); - ri->sentinels = dictCreate(&instancesDictType,NULL); + if (!(flags & SENTINEL_RESET_NO_SENTINELS)) { + dictRelease(ri->sentinels); + ri->sentinels = dictCreate(&instancesDictType,NULL); + } if (ri->cc) sentinelKillLink(ri,ri->cc); if (ri->pc) sentinelKillLink(ri,ri->pc); ri->flags &= SRI_MASTER|SRI_CAN_FAILOVER|SRI_DISCONNECTED; @@ -1134,17 +1139,13 @@ int sentinelResetMastersByPattern(char *pattern, int flags) { * This is used to handle the +switch-master and +redirect-to-master events. * * The function returns REDIS_ERR if the address can't be resolved for some - * reason. Otherwise REDIS_OK is returned. - * - * TODO: make this reset so that original sentinels are re-added with - * same ip / port / runid. - */ + * reason. Otherwise REDIS_OK is returned. */ int sentinelResetMasterAndChangeAddress(sentinelRedisInstance *master, char *ip, int port) { sentinelAddr *oldaddr, *newaddr; newaddr = createSentinelAddr(ip,port); if (newaddr == NULL) return REDIS_ERR; - sentinelResetMaster(master,SENTINEL_NO_FLAGS); + sentinelResetMaster(master,SENTINEL_RESET_NO_SENTINELS); oldaddr = master->addr; master->addr = newaddr; master->o_down_since_time = 0; @@ -2898,15 +2899,6 @@ void sentinelFailoverSwitchToPromotedSlave(sentinelRedisInstance *master) { old_master_ip = sdsdup(master->addr->ip); old_master_port = master->addr->port; sentinelResetMasterAndChangeAddress(master,ref->addr->ip,ref->addr->port); - /* If this is a real switch and not just a user requested reset, we want - * to add all the known instances as slaves, and also all the sentinels - * back to this master. */ - if (master != ref) { - /* TODO: - createSentinelRedisInstance(NULL,SRI_SLAVE - old_master_ip, old_master_port, master->quorum, master); - */ - } sdsfree(old_master_ip); } From ae35b7e2405c8e68298d658f6793d545e3dc03ad Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 13 Nov 2013 13:01:11 +0100 Subject: [PATCH 16/53] Sentinel: readd slaves back after a master reset. --- src/sentinel.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/sentinel.c b/src/sentinel.c index 648e1ac0..72ef3587 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -443,6 +443,11 @@ void releaseSentinelAddr(sentinelAddr *sa) { zfree(sa); } +/* Return non-zero if two addresses are equal. */ +int sentinelAddrIsEqual(sentinelAddr *a, sentinelAddr *b) { + return a->port == b->port && !strcasecmp(a->ip,b->ip); +} + /* =========================== Events notification ========================== */ /* Send an event to log, pub/sub, user notification script. @@ -1142,15 +1147,54 @@ int sentinelResetMastersByPattern(char *pattern, int flags) { * reason. Otherwise REDIS_OK is returned. */ int sentinelResetMasterAndChangeAddress(sentinelRedisInstance *master, char *ip, int port) { sentinelAddr *oldaddr, *newaddr; + sentinelAddr **slaves = NULL; + int numslaves = 0, j; + dictIterator *di; + dictEntry *de; newaddr = createSentinelAddr(ip,port); if (newaddr == NULL) return REDIS_ERR; + + /* Make a list of slaves to add back after the reset. + * Don't include the one having the address we are switching to. */ + di = dictGetIterator(master->slaves); + while((de = dictNext(di)) != NULL) { + sentinelRedisInstance *slave = dictGetVal(de); + + if (sentinelAddrIsEqual(slave->addr,newaddr)) continue; + slaves = zrealloc(slaves,sizeof(sentinelAddr*)*(numslaves+1)); + slaves[numslaves++] = createSentinelAddr(slave->addr->ip, + slave->addr->port); + } + dictReleaseIterator(di); + + /* If we are switching to a different address, include the old address + * as a slave as well, so that we'll be able to sense / reconfigure + * the old master. */ + if (!sentinelAddrIsEqual(newaddr,master->addr)) { + slaves = zrealloc(slaves,sizeof(sentinelAddr*)*(numslaves+1)); + slaves[numslaves++] = createSentinelAddr(master->addr->ip, + master->addr->port); + } + + /* Reset and switch address. */ sentinelResetMaster(master,SENTINEL_RESET_NO_SENTINELS); oldaddr = master->addr; master->addr = newaddr; master->o_down_since_time = 0; master->s_down_since_time = 0; + /* Add slaves back. */ + for (j = 0; j < numslaves; j++) { + sentinelRedisInstance *slave; + + slave = createSentinelRedisInstance(NULL,SRI_SLAVE,slaves[j]->ip, + slaves[j]->port, master->quorum, master); + releaseSentinelAddr(slaves[j]); + if (slave) sentinelEvent(REDIS_NOTICE,"+slave",slave,"%@"); + } + zfree(slaves); + /* Release the old address at the end so we are safe even if the function * gets the master->addr->ip and master->addr->port as arguments. */ releaseSentinelAddr(oldaddr); From 9e40c46f5e2e4799f072cb91ab7e5af46cccdc18 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 13 Nov 2013 13:43:59 +0100 Subject: [PATCH 17/53] Sentinel: fix no-down check in master->slave conversion code. --- src/sentinel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index 72ef3587..fb5896eb 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1574,7 +1574,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { * configuratio via Pub/Sub if any. */ mstime_t wait_time = SENTINEL_PUBLISH_PERIOD*4; - if (!sentinelRedisInstanceNoDownFor(ri->master,wait_time) || + if (!sentinelRedisInstanceNoDownFor(ri,wait_time) || (mstime()-sentinel.tilt_start_time) < wait_time) return; From 46a053d34bedeccba7523fb4756aab7245e86da3 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 13 Nov 2013 16:18:23 +0100 Subject: [PATCH 18/53] Sentinel: track role change time. Wait before reconfigurations. --- src/sentinel.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index fb5896eb..442eebc3 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -155,6 +155,14 @@ typedef struct sentinelRedisInstance { mstime_t down_after_period; /* Consider it down after that period. */ mstime_t info_refresh; /* Time at which we received INFO output from it. */ + /* Role and the first time we observed it. + * This is useful in order to delay replacing what the instance reports + * with our own configuration. We need to always wait some time in order + * to give a chance to the leader to report the new configuration before + * we do silly things. */ + int role_reported; + mstime_t role_reported_time; + /* Master specific. */ dict *sentinels; /* Other sentinels monitoring the same master. */ dict *slaves; /* Slaves for this master instance. */ @@ -911,6 +919,10 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * ri->notification_script = NULL; ri->client_reconfig_script = NULL; + /* Role */ + ri->role_reported = ri->flags & (SRI_MASTER|SRI_SLAVE); + ri->role_reported_time = mstime(); + /* Add into the right table. */ dictAdd(table, ri->name, ri); return ri; @@ -1536,6 +1548,11 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { * master, always. */ if ((ri->flags & SRI_MASTER) && role == SRI_SLAVE && ri->slave_master_host) { + if (ri->role_reported != SRI_MASTER) { + ri->role_reported_time = mstime(); + ri->role_reported = SRI_MASTER; + } + sentinelEvent(REDIS_WARNING,"+redirect-to-master",ri, "%s %s %d %s %d", ri->name, ri->addr->ip, ri->addr->port, @@ -1547,6 +1564,11 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { /* Handle slave -> master role switch. */ if ((ri->flags & SRI_SLAVE) && role == SRI_MASTER) { + if (ri->role_reported != SRI_SLAVE) { + ri->role_reported_time = mstime(); + ri->role_reported = SRI_SLAVE; + } + /* If this is a promoted slave we can change state to the * failover state machine. */ if (!sentinel.tilt && @@ -1575,7 +1597,8 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { mstime_t wait_time = SENTINEL_PUBLISH_PERIOD*4; if (!sentinelRedisInstanceNoDownFor(ri,wait_time) || - (mstime()-sentinel.tilt_start_time) < wait_time) + mstime() - ri->role_reported_time < wait_time || + mstime() - sentinel.tilt_start_time < wait_time) return; /* Make sure the master is sane before reconfiguring this instance From 17718fdcbae91bb47a370962fd87e660f5452b06 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 13 Nov 2013 16:21:58 +0100 Subject: [PATCH 19/53] Sentinel: make sure role_reported is always updated. --- src/sentinel.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 442eebc3..a62e6495 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1546,20 +1546,21 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { /* When what we believe is our master, turned into a slave, the wiser * thing we can do is to follow the events and redirect to the new * master, always. */ - if ((ri->flags & SRI_MASTER) && role == SRI_SLAVE && ri->slave_master_host) - { + if ((ri->flags & SRI_MASTER) && role == SRI_SLAVE) { if (ri->role_reported != SRI_MASTER) { ri->role_reported_time = mstime(); ri->role_reported = SRI_MASTER; } - sentinelEvent(REDIS_WARNING,"+redirect-to-master",ri, - "%s %s %d %s %d", - ri->name, ri->addr->ip, ri->addr->port, - ri->slave_master_host, ri->slave_master_port); - sentinelResetMasterAndChangeAddress(ri,ri->slave_master_host, - ri->slave_master_port); - return; /* Don't process anything after this event. */ + if (ri->slave_master_host) { + sentinelEvent(REDIS_WARNING,"+redirect-to-master",ri, + "%s %s %d %s %d", + ri->name, ri->addr->ip, ri->addr->port, + ri->slave_master_host, ri->slave_master_port); + sentinelResetMasterAndChangeAddress(ri,ri->slave_master_host, + ri->slave_master_port); + return; /* Don't process anything after this event. */ + } } /* Handle slave -> master role switch. */ From a0afa66f4b3353072879cf87bcc8cc1129a812ac Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 13 Nov 2013 16:28:52 +0100 Subject: [PATCH 20/53] Sentinel: being a master and reporting as slave is considered SDOWN. --- src/sentinel.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index a62e6495..b115de87 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2319,8 +2319,18 @@ void sentinelCheckSubjectivelyDown(sentinelRedisInstance *ri) { sentinelKillLink(ri,ri->pc); } - /* Update the subjectively down flag. */ - if (elapsed > ri->down_after_period) { + /* Update the subjectively down flag. We believe the instance is in SDOWN + * state if: + * 1) It is not replying. + * 2) We believe it is a master, it reports to be a slave for enough time + * to meet the down_after_period, plus enough time to get two times + * INFO report from the instance. */ + if (elapsed > ri->down_after_period || + (ri->flags & SRI_MASTER && + ri->role_reported == SRI_SLAVE && + mstime() - ri->role_reported_time > + (ri->down_after_period+SENTINEL_INFO_PERIOD*2))) + { /* Is subjectively down */ if ((ri->flags & SRI_S_DOWN) == 0) { sentinelEvent(REDIS_WARNING,"+sdown",ri,"%@"); From ddaad9fe2d9100f9ccbd6aa3515157ecbf7e413e Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 13 Nov 2013 16:36:40 +0100 Subject: [PATCH 21/53] Sentinel: role reporting fixed and added in SENTINEL output. --- src/sentinel.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index b115de87..b60c375d 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1547,9 +1547,9 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { * thing we can do is to follow the events and redirect to the new * master, always. */ if ((ri->flags & SRI_MASTER) && role == SRI_SLAVE) { - if (ri->role_reported != SRI_MASTER) { + if (ri->role_reported != SRI_SLAVE) { ri->role_reported_time = mstime(); - ri->role_reported = SRI_MASTER; + ri->role_reported = SRI_SLAVE; } if (ri->slave_master_host) { @@ -1565,9 +1565,9 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { /* Handle slave -> master role switch. */ if ((ri->flags & SRI_SLAVE) && role == SRI_MASTER) { - if (ri->role_reported != SRI_SLAVE) { + if (ri->role_reported != SRI_MASTER) { ri->role_reported_time = mstime(); - ri->role_reported = SRI_SLAVE; + ri->role_reported = SRI_MASTER; } /* If this is a promoted slave we can change state to the @@ -2004,6 +2004,15 @@ void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) { addReplyBulkCString(c,"info-refresh"); addReplyBulkLongLong(c,mstime() - ri->info_refresh); fields++; + + addReplyBulkCString(c,"role-reported"); + addReplyBulkCString(c, (ri->role_reported == SRI_MASTER) ? "master" : + "slave"); + fields++; + + addReplyBulkCString(c,"role-reported-time"); + addReplyBulkLongLong(c,mstime() - ri->role_reported_time); + fields++; } /* Only masters */ From 76a88f56e5799140d316aaaaabd51b23b2461826 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 13 Nov 2013 17:02:09 +0100 Subject: [PATCH 22/53] Sentinel: safer slave reconfig, master reported role should match. --- src/sentinel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sentinel.c b/src/sentinel.c index b60c375d..d6b394be 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1605,6 +1605,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { /* Make sure the master is sane before reconfiguring this instance * into a slave. */ if (ri->master->flags & SRI_MASTER && + ri->master->role_reported == SRI_MASTER && (ri->master->flags & (SRI_S_DOWN|SRI_O_DOWN)) == 0 && (mstime() - ri->master->info_refresh) < SENTINEL_INFO_PERIOD*2) { From 8297745fa6dab7a9a73964ab2ee2c8eca17bd41a Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 13 Nov 2013 17:03:42 +0100 Subject: [PATCH 23/53] Sentinel: redirect-to-master is not ok with new algorithm. Now Sentinel believe the current configuration is always the winner and should be applied by Sentinels instead of trying to adapt our view of the cluster based on what we observe. So the only way to modify what a Sentinel believe to be the truth is to win an election and advertise the new configuration via Pub / Sub with a greater configuration epoch. --- src/sentinel.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index d6b394be..89cb631c 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1543,24 +1543,12 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { * Some things will not happen if sentinel.tilt is true, but some will * still be processed. */ - /* When what we believe is our master, turned into a slave, the wiser - * thing we can do is to follow the events and redirect to the new - * master, always. */ + /* Handle master -> slave role switch. */ if ((ri->flags & SRI_MASTER) && role == SRI_SLAVE) { if (ri->role_reported != SRI_SLAVE) { ri->role_reported_time = mstime(); ri->role_reported = SRI_SLAVE; } - - if (ri->slave_master_host) { - sentinelEvent(REDIS_WARNING,"+redirect-to-master",ri, - "%s %s %d %s %d", - ri->name, ri->addr->ip, ri->addr->port, - ri->slave_master_host, ri->slave_master_port); - sentinelResetMasterAndChangeAddress(ri,ri->slave_master_host, - ri->slave_master_port); - return; /* Don't process anything after this event. */ - } } /* Handle slave -> master role switch. */ From 3e27d678da8ea0e3aeeadf93f39e45d184bf778d Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 14 Nov 2013 00:08:13 +0100 Subject: [PATCH 24/53] Sentinel: remember last time slave changed master. --- src/sentinel.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 89cb631c..a6e0c309 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -162,6 +162,7 @@ typedef struct sentinelRedisInstance { * we do silly things. */ int role_reported; mstime_t role_reported_time; + mstime_t slave_conf_change_time; /* Last time slave master addr changed. */ /* Master specific. */ dict *sentinels; /* Other sentinels monitoring the same master. */ @@ -922,6 +923,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * /* Role */ ri->role_reported = ri->flags & (SRI_MASTER|SRI_SLAVE); ri->role_reported_time = mstime(); + ri->slave_conf_change_time = mstime(); /* Add into the right table. */ dictAdd(table, ri->name, ri); @@ -1515,13 +1517,24 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { if (role == SRI_SLAVE) { /* master_host: */ if (sdslen(l) >= 12 && !memcmp(l,"master_host:",12)) { - sdsfree(ri->slave_master_host); - ri->slave_master_host = sdsnew(l+12); + if (ri->slave_master_host == NULL || + strcasecmp(l+12,ri->slave_master_host)) + { + sdsfree(ri->slave_master_host); + ri->slave_master_host = sdsnew(l+12); + ri->slave_conf_change_time = mstime(); + } } /* master_port: */ - if (sdslen(l) >= 12 && !memcmp(l,"master_port:",12)) - ri->slave_master_port = atoi(l+12); + if (sdslen(l) >= 12 && !memcmp(l,"master_port:",12)) { + int slave_master_port = atoi(l+12); + + if (ri->slave_master_port != slave_master_port) { + ri->slave_master_port = slave_master_port; + ri->slave_conf_change_time = mstime(); + } + } /* master_link_status: */ if (sdslen(l) >= 19 && !memcmp(l,"master_link_status:",19)) { @@ -1548,6 +1561,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { if (ri->role_reported != SRI_SLAVE) { ri->role_reported_time = mstime(); ri->role_reported = SRI_SLAVE; + ri->slave_conf_change_time = mstime(); } } From 64ad6648a8595bfd265b3eb3e80a45189ade25b7 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 14 Nov 2013 00:29:38 +0100 Subject: [PATCH 25/53] Sentinel: reconfigure slaves to right master. --- src/sentinel.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index a6e0c309..4e42f2b1 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1596,7 +1596,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { /* A slave turned into a master. We want to force our view and * reconfigure as slave, but make sure to wait some time before * doing this in order to make sure to receive an updated - * configuratio via Pub/Sub if any. */ + * configuration via Pub/Sub if any. */ mstime_t wait_time = SENTINEL_PUBLISH_PERIOD*4; if (!sentinelRedisInstanceNoDownFor(ri,wait_time) || @@ -1620,6 +1620,32 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { } } + /* Handle slaves replicating to a different master address. */ + if ((ri->flags & SRI_SLAVE) && !sentinel.tilt && + (ri->slave_master_port != ri->master->addr->port || + strcasecmp(ri->slave_master_host,ri->master->addr->ip))) + { + mstime_t wait_time = SENTINEL_PUBLISH_PERIOD*4; + + if (!sentinelRedisInstanceNoDownFor(ri,wait_time) || + mstime() - ri->slave_conf_change_time < wait_time) + return; + + /* Make sure the master is sane before reconfiguring this instance + * into a slave. */ + if (ri->master->flags & SRI_MASTER && + ri->master->role_reported == SRI_MASTER && + (ri->master->flags & (SRI_S_DOWN|SRI_O_DOWN)) == 0 && + (mstime() - ri->master->info_refresh) < SENTINEL_INFO_PERIOD*2) + { + int retval = sentinelSendSlaveOf(ri, + ri->master->addr->ip, + ri->master->addr->port); + if (retval == REDIS_OK) + sentinelEvent(REDIS_NOTICE,"+fix-slave-config",ri,"%@"); + } + } + /* None of the following conditions are processed when in tilt mode, so * return asap. */ if (sentinel.tilt) return; From dfbd9c5aeb9f18cff20f450b6643ab476a3624a9 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 14 Nov 2013 00:36:43 +0100 Subject: [PATCH 26/53] Sentinel: simplify and refactor slave reconfig code. --- src/sentinel.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 4e42f2b1..a01ae8fd 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1431,6 +1431,19 @@ void sentinelReconnectInstance(sentinelRedisInstance *ri) { /* ======================== Redis instances pinging ======================== */ +/* Return true if master looks "sane", that is: + * 1) It is actually a master in the current configuration. + * 2) It reports itself as a master. + * 3) It is not SDOWN or ODOWN. + * 4) We obtained last INFO no more than two times the INFO period of time ago. */ +int sentinelMasterLooksSane(sentinelRedisInstance *master) { + return + master->flags & SRI_MASTER && + master->role_reported == SRI_MASTER && + (master->flags & (SRI_S_DOWN|SRI_O_DOWN)) == 0 && + (mstime() - master->info_refresh) < SENTINEL_INFO_PERIOD*2; +} + /* Process the INFO output from masters. */ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { sds *lines; @@ -1594,22 +1607,13 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { "start",ri->master->addr,ri->addr); } else if (!sentinel.tilt) { /* A slave turned into a master. We want to force our view and - * reconfigure as slave, but make sure to wait some time before - * doing this in order to make sure to receive an updated - * configuration via Pub/Sub if any. */ + * reconfigure as slave. Wait some time after the change before + * going forward, to receive new configs if any. */ mstime_t wait_time = SENTINEL_PUBLISH_PERIOD*4; - if (!sentinelRedisInstanceNoDownFor(ri,wait_time) || - mstime() - ri->role_reported_time < wait_time || - mstime() - sentinel.tilt_start_time < wait_time) - return; - - /* Make sure the master is sane before reconfiguring this instance - * into a slave. */ - if (ri->master->flags & SRI_MASTER && - ri->master->role_reported == SRI_MASTER && - (ri->master->flags & (SRI_S_DOWN|SRI_O_DOWN)) == 0 && - (mstime() - ri->master->info_refresh) < SENTINEL_INFO_PERIOD*2) + if (sentinelMasterLooksSane(ri->master) && + sentinelRedisInstanceNoDownFor(ri,wait_time) && + mstime() - ri->role_reported_time > wait_time) { int retval = sentinelSendSlaveOf(ri, ri->master->addr->ip, @@ -1627,16 +1631,11 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { { mstime_t wait_time = SENTINEL_PUBLISH_PERIOD*4; - if (!sentinelRedisInstanceNoDownFor(ri,wait_time) || - mstime() - ri->slave_conf_change_time < wait_time) - return; - /* Make sure the master is sane before reconfiguring this instance * into a slave. */ - if (ri->master->flags & SRI_MASTER && - ri->master->role_reported == SRI_MASTER && - (ri->master->flags & (SRI_S_DOWN|SRI_O_DOWN)) == 0 && - (mstime() - ri->master->info_refresh) < SENTINEL_INFO_PERIOD*2) + if (sentinelMasterLooksSane(ri->master) && + sentinelRedisInstanceNoDownFor(ri,wait_time) && + mstime() - ri->slave_conf_change_time > wait_time) { int retval = sentinelSendSlaveOf(ri, ri->master->addr->ip, From c0d72293645eeef2da73080d1f77cf9944947b45 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 14 Nov 2013 10:23:05 +0100 Subject: [PATCH 27/53] Sentinel: fix conditional to only affect slaves with wrong master. --- src/sentinel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sentinel.c b/src/sentinel.c index a01ae8fd..31f58338 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1626,6 +1626,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { /* Handle slaves replicating to a different master address. */ if ((ri->flags & SRI_SLAVE) && !sentinel.tilt && + role == SRI_SLAVE && (ri->slave_master_port != ri->master->addr->port || strcasecmp(ri->slave_master_host,ri->master->addr->ip))) { From e4c65e72c60c4d25ed1521609cd3f55030a5aada Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 14 Nov 2013 10:23:54 +0100 Subject: [PATCH 28/53] Sentinel: master address selection in get-master-address refactored. --- src/sentinel.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 31f58338..b59f2dc8 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1226,6 +1226,24 @@ int sentinelRedisInstanceNoDownFor(sentinelRedisInstance *ri, mstime_t ms) { return most_recent == 0 || (mstime() - most_recent) > ms; } +/* Return the current master address, that is, its address or the address + * of the promoted slave if already operational. */ +sentinelAddr *sentinelGetCurrentMasterAddress(sentinelRedisInstance *master) { + /* If we are failing over the master, and the state is already + * SENTINEL_FAILOVER_STATE_RECONF_SLAVES or greater, it means that we + * already have the new configuration epoch in the master, and the + * slave acknowledged the configuration switch. Advertise the new + * address. */ + if ((master->flags & SRI_FAILOVER_IN_PROGRESS) && + master->promoted_slave && + master->failover_state >= SENTINEL_FAILOVER_STATE_RECONF_SLAVES) + { + return master->promoted_slave->addr; + } else { + return master->addr; + } +} + /* ============================ Config handling ============================= */ char *sentinelHandleConfiguration(char **argv, int argc) { sentinelRedisInstance *ri; @@ -2217,18 +2235,8 @@ void sentinelCommand(redisClient *c) { } else if (ri->info_refresh == 0) { addReplySds(c,sdsnew("-IDONTKNOW I have not enough information to reply. Please ask another Sentinel.\r\n")); } else { - sentinelAddr *addr = ri->addr; + sentinelAddr *addr = sentinelGetCurrentMasterAddress(ri); - /* If we are in the middle of a failover, and the slave was - * already successfully switched to master role, we can advertise - * the new address as slave in order to allow clients to talk - * with the new master ASAP. */ - if ((ri->flags & SRI_FAILOVER_IN_PROGRESS) && - ri->promoted_slave && - ri->failover_state >= SENTINEL_FAILOVER_STATE_RECONF_SLAVES) - { - addr = ri->promoted_slave->addr; - } addReplyMultiBulkLen(c,2); addReplyBulkCString(c,addr->ip); addReplyBulkLongLong(c,addr->port); From 69d826a354f69918e3887bac1d11c208917f0b4e Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 14 Nov 2013 10:25:55 +0100 Subject: [PATCH 29/53] Sentinel: fix address of master in Hello messages. Once we switched configuration during a failover, we should advertise the new address. This was a serious race condition as the Sentinel performing the failover for a moment advertised the old address with the new configuration epoch: once trasmitted to the other Sentinels the broken configuration would remain there forever, until the next failover (because a greater configuration epoch is required to overwrite an older one). --- src/sentinel.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index b59f2dc8..e6c39dd5 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1936,6 +1936,7 @@ void sentinelPingInstance(sentinelRedisInstance *ri) { char payload[REDIS_IP_STR_LEN+1024]; sentinelRedisInstance *master = (ri->flags & SRI_MASTER) ? ri : ri->master; + sentinelAddr *master_addr = sentinelGetCurrentMasterAddress(master); snprintf(payload,sizeof(payload), "%s,%d,%s,%d,%llu," /* Info about this sentinel. */ @@ -1944,7 +1945,7 @@ void sentinelPingInstance(sentinelRedisInstance *ri) { (master->flags & SRI_CAN_FAILOVER) != 0, (unsigned long long) sentinel.current_epoch, /* --- */ - master->name,master->addr->ip,master->addr->port, + master->name,master_addr->ip,master_addr->port, master->config_epoch); retval = redisAsyncCommand(ri->cc, sentinelPublishReplyCallback, NULL, "PUBLISH %s %s", From 4be53b1c5d9b566f2994e8445b6a5300a8eabdb6 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 18 Nov 2013 10:08:06 +0100 Subject: [PATCH 30/53] Sentinel: election timeout define. --- src/sentinel.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index e6c39dd5..88d057e1 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -87,6 +87,7 @@ typedef struct sentinelAddr { #define SENTINEL_MIN_LINK_RECONNECT_PERIOD 15000 #define SENTINEL_DEFAULT_FAILOVER_TIMEOUT (60*15*1000) #define SENTINEL_MAX_PENDING_COMMANDS 100 +#define SENTINEL_ELECTION_TIMEOUT 10000 /* How many milliseconds is an information valid? This applies for instance * to the reply to SENTINEL IS-MASTER-DOWN-BY-ADDR replies. */ @@ -2816,7 +2817,7 @@ void sentinelFailoverWaitStart(sentinelRedisInstance *ri) { /* If I'm not the leader, I can't continue with the failover. */ if (!isleader) { /* Abort the failover if I'm not the leader after some time. */ - if (mstime() - ri->failover_start_time > 10000) { + if (mstime() - ri->failover_start_time > SENTINEL_ELECTION_TIMEOUT) { sentinelEvent(REDIS_WARNING,"-failover-abort-not-elected",ri,"%@"); sentinelAbortFailover(ri); } From 3a56013acb7e07eb314cea6606418355d4a452d4 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 18 Nov 2013 11:12:58 +0100 Subject: [PATCH 31/53] Sentinel: state machine and timeouts simplified. --- src/sentinel.c | 95 +++++++++++++++++++++++++------------------------- 1 file changed, 48 insertions(+), 47 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 88d057e1..f6b7c019 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -85,7 +85,7 @@ typedef struct sentinelAddr { #define SENTINEL_SLAVE_RECONF_RETRY_PERIOD 10000 #define SENTINEL_DEFAULT_PARALLEL_SYNCS 1 #define SENTINEL_MIN_LINK_RECONNECT_PERIOD 15000 -#define SENTINEL_DEFAULT_FAILOVER_TIMEOUT (60*15*1000) +#define SENTINEL_DEFAULT_FAILOVER_TIMEOUT (60*5*1000) #define SENTINEL_MAX_PENDING_COMMANDS 100 #define SENTINEL_ELECTION_TIMEOUT 10000 @@ -105,8 +105,7 @@ typedef struct sentinelAddr { #define SENTINEL_FAILOVER_STATE_WAIT_NEXT_SLAVE 6 /* wait replication */ #define SENTINEL_FAILOVER_STATE_ALERT_CLIENTS 7 /* Run user script. */ #define SENTINEL_FAILOVER_STATE_WAIT_ALERT_SCRIPT 8 /* Wait script exec. */ -#define SENTINEL_FAILOVER_STATE_DETECT_END 9 /* Check for failover end. */ -#define SENTINEL_FAILOVER_STATE_UPDATE_CONFIG 10 /* Monitor promoted slave. */ +#define SENTINEL_FAILOVER_STATE_UPDATE_CONFIG 9 /* Monitor promoted slave. */ #define SENTINEL_MASTER_LINK_STATUS_UP 0 #define SENTINEL_MASTER_LINK_STATUS_DOWN 1 @@ -1693,10 +1692,6 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { ri->flags &= ~SRI_RECONF_INPROG; ri->flags |= SRI_RECONF_DONE; sentinelEvent(REDIS_NOTICE,"+slave-reconf-done",ri,"%@"); - /* If we are moving forward (a new slave is now configured) - * we update the change_time as we are conceptually passing - * to the next slave. */ - ri->failover_state_change_time = mstime(); } } } @@ -1968,7 +1963,6 @@ const char *sentinelFailoverStateStr(int state) { case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION: return "wait_promotion"; case SENTINEL_FAILOVER_STATE_RECONF_SLAVES: return "reconf_slaves"; case SENTINEL_FAILOVER_STATE_ALERT_CLIENTS: return "alert_clients"; - case SENTINEL_FAILOVER_STATE_DETECT_END: return "detect_end"; case SENTINEL_FAILOVER_STATE_UPDATE_CONFIG: return "update_config"; default: return "unknown"; } @@ -2816,17 +2810,20 @@ void sentinelFailoverWaitStart(sentinelRedisInstance *ri) { /* If I'm not the leader, I can't continue with the failover. */ if (!isleader) { + int election_timeout = SENTINEL_ELECTION_TIMEOUT; + + /* The election timeout is the MIN between SENTINEL_ELECTION_TIMEOUT + * and the configured failover timeout. */ + if (election_timeout > ri->failover_timeout) + election_timeout = ri->failover_timeout; /* Abort the failover if I'm not the leader after some time. */ - if (mstime() - ri->failover_start_time > SENTINEL_ELECTION_TIMEOUT) { + if (mstime() - ri->failover_start_time > election_timeout) { sentinelEvent(REDIS_WARNING,"-failover-abort-not-elected",ri,"%@"); sentinelAbortFailover(ri); } return; } sentinelEvent(REDIS_WARNING,"+elected-leader",ri,"%@"); - - /* Start the failover going to the next state if enough time has - * elapsed. */ ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE; ri->failover_state_change_time = mstime(); sentinelEvent(REDIS_WARNING,"+failover-state-select-slave",ri,"%@"); @@ -2835,6 +2832,8 @@ void sentinelFailoverWaitStart(sentinelRedisInstance *ri) { void sentinelFailoverSelectSlave(sentinelRedisInstance *ri) { sentinelRedisInstance *slave = sentinelSelectSlave(ri); + /* We don't handle the timeout in this state as the function aborts + * the failover or go forward in the next state. */ if (slave == NULL) { sentinelEvent(REDIS_WARNING,"-failover-abort-no-good-slave",ri,"%@"); sentinelAbortFailover(ri); @@ -2852,7 +2851,16 @@ void sentinelFailoverSelectSlave(sentinelRedisInstance *ri) { void sentinelFailoverSendSlaveOfNoOne(sentinelRedisInstance *ri) { int retval; - if (ri->promoted_slave->flags & SRI_DISCONNECTED) return; + /* We can't send the command to the promoted slave if it is now + * disconnected. Retry again and again with this state until the timeout + * is reached, then abort the failover. */ + if (ri->promoted_slave->flags & SRI_DISCONNECTED) { + if (mstime() - ri->failover_state_change_time > ri->failover_timeout) { + sentinelEvent(REDIS_WARNING,"-failover-abort-slave-timeout",ri,"%@"); + sentinelAbortFailover(ri); + } + return; + } /* Send SLAVEOF NO ONE command to turn the slave into a master. * We actually register a generic callback for this command as we don't @@ -2869,16 +2877,11 @@ void sentinelFailoverSendSlaveOfNoOne(sentinelRedisInstance *ri) { /* We actually wait for promotion indirectly checking with INFO when the * slave turns into a master. */ void sentinelFailoverWaitPromotion(sentinelRedisInstance *ri) { - mstime_t elapsed = mstime() - ri->failover_state_change_time; - - if (elapsed >= SENTINEL_PROMOTION_RETRY_PERIOD) { - sentinelEvent(REDIS_WARNING,"-promotion-timeout",ri->promoted_slave, - "%@"); - sentinelEvent(REDIS_WARNING,"+failover-state-select-slave",ri,"%@"); - ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE; - ri->failover_state_change_time = mstime(); - ri->promoted_slave->flags &= ~SRI_PROMOTED; - ri->promoted_slave = NULL; + /* Just handle the timeout. Switching to the next state is handled + * by the function parsing the INFO command of the promoted slave. */ + if (mstime() - ri->failover_state_change_time > ri->failover_timeout) { + sentinelEvent(REDIS_WARNING,"-failover-abort-slave-timeout",ri,"%@"); + sentinelAbortFailover(ri); } } @@ -3002,6 +3005,8 @@ void sentinelFailoverReconfNextSlave(sentinelRedisInstance *master) { } } dictReleaseIterator(di); + + /* Check if all the slaves are reconfigured and handle timeout. */ sentinelFailoverDetectEnd(master); } @@ -3049,50 +3054,46 @@ void sentinelFailoverStateMachine(sentinelRedisInstance *ri) { case SENTINEL_FAILOVER_STATE_RECONF_SLAVES: sentinelFailoverReconfNextSlave(ri); break; - case SENTINEL_FAILOVER_STATE_DETECT_END: - sentinelFailoverDetectEnd(ri); - break; } } -/* Abort a failover in progress with the following steps: - * 1) Set the master back to the original one, increment the config epoch. - * 2) Reconfig slaves to replicate to the old master. - * 3) Reconfig the promoted slave as a slave as well. */ +/* Abort a failover in progress: + * + * This function can only be called before the promoted slave acknowledged + * the slave -> master switch. Otherwise the failover can't be aborted and + * will reach its end. + * + * If there is a promoted slave and we already got acknowledge of the + * slave -> master switch, we clear our flags and redirect to the + * new master. Eventually the config will be propagated if it is the one + * with the greater config epoch for this master. + * + * Otherwise if we still did not received the acknowledgement from the + * promoted slave, or there is no promoted slave at all, we just clear the + * failover-in-progress state as there is nothing to do (if the promoted + * slave for some reason actually received our "SLAVEOF NO ONE" command + * even if we did not received the ACK, it will be reverted to slave again + * by one of the Sentinels). */ void sentinelAbortFailover(sentinelRedisInstance *ri) { dictIterator *di; dictEntry *de; - int sentinel_role; redisAssert(ri->flags & SRI_FAILOVER_IN_PROGRESS); + redisAssert(ri->failover_state <= SENTINEL_FAILOVER_STATE_WAIT_PROMOTION); - /* Clear failover related flags from slaves. - * Also if we are the leader make sure to send SLAVEOF commands to all the - * already reconfigured slaves in order to turn them back into slaves of - * the original master. */ + /* Clear failover related flags from slaves. */ di = dictGetIterator(ri->slaves); while((de = dictNext(di)) != NULL) { sentinelRedisInstance *slave = dictGetVal(de); - if (!(slave->flags & SRI_DISCONNECTED) && - (slave->flags & (SRI_PROMOTED|SRI_RECONF_SENT|SRI_RECONF_INPROG| - SRI_RECONF_DONE))) - { - int retval; - - retval = sentinelSendSlaveOf(slave,ri->addr->ip,ri->addr->port); - if (retval == REDIS_OK) - sentinelEvent(REDIS_NOTICE,"-slave-reconf-undo",slave,"%@"); - } slave->flags &= ~(SRI_RECONF_SENT|SRI_RECONF_INPROG|SRI_RECONF_DONE); } dictReleaseIterator(di); - sentinel_role = SENTINEL_LEADER; ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS|SRI_FORCE_FAILOVER); ri->failover_state = SENTINEL_FAILOVER_STATE_NONE; ri->failover_state_change_time = mstime(); if (ri->promoted_slave) { - sentinelCallClientReconfScript(ri,sentinel_role,"abort", + sentinelCallClientReconfScript(ri,SENTINEL_LEADER,"abort", ri->promoted_slave->addr,ri->addr); ri->promoted_slave->flags &= ~SRI_PROMOTED; ri->promoted_slave = NULL; From 83316f515c94f3e4c13c82df1b25fa63435645b8 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 18 Nov 2013 11:30:08 +0100 Subject: [PATCH 32/53] Sentinel: failover restart time is now multiple of failover timeout. Also defaulf failover timeout changed to 3 minutes as the failover is a fairly fast procedure most of the times, unless there are a very big number of slaves and the user picked to configure them sequentially (in that case the user should change the failover timeout accordingly). --- src/sentinel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index f6b7c019..b46ab306 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -85,7 +85,7 @@ typedef struct sentinelAddr { #define SENTINEL_SLAVE_RECONF_RETRY_PERIOD 10000 #define SENTINEL_DEFAULT_PARALLEL_SYNCS 1 #define SENTINEL_MIN_LINK_RECONNECT_PERIOD 15000 -#define SENTINEL_DEFAULT_FAILOVER_TIMEOUT (60*5*1000) +#define SENTINEL_DEFAULT_FAILOVER_TIMEOUT (60*3*1000) #define SENTINEL_MAX_PENDING_COMMANDS 100 #define SENTINEL_ELECTION_TIMEOUT 10000 @@ -2713,7 +2713,7 @@ int sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) { /* Last failover attempt started too little time ago? */ if (mstime() - master->failover_start_time < - SENTINEL_PUBLISH_PERIOD*4) return 0; + master->failover_timeout*2) return 0; sentinelStartFailover(master); return 1; From e0750acf11576588a8421b3f60fb4b8f813f772c Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 18 Nov 2013 11:37:24 +0100 Subject: [PATCH 33/53] Sentinel: slaves reconfig delay modified. The time Sentinel waits since the slave is detected to be configured to the wrong master, before reconfiguring it, is now the failover_timeout time as this makes more sense in order to give the Sentinel performing the failover enoung time to reconfigure the slaves slowly (if required by the configuration). Also we now PUBLISH more frequently the new configuraiton as this allows to switch the reapprearing master back to slave faster. --- src/sentinel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index b46ab306..2c4e8343 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -75,7 +75,7 @@ typedef struct sentinelAddr { #define SENTINEL_INFO_PERIOD 10000 #define SENTINEL_PING_PERIOD 1000 #define SENTINEL_ASK_PERIOD 1000 -#define SENTINEL_PUBLISH_PERIOD 5000 +#define SENTINEL_PUBLISH_PERIOD 2000 #define SENTINEL_DOWN_AFTER_PERIOD 30000 #define SENTINEL_HELLO_CHANNEL "__sentinel__:hello" #define SENTINEL_TILT_TRIGGER 2000 @@ -1648,7 +1648,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { (ri->slave_master_port != ri->master->addr->port || strcasecmp(ri->slave_master_host,ri->master->addr->ip))) { - mstime_t wait_time = SENTINEL_PUBLISH_PERIOD*4; + mstime_t wait_time = ri->master->failover_timeout; /* Make sure the master is sane before reconfiguring this instance * into a slave. */ From 3a374b05117250c19e9786a01ccc6ab9528181dd Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 18 Nov 2013 11:43:35 +0100 Subject: [PATCH 34/53] Sentinel: failover abort function simplified. --- src/sentinel.c | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 2c4e8343..18a7058e 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1155,7 +1155,7 @@ int sentinelResetMastersByPattern(char *pattern, int flags) { /* Reset the specified master with sentinelResetMaster(), and also change * the ip:port address, but take the name of the instance unmodified. * - * This is used to handle the +switch-master and +redirect-to-master events. + * This is used to handle the +switch-master event. * * The function returns REDIS_ERR if the address can't be resolved for some * reason. Otherwise REDIS_OK is returned. */ @@ -3061,40 +3061,15 @@ void sentinelFailoverStateMachine(sentinelRedisInstance *ri) { * * This function can only be called before the promoted slave acknowledged * the slave -> master switch. Otherwise the failover can't be aborted and - * will reach its end. - * - * If there is a promoted slave and we already got acknowledge of the - * slave -> master switch, we clear our flags and redirect to the - * new master. Eventually the config will be propagated if it is the one - * with the greater config epoch for this master. - * - * Otherwise if we still did not received the acknowledgement from the - * promoted slave, or there is no promoted slave at all, we just clear the - * failover-in-progress state as there is nothing to do (if the promoted - * slave for some reason actually received our "SLAVEOF NO ONE" command - * even if we did not received the ACK, it will be reverted to slave again - * by one of the Sentinels). */ + * will reach its end (possibly by timeout). */ void sentinelAbortFailover(sentinelRedisInstance *ri) { - dictIterator *di; - dictEntry *de; - redisAssert(ri->flags & SRI_FAILOVER_IN_PROGRESS); redisAssert(ri->failover_state <= SENTINEL_FAILOVER_STATE_WAIT_PROMOTION); - /* Clear failover related flags from slaves. */ - di = dictGetIterator(ri->slaves); - while((de = dictNext(di)) != NULL) { - sentinelRedisInstance *slave = dictGetVal(de); - slave->flags &= ~(SRI_RECONF_SENT|SRI_RECONF_INPROG|SRI_RECONF_DONE); - } - dictReleaseIterator(di); - ri->flags &= ~(SRI_FAILOVER_IN_PROGRESS|SRI_FORCE_FAILOVER); ri->failover_state = SENTINEL_FAILOVER_STATE_NONE; ri->failover_state_change_time = mstime(); if (ri->promoted_slave) { - sentinelCallClientReconfScript(ri,SENTINEL_LEADER,"abort", - ri->promoted_slave->addr,ri->addr); ri->promoted_slave->flags &= ~SRI_PROMOTED; ri->promoted_slave = NULL; } From 232cdb95ab9b5ea92c3351fcac4a414b35a3ec55 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 18 Nov 2013 16:02:58 +0100 Subject: [PATCH 35/53] Sentinel: added config options useful to take state on config rewrite. We'll use CONFIG REWRITE (internally) in order to store the new configuration of a Sentinel after the internal state changes. In order to do so, we need configuration options (that usually the user will not touch at all) about config epoch of the master, Sentinels and Slaves known for this master, and so forth. --- src/sentinel.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/sentinel.c b/src/sentinel.c index 18a7058e..49261470 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1312,6 +1312,35 @@ char *sentinelHandleConfiguration(char **argv, int argc) { ri = sentinelGetMasterByName(argv[1]); if (!ri) return "No such master with specified name."; ri->auth_pass = sdsnew(argv[2]); + } else if (!strcasecmp(argv[0],"config-epoch") && argc == 3) { + /* config-epoch */ + ri = sentinelGetMasterByName(argv[1]); + if (!ri) return "No such master with specified name."; + ri->config_epoch = strtoull(argv[2],NULL,10); + if (ri->config_epoch > sentinel.current_epoch) + sentinel.current_epoch = ri->config_epoch; + } else if (!strcasecmp(argv[0],"slave") && argc == 3) { + sentinelRedisInstance *slave; + + /* slave */ + ri = sentinelGetMasterByName(argv[1]); + if (!ri) return "No such master with specified name."; + if ((slave = createSentinelRedisInstance(NULL,SRI_SLAVE,argv[2], + atoi(argv[3]), ri->quorum, ri)) == NULL) + { + return "Wrong hostname or port for slave."; + } + } else if (!strcasecmp(argv[0],"sentinel") && argc == 3) { + sentinelRedisInstance *si; + + /* sentinel */ + ri = sentinelGetMasterByName(argv[1]); + if (!ri) return "No such master with specified name."; + if ((si = createSentinelRedisInstance(NULL,SRI_SENTINEL,argv[2], + atoi(argv[3]), ri->quorum, ri)) == NULL) + { + return "Wrong hostname or port for sentinel."; + } } else { return "Unrecognized sentinel configuration statement."; } From 47df12d5d9ab45a2603e1b16e862e8d0e29c0f2c Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 09:28:47 +0100 Subject: [PATCH 36/53] Sentinel: can-failover option removed, many comments fixed. --- src/sentinel.c | 105 ++++++++++++++----------------------------------- 1 file changed, 29 insertions(+), 76 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 49261470..05c066f5 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -58,25 +58,20 @@ typedef struct sentinelAddr { #define SRI_O_DOWN (1<<5) /* Objectively down (quorum reached). */ #define SRI_MASTER_DOWN (1<<6) /* A Sentinel with this flag set thinks that its master is down. */ -/* SRI_CAN_FAILOVER when set in an SRI_MASTER instance means that we are - * allowed to perform the failover for this master. - * When set in a SRI_SENTINEL instance means that sentinel is allowed to - * perform the failover on its master. */ -#define SRI_CAN_FAILOVER (1<<7) -#define SRI_FAILOVER_IN_PROGRESS (1<<8) /* Failover is in progress for +#define SRI_FAILOVER_IN_PROGRESS (1<<7) /* Failover is in progress for this master. */ -#define SRI_PROMOTED (1<<9) /* Slave selected for promotion. */ -#define SRI_RECONF_SENT (1<<10) /* SLAVEOF sent. */ -#define SRI_RECONF_INPROG (1<<11) /* Slave synchronization in progress. */ -#define SRI_RECONF_DONE (1<<12) /* Slave synchronized with new master. */ -#define SRI_FORCE_FAILOVER (1<<13) /* Force failover with master up. */ -#define SRI_SCRIPT_KILL_SENT (1<<14) /* SCRIPT KILL already sent on -BUSY */ +#define SRI_PROMOTED (1<<8) /* Slave selected for promotion. */ +#define SRI_RECONF_SENT (1<<9) /* SLAVEOF sent. */ +#define SRI_RECONF_INPROG (1<<10) /* Slave synchronization in progress. */ +#define SRI_RECONF_DONE (1<<11) /* Slave synchronized with new master. */ +#define SRI_FORCE_FAILOVER (1<<12) /* Force failover with master up. */ +#define SRI_SCRIPT_KILL_SENT (1<<13) /* SCRIPT KILL already sent on -BUSY */ #define SENTINEL_INFO_PERIOD 10000 #define SENTINEL_PING_PERIOD 1000 #define SENTINEL_ASK_PERIOD 1000 #define SENTINEL_PUBLISH_PERIOD 2000 -#define SENTINEL_DOWN_AFTER_PERIOD 30000 +#define SENTINEL_DEFAULT_DOWN_AFTER 30000 #define SENTINEL_HELLO_CHANNEL "__sentinel__:hello" #define SENTINEL_TILT_TRIGGER 2000 #define SENTINEL_TILT_PERIOD (SENTINEL_PING_PERIOD*30) @@ -893,7 +888,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * ri->s_down_since_time = 0; ri->o_down_since_time = 0; ri->down_after_period = master ? master->down_after_period : - SENTINEL_DOWN_AFTER_PERIOD; + SENTINEL_DEFAULT_DOWN_AFTER; ri->master_link_down_time = 0; ri->auth_pass = NULL; ri->slave_priority = SENTINEL_DEFAULT_SLAVE_PRIORITY; @@ -1111,7 +1106,7 @@ void sentinelResetMaster(sentinelRedisInstance *ri, int flags) { } if (ri->cc) sentinelKillLink(ri,ri->cc); if (ri->pc) sentinelKillLink(ri,ri->pc); - ri->flags &= SRI_MASTER|SRI_CAN_FAILOVER|SRI_DISCONNECTED; + ri->flags &= SRI_MASTER|SRI_DISCONNECTED; if (ri->leader) { sdsfree(ri->leader); ri->leader = NULL; @@ -1276,17 +1271,6 @@ char *sentinelHandleConfiguration(char **argv, int argc) { ri->failover_timeout = atoi(argv[2]); if (ri->failover_timeout <= 0) return "negative or zero time parameter."; - } else if (!strcasecmp(argv[0],"can-failover") && argc == 3) { - /* can-failover */ - int yesno = yesnotoi(argv[2]); - - ri = sentinelGetMasterByName(argv[1]); - if (!ri) return "No such master with specified name."; - if (yesno == -1) return "Argument must be either yes or no."; - if (yesno) - ri->flags |= SRI_CAN_FAILOVER; - else - ri->flags &= ~SRI_CAN_FAILOVER; } else if (!strcasecmp(argv[0],"parallel-syncs") && argc == 3) { /* parallel-syncs */ ri = sentinelGetMasterByName(argv[1]); @@ -1826,25 +1810,24 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd if (strstr(r->element[2]->str,server.runid) != NULL) return; { - /* Format is composed of 9 tokens: - * 0=ip,1=port,2=runid,3=can_failover,4=current_epoch, - * 5=master_name,6=master_ip,7=master_port,8=master_config_epoch. */ - int numtokens, port, removed, canfailover, master_port; + /* Format is composed of 8 tokens: + * 0=ip,1=port,2=runid,3=current_epoch,4=master_name, + * 5=master_ip,6=master_port,7=master_config_epoch. */ + int numtokens, port, removed, master_port; uint64_t current_epoch, master_config_epoch; char **token = sdssplitlen(r->element[2]->str, r->element[2]->len, ",",1,&numtokens); sentinelRedisInstance *si; - if (numtokens == 9) { + if (numtokens == 8) { /* First, try to see if we already have this sentinel. */ port = atoi(token[1]); - master_port = atoi(token[7]); - canfailover = atoi(token[3]); + master_port = atoi(token[6]); si = getSentinelRedisInstanceByAddrAndRunID( master->sentinels,token[0],port,token[2]); - current_epoch = strtoull(token[4],NULL,10); - master_config_epoch = strtoull(token[8],NULL,10); + current_epoch = strtoull(token[3],NULL,10); + master_config_epoch = strtoull(token[7],NULL,10); sentinelRedisInstance *msgmaster; if (!si) { @@ -1871,7 +1854,7 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd } } - /* Update local current_epoch if received current_epoch is greater. */ + /* Update local current_epoch if received current_epoch is greater.*/ if (current_epoch > sentinel.current_epoch) { sentinel.current_epoch = current_epoch; sentinelEvent(REDIS_WARNING,"+new-epoch",ri,"%llu", @@ -1879,31 +1862,25 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd } /* Update master info if received configuration is newer. */ - if ((msgmaster = sentinelGetMasterByName(token[5])) != NULL) { + if ((msgmaster = sentinelGetMasterByName(token[4])) != NULL) { if (msgmaster->config_epoch < master_config_epoch) { msgmaster->config_epoch = master_config_epoch; if (master_port != msgmaster->addr->port || - !strcmp(msgmaster->addr->ip, token[6])) + !strcmp(msgmaster->addr->ip, token[5])) { sentinelEvent(REDIS_WARNING,"+switch-master", msgmaster,"%s %s %d %s %d", msgmaster->name, msgmaster->addr->ip, msgmaster->addr->port, - token[6], master_port); + token[5], master_port); sentinelResetMasterAndChangeAddress(msgmaster, - token[6], master_port); + token[5], master_port); } } } /* Update the state of the Sentinel. */ - if (si) { - si->last_hello_time = mstime(); - if (canfailover) - si->flags |= SRI_CAN_FAILOVER; - else - si->flags &= ~SRI_CAN_FAILOVER; - } + if (si) si->last_hello_time = mstime(); } sdsfreesplitres(token,numtokens); } @@ -1964,10 +1941,9 @@ void sentinelPingInstance(sentinelRedisInstance *ri) { sentinelAddr *master_addr = sentinelGetCurrentMasterAddress(master); snprintf(payload,sizeof(payload), - "%s,%d,%s,%d,%llu," /* Info about this sentinel. */ + "%s,%d,%s,%llu," /* Info about this sentinel. */ "%s,%s,%d,%lld", /* Info about current master. */ ip, server.port, server.runid, - (master->flags & SRI_CAN_FAILOVER) != 0, (unsigned long long) sentinel.current_epoch, /* --- */ master->name,master_addr->ip,master_addr->port, @@ -2138,10 +2114,6 @@ void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) { addReplyBulkLongLong(c,mstime() - ri->last_hello_time); fields++; - addReplyBulkCString(c,"can-failover-its-master"); - addReplyBulkLongLong(c,(ri->flags & SRI_CAN_FAILOVER) != 0); - fields++; - addReplyBulkCString(c,"voted-leader"); addReplyBulkCString(c,ri->leader ? ri->leader : "?"); fields++; @@ -2540,25 +2512,6 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master, int f /* =============================== FAILOVER ================================= */ -/* Given a master get the "subjective leader", that is, among all the sentinels - * with given characteristics, the one with the lexicographically smaller - * runid. The characteristics required are: - * - * 1) Has SRI_CAN_FAILOVER flag. - * 2) Is not disconnected. - * 3) Recently answered to our ping (no longer than - * SENTINEL_INFO_VALIDITY_TIME milliseconds ago). - * - * The function returns a pointer to an sds string representing the runid of the - * leader sentinel instance (from our point of view). Otherwise NULL is - * returned if there are no suitable sentinels. - */ - -int compareRunID(const void *a, const void *b) { - char **aptrptr = (char**)a, **bptrptr = (char**)b; - return strcasecmp(*aptrptr, *bptrptr); -} - /* Vote for the sentinel with 'req_runid' or return the old vote if already * voted for the specifed 'req_epoch' or one greater. * @@ -2725,8 +2678,9 @@ void sentinelStartFailover(sentinelRedisInstance *master) { /* This function checks if there are the conditions to start the failover, * that is: * - * 1) Enough time has passed since O_DOWN. - * 2) The master is marked as SRI_CAN_FAILOVER, so we can failover it. + * 1) Master must be in ODOWN condition. + * 2) No failover already in progress. + * 3) No failover already attempted recently. * * We still don't know if we'll win the election so it is possible that we * start the failover but that we'll not be able to act. @@ -2734,8 +2688,7 @@ void sentinelStartFailover(sentinelRedisInstance *master) { * Return non-zero if a failover was started. */ int sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) { /* We can't failover if the master is not in O_DOWN state. */ - if (!(master->flags & SRI_CAN_FAILOVER) || - !(master->flags & SRI_O_DOWN)) return 0; + if (!(master->flags & SRI_O_DOWN)) return 0; /* Failover already in progress? */ if (master->flags & SRI_FAILOVER_IN_PROGRESS) return 0; From 5998769c2803ded68b56b8f5836c57c895e0e807 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 09:48:12 +0100 Subject: [PATCH 37/53] Sentinel: CONFIG REWRITE support for Sentinel config. --- src/config.c | 5 +++ src/redis.h | 2 + src/sentinel.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 112 insertions(+), 4 deletions(-) diff --git a/src/config.c b/src/config.c index 8bfb208f..d39546e1 100644 --- a/src/config.c +++ b/src/config.c @@ -1162,6 +1162,10 @@ int dictSdsKeyCompare(void *privdata, const void *key1, const void *key2); void dictSdsDestructor(void *privdata, void *val); void dictListDestructor(void *privdata, void *val); +/* Sentinel config rewriting is implemented inside sentinel.c by + * rewriteConfigSentinelOption(). */ +void rewriteConfigSentinelOption(struct rewriteConfigState *state); + dictType optionToLineDictType = { dictSdsHash, /* hash function */ NULL, /* key dup */ @@ -1735,6 +1739,7 @@ int rewriteConfig(char *path) { rewriteConfigClientoutputbufferlimitOption(state); rewriteConfigNumericalOption(state,"hz",server.hz,REDIS_DEFAULT_HZ); rewriteConfigYesNoOption(state,"aof-rewrite-incremental-fsync",server.aof_rewrite_incremental_fsync,REDIS_DEFAULT_AOF_REWRITE_INCREMENTAL_FSYNC); + if (server.sentinel_mode) rewriteConfigSentinelOption(state); /* Step 3: remove all the orphaned lines in the old file, that is, lines * that were used by a config option and are no longer used, like in case diff --git a/src/redis.h b/src/redis.h index f0b5aa86..2361e03d 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1169,6 +1169,8 @@ sds keyspaceEventsFlagsToString(int flags); void loadServerConfig(char *filename, char *options); void appendServerSaveParams(time_t seconds, int changes); void resetServerSaveParams(); +struct rewriteConfigState; /* Forward declaration to export API. */ +void rewriteConfigRewriteLine(struct rewriteConfigState *state, char *option, sds line, int force); /* db.c -- Keyspace access API */ int removeExpire(redisDb *db, robj *key); diff --git a/src/sentinel.c b/src/sentinel.c index 05c066f5..31005c9d 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1303,10 +1303,10 @@ char *sentinelHandleConfiguration(char **argv, int argc) { ri->config_epoch = strtoull(argv[2],NULL,10); if (ri->config_epoch > sentinel.current_epoch) sentinel.current_epoch = ri->config_epoch; - } else if (!strcasecmp(argv[0],"slave") && argc == 3) { + } else if (!strcasecmp(argv[0],"known-slave") && argc == 3) { sentinelRedisInstance *slave; - /* slave */ + /* known-slave */ ri = sentinelGetMasterByName(argv[1]); if (!ri) return "No such master with specified name."; if ((slave = createSentinelRedisInstance(NULL,SRI_SLAVE,argv[2], @@ -1314,10 +1314,10 @@ char *sentinelHandleConfiguration(char **argv, int argc) { { return "Wrong hostname or port for slave."; } - } else if (!strcasecmp(argv[0],"sentinel") && argc == 3) { + } else if (!strcasecmp(argv[0],"known-sentinel") && argc == 3) { sentinelRedisInstance *si; - /* sentinel */ + /* known-sentinel */ ri = sentinelGetMasterByName(argv[1]); if (!ri) return "No such master with specified name."; if ((si = createSentinelRedisInstance(NULL,SRI_SENTINEL,argv[2], @@ -1331,6 +1331,107 @@ char *sentinelHandleConfiguration(char **argv, int argc) { return NULL; } +/* Implements CONFIG REWRITE for "sentinel" option. + * This is used not just to rewrite the configuration given by the user + * (the configured masters) but also in order to retain the state of + * Sentinel across restarts: config epoch of masters, associated slaves + * and sentinel instances, and so forth. */ +void rewriteConfigSentinelOption(struct rewriteConfigState *state) { + dictIterator *di, *di2; + dictEntry *de; + + /* For every master emit a "sentinel monitor" config entry. */ + di = dictGetIterator(sentinel.masters); + while((de = dictNext(di)) != NULL) { + sentinelRedisInstance *master, *ri; + sds line; + + /* sentinel monitor */ + master = dictGetVal(de); + line = sdscatprintf(sdsempty(),"sentinel monitor %s %s %d %d", + master->name, master->addr->ip, master->addr->port, + master->quorum); + rewriteConfigRewriteLine(state,"sentinel",line,1); + + /* sentinel down-after-milliseconds */ + if (master->down_after_period != SENTINEL_DEFAULT_DOWN_AFTER) { + line = sdscatprintf(sdsempty(), + "sentinel down-after-milliseconds %s %ld", + master->name, (long) master->down_after_period); + rewriteConfigRewriteLine(state,"sentinel",line,1); + } + + /* sentinel failover-timeout */ + if (master->failover_timeout != SENTINEL_DEFAULT_FAILOVER_TIMEOUT) { + line = sdscatprintf(sdsempty(), + "sentinel failover-timeout %s %ld", + master->name, (long) master->failover_timeout); + rewriteConfigRewriteLine(state,"sentinel",line,1); + } + + /* sentinel parallel-syncs */ + if (master->parallel_syncs != SENTINEL_DEFAULT_PARALLEL_SYNCS) { + line = sdscatprintf(sdsempty(), + "sentinel parallel-syncs %s %d", + master->name, master->parallel_syncs); + rewriteConfigRewriteLine(state,"sentinel",line,1); + } + + /* sentinel notification-script */ + if (master->notification_script) { + line = sdscatprintf(sdsempty(), + "sentinel notification-script %s %s", + master->name, master->notification_script); + rewriteConfigRewriteLine(state,"sentinel",line,1); + } + + /* sentinel client-reconfig-script */ + if (master->client_reconfig_script) { + line = sdscatprintf(sdsempty(), + "sentinel client-reconfig-script %s %s", + master->name, master->client_reconfig_script); + rewriteConfigRewriteLine(state,"sentinel",line,1); + } + + /* sentinel auth-pass */ + if (master->auth_pass) { + line = sdscatprintf(sdsempty(), + "sentinel auth-pass %s %s", + master->name, master->auth_pass); + rewriteConfigRewriteLine(state,"sentinel",line,1); + } + + /* sentinel config-epoch */ + line = sdscatprintf(sdsempty(), + "sentinel config-epoch %s %llu", + master->name, (unsigned long long) master->config_epoch); + rewriteConfigRewriteLine(state,"sentinel",line,1); + + /* sentinel known-slave */ + di2 = dictGetIterator(master->slaves); + while((de = dictNext(di)) != NULL) { + ri = dictGetVal(de); + line = sdscatprintf(sdsempty(), + "sentinel known-slave %s %s %d", + master->name, ri->addr->ip, ri->addr->port); + rewriteConfigRewriteLine(state,"sentinel",line,1); + } + dictReleaseIterator(di2); + + /* sentinel known-sentinel */ + di2 = dictGetIterator(master->sentinels); + while((de = dictNext(di)) != NULL) { + ri = dictGetVal(de); + line = sdscatprintf(sdsempty(), + "sentinel known-sentinel %s %s %d", + master->name, ri->addr->ip, ri->addr->port); + rewriteConfigRewriteLine(state,"sentinel",line,1); + } + dictReleaseIterator(di2); + } + dictReleaseIterator(di); +} + /* ====================== hiredis connection handling ======================= */ /* Completely disconnect an hiredis link from an instance. */ From e257ab2bfeabfec5b4af562a5f0bbc28983ce26d Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 10:13:04 +0100 Subject: [PATCH 38/53] Sentinel: sentinelFlushConfig() to CONFIG REWRITE + fsync. --- src/redis.h | 1 + src/sentinel.c | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/src/redis.h b/src/redis.h index 2361e03d..bdc69522 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1171,6 +1171,7 @@ void appendServerSaveParams(time_t seconds, int changes); void resetServerSaveParams(); struct rewriteConfigState; /* Forward declaration to export API. */ void rewriteConfigRewriteLine(struct rewriteConfigState *state, char *option, sds line, int force); +int rewriteConfig(char *path); /* db.c -- Keyspace access API */ int removeExpire(redisDb *db, robj *key); diff --git a/src/sentinel.c b/src/sentinel.c index 31005c9d..bd951d8c 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -36,6 +36,7 @@ #include #include #include +#include extern char **environ; @@ -1432,6 +1433,27 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) { dictReleaseIterator(di); } +/* This function uses the config rewriting Redis engine in order to persist + * the state of the Sentinel in the current configuration file. + * + * Before returning the function calls fsync() against the generated + * configuration file to make sure changes are committed to disk. + * + * On failure the function logs a warning on the Redis log. */ +void sentinelFlushConfig(void) { + int fd; + + if (rewriteConfig(server.configfile) == -1) { + redisLog(REDIS_WARNING,"WARNING: Senitnel was not able to save the new configuration on disk!!!: %s", strerror(errno)); + return; + } + if ((fd = open(server.configfile,O_RDONLY)) != -1) { + fsync(fd); + close(fd); + } + return; +} + /* ====================== hiredis connection handling ======================= */ /* Completely disconnect an hiredis link from an instance. */ From 16237d78c83ed7c7182b60b9e66d29db44465986 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 10:55:43 +0100 Subject: [PATCH 39/53] Sentinel: call sentinelFlushConfig() to persist state when needed. Also the sentinel configuration rewriting was modified in order to account for failover in progress, where we need to provide the promoted slave address as master address, and the old master address as one of the slaves address. --- src/sentinel.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index bd951d8c..99a5bb50 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -334,6 +334,7 @@ void sentinelStartFailover(sentinelRedisInstance *master); void sentinelDiscardReplyCallback(redisAsyncContext *c, void *reply, void *privdata); int sentinelSendSlaveOf(sentinelRedisInstance *ri, char *host, int port); char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char *req_runid, uint64_t *leader_epoch); +void sentinelFlushConfig(void); /* ========================= Dictionary types =============================== */ @@ -1201,13 +1202,17 @@ int sentinelResetMasterAndChangeAddress(sentinelRedisInstance *master, char *ip, slave = createSentinelRedisInstance(NULL,SRI_SLAVE,slaves[j]->ip, slaves[j]->port, master->quorum, master); releaseSentinelAddr(slaves[j]); - if (slave) sentinelEvent(REDIS_NOTICE,"+slave",slave,"%@"); + if (slave) { + sentinelEvent(REDIS_NOTICE,"+slave",slave,"%@"); + sentinelFlushConfig(); + } } zfree(slaves); /* Release the old address at the end so we are safe even if the function * gets the master->addr->ip and master->addr->port as arguments. */ releaseSentinelAddr(oldaddr); + sentinelFlushConfig(); return REDIS_OK; } @@ -1345,12 +1350,14 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) { di = dictGetIterator(sentinel.masters); while((de = dictNext(di)) != NULL) { sentinelRedisInstance *master, *ri; + sentinelAddr *master_addr; sds line; /* sentinel monitor */ master = dictGetVal(de); + master_addr = sentinelGetCurrentMasterAddress(master); line = sdscatprintf(sdsempty(),"sentinel monitor %s %s %d %d", - master->name, master->addr->ip, master->addr->port, + master->name, master_addr->ip, master_addr->port, master->quorum); rewriteConfigRewriteLine(state,"sentinel",line,1); @@ -1411,7 +1418,18 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) { /* sentinel known-slave */ di2 = dictGetIterator(master->slaves); while((de = dictNext(di)) != NULL) { + sentinelAddr *slave_addr; + ri = dictGetVal(de); + slave_addr = ri->addr; + + /* If master_addr (obtained using sentinelGetCurrentMasterAddress() + * so it may be the address of the promoted slave) is equal to this + * slave's address, a failover is in progress and the slave was + * already successfully promoted. So as the address of this slave + * we use the old master address instead. */ + if (sentinelAddrIsEqual(slave_addr,master_addr)) + slave_addr = master->addr; line = sdscatprintf(sdsempty(), "sentinel known-slave %s %s %d", master->name, ri->addr->ip, ri->addr->port); @@ -1754,6 +1772,7 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { ri->master->config_epoch = ri->master->failover_epoch; ri->master->failover_state = SENTINEL_FAILOVER_STATE_RECONF_SLAVES; ri->master->failover_state_change_time = mstime(); + sentinelFlushConfig(); sentinelEvent(REDIS_WARNING,"+promoted-slave",ri,"%@"); sentinelEvent(REDIS_WARNING,"+failover-state-reconf-slaves", ri->master,"%@"); @@ -1974,6 +1993,7 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd * for Sentinels we don't have a later chance to fill it, * so do it now. */ si->runid = sdsnew(token[2]); + sentinelFlushConfig(); } } From b8a94463b751e55c9b3dfe08646738a13974c274 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 10:59:47 +0100 Subject: [PATCH 40/53] Sentinel: rewriteConfigSentinelOption() sub-iterators var typo fixed. --- src/sentinel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 99a5bb50..340dd646 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1417,7 +1417,7 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) { /* sentinel known-slave */ di2 = dictGetIterator(master->slaves); - while((de = dictNext(di)) != NULL) { + while((de = dictNext(di2)) != NULL) { sentinelAddr *slave_addr; ri = dictGetVal(de); @@ -1439,7 +1439,7 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) { /* sentinel known-sentinel */ di2 = dictGetIterator(master->sentinels); - while((de = dictNext(di)) != NULL) { + while((de = dictNext(di2)) != NULL) { ri = dictGetVal(de); line = sdscatprintf(sdsempty(), "sentinel known-sentinel %s %s %d", From 5450833d025bbe84a2da1bbb1df38d3fad70856f Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 11:03:47 +0100 Subject: [PATCH 41/53] Sentinel: arity of known-sentinel/slave is 4 not 3. --- src/sentinel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 340dd646..dd2b681f 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1309,7 +1309,7 @@ char *sentinelHandleConfiguration(char **argv, int argc) { ri->config_epoch = strtoull(argv[2],NULL,10); if (ri->config_epoch > sentinel.current_epoch) sentinel.current_epoch = ri->config_epoch; - } else if (!strcasecmp(argv[0],"known-slave") && argc == 3) { + } else if (!strcasecmp(argv[0],"known-slave") && argc == 4) { sentinelRedisInstance *slave; /* known-slave */ @@ -1320,7 +1320,7 @@ char *sentinelHandleConfiguration(char **argv, int argc) { { return "Wrong hostname or port for slave."; } - } else if (!strcasecmp(argv[0],"known-sentinel") && argc == 3) { + } else if (!strcasecmp(argv[0],"known-sentinel") && argc == 4) { sentinelRedisInstance *si; /* known-sentinel */ From 0a35f65301481a6165f34c1c3e38e2b805ba595e Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 11:11:43 +0100 Subject: [PATCH 42/53] Sentinel: when writing config on disk, remember sentinels runid. --- src/sentinel.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index dd2b681f..e383b58f 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -1320,10 +1320,11 @@ char *sentinelHandleConfiguration(char **argv, int argc) { { return "Wrong hostname or port for slave."; } - } else if (!strcasecmp(argv[0],"known-sentinel") && argc == 4) { + } else if (!strcasecmp(argv[0],"known-sentinel") && + (argc == 4 || argc == 5)) { sentinelRedisInstance *si; - /* known-sentinel */ + /* known-sentinel [runid] */ ri = sentinelGetMasterByName(argv[1]); if (!ri) return "No such master with specified name."; if ((si = createSentinelRedisInstance(NULL,SRI_SENTINEL,argv[2], @@ -1331,6 +1332,7 @@ char *sentinelHandleConfiguration(char **argv, int argc) { { return "Wrong hostname or port for sentinel."; } + if (argc == 5) si->runid = sdsnew(argv[4]); } else { return "Unrecognized sentinel configuration statement."; } @@ -1442,8 +1444,10 @@ void rewriteConfigSentinelOption(struct rewriteConfigState *state) { while((de = dictNext(di2)) != NULL) { ri = dictGetVal(de); line = sdscatprintf(sdsempty(), - "sentinel known-sentinel %s %s %d", - master->name, ri->addr->ip, ri->addr->port); + "sentinel known-sentinel %s %s %d%s%s", + master->name, ri->addr->ip, ri->addr->port, + ri->runid ? " " : "", + ri->runid ? ri->runid : ""); rewriteConfigRewriteLine(state,"sentinel",line,1); } dictReleaseIterator(di2); From 90635488ce6b6f48cc0fd767cdb34b1c8c9eab79 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 11:24:36 +0100 Subject: [PATCH 43/53] Sentinel: no longer used defines removed. --- src/sentinel.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index e383b58f..7319e731 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -77,7 +77,6 @@ typedef struct sentinelAddr { #define SENTINEL_TILT_TRIGGER 2000 #define SENTINEL_TILT_PERIOD (SENTINEL_PING_PERIOD*30) #define SENTINEL_DEFAULT_SLAVE_PRIORITY 100 -#define SENTINEL_PROMOTION_RETRY_PERIOD 30000 #define SENTINEL_SLAVE_RECONF_RETRY_PERIOD 10000 #define SENTINEL_DEFAULT_PARALLEL_SYNCS 1 #define SENTINEL_MIN_LINK_RECONNECT_PERIOD 15000 @@ -88,8 +87,6 @@ typedef struct sentinelAddr { /* How many milliseconds is an information valid? This applies for instance * to the reply to SENTINEL IS-MASTER-DOWN-BY-ADDR replies. */ #define SENTINEL_INFO_VALIDITY_TIME 5000 -#define SENTINEL_FAILOVER_FIXED_DELAY 5000 -#define SENTINEL_FAILOVER_MAX_RANDOM_DELAY 10000 /* Failover machine different states. */ #define SENTINEL_FAILOVER_STATE_NONE 0 /* No failover in progress. */ @@ -98,10 +95,7 @@ typedef struct sentinelAddr { #define SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE 3 /* Slave -> Master */ #define SENTINEL_FAILOVER_STATE_WAIT_PROMOTION 4 /* Wait slave to change role */ #define SENTINEL_FAILOVER_STATE_RECONF_SLAVES 5 /* SLAVEOF newmaster */ -#define SENTINEL_FAILOVER_STATE_WAIT_NEXT_SLAVE 6 /* wait replication */ -#define SENTINEL_FAILOVER_STATE_ALERT_CLIENTS 7 /* Run user script. */ -#define SENTINEL_FAILOVER_STATE_WAIT_ALERT_SCRIPT 8 /* Wait script exec. */ -#define SENTINEL_FAILOVER_STATE_UPDATE_CONFIG 9 /* Monitor promoted slave. */ +#define SENTINEL_FAILOVER_STATE_UPDATE_CONFIG 6 /* Monitor promoted slave. */ #define SENTINEL_MASTER_LINK_STATUS_UP 0 #define SENTINEL_MASTER_LINK_STATUS_DOWN 1 @@ -2114,7 +2108,6 @@ const char *sentinelFailoverStateStr(int state) { case SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE: return "send_slaveof_noone"; case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION: return "wait_promotion"; case SENTINEL_FAILOVER_STATE_RECONF_SLAVES: return "reconf_slaves"; - case SENTINEL_FAILOVER_STATE_ALERT_CLIENTS: return "alert_clients"; case SENTINEL_FAILOVER_STATE_UPDATE_CONFIG: return "update_config"; default: return "unknown"; } From 1f9728cb20f7eaed3f613e7b8c49292edd1edc34 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 12:34:39 +0100 Subject: [PATCH 44/53] Sentinel: failover script execution fixed. --- src/sentinel.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 7319e731..4b40e7a2 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -437,6 +437,16 @@ sentinelAddr *createSentinelAddr(char *hostname, int port) { return sa; } +/* Return a duplicate of the source address. */ +sentinelAddr *dupSentinelAddr(sentinelAddr *src) { + sentinelAddr *sa; + + sa = zmalloc(sizeof(*sa)); + sa->ip = sdsnew(src->ip); + sa->port = src->port; + return sa; +} + /* Free a Sentinel address. Can't fail. */ void releaseSentinelAddr(sentinelAddr *sa) { sdsfree(sa->ip); @@ -783,15 +793,13 @@ void sentinelPendingScriptsCommand(redisClient *c) { * * * - * It is called every time a failover starts, ends, or is aborted. + * It is called every time a failover is performed. * - * is "start", "end" or "abort". + * is currently always "failover". * is either "leader" or "observer". * * from/to fields are respectively master -> promoted slave addresses for - * "start" and "end", or the reverse (promoted slave -> master) in case of - * "abort". - */ + * "start" and "end". */ void sentinelCallClientReconfScript(sentinelRedisInstance *master, int role, char *state, sentinelAddr *from, sentinelAddr *to) { char fromport[32], toport[32]; @@ -2009,13 +2017,21 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd if (master_port != msgmaster->addr->port || !strcmp(msgmaster->addr->ip, token[5])) { + sentinelAddr *old_addr; + sentinelEvent(REDIS_WARNING,"+switch-master", msgmaster,"%s %s %d %s %d", msgmaster->name, msgmaster->addr->ip, msgmaster->addr->port, token[5], master_port); + + old_addr = dupSentinelAddr(msgmaster->addr); sentinelResetMasterAndChangeAddress(msgmaster, token[5], master_port); + sentinelCallClientReconfScript(msgmaster, + SENTINEL_OBSERVER,"start", + old_addr,msgmaster->addr); + releaseSentinelAddr(old_addr); } } } @@ -3038,13 +3054,9 @@ void sentinelFailoverDetectEnd(sentinelRedisInstance *master) { } if (not_reconfigured == 0) { - int role = SENTINEL_LEADER; - sentinelEvent(REDIS_WARNING,"+failover-end",master,"%@"); master->failover_state = SENTINEL_FAILOVER_STATE_UPDATE_CONFIG; master->failover_state_change_time = mstime(); - sentinelCallClientReconfScript(master,role,"end",master->addr, - master->promoted_slave->addr); } /* If I'm the leader it is a good idea to send a best effort SLAVEOF From b22d1beea07383eaaf19ed1ab304224c5829a83a Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 16:20:42 +0100 Subject: [PATCH 45/53] Sentinel: various fixes to leader election implementation. --- src/sentinel.c | 70 +++++++++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 4b40e7a2..7abd1cfb 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2672,7 +2672,7 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master, int f * voted for the specifed 'req_epoch' or one greater. * * If a vote is not available returns NULL, otherwise return the Sentinel - * runid and populate the leader_epoch with the epoch of the last vote. */ + * runid and populate the leader_epoch with the epoch of the vote. */ char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char *req_runid, uint64_t *leader_epoch) { if (req_epoch > sentinel.current_epoch) { sentinel.current_epoch = req_epoch; @@ -2680,7 +2680,8 @@ char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char (unsigned long long) sentinel.current_epoch); } - if (master->leader_epoch < req_epoch && sentinel.current_epoch <= req_epoch) { + if (master->leader_epoch < req_epoch && sentinel.current_epoch <= req_epoch) + { sdsfree(master->leader); master->leader = sdsnew(req_runid); master->leader_epoch = sentinel.current_epoch; @@ -2692,7 +2693,8 @@ char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char * * The random addition is useful to desynchronize a bit the slaves * and reduce the chance that no slave gets majority. */ - master->failover_start_time = mstime() + rand() % 2000; + if (strcasecmp(master->leader,server.runid)) + master->failover_start_time = mstime() + rand() % 2000; } *leader_epoch = master->leader_epoch; @@ -2706,17 +2708,19 @@ struct sentinelLeader { /* Helper function for sentinelGetLeader, increment the counter * relative to the specified runid. */ -void sentinelLeaderIncr(dict *counters, char *runid) { +int sentinelLeaderIncr(dict *counters, char *runid) { dictEntry *de = dictFind(counters,runid); uint64_t oldval; if (de) { oldval = dictGetUnsignedIntegerVal(de); dictSetUnsignedIntegerVal(de,oldval+1); + return oldval+1; } else { de = dictAddRaw(counters,runid); redisAssert(de != NULL); dictSetUnsignedIntegerVal(de,1); + return 1; } } @@ -2734,49 +2738,57 @@ char *sentinelGetLeader(sentinelRedisInstance *master, uint64_t epoch) { char *myvote; char *winner = NULL; uint64_t leader_epoch; + uint64_t max_votes = 0; redisAssert(master->flags & (SRI_O_DOWN|SRI_FAILOVER_IN_PROGRESS)); counters = dictCreate(&leaderVotesDictType,NULL); - /* Count my vote (and vote for myself if I still did not voted for - * the currnet epoch). */ - myvote = sentinelVoteLeader(master,epoch,server.runid,&leader_epoch); - if (myvote && leader_epoch == epoch) { - sentinelLeaderIncr(counters,myvote); - voters++; - } - /* Count other sentinels votes */ di = dictGetIterator(master->sentinels); while((de = dictNext(di)) != NULL) { sentinelRedisInstance *ri = dictGetVal(de); - if (ri->leader == NULL || ri->leader_epoch != sentinel.current_epoch) - continue; - sentinelLeaderIncr(counters,ri->leader); + if (ri->leader != NULL && ri->leader_epoch == sentinel.current_epoch) + sentinelLeaderIncr(counters,ri->leader); voters++; } dictReleaseIterator(di); - voters_quorum = voters/2+1; /* Check what's the winner. For the winner to win, it needs two conditions: * 1) Absolute majority between voters (50% + 1). * 2) And anyway at least master->quorum votes. */ - { - uint64_t max_votes = 0; /* Max votes so far. */ + di = dictGetIterator(counters); + while((de = dictNext(di)) != NULL) { + uint64_t votes = dictGetUnsignedIntegerVal(de); - di = dictGetIterator(counters); - while((de = dictNext(di)) != NULL) { - uint64_t votes = dictGetUnsignedIntegerVal(de); - - if (max_votes < votes) { - max_votes = votes; - winner = dictGetKey(de); - } + if (votes > max_votes) { + max_votes = votes; + winner = dictGetKey(de); } - dictReleaseIterator(di); - if (winner && (max_votes < voters_quorum || max_votes < master->quorum)) - winner = NULL; } + dictReleaseIterator(di); + + /* Count this Sentinel vote: + * if this Sentinel did not voted yet, either vote for the most + * common voted sentinel, or for itself if no vote exists at all. */ + if (winner) + myvote = sentinelVoteLeader(master,epoch,winner,&leader_epoch); + else + myvote = sentinelVoteLeader(master,epoch,server.runid,&leader_epoch); + + if (myvote && leader_epoch == epoch) { + uint64_t votes = sentinelLeaderIncr(counters,myvote); + + if (votes > max_votes) { + max_votes = votes; + winner = myvote; + } + } + voters++; /* Anyway, count me as one of the voters. */ + + voters_quorum = voters/2+1; + if (winner && (max_votes < voters_quorum || max_votes < master->quorum)) + winner = NULL; + winner = winner ? sdsnew(winner) : NULL; sdsfree(myvote); dictRelease(counters); From 37a51a2568ed8558c60920e1bc08dd5265e74efe Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 16:50:04 +0100 Subject: [PATCH 46/53] Sentinel: distinguish between is-master-down-by-addr requests. Some are just to know if the master is down, and in this case the runid in the request is set to "*", others are actually in order to seek for a vote and get elected. In the latter case the runid is set to the runid of the instance seeking for the vote. --- src/redis.c | 2 ++ src/sentinel.c | 20 +++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/redis.c b/src/redis.c index 7932824e..af24505b 100644 --- a/src/redis.c +++ b/src/redis.c @@ -3108,6 +3108,8 @@ int main(int argc, char **argv) { redisLog(REDIS_NOTICE,"The server is now ready to accept connections on port %d", server.port); if (server.sofd > 0) redisLog(REDIS_NOTICE,"The server is now ready to accept connections at %s", server.unixsocket); + } else { + redisLog(REDIS_WARNING,"Sentinel runid is %s", server.runid); } /* Warning the user about suspicious maxmemory setting. */ diff --git a/src/sentinel.c b/src/sentinel.c index 7abd1cfb..972f8921 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2359,8 +2359,9 @@ void sentinelCommand(redisClient *c) { (ri->flags & SRI_MASTER)) isdown = 1; - /* Vote for the master (or fetch the previous vote) */ - if (ri && ri->flags & SRI_MASTER) { + /* Vote for the master (or fetch the previous vote) if the request + * includes a runid, otherwise the sender is not seeking for a vote. */ + if (ri && ri->flags & SRI_MASTER && strcasecmp(c->argv[5]->ptr,"*")) { leader = sentinelVoteLeader(ri,(uint64_t)req_epoch, c->argv[5]->ptr, &leader_epoch); @@ -2370,7 +2371,7 @@ void sentinelCommand(redisClient *c) { * down state, leader, vote epoch. */ addReplyMultiBulkLen(c,3); addReply(c, isdown ? shared.cone : shared.czero); - addReplyBulkCString(c, leader ? leader : "?"); + addReplyBulkCString(c, leader ? leader : "*"); addReplyLongLong(c, (long long)leader_epoch); if (leader) sdsfree(leader); } else if (!strcasecmp(c->argv[1]->ptr,"reset")) { @@ -2605,9 +2606,13 @@ void sentinelReceiveIsMasterDownReply(redisAsyncContext *c, void *reply, void *p } else { ri->flags &= ~SRI_MASTER_DOWN; } - sdsfree(ri->leader); - ri->leader = sdsnew(r->element[1]->str); - ri->leader_epoch = r->element[2]->integer; + if (strcmp(r->element[1]->str,"*")) { + /* If the runid in the reply is not "*" the Sentinel actually + * replied with a vote. */ + sdsfree(ri->leader); + ri->leader = sdsnew(r->element[1]->str); + ri->leader_epoch = r->element[2]->integer; + } } } @@ -2660,7 +2665,8 @@ void sentinelAskMasterStateToOtherSentinels(sentinelRedisInstance *master, int f "SENTINEL is-master-down-by-addr %s %s %llu %s", master->addr->ip, port, sentinel.current_epoch, - server.runid); + (master->failover_state > SENTINEL_FAILOVER_STATE_NONE) ? + server.runid : "*"); if (retval == REDIS_OK) ri->pending_commands++; } dictReleaseIterator(di); From b1f5a0b3ece05de66b22e7544c63ac7659b6d06b Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Nov 2013 17:58:11 +0100 Subject: [PATCH 47/53] CONFIG REWRITE: don't add the signature if it already exists. At the end of the file, CONFIG REWRITE adds a comment line that: # Generated by CONFIG REWRITE Followed by the additional config options required. However this was added again and again at every rewrite in praticular conditions (when a given set of options change in a given time during the time). Now if it was alrady encountered, it is not added a second time. This is especially important for Sentinel that rewrites the config at every state change. --- src/config.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/config.c b/src/config.c index d39546e1..4f7ddda9 100644 --- a/src/config.c +++ b/src/config.c @@ -1154,6 +1154,8 @@ void configGetCommand(redisClient *c) { * */ +#define REDIS_CONFIG_REWRITE_SIGNATURE "# Generated by CONFIG REWRITE" + /* We use the following dictionary type to store where a configuration * option is mentioned in the old configuration file, so it's * like "maxmemory" -> list of line numbers (first line is zero). */ @@ -1230,6 +1232,8 @@ struct rewriteConfigState *rewriteConfigReadOldFile(char *path) { /* Handle comments and empty lines. */ if (line[0] == '#' || line[0] == '\0') { + if (!state->has_tail && !strcmp(line,REDIS_CONFIG_REWRITE_SIGNATURE)) + state->has_tail = 1; rewriteConfigAppendLine(state,line); continue; } @@ -1301,7 +1305,7 @@ void rewriteConfigRewriteLine(struct rewriteConfigState *state, char *option, sd /* Append a new line. */ if (!state->has_tail) { rewriteConfigAppendLine(state, - sdsnew("# Generated by CONFIG REWRITE")); + sdsnew(REDIS_CONFIG_REWRITE_SIGNATURE)); state->has_tail = 1; } rewriteConfigAppendLine(state,line); From a6ebd910d80c0ecfb6c510e06b63dcee926ff07c Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 20 Nov 2013 15:52:44 +0100 Subject: [PATCH 48/53] Sentinel: take the replication offset in slaves state. --- src/sentinel.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index 972f8921..1fef26df 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -165,10 +165,11 @@ typedef struct sentinelRedisInstance { mstime_t master_link_down_time; /* Slave replication link down time. */ int slave_priority; /* Slave priority according to its INFO output. */ mstime_t slave_reconf_sent_time; /* Time at which we sent SLAVE OF */ - struct sentinelRedisInstance *master; /* Master instance if SRI_SLAVE is set. */ + struct sentinelRedisInstance *master; /* Master instance if it's slave. */ char *slave_master_host; /* Master host as reported by INFO */ int slave_master_port; /* Master port as reported by INFO */ int slave_master_link_status; /* Master link status as reported by INFO */ + unsigned long long slave_repl_offset; /* Slave replication offset. */ /* Failover */ char *leader; /* If this is a master instance, this is the runid of the Sentinel that should perform the failover. If @@ -900,6 +901,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * ri->slave_master_host = NULL; ri->slave_master_port = 0; ri->slave_master_link_status = SENTINEL_MASTER_LINK_STATUS_DOWN; + ri->slave_repl_offset = 0; ri->sentinels = dictCreate(&instancesDictType,NULL); ri->quorum = quorum; ri->parallel_syncs = SENTINEL_DEFAULT_PARALLEL_SYNCS; @@ -1738,6 +1740,10 @@ void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) { /* slave_priority: */ if (sdslen(l) >= 15 && !memcmp(l,"slave_priority:",15)) ri->slave_priority = atoi(l+15); + + /* slave_repl_offset: */ + if (sdslen(l) >= 18 && !memcmp(l,"slave_repl_offset:",18)) + ri->slave_repl_offset = strtoull(l+18,NULL,10); } } ri->info_refresh = mstime(); @@ -2262,6 +2268,10 @@ void addReplySentinelRedisInstance(redisClient *c, sentinelRedisInstance *ri) { addReplyBulkCString(c,"slave-priority"); addReplyBulkLongLong(c,ri->slave_priority); fields++; + + addReplyBulkCString(c,"slave-repl-offset"); + addReplyBulkLongLong(c,ri->slave_repl_offset); + fields++; } /* Only sentinels */ From 0101c2bcfe6f03db10a4ee625209cc46ad4f8c8b Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 20 Nov 2013 16:05:36 +0100 Subject: [PATCH 49/53] Sentinel: select slave with best (greater) replication offset. --- src/sentinel.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 1fef26df..51614190 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2903,6 +2903,9 @@ int sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) { * NULL if no suitable slave was found. */ +/* Helper for sentinelSelectSlave(). This is used by qsort() in order to + * sort suitable slaves in a "better first" order, to take the first of + * the list. */ int compareSlavesForPromotion(const void *a, const void *b) { sentinelRedisInstance **sa = (sentinelRedisInstance **)a, **sb = (sentinelRedisInstance **)b; @@ -2911,8 +2914,16 @@ int compareSlavesForPromotion(const void *a, const void *b) { if ((*sa)->slave_priority != (*sb)->slave_priority) return (*sa)->slave_priority - (*sb)->slave_priority; - /* If priority is the same, select the slave with that has the - * lexicographically smaller runid. Note that we try to handle runid + /* If priority is the same, select the slave with greater replication + * offset (processed more data frmo the master). */ + if ((*sa)->slave_repl_offset > (*sb)->slave_repl_offset) { + return -1; /* a < b */ + } else if ((*sa)->slave_repl_offset < (*sb)->slave_repl_offset) { + return 1; /* b > a */ + } + + /* If the replication offset is the same select the slave with that has + * the lexicographically smaller runid. Note that we try to handle runid * == NULL as there are old Redis versions that don't publish runid in * INFO. A NULL runid is considered bigger than any other runid. */ sa_runid = (*sa)->runid; From 8810167d13cb5d7aef12c2ee1d7c48895cb79626 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 21 Nov 2013 11:31:06 +0100 Subject: [PATCH 50/53] Sentinel: Hello message sending code refactored. --- src/sentinel.c | 62 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index 51614190..9ab91d0b 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2049,6 +2049,46 @@ void sentinelReceiveHelloMessages(redisAsyncContext *c, void *reply, void *privd } } +/* Send an "Hello" message via Pub/Sub to the specified 'ri' Redis + * instance in order to broadcast the current configuraiton for this + * master, and to advertise the existence of this Sentinel at the same time. + * + * The message has the following format: + * + * sentinel_ip,sentinel_port,sentinel_runid,current_epoch, + * master_name,master_ip,master_port,master_config_epoch. + * + * Returns REDIS_OK if the PUBLISH was queued correctly, otherwise + * REDIS_ERR is returned. */ +int sentinelSendHello(sentinelRedisInstance *ri) { + char ip[REDIS_IP_STR_LEN]; + char payload[REDIS_IP_STR_LEN+1024]; + int retval; + sentinelRedisInstance *master = (ri->flags & SRI_MASTER) ? ri : ri->master; + sentinelAddr *master_addr = sentinelGetCurrentMasterAddress(master); + + /* Try to obtain our own IP address. */ + if (anetSockName(ri->cc->c.fd,ip,sizeof(ip),NULL) == -1) return REDIS_ERR; + + /* Format and send the Hello message. */ + snprintf(payload,sizeof(payload), + "%s,%d,%s,%llu," /* Info about this sentinel. */ + "%s,%s,%d,%lld", /* Info about current master. */ + ip, server.port, server.runid, + (unsigned long long) sentinel.current_epoch, + /* --- */ + master->name,master_addr->ip,master_addr->port, + master->config_epoch); + retval = redisAsyncCommand(ri->cc, + sentinelPublishReplyCallback, NULL, "PUBLISH %s %s", + SENTINEL_HELLO_CHANNEL,payload); + if (retval != REDIS_OK) return REDIS_ERR; + ri->pending_commands++; + return REDIS_OK; +} + +/* Send periodic PING, INFO, and PUBLISH to the Hello channel to + * the specified master or slave instance. */ void sentinelPingInstance(sentinelRedisInstance *ri) { mstime_t now = mstime(); mstime_t info_period; @@ -2096,27 +2136,7 @@ void sentinelPingInstance(sentinelRedisInstance *ri) { (now - ri->last_pub_time) > SENTINEL_PUBLISH_PERIOD) { /* PUBLISH hello messages to masters and slaves. */ - char ip[REDIS_IP_STR_LEN]; - if (anetSockName(ri->cc->c.fd,ip,sizeof(ip),NULL) != -1) { - char payload[REDIS_IP_STR_LEN+1024]; - sentinelRedisInstance *master = (ri->flags & SRI_MASTER) ? - ri : ri->master; - sentinelAddr *master_addr = sentinelGetCurrentMasterAddress(master); - - snprintf(payload,sizeof(payload), - "%s,%d,%s,%llu," /* Info about this sentinel. */ - "%s,%s,%d,%lld", /* Info about current master. */ - ip, server.port, server.runid, - (unsigned long long) sentinel.current_epoch, - /* --- */ - master->name,master_addr->ip,master_addr->port, - master->config_epoch); - retval = redisAsyncCommand(ri->cc, - sentinelPublishReplyCallback, NULL, "PUBLISH %s %s", - SENTINEL_HELLO_CHANNEL,payload); - if (retval != REDIS_OK) return; - ri->pending_commands++; - } + sentinelSendHello(ri); } } From d920177f8ddadc4af7fbc8a02438340eea5ca1f3 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 21 Nov 2013 11:35:50 +0100 Subject: [PATCH 51/53] Sentinel: check for disconnected links in sentinelSendHello(). Does not fix any bug as the test is performed by the caller, but better to have the check. --- src/sentinel.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sentinel.c b/src/sentinel.c index 9ab91d0b..b963da10 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2069,6 +2069,7 @@ int sentinelSendHello(sentinelRedisInstance *ri) { /* Try to obtain our own IP address. */ if (anetSockName(ri->cc->c.fd,ip,sizeof(ip),NULL) == -1) return REDIS_ERR; + if (ri->flags & SRI_DISCONNECTED) return; /* Format and send the Hello message. */ snprintf(payload,sizeof(payload), From 297de1ab26789f8a30a5d4c8a188636a5b9ea821 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 21 Nov 2013 12:27:14 +0100 Subject: [PATCH 52/53] Sentinel: test for writable config file. This commit introduces a funciton called when Sentinel is ready for normal operations to avoid putting Sentinel specific stuff in redis.c. --- src/redis.c | 2 +- src/redis.h | 1 + src/sentinel.c | 13 ++++++++++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/src/redis.c b/src/redis.c index af24505b..76ca2a21 100644 --- a/src/redis.c +++ b/src/redis.c @@ -3109,7 +3109,7 @@ int main(int argc, char **argv) { if (server.sofd > 0) redisLog(REDIS_NOTICE,"The server is now ready to accept connections at %s", server.unixsocket); } else { - redisLog(REDIS_WARNING,"Sentinel runid is %s", server.runid); + sentinelIsRunning(); } /* Warning the user about suspicious maxmemory setting. */ diff --git a/src/redis.h b/src/redis.h index bdc69522..76f7dd3b 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1223,6 +1223,7 @@ void initSentinelConfig(void); void initSentinel(void); void sentinelTimer(void); char *sentinelHandleConfiguration(char **argv, int argc); +void sentinelIsRunning(void); /* Scripting */ void scriptingInit(void); diff --git a/src/sentinel.c b/src/sentinel.c index b963da10..ca6d0eb5 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -413,6 +413,17 @@ void initSentinel(void) { sentinel.scripts_queue = listCreate(); } +/* This function gets called when the server is in Sentinel mode, started, + * loaded the configuration, and is ready for normal operations. */ +void sentinelIsRunning(void) { + redisLog(REDIS_WARNING,"Sentinel runid is %s", server.runid); + + if (server.configfile == NULL || access(server.configfile,W_OK) == -1) { + redisLog(REDIS_WARNING,"Sentinel started without a config file, or config file not writable. Exiting..."); + exit(1); + } +} + /* ============================== sentinelAddr ============================== */ /* Create a sentinelAddr object and return it on success. @@ -2069,7 +2080,7 @@ int sentinelSendHello(sentinelRedisInstance *ri) { /* Try to obtain our own IP address. */ if (anetSockName(ri->cc->c.fd,ip,sizeof(ip),NULL) == -1) return REDIS_ERR; - if (ri->flags & SRI_DISCONNECTED) return; + if (ri->flags & SRI_DISCONNECTED) return REDIS_ERR; /* Format and send the Hello message. */ snprintf(payload,sizeof(payload), From f55ad3038fa4c7130fe59392e2a256432b546f5d Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 21 Nov 2013 12:39:47 +0100 Subject: [PATCH 53/53] Sentinel: manual failover works again. --- src/sentinel.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/sentinel.c b/src/sentinel.c index ca6d0eb5..134e8ee9 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -2452,6 +2452,8 @@ void sentinelCommand(redisClient *c) { addReplySds(c,sdsnew("-NOGOODSLAVE No suitable slave to promote\r\n")); return; } + redisLog(REDIS_WARNING,"Executing user requested FAILOVER of '%s'", + ri->name); sentinelStartFailover(ri); ri->flags |= SRI_FORCE_FAILOVER; addReply(c,shared.ok); @@ -3017,8 +3019,9 @@ void sentinelFailoverWaitStart(sentinelRedisInstance *ri) { isleader = leader && strcasecmp(leader,server.runid) == 0; sdsfree(leader); - /* If I'm not the leader, I can't continue with the failover. */ - if (!isleader) { + /* If I'm not the leader, and it is not a forced failover via + * SENTINEL FAILOVER, then I can't continue with the failover. */ + if (!isleader && !(ri->flags & SRI_FORCE_FAILOVER)) { int election_timeout = SENTINEL_ELECTION_TIMEOUT; /* The election timeout is the MIN between SENTINEL_ELECTION_TIMEOUT