From 210277860645b15bf79815a47fe35f778c83ebc7 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 8 May 2014 16:38:53 +0200 Subject: [PATCH] Sentinel: log when a failover will be attempted again. When a Sentinel performs a failover (successful or not), or when a Sentinel votes for a different Sentinel trying to start a failover, it sets a min delay before it will try to get elected for a failover. While not strictly needed, because if multiple Sentinels will try to failover the same master at the same time, only one configuration will eventually win, this serialization is practically very useful. Normal failovers are cleaner: one Sentinel starts to failover, the others update their config when the Sentinel performing the failover is able to get the selected slave to move from the role of slave to the one of master. However currently this timeout was implicit, so users could see Sentinels not reacting, after a failed failover, for some time, without giving any feedback in the logs to the poor sysadmin waiting for clues. This commit makes Sentinels more verbose about the delay: when a master is down and a failover attempt is not performed because the delay has still not elaped, something like that will be logged: Next failover delay: I will not start a failover before Thu May 8 16:48:59 2014 --- src/sentinel.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/sentinel.c b/src/sentinel.c index 4f2c6c0f..027817d1 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -183,6 +183,8 @@ typedef struct sentinelRedisInstance { mstime_t failover_state_change_time; mstime_t failover_start_time; /* Last failover attempt start time. */ mstime_t failover_timeout; /* Max time to refresh failover state. */ + mstime_t failover_delay_logged; /* For what failover_start_time value we + logged the failover delay. */ struct sentinelRedisInstance *promoted_slave; /* Promoted slave instance. */ /* Scripts executed to notify admin or reconfigure clients: when they * are set to NULL no script is executed. */ @@ -967,6 +969,7 @@ sentinelRedisInstance *createSentinelRedisInstance(char *name, int flags, char * ri->failover_state_change_time = 0; ri->failover_start_time = 0; ri->failover_timeout = SENTINEL_DEFAULT_FAILOVER_TIMEOUT; + ri->failover_delay_logged = 0; ri->promoted_slave = NULL; ri->notification_script = NULL; ri->client_reconfig_script = NULL; @@ -3252,7 +3255,22 @@ int sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) { /* Last failover attempt started too little time ago? */ if (mstime() - master->failover_start_time < - master->failover_timeout*2) return 0; + master->failover_timeout*2) + { + if (master->failover_delay_logged != master->failover_start_time) { + time_t clock = (master->failover_start_time + + master->failover_timeout*2) / 1000; + char ctimebuf[26]; + + ctime_r(&clock,ctimebuf); + ctimebuf[24] = '\0'; /* Remove newline. */ + master->failover_delay_logged = master->failover_start_time; + redisLog(REDIS_WARNING, + "Next failover delay: I will not start a failover before %s", + ctimebuf); + } + return 0; + } sentinelStartFailover(master); return 1;