From edb3987a061a8425bac8afcffe66293cf819a54c Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Tue, 7 Oct 2014 09:51:55 +0200
Subject: [PATCH] Cluster: more chatty slaves when failover is stalled.

---
 src/cluster.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++---
 src/cluster.h | 10 ++++++
 2 files changed, 94 insertions(+), 4 deletions(-)

diff --git a/src/cluster.c b/src/cluster.c
index 149c9d93..71c7d3ae 100644
--- a/src/cluster.c
+++ b/src/cluster.c
@@ -417,6 +417,7 @@ void clusterInit(void) {
     server.cluster->failover_auth_count = 0;
     server.cluster->failover_auth_rank = 0;
     server.cluster->failover_auth_epoch = 0;
+    server.cluster->cant_failover_reason = REDIS_CLUSTER_CANT_FAILOVER_NONE;
     server.cluster->lastVoteEpoch = 0;
     server.cluster->stats_bus_messages_sent = 0;
     server.cluster->stats_bus_messages_received = 0;
@@ -2430,6 +2431,68 @@ int clusterGetSlaveRank(void) {
     return rank;
 }
 
+/* This function is called by clusterHandleSlaveFailover() in order to
+ * let the slave log why it is not able to failover. Sometimes there are
+ * not the conditions, but since the failover function is called again and
+ * again, we can't log the same things continuously.
+ *
+ * This function works by logging only if a given set of conditions are
+ * true:
+ *
+ * 1) The reason for which the failover can't be initiated changed.
+ *    The reasons also include a NONE reason we reset the state to
+ *    when the slave finds that its master is fine (no FAIL flag).
+ * 2) Also, the log is emitted again if the master is still down and
+ *    the reason for not failing over is still the same, but more than
+ *    REDIS_CLUSTER_CANT_FAILOVER_RELOG_PERIOD seconds elapsed.
+ * 3) Finally, the function only logs if the slave is down for more than
+ *    five seconds + NODE_TIMEOUT. This way nothing is logged when a
+ *    failover starts in a reasonable time.
+ *
+ * The function is called with the reason why the slave can't failover
+ * which is one of the integer macros REDIS_CLUSTER_CANT_FAILOVER_*.
+ *
+ * The function is guaranteed to be called only if 'myself' is a slave. */
+void clusterLogCantFailover(int reason) {
+    char *msg;
+    static time_t lastlog_time = 0;
+    mstime_t nolog_fail_time = server.cluster_node_timeout + 5000;
+
+    /* Don't log if we have the same reason for some time. */
+    if (reason == server.cluster->cant_failover_reason &&
+        time(NULL)-lastlog_time < REDIS_CLUSTER_CANT_FAILOVER_RELOG_PERIOD)
+        return;
+
+    server.cluster->cant_failover_reason = reason;
+
+    /* We also don't emit any log if the master failed no long ago, the
+     * goal of this function is to log slaves in a stalled condition for
+     * a long time. */
+    if (myself->slaveof &&
+        nodeFailed(myself->slaveof) &&
+        (mstime() - myself->slaveof->fail_time) < nolog_fail_time) return;
+
+    switch(reason) {
+    case REDIS_CLUSTER_CANT_FAILOVER_DATA_AGE:
+        msg = "Disconnected from master for longer than allowed.";
+        break;
+    case REDIS_CLUSTER_CANT_FAILOVER_WAITING_DELAY:
+        msg = "Waiting the delay before I can start a new failover.";
+        break;
+    case REDIS_CLUSTER_CANT_FAILOVER_EXPIRED:
+        msg = "Failover attempt expired.";
+        break;
+    case REDIS_CLUSTER_CANT_FAILOVER_WAITING_VOTES:
+        msg = "Waiting for votes, but majority still not reached.";
+        break;
+    default:
+        msg = "Unknown reason code.";
+        break;
+    }
+    lastlog_time = time(NULL);
+    redisLog(REDIS_WARNING,"Currently unable to failover: %s", msg);
+}
+
 /* This function is called if we are a slave node and our master serving
  * a non-zero amount of hash slots is in FAIL state.
  *
@@ -2468,7 +2531,13 @@ void clusterHandleSlaveFailover(void) {
     if (nodeIsMaster(myself) ||
         myself->slaveof == NULL ||
         (!nodeFailed(myself->slaveof) && !manual_failover) ||
-        myself->slaveof->numslots == 0) return;
+        myself->slaveof->numslots == 0)
+    {
+        /* There are no reasons to failover, so we set the reason why we
+         * are returning without failing over to NONE. */
+        server.cluster->cant_failover_reason = REDIS_CLUSTER_CANT_FAILOVER_NONE;
+        return;
+    }
 
     /* Set data_age to the number of seconds we are disconnected from
      * the master. */
@@ -2494,7 +2563,10 @@ void clusterHandleSlaveFailover(void) {
         (((mstime_t)server.repl_ping_slave_period * 1000) +
          (server.cluster_node_timeout * server.cluster_slave_validity_factor)))
     {
-        if (!manual_failover) return;
+        if (!manual_failover) {
+            clusterLogCantFailover(REDIS_CLUSTER_CANT_FAILOVER_DATA_AGE);
+            return;
+        }
     }
 
     /* If the previous failover attempt timedout and the retry time has
@@ -2550,10 +2622,16 @@ void clusterHandleSlaveFailover(void) {
     }
 
     /* Return ASAP if we can't still start the election. */
-    if (mstime() < server.cluster->failover_auth_time) return;
+    if (mstime() < server.cluster->failover_auth_time) {
+        clusterLogCantFailover(REDIS_CLUSTER_CANT_FAILOVER_WAITING_DELAY);
+        return;
+    }
 
     /* Return ASAP if the election is too old to be valid. */
-    if (auth_age > auth_timeout) return;
+    if (auth_age > auth_timeout) {
+        clusterLogCantFailover(REDIS_CLUSTER_CANT_FAILOVER_EXPIRED);
+        return;
+    }
 
     /* Ask for votes if needed. */
     if (server.cluster->failover_auth_sent == 0) {
@@ -2608,6 +2686,8 @@ void clusterHandleSlaveFailover(void) {
 
         /* 6) If there was a manual failover in progress, clear the state. */
         resetManualFailover();
+    } else {
+        clusterLogCantFailover(REDIS_CLUSTER_CANT_FAILOVER_WAITING_VOTES);
     }
 }
 
diff --git a/src/cluster.h b/src/cluster.h
index 3287afe7..b05a30de 100644
--- a/src/cluster.h
+++ b/src/cluster.h
@@ -62,6 +62,14 @@ typedef struct clusterLink {
 #define nodeTimedOut(n) ((n)->flags & REDIS_NODE_PFAIL)
 #define nodeFailed(n) ((n)->flags & REDIS_NODE_FAIL)
 
+/* Reasons why a slave is not able to failover. */
+#define REDIS_CLUSTER_CANT_FAILOVER_NONE 0
+#define REDIS_CLUSTER_CANT_FAILOVER_DATA_AGE 1
+#define REDIS_CLUSTER_CANT_FAILOVER_WAITING_DELAY 2
+#define REDIS_CLUSTER_CANT_FAILOVER_EXPIRED 3
+#define REDIS_CLUSTER_CANT_FAILOVER_WAITING_VOTES 4
+#define REDIS_CLUSTER_CANT_FAILOVER_RELOG_PERIOD (60*5) /* seconds. */
+
 /* This structure represent elements of node->fail_reports. */
 typedef struct clusterNodeFailReport {
     struct clusterNode *node;  /* Node reporting the failure condition. */
@@ -107,6 +115,8 @@ typedef struct clusterState {
     int failover_auth_sent;     /* True if we already asked for votes. */
     int failover_auth_rank;     /* This slave rank for current auth request. */
     uint64_t failover_auth_epoch; /* Epoch of the current election. */
+    int cant_failover_reason;   /* Why a slave is currently not able to
+                                   failover. See the CANT_FAILOVER_* macros. */
     /* Manual failover state in common. */
     mstime_t mf_end;            /* Manual failover time limit (ms unixtime).
                                    It is zero if there is no MF in progress. */