Cluster: send a ping to every node we never contacted in timeout/2 seconds.

Usually we try to send just 1 ping every second, however when we detect we are going to have unreliable failure detection because we can't ping some node in time, send an additional ping. This should only happen with very large clusters or when the the node timeout is set to a very low value.
2025-05-04 15:02:13 +00:00 · 2013-03-05 12:13:39 +01:00 · 2013-03-05 12:13:39 +01:00 · d728ec6dee
commit d728ec6dee
parent e7628be2a7
1 changed files with 13 additions and 2 deletions
--- a/src/cluster.c
+++ b/src/cluster.c
@ -1279,22 +1279,33 @@ void clusterCron(void) {
    di = dictGetIterator(server.cluster->nodes);
    while((de = dictNext(di)) != NULL) {
        clusterNode *node = dictGetVal(de);
+        time_t now = time(NULL);
        int delay;

        if (node->flags &
            (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR|REDIS_NODE_HANDSHAKE))
                continue;
+
        /* Check only if we already sent a ping and did not received
         * a reply yet. */
        if (node->ping_sent == 0 ||
            node->ping_sent <= node->pong_received) continue;

+        /* If our ping is older than half the cluster timeout (may happen
+         * in a cluster with many nodes), send a new ping. */
+        if (node->link &&
+            (now - node->ping_sent) > server.cluster->node_timeout/2)
+        {
+            clusterSendPing(node->link, CLUSTERMSG_TYPE_PING);
+            continue;
+        }
+
        /* If we never received a pong, use the ping time to compute
         * the delay. */
        if (node->pong_received) {
-            delay = time(NULL) - node->pong_received;
+            delay = now - node->pong_received;
        } else {
-            delay = time(NULL) - node->ping_sent;
+            delay = now - node->ping_sent;
        }

        if (delay < server.cluster->node_timeout) {