From d728ec6deefa10c4bdb99b190e47368f7f1747f2 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 5 Mar 2013 12:13:39 +0100 Subject: [PATCH] Cluster: send a ping to every node we never contacted in timeout/2 seconds. Usually we try to send just 1 ping every second, however when we detect we are going to have unreliable failure detection because we can't ping some node in time, send an additional ping. This should only happen with very large clusters or when the the node timeout is set to a very low value. --- src/cluster.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index fe8675f2..327cbecc 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1279,22 +1279,33 @@ void clusterCron(void) { di = dictGetIterator(server.cluster->nodes); while((de = dictNext(di)) != NULL) { clusterNode *node = dictGetVal(de); + time_t now = time(NULL); int delay; if (node->flags & (REDIS_NODE_MYSELF|REDIS_NODE_NOADDR|REDIS_NODE_HANDSHAKE)) continue; + /* Check only if we already sent a ping and did not received * a reply yet. */ if (node->ping_sent == 0 || node->ping_sent <= node->pong_received) continue; + /* If our ping is older than half the cluster timeout (may happen + * in a cluster with many nodes), send a new ping. */ + if (node->link && + (now - node->ping_sent) > server.cluster->node_timeout/2) + { + clusterSendPing(node->link, CLUSTERMSG_TYPE_PING); + continue; + } + /* If we never received a pong, use the ping time to compute * the delay. */ if (node->pong_received) { - delay = time(NULL) - node->pong_received; + delay = now - node->pong_received; } else { - delay = time(NULL) - node->ping_sent; + delay = now - node->ping_sent; } if (delay < server.cluster->node_timeout) {