Cluster: better cluster state transiction handling.

Before we relied on the global cluster state to make sure all the hash slots are linked to some node, when getNodeByQuery() is called. So finding the hash slot unbound was checked with an assertion. However this is fragile. The cluster state is often updated in the clusterBeforeSleep() function, and not ASAP on state change, so it may happen to process clients with a cluster state that is 'ok' but yet certain hash slots set to NULL. With this commit the condition is also checked in getNodeByQuery() and reported with a identical error code of -CLUSTERDOWN but slightly different error message so that we have more debugging clue in the future. Root cause of issue #2288.
2025-03-19 00:50:50 +00:00 · 2015-03-20 09:57:10 +01:00 · 2015-03-20 09:57:10 +01:00 · 25c0f5ac63
commit 25c0f5ac63
parent 2ecb5edf34
3 changed files with 21 additions and 2 deletions
--- a/src/cluster.c
+++ b/src/cluster.c
@ -4703,7 +4703,12 @@ void readwriteCommand(redisClient *c) {
 *
 * REDIS_CLUSTER_REDIR_UNSTABLE if the request contains mutliple keys
 * belonging to the same slot, but the slot is not stable (in migration or
- * importing state, likely because a resharding is in progress). */
+ * importing state, likely because a resharding is in progress).
+ *
+ * REDIS_CLUSTER_REDIR_DOWN if the request addresses a slot which is not
+ * bound to any node. In this case the cluster global state should be already
+ * "down" but it is fragile to rely on the update of the global state, so
+ * we also handle it here. */
 clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *error_code) {
    clusterNode *n = NULL;
    robj *firstkey = NULL;
@ -4757,7 +4762,18 @@ clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **arg
                firstkey = thiskey;
                slot = thisslot;
                n = server.cluster->slots[slot];
-                redisAssertWithInfo(c,firstkey,n != NULL);
+
+                /* Error: If a slot is not served, we are in "cluster down"
+                 * state. However the state is yet to be updated, so this was
+                 * not trapped earlier in processCommand(). Report the same
+                 * error to the client. */
+                if (n == NULL) {
+                    getKeysFreeResult(keyindex);
+                    if (error_code)
+                        *error_code = REDIS_CLUSTER_REDIR_DOWN;
+                    return NULL;
+                }
+
                /* If we are migrating or importing this slot, we need to check
                 * if we have all the keys in the request (the only way we
                 * can safely serve the request, otherwise we return a TRYAGAIN
--- a/src/cluster.h
+++ b/src/cluster.h
@ -30,6 +30,7 @@
 #define REDIS_CLUSTER_REDIR_UNSTABLE 2      /* Keys in slot resharding. */
 #define REDIS_CLUSTER_REDIR_ASK 3           /* -ASK redirection required. */
 #define REDIS_CLUSTER_REDIR_MOVED 4         /* -MOVED redirection required. */
+#define REDIS_CLUSTER_REDIR_DOWN 5          /* -CLUSTERDOWN error. */

 struct clusterNode;

--- a/src/redis.c
+++ b/src/redis.c
@ -2219,6 +2219,8 @@ int processCommand(redisClient *c) {
                     * but the slot is not "stable" currently as there is
                     * a migration or import in progress. */
                    addReplySds(c,sdsnew("-TRYAGAIN Multiple keys request during rehashing of slot\r\n"));
+                } else if (error_code == REDIS_CLUSTER_REDIR_DOWN) {
+                    addReplySds(c,sdsnew("-CLUSTERDOWN The cluster is down. Hash slot is unbound\r\n"));
                } else {
                    redisPanic("getNodeByQuery() unknown error.");
                }