Merge branch 'clusterfixes' into unstable

2025-03-19 00:50:50 +00:00 · 2015-01-21 19:30:22 +01:00 · 2015-01-21 19:30:22 +01:00 · af8d1b4bda
commit af8d1b4bda
parent 4433f5a7f2 7e79b3f51a
5 changed files with 59 additions and 19 deletions
--- a/src/cluster.c
+++ b/src/cluster.c
@ -783,8 +783,11 @@ int clusterNodeRemoveSlave(clusterNode *master, clusterNode *slave) {

    for (j = 0; j < master->numslaves; j++) {
        if (master->slaves[j] == slave) {
-            memmove(master->slaves+j,master->slaves+(j+1),
-                (master->numslaves-1)-j);
+            if ((j+1) < master->numslaves) {
+                int remaining_slaves = (master->numslaves - j) - 1;
+                memmove(master->slaves+j,master->slaves+(j+1),
+                        (sizeof(*master->slaves) * remaining_slaves));
+            }
            master->numslaves--;
            return REDIS_OK;
        }
@ -819,15 +822,30 @@ int clusterCountNonFailingSlaves(clusterNode *n) {
    return okslaves;
 }

+/* Low level cleanup of the node structure. Only called by clusterDelNode(). */
 void freeClusterNode(clusterNode *n) {
    sds nodename;
+    int j;

+    /* If the node is a master with associated slaves, we have to set
+     * all the slaves->slaveof fields to NULL (unknown). */
+    if (nodeIsMaster(n)) {
+        for (j = 0; j < n->numslaves; j++)
+            n->slaves[j]->slaveof = NULL;
+    }
+
+    /* Remove this node from the list of slaves of its master. */
+    if (nodeIsSlave(n) && n->slaveof) clusterNodeRemoveSlave(n->slaveof,n);
+
+    /* Unlink from the set of nodes. */
    nodename = sdsnewlen(n->name, REDIS_CLUSTER_NAMELEN);
    redisAssert(dictDelete(server.cluster->nodes,nodename) == DICT_OK);
    sdsfree(nodename);
-    if (n->slaveof) clusterNodeRemoveSlave(n->slaveof, n);
+
+    /* Release link and associated data structures. */
    if (n->link) freeClusterLink(n->link);
    listRelease(n->fail_reports);
+    zfree(n->slaves);
    zfree(n);
 }

@ -840,11 +858,16 @@ int clusterAddNode(clusterNode *node) {
    return (retval == DICT_OK) ? REDIS_OK : REDIS_ERR;
 }

-/* Remove a node from the cluster:
- * 1) Mark all the nodes handled by it as unassigned.
- * 2) Remove all the failure reports sent by this node.
- * 3) Free the node, that will in turn remove it from the hash table
- *    and from the list of slaves of its master, if it is a slave node.
+/* Remove a node from the cluster. The functio performs the high level
+ * cleanup, calling freeClusterNode() for the low level cleanup.
+ * Here we do the following:
+ *
+ * 1) Mark all the slots handled by it as unassigned.
+ * 2) Remove all the failure reports sent by this node and referenced by
+ *    other nodes.
+ * 3) Free the node with freeClusterNode() that will in turn remove it
+ *    from the hash table and from the list of slaves of its master, if
+ *    it is a slave node.
 */
 void clusterDelNode(clusterNode *delnode) {
    int j;
@ -871,11 +894,7 @@ void clusterDelNode(clusterNode *delnode) {
    }
    dictReleaseIterator(di);

-    /* 3) Remove this node from its master's slaves if needed. */
-    if (nodeIsSlave(delnode) && delnode->slaveof)
-        clusterNodeRemoveSlave(delnode->slaveof,delnode);
-
-    /* 4) Free the node, unlinking it from the cluster. */
+    /* 3) Free the node, unlinking it from the cluster. */
    freeClusterNode(delnode);
 }

@ -1234,7 +1253,7 @@ void nodeIp2String(char *buf, clusterLink *link) {
 * The function returns 0 if the node address is still the same,
 * otherwise 1 is returned. */
 int nodeUpdateAddressIfNeeded(clusterNode *node, clusterLink *link, int port) {
-    char ip[REDIS_IP_STR_LEN];
+    char ip[REDIS_IP_STR_LEN] = {0};

    /* We don't proceed if the link is the same as the sender link, as this
     * function is designed to see if the node link is consistent with the
@ -1611,7 +1630,7 @@ int clusterProcessPacket(clusterLink *link) {
                    }
                    /* Free this node as we already have it. This will
                     * cause the link to be freed as well. */
-                    freeClusterNode(link->node);
+                    clusterDelNode(link->node);
                    return 0;
                }

@ -2784,6 +2803,7 @@ void clusterHandleSlaveMigration(int max_slaves) {
            }
        }
    }
+    dictReleaseIterator(di);

    /* Step 4: perform the migration if there is a target, and if I'm the
     * candidate. */
@ -2905,7 +2925,7 @@ void clusterCron(void) {
        /* A Node in HANDSHAKE state has a limited lifespan equal to the
         * configured node timeout. */
        if (nodeInHandshake(node) && now - node->ctime > handshake_timeout) {
-            freeClusterNode(node);
+            clusterDelNode(node);
            continue;
        }

--- a/tests/cluster/run.tcl
+++ b/tests/cluster/run.tcl
@ -21,6 +21,7 @@ proc main {} {

 if {[catch main e]} {
    puts $::errorInfo
+    if {$::pause_on_error} pause_on_error
    cleanup
    exit 1
 }
--- a/tests/cluster/tests/includes/init-tests.tcl
+++ b/tests/cluster/tests/includes/init-tests.tcl
@ -28,8 +28,10 @@ test "Cluster nodes are reachable" {
 test "Cluster nodes hard reset" {
    foreach_redis_id id {
        catch {R $id flushall} ; # May fail for readonly slaves.
+        R $id MULTI
        R $id cluster reset hard
        R $id cluster set-config-epoch [expr {$id+1}]
+        R $id EXEC
        R $id config set cluster-node-timeout 3000
        R $id config set cluster-slave-validity-factor 10
        R $id config rewrite
--- a/tests/instances.tcl
+++ b/tests/instances.tcl
@ -68,7 +68,7 @@ proc spawn_instance {type base_port count {conf {}}} {
        }

        if {$::valgrind} {
-            set pid [exec valgrind --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile &]
+            set pid [exec valgrind --track-origins=yes --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile &]
        } else {
            set pid [exec ../../../src/${prgname} $cfgfile &]
        }
@ -105,6 +105,7 @@ proc cleanup {} {
 proc abort_sentinel_test msg {
    puts "WARNING: Aborting the test."
    puts ">>>>>>>> $msg"
+    if {$::pause_on_error} pause_on_error
    cleanup
    exit 1
 }
@ -369,15 +370,31 @@ proc get_instance_id_by_port {type port} {
 # The instance can be restarted with restart-instance.
 proc kill_instance {type id} {
    set pid [get_instance_attrib $type $id pid]
+    set port [get_instance_attrib $type $id port]
+
    if {$pid == -1} {
        error "You tried to kill $type $id twice."
    }
+
    exec kill -9 $pid
    set_instance_attrib $type $id pid -1
    set_instance_attrib $type $id link you_tried_to_talk_with_killed_instance

    # Remove the PID from the list of pids to kill at exit.
    set ::pids [lsearch -all -inline -not -exact $::pids $pid]
+
+    # Wait for the port it was using to be available again, so that's not
+    # an issue to start a new server ASAP with the same port.
+    set retry 10
+    while {[incr retry -1]} {
+        set port_is_free [catch {set s [socket 127.0.01 $port]}]
+        if {$port_is_free} break
+        catch {close $s}
+        after 1000
+    }
+    if {$retry == 0} {
+        error "Port $port does not return available after killing instance."
+    }
 }

 # Return true of the instance of the specified type/id is killed.
@ -401,7 +418,7 @@ proc restart_instance {type id} {
    }

    if {$::valgrind} {
-        set pid [exec valgrind --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile &]
+        set pid [exec valgrind --track-origins=yes --suppressions=../../../src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full ../../../src/${prgname} $cfgfile &]
    } else {
        set pid [exec ../../../src/${prgname} $cfgfile &]
    }
--- a/tests/support/server.tcl
+++ b/tests/support/server.tcl
@ -207,7 +207,7 @@ proc start_server {options {code undefined}} {
    set stderr [format "%s/%s" [dict get $config "dir"] "stderr"]

    if {$::valgrind} {
-        set pid [exec valgrind --suppressions=src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full src/redis-server $config_file > $stdout 2> $stderr &]
+        set pid [exec valgrind --track-origins=yes --suppressions=src/valgrind.sup --show-reachable=no --show-possibly-lost=no --leak-check=full src/redis-server $config_file > $stdout 2> $stderr &]
    } else {
        set pid [exec src/redis-server $config_file > $stdout 2> $stderr &]
    }