Cluster: replica migration with delay.

We wait a fixed amount of time (5 seconds currently) much greater than the usual Cluster node to node communication latency, before migrating. This way when a failover occurs, before detecting the new master as a target for migration, we give the time to its natural slaves (the slaves of the failed over master) to announce they switched to the new master, preventing an useless migration operation.
2025-05-05 07:22:15 +00:00 · 2015-12-11 09:19:06 +01:00 · 2015-12-11 09:19:06 +01:00 · adc2fe6993
commit adc2fe6993
parent 41db54a557
2 changed files with 39 additions and 17 deletions
--- a/src/cluster.c
+++ b/src/cluster.c
@ -671,6 +671,7 @@ clusterNode *createClusterNode(char *nodename, int flags) {
    node->port = 0;
    node->fail_reports = listCreate();
    node->voted_time = 0;
    node->orphaned_time = 0;
    node->repl_offset_time = 0;
    node->repl_offset = 0;
    listSetFreeMethod(node->fail_reports,zfree);
@ -2910,30 +2911,44 @@ void clusterHandleSlaveMigration(int max_slaves) {
    /* Step 3: Idenitfy a candidate for migration, and check if among the
     * masters with the greatest number of ok slaves, I'm the one with the
-     * smaller node ID.
+     * smallest node ID (the "candidate slave").
     *
-     * Note that this means that eventually a replica migration will occurr
+     * Note: this means that eventually a replica migration will occurr
     * since slaves that are reachable again always have their FAIL flag
-     * cleared. At the same time this does not mean that there are no
+     * cleared, so eventually there must be a candidate. At the same time
-     * race conditions possible (two slaves migrating at the same time), but
+     * this does not mean that there are no race conditions possible (two
-     * this is extremely unlikely to happen, and harmless. */
+     * slaves migrating at the same time), but this is unlikely to
     * happen, and harmless when happens. */
    candidate = myself;
    di = dictGetSafeIterator(server.cluster->nodes);
    while((de = dictNext(di)) != NULL) {
        clusterNode *node = dictGetVal(de);
-        int okslaves;
+        int okslaves = 0, is_orphaned = 1;
-        /* Only iterate over working masters. */
+        /* We want to migrate only if this master is working, orphaned, and
-        if (nodeIsSlave(node) || nodeFailed(node)) continue;
+         * used to have slaves or if failed over a master that had slaves
-        /* We want to migrate only if this master used to have slaves or
+         * (MIGRATE_TO flag). This way we only migrate to instances that were
-         * if failed over a master that had slaves. This way we only migrate
+         * supposed to have replicas. */
-         * to instances that were supposed to have replicas. */
+        if (nodeIsSlave(node) || nodeFailed(node)) is_orphaned = 0;
-        if (!(node->flags & CLUSTER_NODE_MIGRATE_TO)) continue;
+        if (!(node->flags & CLUSTER_NODE_MIGRATE_TO)) is_orphaned = 0;
        okslaves = clusterCountNonFailingSlaves(node);
-        if (okslaves == 0 && target == NULL && node->numslots > 0)
+        /* Check number of working slaves. */
-            target = node;
+        if (nodeIsMaster(node)) okslaves = clusterCountNonFailingSlaves(node);
        if (okslaves > 0) is_orphaned = 0;
        if (is_orphaned) {
            if (!target && node->numslots > 0) target = node;
            /* Track the starting time of the orphaned condition for this
             * master. */
            if (!node->orphaned_time) node->orphaned_time = mstime();
        } else {
            node->orphaned_time = 0;
        }
        /* Check if I'm the slave candidate for the migration: attached
         * to a master with the maximum number of slaves and with the smallest
         * node ID. */
        if (okslaves == max_slaves) {
            for (j = 0; j < node->numslaves; j++) {
                if (memcmp(node->slaves[j]->name,
@ -2948,8 +2963,13 @@ void clusterHandleSlaveMigration(int max_slaves) {
    dictReleaseIterator(di);
    /* Step 4: perform the migration if there is a target, and if I'm the
-     * candidate. */
+     * candidate, but only if the master is continuously orphaned for a
-    if (target && candidate == myself) {
+     * couple of seconds, so that during failovers, we give some time to
     * the natural slaves of this instance to advertise their switch from
     * the old master to the new one. */
    if (target && candidate == myself &&
        (mstime()-target->orphaned_time) > CLUSTER_SLAVE_MIGRATION_DELAY)
    {
        serverLog(LL_WARNING,"Migrating to orphaned master %.40s",
            target->name);
        clusterSetMaster(target);
--- a/src/cluster.h
+++ b/src/cluster.h
@ -23,6 +23,7 @@
 #define CLUSTER_DEFAULT_MIGRATION_BARRIER 1
 #define CLUSTER_MF_TIMEOUT 5000 /* Milliseconds to do a manual failover. */
 #define CLUSTER_MF_PAUSE_MULT 2 /* Master pause manual failover mult. */
 #define CLUSTER_SLAVE_MIGRATION_DELAY 5000 /* Delay for slave migration. */
 /* Redirection errors returned by getNodeByQuery(). */
 #define CLUSTER_REDIR_NONE 0          /* Node can serve the request. */
@ -93,6 +94,7 @@ typedef struct clusterNode {
    mstime_t fail_time;      /* Unix time when FAIL flag was set */
    mstime_t voted_time;     /* Last time we voted for a slave of this master */
    mstime_t repl_offset_time;  /* Unix time we received offset for this node */
    mstime_t orphaned_time;     /* Starting time of orphaned master condition */
    long long repl_offset;      /* Last known repl offset for this node. */
    char ip[NET_IP_STR_LEN];  /* Latest known IP address of this node */
    int port;                   /* Latest known port of this node */