Merge pull request #4412 from soloestoy/bugfix-psync2

PSYNC2: safe free backlog when reach the time limit and others
2025-03-19 09:00:51 +00:00 · 2017-11-24 10:56:18 +01:00 · 2017-11-24 10:56:18 +01:00 · 9d86ae4597
commit 9d86ae4597
parent f739c27229 ea2e51c630
3 changed files with 50 additions and 12 deletions
--- a/src/rdb.c
+++ b/src/rdb.c
@ -2000,6 +2000,9 @@ void bgsaveCommand(client *c) {
        }
    }
    rdbSaveInfo rsi, *rsiptr;
    rsiptr = rdbPopulateSaveInfo(&rsi);
    if (server.rdb_child_pid != -1) {
        addReplyError(c,"Background save already in progress");
    } else if (server.aof_child_pid != -1) {
@ -2012,7 +2015,7 @@ void bgsaveCommand(client *c) {
                "Use BGSAVE SCHEDULE in order to schedule a BGSAVE whenever "
                "possible.");
        }
-    } else if (rdbSaveBackground(server.rdb_filename,NULL) == C_OK) {
+    } else if (rdbSaveBackground(server.rdb_filename,rsiptr) == C_OK) {
        addReplyStatus(c,"Background saving started");
    } else {
        addReply(c,shared.err);
@ -2033,22 +2036,44 @@ rdbSaveInfo *rdbPopulateSaveInfo(rdbSaveInfo *rsi) {
    *rsi = rsi_init;
    /* If the instance is a master, we can populate the replication info
-     * in all the cases, even if sometimes in incomplete (but safe) form. */
+     * only when repl_backlog is not NULL. If the repl_backlog is NULL,
-    if (!server.masterhost) {
+     * it means that the instance isn't in any replication chains. In this
-        if (server.repl_backlog) rsi->repl_stream_db = server.slaveseldb;
+     * scenario the replication info is useless, because when a slave
-        /* Note that if repl_backlog is NULL, it means that histories
+     * connect to us, the NULL repl_backlog will trigger a full synchronization,
-         * following from this point will trigger a full synchronization
+     * at the same time we will use a new replid and clear replid2.
-         * generating a SELECT statement, so we can leave the currently
+     * And remember that after free backlog if we reach repl_backlog_time_limit,
-         * selected DB set to -1. This allows a restarted master to reload
+     * we will use a new replid and clear replid2 too. So there is only one
-         * its replication ID/offset when there are no connected slaves. */
+     * scenario which can make repl_stream_db be -1, that is the instance is
     * a master, and it have repl_backlog, but server.slaveseldb is -1. */
    if (!server.masterhost && server.repl_backlog) {
        rsi->repl_stream_db = server.slaveseldb == -1 ? 0 : server.slaveseldb;
        /* Note that when server.slaveseldb is -1, it means that this master
         * didn't apply any write commands after a full synchronization.
         * So we can let repl_stream_db be 0, this allows a restarted slave
         * to reload replication ID/offset, it's safe because the next write
         * command must generate a SELECT statement. */
        return rsi;
    }
-    /* If the instance is a slave we need a connected master in order to
+    /* If the instance is a slave we need a connected master
-     * fetch the currently selected DB. */
+     * in order to fetch the currently selected DB. */
    if (server.master) {
        rsi->repl_stream_db = server.master->db->id;
        return rsi;
    }
    /* It is useful to persist cached_master's db id inside RDB file.
     * When a slave lost master's connection, server.master will be
     * cached as server.cached_master, after that a slave can not
     * increment the master_repl_offset because slave only apply data
     * from connected master, so the cached_master can hold right
     * replication info. But please note that this action is safe
     * only after we fix the free backlog problem, because when a master
     * turn to be a slave, it will use itself as the server.cached_master,
     * that is dangerous if we didn't use a new replication ID after
     * free backlog. */
    if (server.cached_master) {
        rsi->repl_stream_db = server.cached_master->db->id;
        return rsi;
    }
    return NULL;
 }
--- a/src/replication.c
+++ b/src/replication.c
@ -2613,6 +2613,18 @@ void replicationCron(void) {
        time_t idle = server.unixtime - server.repl_no_slaves_since;
        if (idle > server.repl_backlog_time_limit) {
            /* When we free the backlog, we always use a new
             * replication ID and clear the ID2. Since without
             * backlog we can not increment master_repl_offset
             * even do write commands, that may lead to inconsistency
             * when we try to connect a "slave-before" master
             * (if this master is our slave before, our replid
             * equals the master's replid2). As the master have our
             * history, so we can match the master's replid2 and
             * second_replid_offset, that make partial sync work,
             * but the data is inconsistent. */
            changeReplicationId();
            clearReplicationId2();
            freeReplicationBacklog();
            serverLog(LL_NOTICE,
                "Replication backlog freed after %d seconds "
--- a/src/server.c
+++ b/src/server.c
@ -3540,7 +3540,8 @@ void loadDataFromDisk(void) {
                rsi.repl_id_is_set &&
                rsi.repl_offset != -1 &&
                /* Note that older implementations may save a repl_stream_db
-                 * of -1 inside the RDB file. */
+                 * of -1 inside the RDB file in a wrong way, see more information
                 * in function rdbPopulateSaveInfo. */
                rsi.repl_stream_db != -1)
            {
                memcpy(server.replid,rsi.repl_id,sizeof(server.replid));