From 6ddf0ea29340891d9934892b6c239d160d1ee0c4 Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Wed, 1 Nov 2017 17:32:23 +0800 Subject: [PATCH 1/5] PSYNC2: safe free backlog when reach the time limit When we free the backlog, we should use a new replication ID and clear the ID2. Since without backlog we can not increment master_repl_offset even do write commands, that may lead to inconsistency when we try to connect a "slave-before" master (if this master is our slave before, our replid equals the master's replid2). As the master have our history, so we can match the master's replid2 and second_replid_offset, that make partial sync work, but the data is inconsistent. --- src/replication.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/replication.c b/src/replication.c index e0b3d910..fe7b0f73 100644 --- a/src/replication.c +++ b/src/replication.c @@ -2613,6 +2613,18 @@ void replicationCron(void) { time_t idle = server.unixtime - server.repl_no_slaves_since; if (idle > server.repl_backlog_time_limit) { + /* When we free the backlog, we always use a new + * replication ID and clear the ID2. Since without + * backlog we can not increment master_repl_offset + * even do write commands, that may lead to inconsistency + * when we try to connect a "slave-before" master + * (if this master is our slave before, our replid + * equals the master's replid2). As the master have our + * history, so we can match the master's replid2 and + * second_replid_offset, that make partial sync work, + * but the data is inconsistent. */ + changeReplicationId(); + clearReplicationId2(); freeReplicationBacklog(); serverLog(LL_NOTICE, "Replication backlog freed after %d seconds " From 885c4f856e5a41e86ebf2a3233dc3ba2bae6945e Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Wed, 1 Nov 2017 17:52:43 +0800 Subject: [PATCH 2/5] PSYNC2 & RDB: fix the missing rdbSaveInfo for BGSAVE --- src/rdb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/rdb.c b/src/rdb.c index 125df607..70b13fb9 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -1999,6 +1999,9 @@ void bgsaveCommand(client *c) { } } + rdbSaveInfo rsi, *rsiptr; + rsiptr = rdbPopulateSaveInfo(&rsi); + if (server.rdb_child_pid != -1) { addReplyError(c,"Background save already in progress"); } else if (server.aof_child_pid != -1) { @@ -2011,7 +2014,7 @@ void bgsaveCommand(client *c) { "Use BGSAVE SCHEDULE in order to schedule a BGSAVE whenever " "possible."); } - } else if (rdbSaveBackground(server.rdb_filename,NULL) == C_OK) { + } else if (rdbSaveBackground(server.rdb_filename,rsiptr) == C_OK) { addReplyStatus(c,"Background saving started"); } else { addReply(c,shared.err); From b8579c225cd0ea83222938bb188cd74501fb1627 Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Thu, 2 Nov 2017 10:45:33 +0800 Subject: [PATCH 3/5] PSYNC2: clarify the scenario when repl_stream_db can be -1 --- src/rdb.c | 27 +++++++++++++++++++-------- src/server.c | 3 ++- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/rdb.c b/src/rdb.c index 70b13fb9..8de3cd96 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -2035,14 +2035,25 @@ rdbSaveInfo *rdbPopulateSaveInfo(rdbSaveInfo *rsi) { *rsi = rsi_init; /* If the instance is a master, we can populate the replication info - * in all the cases, even if sometimes in incomplete (but safe) form. */ - if (!server.masterhost) { - if (server.repl_backlog) rsi->repl_stream_db = server.slaveseldb; - /* Note that if repl_backlog is NULL, it means that histories - * following from this point will trigger a full synchronization - * generating a SELECT statement, so we can leave the currently - * selected DB set to -1. This allows a restarted master to reload - * its replication ID/offset when there are no connected slaves. */ + * only when repl_backlog is not NULL. If the repl_backlog is NULL, + * it means that the instance isn't in any replication chains. In this + * scenario the replication info is useless, because when a slave + * connect to us, the NULL repl_backlog will trigger a full synchronization, + * at the same time we will use a new replid and clear replid2. + * And remember that after free backlog if we reach repl_backlog_time_limit, + * we will use a new replid and clear replid2 too. So there is only one + * scenario which can make repl_stream_db be -1, that is the instance is + * a master, and it have repl_backlog, but server.slaveseldb is -1. */ + if (!server.masterhost && server.repl_backlog) { + rsi->repl_stream_db = server.slaveseldb; + /* Note that server.slaveseldb may be -1, it means that this master + * didn't apply any write commands after a full synchronization, + * so we can leave the currently selected DB set to -1, because the + * next write command must generate a SELECT statement. This allows + * a restarted slave to reload replication ID/offset even the repl_stream_db + * is -1, but we should not do that, because older implementations + * may save a repl_stream_db as -1 in a wrong way. Maybe we can fix + * it in the next release version. */ return rsi; } diff --git a/src/server.c b/src/server.c index 61291fde..d617d3ec 100644 --- a/src/server.c +++ b/src/server.c @@ -3536,7 +3536,8 @@ void loadDataFromDisk(void) { rsi.repl_id_is_set && rsi.repl_offset != -1 && /* Note that older implementations may save a repl_stream_db - * of -1 inside the RDB file. */ + * of -1 inside the RDB file in a wrong way, see more information + * in function rdbPopulateSaveInfo. */ rsi.repl_stream_db != -1) { memcpy(server.replid,rsi.repl_id,sizeof(server.replid)); From 93037f764209e86f3fcb240a642334fd67552935 Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Wed, 22 Nov 2017 12:05:30 +0800 Subject: [PATCH 4/5] PSYNC2: make repl_stream_db never be -1 it means that after this change all the replication info in RDB is valid, and it can distinguish us from the older version. --- src/rdb.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/rdb.c b/src/rdb.c index 8de3cd96..386b6a78 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -2045,15 +2045,12 @@ rdbSaveInfo *rdbPopulateSaveInfo(rdbSaveInfo *rsi) { * scenario which can make repl_stream_db be -1, that is the instance is * a master, and it have repl_backlog, but server.slaveseldb is -1. */ if (!server.masterhost && server.repl_backlog) { - rsi->repl_stream_db = server.slaveseldb; - /* Note that server.slaveseldb may be -1, it means that this master - * didn't apply any write commands after a full synchronization, - * so we can leave the currently selected DB set to -1, because the - * next write command must generate a SELECT statement. This allows - * a restarted slave to reload replication ID/offset even the repl_stream_db - * is -1, but we should not do that, because older implementations - * may save a repl_stream_db as -1 in a wrong way. Maybe we can fix - * it in the next release version. */ + rsi->repl_stream_db = server.slaveseldb == -1 ? 0 : server.slaveseldb; + /* Note that when server.slaveseldb is -1, it means that this master + * didn't apply any write commands after a full synchronization. + * So we can let repl_stream_db be 0, this allows a restarted slave + * to reload replication ID/offset, it's safe because the next write + * command must generate a SELECT statement. */ return rsi; } From ea2e51c630f972b23d3557f6369b13d983c12f17 Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Sat, 4 Nov 2017 23:05:00 +0800 Subject: [PATCH 5/5] PSYNC2: persist cached_master's dbid inside the RDB --- src/rdb.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/rdb.c b/src/rdb.c index 386b6a78..a7334449 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -2054,11 +2054,25 @@ rdbSaveInfo *rdbPopulateSaveInfo(rdbSaveInfo *rsi) { return rsi; } - /* If the instance is a slave we need a connected master in order to - * fetch the currently selected DB. */ + /* If the instance is a slave we need a connected master + * in order to fetch the currently selected DB. */ if (server.master) { rsi->repl_stream_db = server.master->db->id; return rsi; } + /* It is useful to persist cached_master's db id inside RDB file. + * When a slave lost master's connection, server.master will be + * cached as server.cached_master, after that a slave can not + * increment the master_repl_offset because slave only apply data + * from connected master, so the cached_master can hold right + * replication info. But please note that this action is safe + * only after we fix the free backlog problem, because when a master + * turn to be a slave, it will use itself as the server.cached_master, + * that is dangerous if we didn't use a new replication ID after + * free backlog. */ + if (server.cached_master) { + rsi->repl_stream_db = server.cached_master->db->id; + return rsi; + } return NULL; }