PSYNC2: bugfixing pre release.

1. Master replication offset was cleared after switching configuration
to some other slave, since it was assumed you can't PSYNC after a
switch. Note the case anymore and when we successfully PSYNC we need to
have our offset untouched.

2. Secondary replication ID was not reset to "000..." pattern at
startup.

3. Master in error state replying -LOADING or other transient errors
forced the slave to discard the cached master and full resync. This is
now fixed.

4. Better logging of what's happening on failed PSYNCs.
This commit is contained in:
antirez 2016-11-23 16:12:20 +01:00
parent 73dd51c044
commit 5b7d42fff3
2 changed files with 39 additions and 11 deletions

View File

@ -484,10 +484,18 @@ int masterTryPartialResynchronization(client *c) {
{ {
/* Run id "?" is used by slaves that want to force a full resync. */ /* Run id "?" is used by slaves that want to force a full resync. */
if (master_replid[0] != '?') { if (master_replid[0] != '?') {
if (strcasecmp(master_replid, server.replid) &&
strcasecmp(master_replid, server.replid2))
{
serverLog(LL_NOTICE,"Partial resynchronization not accepted: " serverLog(LL_NOTICE,"Partial resynchronization not accepted: "
"Replication ID mismatch (Slave asked for '%s', my replication " "Replication ID mismatch (Slave asked for '%s', my "
"ID is '%s')", "replication IDs are '%s' and '%s')",
master_replid, server.replid); master_replid, server.replid, server.replid2);
} else {
serverLog(LL_NOTICE,"Partial resynchronization not accepted: "
"Requested offset for second ID was %lld, but I can reply "
"up to %lld", psync_offset, server.second_replid_offset);
}
} else { } else {
serverLog(LL_NOTICE,"Full resync requested by slave %s", serverLog(LL_NOTICE,"Full resync requested by slave %s",
replicationGetSlaveName(c)); replicationGetSlaveName(c));
@ -638,7 +646,7 @@ void syncCommand(client *c) {
/* Refuse SYNC requests if we are a slave but the link with our master /* Refuse SYNC requests if we are a slave but the link with our master
* is not ok... */ * is not ok... */
if (server.masterhost && server.repl_state != REPL_STATE_CONNECTED) { if (server.masterhost && server.repl_state != REPL_STATE_CONNECTED) {
addReplyError(c,"Can't SYNC while not connected with my master"); addReplySds(c,sdsnew("-NOMASTERLINK Can't SYNC while not connected with my master\r\n"));
return; return;
} }
@ -1393,8 +1401,9 @@ char *sendSynchronousCommand(int flags, int fd, ...) {
* offset is saved. * offset is saved.
* PSYNC_NOT_SUPPORTED: If the server does not understand PSYNC at all and * PSYNC_NOT_SUPPORTED: If the server does not understand PSYNC at all and
* the caller should fall back to SYNC. * the caller should fall back to SYNC.
* PSYNC_WRITE_ERR: There was an error writing the command to the socket. * PSYNC_WRITE_ERROR: There was an error writing the command to the socket.
* PSYNC_WAIT_REPLY: Call again the function with read_reply set to 1. * PSYNC_WAIT_REPLY: Call again the function with read_reply set to 1.
* PSYNC_TRY_LATER: Master is currently in a transient error condition.
* *
* Notable side effects: * Notable side effects:
* *
@ -1410,6 +1419,7 @@ char *sendSynchronousCommand(int flags, int fd, ...) {
#define PSYNC_CONTINUE 2 #define PSYNC_CONTINUE 2
#define PSYNC_FULLRESYNC 3 #define PSYNC_FULLRESYNC 3
#define PSYNC_NOT_SUPPORTED 4 #define PSYNC_NOT_SUPPORTED 4
#define PSYNC_TRY_LATER 5
int slaveTryPartialResynchronization(int fd, int read_reply) { int slaveTryPartialResynchronization(int fd, int read_reply) {
char *psync_replid; char *psync_replid;
char psync_offset[32]; char psync_offset[32];
@ -1529,9 +1539,21 @@ int slaveTryPartialResynchronization(int fd, int read_reply) {
return PSYNC_CONTINUE; return PSYNC_CONTINUE;
} }
/* If we reach this point we received either an error since the master does /* If we reach this point we received either an error (since the master does
* not understand PSYNC, or an unexpected reply from the master. * not understand PSYNC or because it is in a special state and cannot
* Return PSYNC_NOT_SUPPORTED to the caller in both cases. */ * serve our request), or an unexpected reply from the master.
*
* Return PSYNC_NOT_SUPPORTED on errors we don't understand, otherwise
* return PSYNC_TRY_LATER if we believe this is a transient error. */
if (!strncmp(reply,"-NOMASTERLINK",13) ||
!strncmp(reply,"-LOADING",8))
{
serverLog(LL_NOTICE,
"Master is currently unable to PSYNC "
"but should be in the future: %s", reply);
return PSYNC_TRY_LATER;
}
if (strncmp(reply,"-ERR",4)) { if (strncmp(reply,"-ERR",4)) {
/* If it's not an error, log the unexpected event. */ /* If it's not an error, log the unexpected event. */
@ -1748,6 +1770,12 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
psync_result = slaveTryPartialResynchronization(fd,1); psync_result = slaveTryPartialResynchronization(fd,1);
if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */ if (psync_result == PSYNC_WAIT_REPLY) return; /* Try again later... */
/* If the master is in an transient error, we should try to PSYNC
* from scratch later, so go to the error path. This happens when
* the server is loading the dataset or is not connected with its
* master and so forth. */
if (psync_result == PSYNC_TRY_LATER) goto write_error;
/* Note: if PSYNC does not return WAIT_REPLY, it will take care of /* Note: if PSYNC does not return WAIT_REPLY, it will take care of
* uninstalling the read handler from the file descriptor. */ * uninstalling the read handler from the file descriptor. */
@ -1757,7 +1785,7 @@ void syncWithMaster(aeEventLoop *el, int fd, void *privdata, int mask) {
} }
/* PSYNC failed or is not supported: we want our slaves to resync with us /* PSYNC failed or is not supported: we want our slaves to resync with us
* as well, if we have any sub-slaves. The mater may transfer us an * as well, if we have any sub-slaves. The master may transfer us an
* entirely different data set and we have no way to incrementally feed * entirely different data set and we have no way to incrementally feed
* our slaves after that. */ * our slaves after that. */
disconnectSlaves(); /* Force our slaves to resync with us as well. */ disconnectSlaves(); /* Force our slaves to resync with us as well. */
@ -1911,7 +1939,6 @@ void replicationSetMaster(char *ip, int port) {
* our own parameters, to later PSYNC with the new master. */ * our own parameters, to later PSYNC with the new master. */
if (was_master) replicationCacheMasterUsingMyself(); if (was_master) replicationCacheMasterUsingMyself();
server.repl_state = REPL_STATE_CONNECT; server.repl_state = REPL_STATE_CONNECT;
server.master_repl_offset = 0;
server.repl_down_since = 0; server.repl_down_since = 0;
} }

View File

@ -1311,6 +1311,7 @@ void initServerConfig(void) {
getRandomHexChars(server.runid,CONFIG_RUN_ID_SIZE); getRandomHexChars(server.runid,CONFIG_RUN_ID_SIZE);
server.runid[CONFIG_RUN_ID_SIZE] = '\0'; server.runid[CONFIG_RUN_ID_SIZE] = '\0';
changeReplicationId(); changeReplicationId();
clearReplicationId2();
server.configfile = NULL; server.configfile = NULL;
server.executable = NULL; server.executable = NULL;
server.hz = CONFIG_DEFAULT_HZ; server.hz = CONFIG_DEFAULT_HZ;