From 3fd43062c8127857f98c09a06bf70710b2dc2f68 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 29 Jan 2015 14:17:45 +0100 Subject: [PATCH] Cluster: use a number of gossip sections proportional to cluster size. Otherwise it is impossible to receive the majority of failure reports in the node_timeout*2 window in larger clusters. Still with a 200 nodes cluster, 20 gossip sections are a very reasonable amount of bytes to send. A side effect of this change is also fater cluster nodes joins for large clusters, because the cluster layout makes less time to propagate. --- src/cluster.c | 59 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/src/cluster.c b/src/cluster.c index 2cbb2190..aaba7aa7 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -2037,7 +2037,8 @@ void clusterBroadcastMessage(void *buf, size_t len) { dictReleaseIterator(di); } -/* Build the message header */ +/* Build the message header. hdr must point to a buffer at least + * sizeof(clusterMsg) in bytes. */ void clusterBuildMessageHdr(clusterMsg *hdr, int type) { int totlen = 0; uint64_t offset; @@ -2098,40 +2099,60 @@ void clusterBuildMessageHdr(clusterMsg *hdr, int type) { /* Send a PING or PONG packet to the specified node, making sure to add enough * gossip informations. */ void clusterSendPing(clusterLink *link, int type) { - unsigned char buf[sizeof(clusterMsg)+sizeof(clusterMsgDataGossip)*3]; - clusterMsg *hdr = (clusterMsg*) buf; - int gossipcount = 0, totlen; - /* freshnodes is the number of nodes we can still use to populate the - * gossip section of the ping packet. Basically we start with the nodes - * we have in memory minus two (ourself and the node we are sending the - * message to). Every time we add a node we decrement the counter, so when - * it will drop to <= zero we know there is no more gossip info we can - * send. */ + unsigned char *buf; + clusterMsg *hdr; + int gossipcount = 0; /* Number of gossip sections added so far. */ + int wanted; /* Number of gossip sections we want to append if possible. */ + int totlen; /* Total packet length. */ + /* freshnodes is the max number of nodes we can hope to append at all: + * nodes available minus two (ourself and the node we are sending the + * message to). However practically there may be less valid nodes since + * nodes in handshake state, disconnected, are not considered. */ int freshnodes = dictSize(server.cluster->nodes)-2; + /* How many gossip sections we want to add? 1/10 of the available nodes + * and anyway at least 3. */ + wanted = freshnodes/10; + if (wanted < 3) wanted = 3; + + /* Compute the maxium totlen to allocate our buffer. We'll fix the totlen + * later according to the number of gossip sections we really were able + * to put inside the packet. */ + totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData); + totlen += (sizeof(clusterMsgDataGossip)*wanted); + /* Note: clusterBuildMessageHdr() expects the buffer to be always at least + * sizeof(clusterMsg) or more. */ + if (totlen < (int)sizeof(clusterMsg)) totlen = sizeof(clusterMsg); + buf = zcalloc(totlen); + hdr = (clusterMsg*) buf; + + /* Populate the header. */ if (link->node && type == CLUSTERMSG_TYPE_PING) link->node->ping_sent = mstime(); clusterBuildMessageHdr(hdr,type); /* Populate the gossip fields */ - while(freshnodes > 0 && gossipcount < 3) { + int maxiterations = wanted+10; + while(freshnodes > 0 && gossipcount < wanted && maxiterations--) { dictEntry *de = dictGetRandomKey(server.cluster->nodes); clusterNode *this = dictGetVal(de); clusterMsgDataGossip *gossip; int j; + /* Don't include this node: the whole packet header is about us + * already, so we just gossip about other nodes. */ + if (this == myself) continue; + /* In the gossip section don't include: - * 1) Myself. - * 2) Nodes in HANDSHAKE state. + * 1) Nodes in HANDSHAKE state. * 3) Nodes with the NOADDR flag set. * 4) Disconnected nodes if they don't have configured slots. */ - if (this == myself || - this->flags & (REDIS_NODE_HANDSHAKE|REDIS_NODE_NOADDR) || + if (this->flags & (REDIS_NODE_HANDSHAKE|REDIS_NODE_NOADDR) || (this->link == NULL && this->numslots == 0)) { - freshnodes--; /* otherwise we may loop forever. */ - continue; + freshnodes--; /* Tecnically not correct, but saves CPU. */ + continue; } /* Check if we already added this node */ @@ -2154,11 +2175,15 @@ void clusterSendPing(clusterLink *link, int type) { gossip->notused2 = 0; gossipcount++; } + + /* Ready to send... fix the totlen fiend and queue the message in the + * output buffer. */ totlen = sizeof(clusterMsg)-sizeof(union clusterMsgData); totlen += (sizeof(clusterMsgDataGossip)*gossipcount); hdr->count = htons(gossipcount); hdr->totlen = htonl(totlen); clusterSendMessage(link,buf,totlen); + zfree(buf); } /* Send a PONG packet to every connected node that's not in handshake state