2012-11-08 18:25:23 +01:00
/* Redis Cluster implementation.
* Copyright ( c ) 2009 - 2012 , Salvatore Sanfilippo < antirez at gmail dot com >
* All rights reserved .
* Redistribution and use in source and binary forms , with or without
* modification , are permitted provided that the following conditions are met :
* * Redistributions of source code must retain the above copyright notice ,
* this list of conditions and the following disclaimer .
* * Redistributions in binary form must reproduce the above copyright
* notice , this list of conditions and the following disclaimer in the
* documentation and / or other materials provided with the distribution .
* * Neither the name of Redis nor the names of its contributors may be used
* to endorse or promote products derived from this software without
* specific prior written permission .
2011-03-29 17:51:15 +02:00
# include "redis.h"
2013-10-09 15:37:20 +02:00
# include "cluster.h"
2012-04-02 13:10:39 +02:00
# include "endianconv.h"
2011-03-29 17:51:15 +02:00
2011-09-20 00:00:14 +01:00
# include <sys/types.h>
# include <sys/socket.h>
2011-03-29 17:51:15 +02:00
# include <arpa/inet.h>
2011-03-30 14:58:19 +02:00
# include <fcntl.h>
# include <unistd.h>
2012-04-24 11:11:55 +02:00
# include <sys/socket.h>
2014-01-15 10:31:12 +01:00
# include <sys/stat.h>
2011-03-29 17:51:15 +02:00
2014-01-29 11:22:22 +01:00
/* A global reference to myself is handy to make code more clear.
* Myself always points to server . cluster - > myself , that is , the clusterNode
* that represents this node . */
clusterNode * myself = NULL ;
2013-10-09 15:37:20 +02:00
clusterNode * createClusterNode ( char * nodename , int flags ) ;
int clusterAddNode ( clusterNode * node ) ;
2011-03-29 17:51:15 +02:00
void clusterAcceptHandler ( aeEventLoop * el , int fd , void * privdata , int mask ) ;
void clusterReadHandler ( aeEventLoop * el , int fd , void * privdata , int mask ) ;
void clusterSendPing ( clusterLink * link , int type ) ;
void clusterSendFail ( char * nodename ) ;
2013-09-20 09:22:21 +02:00
void clusterSendFailoverAuthIfNeeded ( clusterNode * node , clusterMsg * request ) ;
2011-03-29 17:51:15 +02:00
void clusterUpdateState ( void ) ;
int clusterNodeGetSlotBit ( clusterNode * n , int slot ) ;
2013-09-04 10:25:26 +02:00
sds clusterGenNodesDescription ( int filter ) ;
2011-04-07 17:46:28 +02:00
clusterNode * clusterLookupNode ( char * name ) ;
int clusterNodeAddSlave ( clusterNode * master , clusterNode * slave ) ;
2011-04-07 21:34:41 +02:00
int clusterAddSlot ( clusterNode * n , int slot ) ;
2013-02-21 11:44:58 +01:00
int clusterDelSlot ( int slot ) ;
2013-03-15 16:35:16 +01:00
int clusterDelNodeSlots ( clusterNode * node ) ;
2013-02-22 17:45:49 +01:00
int clusterNodeSetSlotBit ( clusterNode * n , int slot ) ;
2013-03-20 00:30:47 +01:00
void clusterSetMaster ( clusterNode * n ) ;
2013-09-26 16:54:43 +02:00
void clusterHandleSlaveFailover ( void ) ;
2014-01-30 18:05:11 +01:00
void clusterHandleSlaveMigration ( int max_slaves ) ;
2013-02-28 15:41:54 +01:00
int bitmapTestBit ( unsigned char * bitmap , int pos ) ;
2013-10-03 09:55:20 +02:00
void clusterDoBeforeSleep ( int flags ) ;
2013-11-08 16:26:50 +01:00
void clusterSendUpdate ( clusterLink * link , clusterNode * node ) ;
2014-02-05 13:01:24 +01:00
void resetManualFailover ( void ) ;
2014-03-11 11:16:18 +01:00
void clusterCloseAllSlots ( void ) ;
2011-03-29 17:51:15 +02:00
/* -----------------------------------------------------------------------------
* Initialization
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
2014-02-11 10:00:11 +01:00
/* Return the greatest configEpoch found in the cluster. */
uint64_t clusterGetMaxEpoch ( void ) {
uint64_t max = 0 ;
2013-09-25 11:47:13 +02:00
dictIterator * di ;
dictEntry * de ;
di = dictGetSafeIterator ( server . cluster - > nodes ) ;
while ( ( de = dictNext ( di ) ) ! = NULL ) {
clusterNode * node = dictGetVal ( de ) ;
2014-02-11 10:00:11 +01:00
if ( node - > configEpoch > max ) max = node - > configEpoch ;
2013-09-25 11:47:13 +02:00
dictReleaseIterator ( di ) ;
2014-02-11 10:00:11 +01:00
return max ;
2013-09-25 11:47:13 +02:00
2011-03-29 17:51:15 +02:00
int clusterLoadConfig ( char * filename ) {
FILE * fp = fopen ( filename , " r " ) ;
2011-04-07 12:55:02 +02:00
char * line ;
2011-04-07 17:46:28 +02:00
int maxline , j ;
2011-03-30 14:58:19 +02:00
2011-03-29 17:51:15 +02:00
if ( fp = = NULL ) return REDIS_ERR ;
2011-04-07 12:55:02 +02:00
/* Parse the file. Note that single liens of the cluster config file can
* be really long as they include all the hash slots of the node .
2013-03-04 19:45:36 +01:00
* This means in the worst possible case , half of the Redis slots will be
* present in a single line , possibly in importing or migrating state , so
* together with the node ID of the sender / receiver .
* To simplify we allocate 1024 + REDIS_CLUSTER_SLOTS * 128 bytes per line . */
maxline = 1024 + REDIS_CLUSTER_SLOTS * 128 ;
2011-04-07 12:55:02 +02:00
line = zmalloc ( maxline ) ;
while ( fgets ( line , maxline , fp ) ! = NULL ) {
int argc ;
2014-01-15 11:23:34 +01:00
sds * argv ;
2011-04-07 17:46:28 +02:00
clusterNode * n , * master ;
char * p , * s ;
2014-01-15 11:23:34 +01:00
/* Skip blank lines, they can be created either by users manually
* editing nodes . conf or by the config writing process if stopped
* before the truncate ( ) call . */
if ( line [ 0 ] = = ' \n ' ) continue ;
/* Split the line into arguments for processing. */
argv = sdssplitargs ( line , & argc ) ;
if ( argv = = NULL ) goto fmterr ;
2011-04-07 17:46:28 +02:00
/* Create this node if it does not exist */
n = clusterLookupNode ( argv [ 0 ] ) ;
if ( ! n ) {
n = createClusterNode ( argv [ 0 ] , 0 ) ;
clusterAddNode ( n ) ;
/* Address and port */
if ( ( p = strchr ( argv [ 1 ] , ' : ' ) ) = = NULL ) goto fmterr ;
* p = ' \0 ' ;
memcpy ( n - > ip , argv [ 1 ] , strlen ( argv [ 1 ] ) + 1 ) ;
n - > port = atoi ( p + 1 ) ;
/* Parse flags */
p = s = argv [ 2 ] ;
while ( p ) {
p = strchr ( s , ' , ' ) ;
if ( p ) * p = ' \0 ' ;
if ( ! strcasecmp ( s , " myself " ) ) {
2013-02-14 13:20:56 +01:00
redisAssert ( server . cluster - > myself = = NULL ) ;
2014-01-29 11:22:22 +01:00
myself = server . cluster - > myself = n ;
2011-04-07 17:46:28 +02:00
n - > flags | = REDIS_NODE_MYSELF ;
} else if ( ! strcasecmp ( s , " master " ) ) {
n - > flags | = REDIS_NODE_MASTER ;
} else if ( ! strcasecmp ( s , " slave " ) ) {
n - > flags | = REDIS_NODE_SLAVE ;
} else if ( ! strcasecmp ( s , " fail? " ) ) {
n - > flags | = REDIS_NODE_PFAIL ;
} else if ( ! strcasecmp ( s , " fail " ) ) {
n - > flags | = REDIS_NODE_FAIL ;
2013-10-09 16:18:33 +02:00
n - > fail_time = mstime ( ) ;
2011-04-07 17:46:28 +02:00
} else if ( ! strcasecmp ( s , " handshake " ) ) {
n - > flags | = REDIS_NODE_HANDSHAKE ;
} else if ( ! strcasecmp ( s , " noaddr " ) ) {
n - > flags | = REDIS_NODE_NOADDR ;
2011-04-07 19:04:16 +02:00
} else if ( ! strcasecmp ( s , " noflags " ) ) {
/* nothing to do */
2011-04-07 17:46:28 +02:00
} else {
redisPanic ( " Unknown flag in redis cluster config file " ) ;
if ( p ) s = p + 1 ;
/* Get master if any. Set the master and populate master's
* slave list . */
if ( argv [ 3 ] [ 0 ] ! = ' - ' ) {
master = clusterLookupNode ( argv [ 3 ] ) ;
if ( ! master ) {
master = createClusterNode ( argv [ 3 ] , 0 ) ;
clusterAddNode ( master ) ;
n - > slaveof = master ;
clusterNodeAddSlave ( master , n ) ;
2011-04-07 23:06:01 +02:00
/* Set ping sent / pong received timestamps */
2013-10-09 16:18:33 +02:00
if ( atoi ( argv [ 4 ] ) ) n - > ping_sent = mstime ( ) ;
if ( atoi ( argv [ 5 ] ) ) n - > pong_received = mstime ( ) ;
2011-04-07 23:06:01 +02:00
2013-09-25 11:47:13 +02:00
/* Set configEpoch for this node. */
n - > configEpoch = strtoull ( argv [ 6 ] , NULL , 10 ) ;
2011-04-07 17:46:28 +02:00
/* Populate hash slots served by this instance. */
2013-09-25 11:47:13 +02:00
for ( j = 8 ; j < argc ; j + + ) {
2011-04-07 17:46:28 +02:00
int start , stop ;
2011-05-04 09:31:37 +02:00
if ( argv [ j ] [ 0 ] = = ' [ ' ) {
/* Here we handle migrating / importing slots */
int slot ;
char direction ;
clusterNode * cn ;
p = strchr ( argv [ j ] , ' - ' ) ;
redisAssert ( p ! = NULL ) ;
* p = ' \0 ' ;
direction = p [ 1 ] ; /* Either '>' or '<' */
slot = atoi ( argv [ j ] + 1 ) ;
p + = 3 ;
cn = clusterLookupNode ( p ) ;
if ( ! cn ) {
cn = createClusterNode ( p , 0 ) ;
clusterAddNode ( cn ) ;
if ( direction = = ' > ' ) {
2013-02-14 13:20:56 +01:00
server . cluster - > migrating_slots_to [ slot ] = cn ;
2011-05-04 09:31:37 +02:00
} else {
2013-02-14 13:20:56 +01:00
server . cluster - > importing_slots_from [ slot ] = cn ;
2011-05-04 09:31:37 +02:00
continue ;
} else if ( ( p = strchr ( argv [ j ] , ' - ' ) ) ! = NULL ) {
2011-04-07 17:46:28 +02:00
* p = ' \0 ' ;
start = atoi ( argv [ j ] ) ;
stop = atoi ( p + 1 ) ;
} else {
start = stop = atoi ( argv [ j ] ) ;
while ( start < = stop ) clusterAddSlot ( n , start + + ) ;
2011-04-07 12:55:02 +02:00
2013-03-06 12:36:07 +01:00
sdsfreesplitres ( argv , argc ) ;
2011-04-07 12:55:02 +02:00
zfree ( line ) ;
2011-03-29 17:51:15 +02:00
fclose ( fp ) ;
2011-04-07 12:55:02 +02:00
/* Config sanity check */
2013-02-14 13:20:56 +01:00
redisAssert ( server . cluster - > myself ! = NULL ) ;
2014-01-29 11:38:14 +01:00
redisLog ( REDIS_NOTICE , " Node configuration loaded, I'm %.40s " , myself - > name ) ;
2014-02-11 10:00:11 +01:00
/* Set the currentEpoch to the max epoch found in the master.
* FIXME : this should actually be part of the persistent state , as
* documented in the Github issue # 1479. */
server . cluster - > currentEpoch = clusterGetMaxEpoch ( ) ;
2011-03-29 17:51:15 +02:00
return REDIS_OK ;
fmterr :
2013-01-17 01:00:20 +08:00
redisLog ( REDIS_WARNING , " Unrecoverable error: corrupted cluster config file. " ) ;
2011-03-29 17:51:15 +02:00
fclose ( fp ) ;
exit ( 1 ) ;
2011-03-30 14:58:19 +02:00
/* Cluster node configuration is exactly the same as CLUSTER NODES output.
* This function writes the node config and returns 0 , on error - 1
2014-01-15 10:31:12 +01:00
* is returned .
* Note : we need to write the file in an atomic way from the point of view
* of the POSIX filesystem semantics , so that if the server is stopped
* or crashes during the write , we ' ll end with either the old file or the
* new one . Since we have the full payload to write available we can use
* a single write to write the whole file . If the pre - existing file was
* bigger we pad our payload with newlines that are anyway ignored and truncate
* the file afterward . */
2013-10-03 09:55:20 +02:00
int clusterSaveConfig ( int do_fsync ) {
2013-09-04 10:25:26 +02:00
sds ci = clusterGenNodesDescription ( REDIS_NODE_HANDSHAKE ) ;
2014-01-15 10:31:12 +01:00
size_t content_size = sdslen ( ci ) ;
struct stat sb ;
2011-03-30 14:58:19 +02:00
int fd ;
2014-01-15 10:31:12 +01:00
if ( ( fd = open ( server . cluster_configfile , O_WRONLY | O_CREAT , 0644 ) )
2011-04-07 12:55:02 +02:00
= = - 1 ) goto err ;
2014-01-15 10:31:12 +01:00
/* Pad the new payload if the existing file length is greater. */
if ( fstat ( fd , & sb ) ! = - 1 ) {
if ( sb . st_size > content_size ) {
ci = sdsgrowzero ( ci , sb . st_size ) ;
memset ( ci + content_size , ' \n ' , sb . st_size - content_size ) ;
2011-03-30 14:58:19 +02:00
if ( write ( fd , ci , sdslen ( ci ) ) ! = ( ssize_t ) sdslen ( ci ) ) goto err ;
2013-10-03 09:55:20 +02:00
if ( do_fsync ) fsync ( fd ) ;
2014-01-15 10:31:12 +01:00
/* Truncate the file if needed to remove the final \n padding that
* is just garbage . */
if ( content_size ! = sdslen ( ci ) & & ftruncate ( fd , content_size ) = = - 1 ) {
/* ftruncate() failing is not a critical error. */
2011-03-30 14:58:19 +02:00
close ( fd ) ;
sdsfree ( ci ) ;
return 0 ;
err :
2014-01-15 10:31:12 +01:00
if ( fd ! = - 1 ) close ( fd ) ;
2011-03-30 14:58:19 +02:00
sdsfree ( ci ) ;
return - 1 ;
2013-10-03 09:55:20 +02:00
void clusterSaveConfigOrDie ( int do_fsync ) {
if ( clusterSaveConfig ( do_fsync ) = = - 1 ) {
2011-03-30 17:41:13 +02:00
redisLog ( REDIS_WARNING , " Fatal: can't update cluster config file. " ) ;
exit ( 1 ) ;
2011-03-29 17:51:15 +02:00
void clusterInit ( void ) {
2013-08-22 14:05:07 +02:00
int saveconf = 0 ;
2011-03-30 16:51:28 +02:00
2013-02-14 13:20:56 +01:00
server . cluster = zmalloc ( sizeof ( clusterState ) ) ;
server . cluster - > myself = NULL ;
2013-09-25 11:47:13 +02:00
server . cluster - > currentEpoch = 0 ;
2013-02-14 13:20:56 +01:00
server . cluster - > state = REDIS_CLUSTER_FAIL ;
2013-02-22 19:18:30 +01:00
server . cluster - > size = 1 ;
2013-12-17 12:22:02 +01:00
server . cluster - > todo_before_sleep = 0 ;
2013-02-14 13:20:56 +01:00
server . cluster - > nodes = dictCreate ( & clusterNodesDictType , NULL ) ;
2013-11-29 17:37:06 +01:00
server . cluster - > nodes_black_list =
dictCreate ( & clusterNodesBlackListDictType , NULL ) ;
2013-03-13 13:10:49 +01:00
server . cluster - > failover_auth_time = 0 ;
server . cluster - > failover_auth_count = 0 ;
2014-01-29 16:51:11 +01:00
server . cluster - > failover_auth_rank = 0 ;
2013-09-26 11:13:17 +02:00
server . cluster - > failover_auth_epoch = 0 ;
2013-09-26 13:00:41 +02:00
server . cluster - > last_vote_epoch = 0 ;
2013-10-02 10:10:08 +02:00
server . cluster - > stats_bus_messages_sent = 0 ;
server . cluster - > stats_bus_messages_received = 0 ;
2014-03-11 11:16:18 +01:00
memset ( server . cluster - > slots , 0 , sizeof ( server . cluster - > slots ) ) ;
clusterCloseAllSlots ( ) ;
2013-02-14 13:20:56 +01:00
if ( clusterLoadConfig ( server . cluster_configfile ) = = REDIS_ERR ) {
2011-03-29 17:51:15 +02:00
/* No configuration found. We will just use the random name provided
* by the createClusterNode ( ) function . */
2014-01-29 11:22:22 +01:00
myself = server . cluster - > myself =
2013-02-22 19:24:01 +01:00
2011-03-29 17:51:15 +02:00
redisLog ( REDIS_NOTICE , " No cluster configuration found, I'm %.40s " ,
2014-01-29 11:38:14 +01:00
myself - > name ) ;
clusterAddNode ( myself ) ;
2011-03-30 16:51:28 +02:00
saveconf = 1 ;
2013-10-03 09:55:20 +02:00
if ( saveconf ) clusterSaveConfigOrDie ( 1 ) ;
2013-08-22 14:05:07 +02:00
/* We need a listening TCP port for our cluster messaging needs. */
2013-07-05 11:47:20 +02:00
server . cfd_count = 0 ;
2014-02-19 17:30:07 -05:00
/* Port sanity check II
2014-03-10 10:41:27 +01:00
* The other handshake port check is triggered too late to stop
* us from trying to use a too - high cluster port number . */
2014-02-19 17:30:07 -05:00
if ( server . port > ( 65535 - REDIS_CLUSTER_PORT_INCR ) ) {
redisLog ( REDIS_WARNING , " Redis port number too high. "
" Cluster communication port is 10,000 port "
" numbers higher than your Redis port. "
" Your Redis port number must be "
" lower than 55535. " ) ;
2014-03-10 10:41:27 +01:00
exit ( 1 ) ;
2014-02-19 17:30:07 -05:00
2013-08-22 14:05:07 +02:00
if ( listenToPort ( server . port + REDIS_CLUSTER_PORT_INCR ,
server . cfd , & server . cfd_count ) = = REDIS_ERR )
exit ( 1 ) ;
2013-08-22 14:53:53 +02:00
} else {
int j ;
for ( j = 0 ; j < server . cfd_count ; j + + ) {
if ( aeCreateFileEvent ( server . el , server . cfd [ j ] , AE_READABLE ,
clusterAcceptHandler , NULL ) = = AE_ERR )
redisPanic ( " Unrecoverable error creating Redis Cluster "
" file event. " ) ;
2011-03-29 17:51:15 +02:00
2013-08-22 14:05:07 +02:00
/* The slots -> keys map is a sorted set. Init it. */
2013-02-14 13:20:56 +01:00
server . cluster - > slots_to_keys = zslCreate ( ) ;
2014-02-05 13:01:24 +01:00
resetManualFailover ( ) ;
2011-03-29 17:51:15 +02:00
/* -----------------------------------------------------------------------------
* CLUSTER communication link
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
clusterLink * createClusterLink ( clusterNode * node ) {
clusterLink * link = zmalloc ( sizeof ( * link ) ) ;
2013-10-09 16:18:33 +02:00
link - > ctime = mstime ( ) ;
2011-03-29 17:51:15 +02:00
link - > sndbuf = sdsempty ( ) ;
link - > rcvbuf = sdsempty ( ) ;
link - > node = node ;
link - > fd = - 1 ;
return link ;
/* Free a cluster link, but does not free the associated node of course.
2013-04-11 17:02:39 +02:00
* This function will just make sure that the original node associated
2011-03-29 17:51:15 +02:00
* with this link will have the ' link ' field set to NULL . */
void freeClusterLink ( clusterLink * link ) {
if ( link - > fd ! = - 1 ) {
aeDeleteFileEvent ( server . el , link - > fd , AE_WRITABLE ) ;
aeDeleteFileEvent ( server . el , link - > fd , AE_READABLE ) ;
sdsfree ( link - > sndbuf ) ;
sdsfree ( link - > rcvbuf ) ;
if ( link - > node )
link - > node - > link = NULL ;
close ( link - > fd ) ;
zfree ( link ) ;
void clusterAcceptHandler ( aeEventLoop * el , int fd , void * privdata , int mask ) {
int cport , cfd ;
2013-07-09 11:32:52 +02:00
char cip [ REDIS_IP_STR_LEN ] ;
2011-03-29 17:51:15 +02:00
clusterLink * link ;
REDIS_NOTUSED ( mask ) ;
REDIS_NOTUSED ( privdata ) ;
2011-06-17 19:54:17 +01:00
cfd = anetTcpAccept ( server . neterr , fd , cip , sizeof ( cip ) , & cport ) ;
2014-02-26 17:58:46 -05:00
if ( cfd = = ANET_ERR ) {
2011-03-29 17:51:15 +02:00
redisLog ( REDIS_VERBOSE , " Accepting cluster node: %s " , server . neterr ) ;
return ;
2013-09-03 11:42:09 +02:00
anetNonBlock ( NULL , cfd ) ;
anetEnableTcpNoDelay ( NULL , cfd ) ;
/* Use non-blocking I/O for cluster messages. */
2011-06-18 19:43:47 +01:00
/* IPV6: might want to wrap a v6 address in [] */
2011-03-29 17:51:15 +02:00
redisLog ( REDIS_VERBOSE , " Accepted cluster node %s:%d " , cip , cport ) ;
/* We need to create a temporary node in order to read the incoming
* packet in a valid contest . This node will be released once we
* read the packet and reply . */
link = createClusterLink ( NULL ) ;
link - > fd = cfd ;
aeCreateFileEvent ( server . el , cfd , AE_READABLE , clusterReadHandler , link ) ;
/* -----------------------------------------------------------------------------
* Key space handling
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
2013-02-14 12:49:16 +01:00
/* We have 16384 hash slots. The hash slot of a given key is obtained
2014-02-07 16:59:24 +01:00
* as the least significant 14 bits of the crc16 of the key .
* However if the key contains the { . . . } pattern , only the part between
* { and } is hashed . This may be useful in the future to force certain
* keys to be in the same node ( assuming no resharding is in progress ) . */
2011-03-29 17:51:15 +02:00
unsigned int keyHashSlot ( char * key , int keylen ) {
2014-02-07 16:59:24 +01:00
int s , e ; /* start-end indexes of { and } */
for ( s = 0 ; s < keylen ; s + + )
if ( key [ s ] = = ' { ' ) break ;
/* No '{' ? Hash the whole key. This is the base case. */
if ( s = = keylen ) return crc16 ( key , keylen ) & 0x3FFF ;
/* '{' found? Check if we have the corresponding '}'. */
for ( e = s + 1 ; e < keylen ; e + + )
if ( key [ e ] = = ' } ' ) break ;
/* No '}' or nothing betweeen {} ? Hash the whole key. */
if ( e = = keylen | | e = = s + 1 ) return crc16 ( key , keylen ) & 0x3FFF ;
/* If we are here there is both a { and a } on its right. Hash
* what is in the middle between { and } . */
return crc16 ( key + s + 1 , e - s - 1 ) & 0x3FFF ;
2011-03-29 17:51:15 +02:00
/* -----------------------------------------------------------------------------
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
/* Create a new cluster node, with the specified flags.
* If " nodename " is NULL this is considered a first handshake and a random
* node name is assigned to this node ( it will be fixed later when we ' ll
* receive the first pong ) .
* The node is created and returned to the user , but it is not automatically
* added to the nodes hash table . */
clusterNode * createClusterNode ( char * nodename , int flags ) {
clusterNode * node = zmalloc ( sizeof ( * node ) ) ;
if ( nodename )
memcpy ( node - > name , nodename , REDIS_CLUSTER_NAMELEN ) ;
2012-03-08 10:08:44 +01:00
getRandomHexChars ( node - > name , REDIS_CLUSTER_NAMELEN ) ;
2013-10-09 16:18:33 +02:00
node - > ctime = mstime ( ) ;
2013-09-25 11:47:13 +02:00
node - > configEpoch = 0 ;
2011-03-29 17:51:15 +02:00
node - > flags = flags ;
memset ( node - > slots , 0 , sizeof ( node - > slots ) ) ;
2013-02-28 15:11:05 +01:00
node - > numslots = 0 ;
2011-03-29 17:51:15 +02:00
node - > numslaves = 0 ;
node - > slaves = NULL ;
node - > slaveof = NULL ;
node - > ping_sent = node - > pong_received = 0 ;
2013-03-05 13:15:05 +01:00
node - > fail_time = 0 ;
2011-03-29 17:51:15 +02:00
node - > link = NULL ;
2013-02-15 12:58:35 +01:00
memset ( node - > ip , 0 , sizeof ( node - > ip ) ) ;
2013-02-14 13:01:28 +01:00
node - > port = 0 ;
2013-02-22 17:43:35 +01:00
node - > fail_reports = listCreate ( ) ;
2013-09-26 13:00:41 +02:00
node - > voted_time = 0 ;
2014-01-28 16:28:07 +01:00
node - > repl_offset_time = 0 ;
node - > repl_offset = 0 ;
2013-02-22 17:43:35 +01:00
listSetFreeMethod ( node - > fail_reports , zfree ) ;
2011-03-29 17:51:15 +02:00
return node ;
2013-02-22 17:43:35 +01:00
/* This function is called every time we get a failure report from a node.
* The side effect is to populate the fail_reports list ( or to update
* the timestamp of an existing report ) .
* ' failing ' is the node that is in failure state according to the
2013-02-26 14:58:39 +01:00
* ' sender ' node .
* The function returns 0 if it just updates a timestamp of an existing
* failure report from the same sender . 1 is returned if a new failure
* report is created . */
int clusterNodeAddFailureReport ( clusterNode * failing , clusterNode * sender ) {
2013-02-22 17:43:35 +01:00
list * l = failing - > fail_reports ;
listNode * ln ;
listIter li ;
clusterNodeFailReport * fr ;
/* If a failure report from the same sender already exists, just update
* the timestamp . */
listRewind ( l , & li ) ;
while ( ( ln = listNext ( & li ) ) ! = NULL ) {
fr = ln - > value ;
if ( fr - > node = = sender ) {
2013-10-09 16:18:33 +02:00
fr - > time = mstime ( ) ;
2013-02-26 14:58:39 +01:00
return 0 ;
2013-02-22 17:43:35 +01:00
/* Otherwise create a new report. */
fr = zmalloc ( sizeof ( * fr ) ) ;
fr - > node = sender ;
2013-10-09 16:18:33 +02:00
fr - > time = mstime ( ) ;
2013-02-22 17:43:35 +01:00
listAddNodeTail ( l , fr ) ;
2013-02-26 14:58:39 +01:00
return 1 ;
2013-02-22 17:43:35 +01:00
2013-02-26 11:19:48 +01:00
/* Remove failure reports that are too old, where too old means reasonably
* older than the global node timeout . Note that anyway for a node to be
* flagged as FAIL we need to have a local PFAIL state that is at least
* older than the global node timeout , so we don ' t just trust the number
* of failure reports from other nodes . */
void clusterNodeCleanupFailureReports ( clusterNode * node ) {
list * l = node - > fail_reports ;
listNode * ln ;
listIter li ;
clusterNodeFailReport * fr ;
2013-10-09 16:18:33 +02:00
mstime_t maxtime = server . cluster_node_timeout *
2013-04-04 12:02:48 +02:00
2013-10-09 16:18:33 +02:00
mstime_t now = mstime ( ) ;
2013-02-26 11:19:48 +01:00
listRewind ( l , & li ) ;
while ( ( ln = listNext ( & li ) ) ! = NULL ) {
fr = ln - > value ;
if ( now - fr - > time > maxtime ) listDelNode ( l , ln ) ;
2013-02-25 19:13:22 +01:00
/* Remove the failing report for 'node' if it was previously considered
* failing by ' sender ' . This function is called when a node informs us via
* gossip that a node is OK from its point of view ( no FAIL or PFAIL flags ) .
* Note that this function is called relatively often as it gets called even
* when there are no nodes failing , and is O ( N ) , however when the cluster is
* fine the failure reports list is empty so the function runs in constant
2013-02-26 14:58:39 +01:00
* time .
* The function returns 1 if the failure report was found and removed .
* Otherwise 0 is returned . */
int clusterNodeDelFailureReport ( clusterNode * node , clusterNode * sender ) {
2013-02-25 19:13:22 +01:00
list * l = node - > fail_reports ;
listNode * ln ;
listIter li ;
clusterNodeFailReport * fr ;
/* Search for a failure report from this sender. */
listRewind ( l , & li ) ;
while ( ( ln = listNext ( & li ) ) ! = NULL ) {
fr = ln - > value ;
if ( fr - > node = = sender ) break ;
2013-02-26 14:58:39 +01:00
if ( ! ln ) return 0 ; /* No failure report from this sender. */
2013-02-25 19:13:22 +01:00
/* Remove the failure report. */
listDelNode ( l , ln ) ;
2013-02-26 11:08:03 +01:00
clusterNodeCleanupFailureReports ( node ) ;
2013-02-26 14:58:39 +01:00
return 1 ;
2013-02-25 19:13:22 +01:00
2013-02-22 17:43:35 +01:00
/* Return the number of external nodes that believe 'node' is failing,
* not including this node , that may have a PFAIL or FAIL state for this
* node as well . */
int clusterNodeFailureReportsCount ( clusterNode * node ) {
clusterNodeCleanupFailureReports ( node ) ;
return listLength ( node - > fail_reports ) ;
2011-03-29 17:51:15 +02:00
int clusterNodeRemoveSlave ( clusterNode * master , clusterNode * slave ) {
int j ;
for ( j = 0 ; j < master - > numslaves ; j + + ) {
if ( master - > slaves [ j ] = = slave ) {
memmove ( master - > slaves + j , master - > slaves + ( j + 1 ) ,
( master - > numslaves - 1 ) - j ) ;
master - > numslaves - - ;
return REDIS_OK ;
return REDIS_ERR ;
int clusterNodeAddSlave ( clusterNode * master , clusterNode * slave ) {
int j ;
/* If it's already a slave, don't add it again. */
for ( j = 0 ; j < master - > numslaves ; j + + )
if ( master - > slaves [ j ] = = slave ) return REDIS_ERR ;
master - > slaves = zrealloc ( master - > slaves ,
sizeof ( clusterNode * ) * ( master - > numslaves + 1 ) ) ;
master - > slaves [ master - > numslaves ] = slave ;
master - > numslaves + + ;
return REDIS_OK ;
void clusterNodeResetSlaves ( clusterNode * n ) {
zfree ( n - > slaves ) ;
n - > numslaves = 0 ;
2013-12-17 14:50:24 +01:00
n - > slaves = NULL ;
2011-03-29 17:51:15 +02:00
2014-01-30 18:05:11 +01:00
int clusterCountNonFailingSlaves ( clusterNode * n ) {
int j , okslaves = 0 ;
for ( j = 0 ; j < n - > numslaves ; j + + )
if ( ! nodeFailed ( n - > slaves [ j ] ) ) okslaves + + ;
return okslaves ;
2011-03-29 17:51:15 +02:00
void freeClusterNode ( clusterNode * n ) {
sds nodename ;
2013-02-22 17:43:35 +01:00
2011-03-29 17:51:15 +02:00
nodename = sdsnewlen ( n - > name , REDIS_CLUSTER_NAMELEN ) ;
2013-02-14 13:20:56 +01:00
redisAssert ( dictDelete ( server . cluster - > nodes , nodename ) = = DICT_OK ) ;
2011-03-29 17:51:15 +02:00
sdsfree ( nodename ) ;
if ( n - > slaveof ) clusterNodeRemoveSlave ( n - > slaveof , n ) ;
if ( n - > link ) freeClusterLink ( n - > link ) ;
2013-02-22 17:43:35 +01:00
listRelease ( n - > fail_reports ) ;
2011-03-29 17:51:15 +02:00
zfree ( n ) ;
/* Add a node to the nodes hash table */
int clusterAddNode ( clusterNode * node ) {
int retval ;
2013-02-14 13:20:56 +01:00
retval = dictAdd ( server . cluster - > nodes ,
2011-03-29 17:51:15 +02:00
sdsnewlen ( node - > name , REDIS_CLUSTER_NAMELEN ) , node ) ;
return ( retval = = DICT_OK ) ? REDIS_OK : REDIS_ERR ;
2013-02-27 17:55:59 +01:00
/* Remove a node from the cluster:
* 1 ) Mark all the nodes handled by it as unassigned .
* 2 ) Remove all the failure reports sent by this node .
* 3 ) Free the node , that will in turn remove it from the hash table
* and from the list of slaves of its master , if it is a slave node .
void clusterDelNode ( clusterNode * delnode ) {
int j ;
dictIterator * di ;
dictEntry * de ;
/* 1) Mark slots as unassigned. */
for ( j = 0 ; j < REDIS_CLUSTER_SLOTS ; j + + ) {
if ( server . cluster - > importing_slots_from [ j ] = = delnode )
server . cluster - > importing_slots_from [ j ] = NULL ;
if ( server . cluster - > migrating_slots_to [ j ] = = delnode )
server . cluster - > migrating_slots_to [ j ] = NULL ;
if ( server . cluster - > slots [ j ] = = delnode )
clusterDelSlot ( j ) ;
/* 2) Remove failure reports. */
2013-09-04 10:07:50 +02:00
di = dictGetSafeIterator ( server . cluster - > nodes ) ;
2013-02-27 17:55:59 +01:00
while ( ( de = dictNext ( di ) ) ! = NULL ) {
clusterNode * node = dictGetVal ( de ) ;
if ( node = = delnode ) continue ;
clusterNodeDelFailureReport ( node , delnode ) ;
dictReleaseIterator ( di ) ;
2014-02-11 10:34:14 +01:00
/* 3) Remove this node from its master's slaves if needed. */
if ( nodeIsSlave ( delnode ) & & delnode - > slaveof )
clusterNodeRemoveSlave ( delnode - > slaveof , delnode ) ;
/* 4) Free the node, unlinking it from the cluster. */
2013-02-27 17:55:59 +01:00
freeClusterNode ( delnode ) ;
2011-03-29 17:51:15 +02:00
/* Node lookup by name */
clusterNode * clusterLookupNode ( char * name ) {
sds s = sdsnewlen ( name , REDIS_CLUSTER_NAMELEN ) ;
struct dictEntry * de ;
2013-02-14 13:20:56 +01:00
de = dictFind ( server . cluster - > nodes , s ) ;
2011-03-29 17:51:15 +02:00
sdsfree ( s ) ;
if ( de = = NULL ) return NULL ;
2011-11-08 17:07:55 +01:00
return dictGetVal ( de ) ;
2011-03-29 17:51:15 +02:00
/* This is only used after the handshake. When we connect a given IP/PORT
* as a result of CLUSTER MEET we don ' t have the node name yet , so we
* pick a random one , and will fix it when we receive the PONG request using
* this function . */
void clusterRenameNode ( clusterNode * node , char * newname ) {
int retval ;
sds s = sdsnewlen ( node - > name , REDIS_CLUSTER_NAMELEN ) ;
redisLog ( REDIS_DEBUG , " Renaming node %.40s into %.40s " ,
node - > name , newname ) ;
2013-02-14 13:20:56 +01:00
retval = dictDelete ( server . cluster - > nodes , s ) ;
2011-03-29 17:51:15 +02:00
sdsfree ( s ) ;
redisAssert ( retval = = DICT_OK ) ;
memcpy ( node - > name , newname , REDIS_CLUSTER_NAMELEN ) ;
clusterAddNode ( node ) ;
2013-12-02 11:12:23 +01:00
/* -----------------------------------------------------------------------------
* CLUSTER nodes blacklist
* The nodes blacklist is just a way to ensure that a given node with a given
* Node ID is not readded before some time elapsed ( this time is specified
* This is useful when we want to remove a node from the cluster completely :
* when CLUSTER FORGET is called , it also puts the node into the blacklist so
* that even if we receive gossip messages from other nodes that still remember
* about the node we want to remove , we don ' t re - add it before some time .
* Currently the REDIS_CLUSTER_BLACKLIST_TTL is set to 1 minute , this means
* that redis - trib has 60 seconds to send CLUSTER FORGET messages to nodes
2014-01-15 16:06:54 +01:00
* in the cluster without dealing with the problem of other nodes re - adding
2013-12-02 11:12:23 +01:00
* back the node to nodes we already sent the FORGET command to .
2013-12-05 16:35:32 +01:00
* The data structure used is a hash table with an sds string representing
2013-12-02 11:12:23 +01:00
* the node ID as key , and the time when it is ok to re - add the node as
* value .
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
# define REDIS_CLUSTER_BLACKLIST_TTL 60 /* 1 minute. */
/* Before of the addNode() or Exists() operations we always remove expired
* entries from the black list . This is an O ( N ) operation but it is not a
* problem since add / exists operations are called very infrequently and
* the hash table is supposed to contain very little elements at max .
* However without the cleanup during long uptimes and with some automated
* node add / removal procedures , entries could accumulate . */
void clusterBlacklistCleanup ( void ) {
dictIterator * di ;
dictEntry * de ;
di = dictGetSafeIterator ( server . cluster - > nodes_black_list ) ;
while ( ( de = dictNext ( di ) ) ! = NULL ) {
int64_t expire = dictGetUnsignedIntegerVal ( de ) ;
if ( expire < server . unixtime )
dictDelete ( server . cluster - > nodes_black_list , dictGetKey ( de ) ) ;
dictReleaseIterator ( di ) ;
/* Cleanup the blacklist and add a new node ID to the black list. */
void clusterBlacklistAddNode ( clusterNode * node ) {
dictEntry * de ;
sds id = sdsnewlen ( node - > name , REDIS_CLUSTER_NAMELEN ) ;
clusterBlacklistCleanup ( ) ;
2014-01-15 16:44:06 +01:00
if ( dictAdd ( server . cluster - > nodes_black_list , id , NULL ) = = DICT_OK ) {
/* If the key was added, duplicate the sds string representation of
* the key for the next lookup . We ' ll free it at the end . */
id = sdsdup ( id ) ;
de = dictFind ( server . cluster - > nodes_black_list , id ) ;
2014-01-15 16:49:31 +01:00
dictSetUnsignedIntegerVal ( de , time ( NULL ) + REDIS_CLUSTER_BLACKLIST_TTL ) ;
2014-01-15 16:44:06 +01:00
sdsfree ( id ) ;
2013-12-02 11:12:23 +01:00
/* Return non-zero if the specified node ID exists in the blacklist.
* You don ' t need to pass an sds string here , any pointer to 40 bytes
* will work . */
int clusterBlacklistExists ( char * nodeid ) {
sds id = sdsnewlen ( nodeid , REDIS_CLUSTER_NAMELEN ) ;
int retval ;
2014-01-15 16:06:54 +01:00
clusterBlacklistCleanup ( ) ;
2013-12-02 11:12:23 +01:00
retval = dictFind ( server . cluster - > nodes_black_list , id ) ! = NULL ;
sdsfree ( id ) ;
return retval ;
2011-03-29 17:51:15 +02:00
/* -----------------------------------------------------------------------------
* CLUSTER messages exchange - PING / PONG and gossip
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
2013-02-26 14:58:39 +01:00
/* This function checks if a given node should be marked as FAIL.
* It happens if the following conditions are met :
2013-09-20 11:26:44 +02:00
* 1 ) We received enough failure reports from other master nodes via gossip .
* Enough means that the majority of the masters signaled the node is
* down recently .
* 2 ) We believe this node is in PFAIL state .
2013-02-26 14:58:39 +01:00
* If a failure is detected we also inform the whole cluster about this
* event trying to force every other node to set the FAIL flag for the node .
2013-09-20 11:26:44 +02:00
* Note that the form of agreement used here is weak , as we collect the majority
* of masters state during some time , and even if we force agreement by
* propagating the FAIL message , because of partitions we may not reach every
* node . However :
* 1 ) Either we reach the majority and eventually the FAIL state will propagate
* to all the cluster .
* 2 ) Or there is no majority so no slave promotion will be authorized and the
* FAIL flag will be cleared after some time .
2013-02-26 14:58:39 +01:00
void markNodeAsFailingIfNeeded ( clusterNode * node ) {
int failures ;
int needed_quorum = ( server . cluster - > size / 2 ) + 1 ;
2014-01-29 12:17:16 +01:00
if ( ! nodeTimedOut ( node ) ) return ; /* We can reach it. */
if ( nodeFailed ( node ) ) return ; /* Already FAILing. */
2013-02-26 14:58:39 +01:00
2013-09-20 11:26:44 +02:00
failures = clusterNodeFailureReportsCount ( node ) ;
/* Also count myself as a voter if I'm a master. */
2014-01-29 12:17:16 +01:00
if ( nodeIsMaster ( myself ) ) failures + + ;
2013-09-20 11:26:44 +02:00
if ( failures < needed_quorum ) return ; /* No weak agreement from masters. */
2013-02-26 14:58:39 +01:00
redisLog ( REDIS_NOTICE ,
" Marking node %.40s as failing (quorum reached). " , node - > name ) ;
/* Mark the node as failing. */
node - > flags & = ~ REDIS_NODE_PFAIL ;
node - > flags | = REDIS_NODE_FAIL ;
2013-10-09 16:18:33 +02:00
node - > fail_time = mstime ( ) ;
2013-02-26 14:58:39 +01:00
2013-09-20 11:26:44 +02:00
/* Broadcast the failing node name to everybody, forcing all the other
* reachable nodes to flag the node as FAIL . */
2014-01-29 12:17:16 +01:00
if ( nodeIsMaster ( myself ) ) clusterSendFail ( node - > name ) ;
2013-10-03 09:55:20 +02:00
2013-02-26 14:58:39 +01:00
/* This function is called only if a node is marked as FAIL, but we are able
* to reach it again . It checks if there are the conditions to undo the FAIL
2013-03-21 10:47:10 +01:00
* state . */
2013-02-26 14:58:39 +01:00
void clearNodeFailureIfNeeded ( clusterNode * node ) {
2013-12-17 09:45:42 +01:00
mstime_t now = mstime ( ) ;
2013-03-05 15:05:32 +01:00
2014-01-29 12:17:16 +01:00
redisAssert ( nodeFailed ( node ) ) ;
2013-03-05 15:05:32 +01:00
/* For slaves we always clear the FAIL flag if we can contact the
* node again . */
2014-02-10 17:18:16 +01:00
if ( nodeIsSlave ( node ) | | node - > numslots = = 0 ) {
2013-03-05 15:05:32 +01:00
redisLog ( REDIS_NOTICE ,
2014-02-10 17:18:16 +01:00
" Clear FAIL state for node %.40s: %s is reachable again. " ,
2014-02-10 17:21:10 +01:00
node - > name ,
nodeIsSlave ( node ) ? " slave " : " master without slots " ) ;
2013-03-05 15:05:32 +01:00
node - > flags & = ~ REDIS_NODE_FAIL ;
2013-10-03 09:55:20 +02:00
2013-03-05 15:05:32 +01:00
/* If it is a master and...
2013-10-09 16:18:33 +02:00
* 1 ) The FAIL state is old enough .
2013-03-05 15:05:32 +01:00
* 2 ) It is yet serving slots from our point of view ( not failed over ) .
* Apparently no one is going to fix these slots , clear the FAIL flag . */
2014-01-29 12:17:16 +01:00
if ( nodeIsMaster ( node ) & & node - > numslots > 0 & &
2013-04-04 12:02:48 +02:00
( now - node - > fail_time ) >
2013-10-09 16:18:33 +02:00
( server . cluster_node_timeout * REDIS_CLUSTER_FAIL_UNDO_TIME_MULT ) )
2013-03-05 15:05:32 +01:00
2013-02-26 15:03:27 +01:00
redisLog ( REDIS_NOTICE ,
2013-03-05 15:05:32 +01:00
" Clear FAIL state for node %.40s: is reachable again and nobody is serving its slots after some time. " ,
2013-02-26 15:03:27 +01:00
node - > name ) ;
2013-02-26 15:15:44 +01:00
node - > flags & = ~ REDIS_NODE_FAIL ;
2013-10-03 09:55:20 +02:00
2013-02-26 14:58:39 +01:00
2013-09-04 15:52:16 +02:00
/* Return true if we already have a node in HANDSHAKE state matching the
* specified ip address and port number . This function is used in order to
* avoid adding a new handshake node for the same address multiple times . */
int clusterHandshakeInProgress ( char * ip , int port ) {
dictIterator * di ;
dictEntry * de ;
di = dictGetSafeIterator ( server . cluster - > nodes ) ;
while ( ( de = dictNext ( di ) ) ! = NULL ) {
clusterNode * node = dictGetVal ( de ) ;
2014-01-29 12:17:16 +01:00
if ( ! nodeInHandshake ( node ) ) continue ;
2013-09-04 15:52:16 +02:00
if ( ! strcasecmp ( node - > ip , ip ) & & node - > port = = port ) break ;
dictReleaseIterator ( di ) ;
return de ! = NULL ;
2013-12-20 12:37:18 +01:00
/* Start an handshake with the specified address if there is not one
* already in progress . Returns non - zero if the handshake was actually
* started . On error zero is returned and errno is set to one of the
* following values :
* EAGAIN - There is already an handshake in progress for this address .
* EINVAL - IP or port are not valid . */
int clusterStartHandshake ( char * ip , int port ) {
clusterNode * n ;
char norm_ip [ REDIS_IP_STR_LEN ] ;
struct sockaddr_storage sa ;
/* IP sanity check */
if ( inet_pton ( AF_INET , ip ,
& ( ( ( struct sockaddr_in * ) & sa ) - > sin_addr ) ) )
sa . ss_family = AF_INET ;
} else if ( inet_pton ( AF_INET6 , ip ,
& ( ( ( struct sockaddr_in6 * ) & sa ) - > sin6_addr ) ) )
sa . ss_family = AF_INET6 ;
} else {
errno = EINVAL ;
return 0 ;
/* Port sanity check */
if ( port < = 0 | | port > ( 65535 - REDIS_CLUSTER_PORT_INCR ) ) {
errno = EINVAL ;
return 0 ;
/* Set norm_ip as the normalized string representation of the node
* IP address . */
if ( sa . ss_family = = AF_INET )
inet_ntop ( AF_INET ,
( void * ) & ( ( ( struct sockaddr_in * ) & sa ) - > sin_addr ) ,
2014-02-26 18:05:12 -05:00
norm_ip , REDIS_IP_STR_LEN ) ;
2013-12-20 12:37:18 +01:00
inet_ntop ( AF_INET6 ,
( void * ) & ( ( ( struct sockaddr_in6 * ) & sa ) - > sin6_addr ) ,
2014-02-26 18:05:12 -05:00
norm_ip , REDIS_IP_STR_LEN ) ;
2013-12-20 12:37:18 +01:00
if ( clusterHandshakeInProgress ( norm_ip , port ) ) {
errno = EAGAIN ;
return 0 ;
/* Add the node with a random address (NULL as first argument to
* createClusterNode ( ) ) . Everything will be fixed during the
* handskake . */
memcpy ( n - > ip , norm_ip , sizeof ( n - > ip ) ) ;
n - > port = port ;
clusterAddNode ( n ) ;
return 1 ;
2011-03-29 17:51:15 +02:00
/* Process the gossip section of PING or PONG packets.
* Note that this function assumes that the packet is already sanity - checked
* by the caller , not in the content of the gossip section , but in the
* length . */
void clusterProcessGossipSection ( clusterMsg * hdr , clusterLink * link ) {
uint16_t count = ntohs ( hdr - > count ) ;
clusterMsgDataGossip * g = ( clusterMsgDataGossip * ) hdr - > data . ping . gossip ;
clusterNode * sender = link - > node ? link - > node : clusterLookupNode ( hdr - > sender ) ;
while ( count - - ) {
sds ci = sdsempty ( ) ;
uint16_t flags = ntohs ( g - > flags ) ;
clusterNode * node ;
if ( flags = = 0 ) ci = sdscat ( ci , " noflags, " ) ;
if ( flags & REDIS_NODE_MYSELF ) ci = sdscat ( ci , " myself, " ) ;
if ( flags & REDIS_NODE_MASTER ) ci = sdscat ( ci , " master, " ) ;
if ( flags & REDIS_NODE_SLAVE ) ci = sdscat ( ci , " slave, " ) ;
if ( flags & REDIS_NODE_PFAIL ) ci = sdscat ( ci , " fail?, " ) ;
if ( flags & REDIS_NODE_FAIL ) ci = sdscat ( ci , " fail, " ) ;
if ( flags & REDIS_NODE_HANDSHAKE ) ci = sdscat ( ci , " handshake, " ) ;
if ( flags & REDIS_NODE_NOADDR ) ci = sdscat ( ci , " noaddr, " ) ;
if ( ci [ sdslen ( ci ) - 1 ] = = ' , ' ) ci [ sdslen ( ci ) - 1 ] = ' ' ;
redisLog ( REDIS_DEBUG , " GOSSIP %.40s %s:%d %s " ,
g - > nodename ,
g - > ip ,
ntohs ( g - > port ) ,
ci ) ;
sdsfree ( ci ) ;
/* Update our state accordingly to the gossip sections */
node = clusterLookupNode ( g - > nodename ) ;
2013-12-20 12:37:18 +01:00
if ( node ) {
2013-08-26 16:16:25 +02:00
/* We already know this node.
Handle failure reports , only when the sender is a master . */
2014-01-29 12:17:16 +01:00
if ( sender & & nodeIsMaster ( sender ) & & node ! = myself ) {
2013-02-26 14:58:39 +01:00
if ( flags & ( REDIS_NODE_FAIL | REDIS_NODE_PFAIL ) ) {
if ( clusterNodeAddFailureReport ( node , sender ) ) {
2013-12-21 00:04:53 +01:00
redisLog ( REDIS_VERBOSE ,
2013-02-26 14:58:39 +01:00
" Node %.40s reported node %.40s as not reachable. " ,
sender - > name , node - > name ) ;
markNodeAsFailingIfNeeded ( node ) ;
} else {
if ( clusterNodeDelFailureReport ( node , sender ) ) {
2013-12-21 00:04:53 +01:00
redisLog ( REDIS_VERBOSE ,
2013-02-26 14:58:39 +01:00
" Node %.40s reported node %.40s is back online. " ,
sender - > name , node - > name ) ;
2011-03-29 17:51:15 +02:00
2013-12-20 12:37:18 +01:00
/* If we already know this node, but it is not reachable, and
* we see a different address in the gossip section , start an
* handshake with the ( possibly ) new address : this will result
* into a node address update if the handshake will be
* successful . */
if ( node - > flags & ( REDIS_NODE_FAIL | REDIS_NODE_PFAIL ) & &
( strcasecmp ( node - > ip , g - > ip ) | | node - > port ! = ntohs ( g - > port ) ) )
clusterStartHandshake ( g - > ip , ntohs ( g - > port ) ) ;
2011-03-29 17:51:15 +02:00
} else {
/* If it's not in NOADDR state and we don't have it, we
2013-12-05 16:35:32 +01:00
* start a handshake process against this IP / PORT pairs .
2011-03-29 17:51:15 +02:00
* Note that we require that the sender of this gossip message
* is a well known node in our cluster , otherwise we risk
* joining another cluster . */
2014-01-15 16:50:45 +01:00
if ( sender & &
! ( flags & REDIS_NODE_NOADDR ) & &
! clusterBlacklistExists ( g - > nodename ) )
2013-12-20 12:37:18 +01:00
clusterStartHandshake ( g - > ip , ntohs ( g - > port ) ) ;
2014-01-15 16:50:45 +01:00
2011-03-29 17:51:15 +02:00
/* Next node */
g + + ;
2011-06-18 19:19:10 +01:00
/* IP -> string conversion. 'buf' is supposed to at least be 46 bytes. */
2011-03-29 17:51:15 +02:00
void nodeIp2String ( char * buf , clusterLink * link ) {
2014-02-26 18:05:12 -05:00
anetPeerToString ( link - > fd , buf , REDIS_IP_STR_LEN , NULL ) ;
2011-03-29 17:51:15 +02:00
/* Update the node address to the IP address that can be extracted
2013-06-12 10:50:07 -07:00
* from link - > fd , and at the specified port .
* Also disconnect the node link so that we ' ll connect again to the new
* address .
* If the ip / port pair are already correct no operation is performed at
* all .
* The function returns 0 if the node address is still the same ,
* otherwise 1 is returned . */
int nodeUpdateAddressIfNeeded ( clusterNode * node , clusterLink * link , int port ) {
char ip [ REDIS_IP_STR_LEN ] ;
/* We don't proceed if the link is the same as the sender link, as this
* function is designed to see if the node link is consistent with the
* symmetric link that is used to receive PINGs from the node .
* As a side effect this function never frees the passed ' link ' , so
* it is safe to call during packet processing . */
if ( link = = node - > link ) return 0 ;
nodeIp2String ( ip , link ) ;
if ( node - > port = = port & & strcmp ( ip , node - > ip ) = = 0 ) return 0 ;
/* IP / port is different, update it. */
memcpy ( node - > ip , ip , sizeof ( ip ) ) ;
node - > port = port ;
if ( node - > link ) freeClusterLink ( node - > link ) ;
redisLog ( REDIS_WARNING , " Address updated for node %.40s, now %s:%d " ,
node - > name , node - > ip , node - > port ) ;
2013-12-20 12:47:13 +01:00
/* Check if this is our master and we have to change the
* replication target as well . */
2014-01-29 12:17:16 +01:00
if ( nodeIsSlave ( myself ) & & myself - > slaveof = = node )
2013-12-20 12:47:13 +01:00
replicationSetMaster ( node - > ip , node - > port ) ;
2013-06-12 10:50:07 -07:00
return 1 ;
2011-03-29 17:51:15 +02:00
2013-11-08 17:02:10 +01:00
/* Reconfigure the specified node 'n' as a master. This function is called when
* a node that we believed to be a slave is now acting as master in order to
* update the state of the node . */
void clusterSetNodeAsMaster ( clusterNode * n ) {
2014-01-29 12:17:16 +01:00
if ( nodeIsMaster ( n ) ) return ;
2013-11-08 17:02:10 +01:00
if ( n - > slaveof ) clusterNodeRemoveSlave ( n - > slaveof , n ) ;
n - > flags & = ~ REDIS_NODE_SLAVE ;
n - > flags | = REDIS_NODE_MASTER ;
n - > slaveof = NULL ;
/* Update config and state. */
clusterDoBeforeSleep ( CLUSTER_TODO_SAVE_CONFIG |
/* This function is called when we receive a master configuration via a
* PING , PONG or UPDATE packet . What we receive is a node , a configEpoch of the
* node , and the set of slots claimed under this configEpoch .
* What we do is to rebind the slots with newer configuration compared to our
* local configuration , and if needed , we turn ourself into a replica of the
* node ( see the function comments for more info ) .
* The ' sender ' is the node for which we received a configuration update .
* Sometimes it is not actaully the " Sender " of the information , like in the case
* we receive the info via an UPDATE packet . */
2014-03-11 11:49:47 +01:00
void clusterUpdateSlotsConfigWith ( clusterNode * sender , uint64_t senderConfigEpoch , unsigned char * slots ) {
2013-11-08 17:02:10 +01:00
int j ;
clusterNode * curmaster , * newmaster = NULL ;
/* Here we set curmaster to this node or the node this node
* replicates to if it ' s a slave . In the for loop we are
* interested to check if slots are taken away from curmaster . */
2014-01-29 12:17:16 +01:00
curmaster = nodeIsMaster ( myself ) ? myself : myself - > slaveof ;
2013-11-08 17:02:10 +01:00
for ( j = 0 ; j < REDIS_CLUSTER_SLOTS ; j + + ) {
if ( bitmapTestBit ( slots , j ) ) {
2014-03-11 11:49:47 +01:00
/* The slot is already bound to the sender of this message. */
if ( server . cluster - > slots [ j ] = = sender ) continue ;
/* The slot is in importing state, it should be modified only
* manually via redis - trib ( example : a resharding is in progress
* and the migrating side slot was already closed and is advertising
* a new config . We still want the slot to be closed manually ) . */
if ( server . cluster - > importing_slots_from [ j ] ) continue ;
2013-11-08 17:02:10 +01:00
/* We rebind the slot to the new node claiming it if:
2014-03-11 11:32:40 +01:00
* 1 ) The slot was unassigned or the new node claims it with a
* greater configEpoch .
* 2 ) We are not currently importing the slot . */
2013-11-08 17:02:10 +01:00
if ( server . cluster - > slots [ j ] = = NULL | |
2014-02-10 18:01:58 +01:00
server . cluster - > slots [ j ] - > configEpoch < senderConfigEpoch )
2013-11-08 17:02:10 +01:00
2014-03-11 11:49:47 +01:00
/* Was this slot mine, and still contains keys? Something
* odd happened , put the slot in importing state so that
* redis - trib fix can detect the condition ( and no further
* updates will be processed before the slot gets fixed ) . */
if ( server . cluster - > slots [ j ] = = myself & &
countKeysInSlot ( j ) & &
sender ! = myself )
redisLog ( REDIS_WARNING , " Slot update for a slot I still have keys received. Putting the slot in IMPORTING state. Please run the 'redis-trib fix' command. " ) ;
server . cluster - > importing_slots_from [ j ] = sender ;
2013-11-08 17:02:10 +01:00
if ( server . cluster - > slots [ j ] = = curmaster )
newmaster = sender ;
clusterDelSlot ( j ) ;
clusterAddSlot ( sender , j ) ;
clusterDoBeforeSleep ( CLUSTER_TODO_SAVE_CONFIG |
/* If at least one slot was reassigned from a node to another node
* with a greater configEpoch , it is possible that :
* 1 ) We are a master left without slots . This means that we were
* failed over and we should turn into a replica of the new
* master .
* 2 ) We are a slave and our master is left without slots . We need
* to replicate to the new slots owner . */
if ( newmaster & & curmaster - > numslots = = 0 ) {
redisLog ( REDIS_WARNING , " Configuration change detected. Reconfiguring myself as a replica of %.40s " , sender - > name ) ;
clusterSetMaster ( sender ) ;
clusterDoBeforeSleep ( CLUSTER_TODO_SAVE_CONFIG |
2011-03-29 17:51:15 +02:00
/* When this function is called, there is a packet to process starting
* at node - > rcvbuf . Releasing the buffer is up to the caller , so this
* function should just handle the higher level stuff of processing the
* packet , modifying the cluster state if needed .
* The function returns 1 if the link is still valid after the packet
* was processed , otherwise 0 if the link was freed since the packet
* processing lead to some inconsistency error ( for instance a PONG
* received from the wrong sender ID ) . */
int clusterProcessPacket ( clusterLink * link ) {
clusterMsg * hdr = ( clusterMsg * ) link - > rcvbuf ;
uint32_t totlen = ntohl ( hdr - > totlen ) ;
uint16_t type = ntohs ( hdr - > type ) ;
2013-03-20 10:48:42 +01:00
uint16_t flags = ntohs ( hdr - > flags ) ;
2013-11-05 12:01:07 +01:00
uint64_t senderCurrentEpoch = 0 , senderConfigEpoch = 0 ;
2011-03-29 17:51:15 +02:00
clusterNode * sender ;
2013-10-02 10:10:08 +02:00
server . cluster - > stats_bus_messages_received + + ;
2011-10-05 16:02:45 +02:00
redisLog ( REDIS_DEBUG , " --- Processing packet of type %d, %lu bytes " ,
type , ( unsigned long ) totlen ) ;
2011-10-07 16:34:16 +02:00
/* Perform sanity checks */
2014-02-10 15:54:19 +01:00
if ( totlen < 16 ) return 1 ; /* At least signature, version, totlen, count. */
if ( ntohs ( hdr - > ver ) ! = 0 ) return 1 ; /* Can't handle versions other than 0. */
2011-03-29 17:51:15 +02:00
if ( totlen > sdslen ( link - > rcvbuf ) ) return 1 ;
if ( type = = CLUSTERMSG_TYPE_PING | | type = = CLUSTERMSG_TYPE_PONG | |
uint16_t count = ntohs ( hdr - > count ) ;
uint32_t explen ; /* expected length of this packet */
explen = sizeof ( clusterMsg ) - sizeof ( union clusterMsgData ) ;
explen + = ( sizeof ( clusterMsgDataGossip ) * count ) ;
if ( totlen ! = explen ) return 1 ;
2013-03-13 17:27:06 +01:00
} else if ( type = = CLUSTERMSG_TYPE_FAIL ) {
2011-03-29 17:51:15 +02:00
uint32_t explen = sizeof ( clusterMsg ) - sizeof ( union clusterMsgData ) ;
explen + = sizeof ( clusterMsgDataFail ) ;
if ( totlen ! = explen ) return 1 ;
2013-03-13 17:27:06 +01:00
} else if ( type = = CLUSTERMSG_TYPE_PUBLISH ) {
2011-10-07 16:34:16 +02:00
uint32_t explen = sizeof ( clusterMsg ) - sizeof ( union clusterMsgData ) ;
explen + = sizeof ( clusterMsgDataPublish ) +
ntohl ( hdr - > data . publish . msg . channel_len ) +
ntohl ( hdr - > data . publish . msg . message_len ) ;
if ( totlen ! = explen ) return 1 ;
2013-03-14 16:42:56 +01:00
2014-02-05 13:01:24 +01:00
2013-03-13 17:31:19 +01:00
uint32_t explen = sizeof ( clusterMsg ) - sizeof ( union clusterMsgData ) ;
2013-11-08 17:02:10 +01:00
if ( totlen ! = explen ) return 1 ;
} else if ( type = = CLUSTERMSG_TYPE_UPDATE ) {
uint32_t explen = sizeof ( clusterMsg ) - sizeof ( union clusterMsgData ) ;
explen + = sizeof ( clusterMsgDataUpdate ) ;
2013-03-13 17:31:19 +01:00
if ( totlen ! = explen ) return 1 ;
2011-10-07 16:34:16 +02:00
2011-03-29 17:51:15 +02:00
2013-09-27 09:55:41 +02:00
/* Check if the sender is a known node. */
2011-03-29 17:51:15 +02:00
sender = clusterLookupNode ( hdr - > sender ) ;
2014-01-29 12:17:16 +01:00
if ( sender & & ! nodeInHandshake ( sender ) ) {
2013-09-27 09:55:41 +02:00
/* Update our curretEpoch if we see a newer epoch in the cluster. */
2013-09-25 12:36:29 +02:00
senderCurrentEpoch = ntohu64 ( hdr - > currentEpoch ) ;
senderConfigEpoch = ntohu64 ( hdr - > configEpoch ) ;
if ( senderCurrentEpoch > server . cluster - > currentEpoch )
server . cluster - > currentEpoch = senderCurrentEpoch ;
2013-09-27 09:55:41 +02:00
/* Update the sender configEpoch if it is publishing a newer one. */
2013-09-30 10:13:33 +02:00
if ( senderConfigEpoch > sender - > configEpoch ) {
2013-09-27 09:55:41 +02:00
sender - > configEpoch = senderConfigEpoch ;
2013-10-03 09:55:20 +02:00
2013-09-30 10:13:33 +02:00
2014-01-29 16:01:00 +01:00
/* Update the replication offset info for this node. */
sender - > repl_offset = ntohu64 ( hdr - > offset ) ;
sender - > repl_offset_time = mstime ( ) ;
2014-02-05 13:01:24 +01:00
/* If we are a slave performing a manual failover and our master
* sent its offset while already paused , populate the MF state . */
if ( server . cluster - > mf_end & &
nodeIsSlave ( myself ) & &
myself - > slaveof = = sender & &
hdr - > mflags [ 0 ] & CLUSTERMSG_FLAG0_PAUSED & &
server . cluster - > mf_master_offset = = 0 )
server . cluster - > mf_master_offset = sender - > repl_offset ;
redisLog ( REDIS_WARNING , " Received replication offset for paused master manual failover: %lld " , server . cluster - > mf_master_offset ) ;
2013-09-25 12:36:29 +02:00
2013-04-11 18:19:48 +02:00
2013-09-25 12:36:29 +02:00
/* Process packets by type. */
2011-03-29 17:51:15 +02:00
if ( type = = CLUSTERMSG_TYPE_PING | | type = = CLUSTERMSG_TYPE_MEET ) {
2013-02-27 12:27:15 +01:00
redisLog ( REDIS_DEBUG , " Ping packet received: %p " , ( void * ) link - > node ) ;
2011-03-29 17:51:15 +02:00
/* Add this node if it is new for us and the msg type is MEET.
* In this stage we don ' t try to add the node with the right
* flags , slaveof pointer , and so forth , as this details will be
2013-06-11 21:33:00 +02:00
* resolved when we ' ll receive PONGs from the node . */
2011-03-29 17:51:15 +02:00
if ( ! sender & & type = = CLUSTERMSG_TYPE_MEET ) {
clusterNode * node ;
node = createClusterNode ( NULL , REDIS_NODE_HANDSHAKE ) ;
nodeIp2String ( node - > ip , link ) ;
node - > port = ntohs ( hdr - > port ) ;
clusterAddNode ( node ) ;
2013-10-03 09:55:20 +02:00
clusterDoBeforeSleep ( CLUSTER_TODO_SAVE_CONFIG ) ;
2011-03-29 17:51:15 +02:00
/* Get info from the gossip section */
clusterProcessGossipSection ( hdr , link ) ;
/* Anyway reply with a PONG */
clusterSendPing ( link , CLUSTERMSG_TYPE_PONG ) ;
2013-04-11 18:19:48 +02:00
/* PING or PONG: process config information. */
2013-08-22 11:53:28 +02:00
if ( type = = CLUSTERMSG_TYPE_PING | | type = = CLUSTERMSG_TYPE_PONG | |
2013-04-11 18:19:48 +02:00
redisLog ( REDIS_DEBUG , " %s packet received: %p " ,
type = = CLUSTERMSG_TYPE_PING ? " ping " : " pong " ,
( void * ) link - > node ) ;
2011-03-29 17:51:15 +02:00
if ( link - > node ) {
2014-01-29 12:17:16 +01:00
if ( nodeInHandshake ( link - > node ) ) {
2011-03-29 17:51:15 +02:00
/* If we already have this node, try to change the
* IP / port of the node with the new one . */
if ( sender ) {
redisLog ( REDIS_WARNING ,
" Handshake error: we already know node %.40s, updating the address if needed. " , sender - > name ) ;
2013-06-12 10:50:07 -07:00
if ( nodeUpdateAddressIfNeeded ( sender , link , ntohs ( hdr - > port ) ) )
2013-10-03 09:55:20 +02:00
clusterDoBeforeSleep ( CLUSTER_TODO_SAVE_CONFIG |
2013-06-12 10:50:07 -07:00
/* Free this node as we alrady have it. This will
* cause the link to be freed as well . */
freeClusterNode ( link - > node ) ;
2011-03-29 17:51:15 +02:00
return 0 ;
/* First thing to do is replacing the random name with the
2013-12-05 16:35:32 +01:00
* right node name if this was a handshake stage . */
2011-03-29 17:51:15 +02:00
clusterRenameNode ( link - > node , hdr - > sender ) ;
redisLog ( REDIS_DEBUG , " Handshake with node %.40s completed. " ,
link - > node - > name ) ;
link - > node - > flags & = ~ REDIS_NODE_HANDSHAKE ;
2013-03-25 13:03:01 +01:00
link - > node - > flags | = flags & ( REDIS_NODE_MASTER | REDIS_NODE_SLAVE ) ;
2013-10-03 09:55:20 +02:00
clusterDoBeforeSleep ( CLUSTER_TODO_SAVE_CONFIG ) ;
2011-03-29 17:51:15 +02:00
} else if ( memcmp ( link - > node - > name , hdr - > sender ,
/* If the reply has a non matching node ID we
* disconnect this node and set it as not having an associated
* address . */
redisLog ( REDIS_DEBUG , " PONG contains mismatching sender ID " ) ;
link - > node - > flags | = REDIS_NODE_NOADDR ;
2013-02-27 17:09:33 +01:00
link - > node - > ip [ 0 ] = ' \0 ' ;
link - > node - > port = 0 ;
2011-03-29 17:51:15 +02:00
freeClusterLink ( link ) ;
2013-10-03 09:55:20 +02:00
clusterDoBeforeSleep ( CLUSTER_TODO_SAVE_CONFIG ) ;
2011-03-29 17:51:15 +02:00
return 0 ;
2013-03-13 18:38:08 +01:00
2013-06-12 10:50:07 -07:00
/* Update the node address if it changed. */
if ( sender & & type = = CLUSTERMSG_TYPE_PING & &
2014-01-29 12:17:16 +01:00
! nodeInHandshake ( sender ) & &
2013-06-12 10:50:07 -07:00
nodeUpdateAddressIfNeeded ( sender , link , ntohs ( hdr - > port ) ) )
2013-10-03 09:55:20 +02:00
2013-06-12 10:50:07 -07:00
2011-03-29 17:51:15 +02:00
/* Update our info about the node */
2013-04-11 18:55:58 +02:00
if ( link - > node & & type = = CLUSTERMSG_TYPE_PONG ) {
2013-10-09 16:18:33 +02:00
link - > node - > pong_received = mstime ( ) ;
2013-04-11 18:55:58 +02:00
link - > node - > ping_sent = 0 ;
/* The PFAIL condition can be reversed without external
2013-06-11 21:33:00 +02:00
* help if it is momentary ( that is , if it does not
2013-04-11 18:55:58 +02:00
* turn into a FAIL state ) .
* The FAIL condition is also reversible under specific
* conditions detected by clearNodeFailureIfNeeded ( ) . */
2014-01-29 12:17:16 +01:00
if ( nodeTimedOut ( link - > node ) ) {
2013-04-11 18:55:58 +02:00
link - > node - > flags & = ~ REDIS_NODE_PFAIL ;
2013-10-03 09:55:20 +02:00
clusterDoBeforeSleep ( CLUSTER_TODO_SAVE_CONFIG |
2014-01-29 12:17:16 +01:00
} else if ( nodeFailed ( link - > node ) ) {
2013-04-11 18:55:58 +02:00
clearNodeFailureIfNeeded ( link - > node ) ;
2011-03-29 17:51:15 +02:00
2013-11-08 10:32:16 +01:00
/* Check for role switch: slave -> master or master -> slave. */
2011-03-29 17:51:15 +02:00
if ( sender ) {
if ( ! memcmp ( hdr - > slaveof , REDIS_NODE_NULL_NAME ,
sizeof ( hdr - > slaveof ) ) )
2013-03-15 16:35:16 +01:00
/* Node is a master. */
2013-11-08 17:02:10 +01:00
clusterSetNodeAsMaster ( sender ) ;
2011-03-29 17:51:15 +02:00
} else {
2013-03-15 16:35:16 +01:00
/* Node is a slave. */
2011-03-29 17:51:15 +02:00
clusterNode * master = clusterLookupNode ( hdr - > slaveof ) ;
2014-01-29 12:17:16 +01:00
if ( nodeIsMaster ( sender ) ) {
2013-03-20 00:30:47 +01:00
/* Master turned into a slave! Reconfigure the node. */
2013-03-15 16:35:16 +01:00
clusterDelNodeSlots ( sender ) ;
2013-03-19 16:01:30 +01:00
sender - > flags & = ~ REDIS_NODE_MASTER ;
sender - > flags | = REDIS_NODE_SLAVE ;
2013-03-20 00:30:47 +01:00
2013-03-19 16:01:30 +01:00
/* Remove the list of slaves from the node. */
if ( sender - > numslaves ) clusterNodeResetSlaves ( sender ) ;
2013-03-20 00:30:47 +01:00
/* Update config and state. */
2013-10-03 09:55:20 +02:00
clusterDoBeforeSleep ( CLUSTER_TODO_SAVE_CONFIG |
2013-03-15 16:35:16 +01:00
2013-03-19 16:01:30 +01:00
/* Master node changed for this slave? */
2014-02-10 18:33:34 +01:00
if ( master & & sender - > slaveof ! = master ) {
2013-03-25 15:01:25 +01:00
if ( sender - > slaveof )
clusterNodeRemoveSlave ( sender - > slaveof , sender ) ;
2013-03-05 11:50:11 +01:00
clusterNodeAddSlave ( master , sender ) ;
sender - > slaveof = master ;
2013-10-02 12:27:12 +02:00
/* Update config. */
2013-10-03 09:55:20 +02:00
clusterDoBeforeSleep ( CLUSTER_TODO_SAVE_CONFIG ) ;
2013-03-05 11:50:11 +01:00
2011-03-29 17:51:15 +02:00
2013-03-15 16:15:40 +01:00
/* Update our info about served slots.
2013-11-08 10:32:16 +01:00
2013-03-15 16:15:40 +01:00
* Note : this MUST happen after we update the master / slave state
* so that REDIS_NODE_MASTER flag will be set . */
2013-11-08 10:32:16 +01:00
/* Many checks are only needed if the set of served slots this
2013-12-25 17:57:36 +01:00
* instance claims is different compared to the set of slots we have
* for it . Check this ASAP to avoid other computational expansive
* checks later . */
clusterNode * sender_master = NULL ; /* Sender or its master if slave. */
2013-11-08 10:32:16 +01:00
int dirty_slots = 0 ; /* Sender claimed slots don't match my view? */
if ( sender ) {
2014-01-29 12:17:16 +01:00
sender_master = nodeIsMaster ( sender ) ? sender : sender - > slaveof ;
2013-11-08 10:32:16 +01:00
if ( sender_master ) {
dirty_slots = memcmp ( sender_master - > slots ,
hdr - > myslots , sizeof ( hdr - > myslots ) ) ! = 0 ;
2013-12-25 17:57:36 +01:00
/* 1) If the sender of the message is a master, and we detected that
* the set of slots it claims changed , scan the slots to see if we
* need to update our configuration . */
2014-01-29 12:17:16 +01:00
if ( sender & & nodeIsMaster ( sender ) & & dirty_slots )
2013-11-08 17:02:10 +01:00
clusterUpdateSlotsConfigWith ( sender , senderConfigEpoch , hdr - > myslots ) ;
2013-11-08 10:32:16 +01:00
2013-12-25 17:57:36 +01:00
/* 2) We also check for the reverse condition, that is, the sender
* claims to serve slots we know are served by a master with a
* greater configEpoch . If this happens we inform the sender .
2013-11-08 10:32:16 +01:00
2013-12-25 17:57:36 +01:00
* This is useful because sometimes after a partition heals , a
* reappearing master may be the last one to claim a given set of
* hash slots , but with a configuration that other instances know to
* be deprecated . Example :
2013-11-08 10:32:16 +01:00
* A and B are master and slave for slots 1 , 2 , 3.
* A is partitioned away , B gets promoted .
* B is partitioned away , and A returns available .
* Usually B would PING A publishing its set of served slots and its
2013-12-25 17:57:36 +01:00
* configEpoch , but because of the partition B can ' t inform A of the
* new configuration , so other nodes that have an updated table must
* do it . In this way A will stop to act as a master ( or can try to
* failover if there are the conditions to win the election ) . */
2013-11-08 10:32:16 +01:00
if ( sender & & dirty_slots ) {
int j ;
for ( j = 0 ; j < REDIS_CLUSTER_SLOTS ; j + + ) {
if ( bitmapTestBit ( hdr - > myslots , j ) ) {
if ( server . cluster - > slots [ j ] = = sender | |
server . cluster - > slots [ j ] = = NULL ) continue ;
if ( server . cluster - > slots [ j ] - > configEpoch >
senderConfigEpoch )
2014-02-11 10:18:24 +01:00
redisLog ( REDIS_VERBOSE ,
2013-11-08 16:26:50 +01:00
" Node %.40s has old slots configuration, sending "
2013-11-08 17:27:59 +01:00
" an UPDATE message about %.40s " ,
2013-11-08 16:26:50 +01:00
sender - > name , server . cluster - > slots [ j ] - > name ) ;
clusterSendUpdate ( sender - > link , server . cluster - > slots [ j ] ) ;
2013-11-08 17:25:49 +01:00
/* TODO: instead of exiting the loop send every other
* UPDATE packet for other nodes that are the new owner
* of sender ' s slots . */
break ;
2013-11-08 10:32:16 +01:00
2013-09-30 11:44:23 +02:00
2011-03-29 17:51:15 +02:00
/* Get info from the gossip section */
clusterProcessGossipSection ( hdr , link ) ;
2013-10-02 09:42:35 +02:00
} else if ( type = = CLUSTERMSG_TYPE_FAIL ) {
2011-03-29 17:51:15 +02:00
clusterNode * failing ;
2013-10-02 09:42:35 +02:00
if ( sender ) {
failing = clusterLookupNode ( hdr - > data . fail . about . nodename ) ;
2013-12-25 17:57:36 +01:00
if ( failing & &
! ( failing - > flags & ( REDIS_NODE_FAIL | REDIS_NODE_MYSELF ) ) )
2013-10-02 09:42:35 +02:00
redisLog ( REDIS_NOTICE ,
" FAIL message received from %.40s about %.40s " ,
hdr - > sender , hdr - > data . fail . about . nodename ) ;
failing - > flags | = REDIS_NODE_FAIL ;
2013-10-09 16:18:33 +02:00
failing - > fail_time = mstime ( ) ;
2013-10-02 09:42:35 +02:00
failing - > flags & = ~ REDIS_NODE_PFAIL ;
2013-10-03 09:55:20 +02:00
2013-10-02 09:42:35 +02:00
} else {
2011-03-29 17:51:15 +02:00
redisLog ( REDIS_NOTICE ,
2013-10-02 09:42:35 +02:00
" Ignoring FAIL message from unknonw node %.40s about %.40s " ,
2011-03-29 17:51:15 +02:00
hdr - > sender , hdr - > data . fail . about . nodename ) ;
2011-10-07 16:34:16 +02:00
} else if ( type = = CLUSTERMSG_TYPE_PUBLISH ) {
robj * channel , * message ;
uint32_t channel_len , message_len ;
2013-10-03 09:55:20 +02:00
/* Don't bother creating useless objects if there are no
* Pub / Sub subscribers . */
2013-12-25 17:57:36 +01:00
if ( dictSize ( server . pubsub_channels ) | |
listLength ( server . pubsub_patterns ) )
2011-10-07 16:34:16 +02:00
channel_len = ntohl ( hdr - > data . publish . msg . channel_len ) ;
message_len = ntohl ( hdr - > data . publish . msg . message_len ) ;
channel = createStringObject (
( char * ) hdr - > data . publish . msg . bulk_data , channel_len ) ;
message = createStringObject (
2013-12-25 17:57:36 +01:00
( char * ) hdr - > data . publish . msg . bulk_data + channel_len ,
message_len ) ;
2011-10-07 16:34:16 +02:00
pubsubPublishMessage ( channel , message ) ;
decrRefCount ( channel ) ;
decrRefCount ( message ) ;
2013-03-14 21:21:58 +01:00
2013-06-12 10:50:07 -07:00
if ( ! sender ) return 1 ; /* We don't know that node. */
2013-09-20 09:22:21 +02:00
clusterSendFailoverAuthIfNeeded ( sender , hdr ) ;
2013-03-14 21:21:58 +01:00
} else if ( type = = CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK ) {
2013-06-12 10:50:07 -07:00
if ( ! sender ) return 1 ; /* We don't know that node. */
2013-10-01 15:40:20 +02:00
/* We consider this vote only if the sender is a master serving
2013-10-01 17:21:28 +02:00
* a non zero number of slots , and its currentEpoch is greater or
* equal to epoch where this node started the election . */
2014-01-29 12:17:16 +01:00
if ( nodeIsMaster ( sender ) & & sender - > numslots > 0 & &
2013-10-01 17:21:28 +02:00
senderCurrentEpoch > = server . cluster - > failover_auth_epoch )
2013-09-26 13:00:41 +02:00
2013-03-14 21:21:58 +01:00
server . cluster - > failover_auth_count + + ;
2013-09-26 16:54:43 +02:00
/* Maybe we reached a quorum here, set a flag to make sure
* we check ASAP . */
2013-10-03 09:55:20 +02:00
2013-09-26 13:00:41 +02:00
2014-02-05 13:01:24 +01:00
} else if ( type = = CLUSTERMSG_TYPE_MFSTART ) {
/* This message is acceptable only if I'm a master and the sender
* is one of my slaves . */
if ( ! sender | | sender - > slaveof ! = myself ) return 1 ;
/* Manual failover requested from slaves. Initialize the state
* accordingly . */
resetManualFailover ( ) ;
server . cluster - > mf_end = mstime ( ) + REDIS_CLUSTER_MF_TIMEOUT ;
server . cluster - > mf_slave = sender ;
pauseClients ( mstime ( ) + ( REDIS_CLUSTER_MF_TIMEOUT * 2 ) ) ;
redisLog ( REDIS_WARNING , " Manual failover requested by slave %.40s. " ,
sender - > name ) ;
2013-11-08 17:02:10 +01:00
} else if ( type = = CLUSTERMSG_TYPE_UPDATE ) {
clusterNode * n ; /* The node the update is about. */
uint64_t reportedConfigEpoch = ntohu64 ( hdr - > data . update . nodecfg . configEpoch ) ;
if ( ! sender ) return 1 ; /* We don't know the sender. */
n = clusterLookupNode ( hdr - > data . update . nodecfg . nodename ) ;
if ( ! n ) return 1 ; /* We don't know the reported node. */
if ( n - > configEpoch > = reportedConfigEpoch ) return 1 ; /* Nothing new. */
/* If in our current config the node is a slave, set it as a master. */
2014-01-29 12:17:16 +01:00
if ( nodeIsSlave ( n ) ) clusterSetNodeAsMaster ( n ) ;
2013-11-08 17:02:10 +01:00
2013-12-25 17:57:36 +01:00
/* Check the bitmap of served slots and udpate our
* config accordingly . */
2013-11-08 17:02:10 +01:00
clusterUpdateSlotsConfigWith ( n , reportedConfigEpoch ,
hdr - > data . update . nodecfg . slots ) ;
2011-03-29 17:51:15 +02:00
} else {
2011-10-07 15:37:34 +02:00
redisLog ( REDIS_WARNING , " Received unknown packet type: %d " , type ) ;
2011-03-29 17:51:15 +02:00
return 1 ;
/* This function is called when we detect the link with this node is lost.
We set the node as no longer connected . The Cluster Cron will detect
this connection and will try to get it connected again .
Instead if the node is a temporary node used to accept a query , we
completely free the node on error . */
void handleLinkIOError ( clusterLink * link ) {
freeClusterLink ( link ) ;
/* Send data. This is handled using a trivial send buffer that gets
* consumed by write ( ) . We don ' t try to optimize this for speed too much
* as this is a very low traffic channel . */
void clusterWriteHandler ( aeEventLoop * el , int fd , void * privdata , int mask ) {
clusterLink * link = ( clusterLink * ) privdata ;
ssize_t nwritten ;
REDIS_NOTUSED ( mask ) ;
nwritten = write ( fd , link - > sndbuf , sdslen ( link - > sndbuf ) ) ;
if ( nwritten < = 0 ) {
2013-02-20 13:18:51 +01:00
redisLog ( REDIS_DEBUG , " I/O error writing to node link: %s " ,
2011-03-29 17:51:15 +02:00
strerror ( errno ) ) ;
handleLinkIOError ( link ) ;
return ;
2013-07-24 11:21:39 +02:00
sdsrange ( link - > sndbuf , nwritten , - 1 ) ;
2011-03-29 17:51:15 +02:00
if ( sdslen ( link - > sndbuf ) = = 0 )
aeDeleteFileEvent ( server . el , link - > fd , AE_WRITABLE ) ;
/* Read data. Try to read the first field of the header first to check the
* full length of the packet . When a whole packet is in memory this function
* will call the function to process the packet . And so forth . */
void clusterReadHandler ( aeEventLoop * el , int fd , void * privdata , int mask ) {
2013-11-08 17:19:19 +01:00
char buf [ sizeof ( clusterMsg ) ] ;
2011-03-29 17:51:15 +02:00
ssize_t nread ;
clusterMsg * hdr ;
clusterLink * link = ( clusterLink * ) privdata ;
2013-02-15 16:44:39 +01:00
int readlen , rcvbuflen ;
2011-03-29 17:51:15 +02:00
REDIS_NOTUSED ( mask ) ;
2013-09-03 11:43:07 +02:00
while ( 1 ) { /* Read as long as there is data to read. */
rcvbuflen = sdslen ( link - > rcvbuf ) ;
2014-02-10 16:27:33 +01:00
if ( rcvbuflen < 8 ) {
/* First, obtain the first 8 bytes to get the full message
2013-09-03 11:43:07 +02:00
* length . */
2014-02-10 16:27:33 +01:00
readlen = 8 - rcvbuflen ;
2013-09-03 11:43:07 +02:00
} else {
/* Finally read the full message. */
hdr = ( clusterMsg * ) link - > rcvbuf ;
2014-02-10 16:27:33 +01:00
if ( rcvbuflen = = 8 ) {
/* Perform some sanity check on the message signature
* and length . */
if ( memcmp ( hdr - > sig , " RCmb " , 4 ) ! = 0 | |
ntohl ( hdr - > totlen ) < CLUSTERMSG_MIN_LEN )
2013-09-03 11:43:07 +02:00
redisLog ( REDIS_WARNING ,
2014-02-10 16:27:33 +01:00
" Bad message length or signature received "
" from Cluster bus. " ) ;
2013-09-03 11:43:07 +02:00
handleLinkIOError ( link ) ;
return ;
2013-02-15 16:44:39 +01:00
2013-09-03 11:43:07 +02:00
readlen = ntohl ( hdr - > totlen ) - rcvbuflen ;
if ( readlen > sizeof ( buf ) ) readlen = sizeof ( buf ) ;
2013-02-15 16:44:39 +01:00
2011-03-29 17:51:15 +02:00
2013-09-03 11:43:07 +02:00
nread = read ( fd , buf , readlen ) ;
if ( nread = = - 1 & & errno = = EAGAIN ) return ; /* No more data ready. */
2011-03-29 17:51:15 +02:00
2013-09-03 11:43:07 +02:00
if ( nread < = 0 ) {
/* I/O error... */
redisLog ( REDIS_DEBUG , " I/O error reading from node link: %s " ,
( nread = = 0 ) ? " connection closed " : strerror ( errno ) ) ;
handleLinkIOError ( link ) ;
return ;
} else {
/* Read data and recast the pointer to the new buffer. */
link - > rcvbuf = sdscatlen ( link - > rcvbuf , buf , nread ) ;
hdr = ( clusterMsg * ) link - > rcvbuf ;
rcvbuflen + = nread ;
2011-03-29 17:51:15 +02:00
2013-09-03 11:43:07 +02:00
/* Total length obtained? Process this packet. */
2014-02-10 16:27:33 +01:00
if ( rcvbuflen > = 8 & & rcvbuflen = = ntohl ( hdr - > totlen ) ) {
2013-09-03 11:43:07 +02:00
if ( clusterProcessPacket ( link ) ) {
sdsfree ( link - > rcvbuf ) ;
link - > rcvbuf = sdsempty ( ) ;
} else {
return ; /* Link no longer valid. */
2011-03-29 17:51:15 +02:00
2013-09-26 16:54:43 +02:00
/* Put stuff into the send buffer.
* It is guaranteed that this function will never have as a side effect
* the link to be invalidated , so it is safe to call this function
* from event handlers that will do stuff with the same link later . */
2011-03-29 17:51:15 +02:00
void clusterSendMessage ( clusterLink * link , unsigned char * msg , size_t msglen ) {
if ( sdslen ( link - > sndbuf ) = = 0 & & msglen ! = 0 )
aeCreateFileEvent ( server . el , link - > fd , AE_WRITABLE ,
clusterWriteHandler , link ) ;
link - > sndbuf = sdscatlen ( link - > sndbuf , msg , msglen ) ;
2013-10-02 10:10:08 +02:00
server . cluster - > stats_bus_messages_sent + + ;
2011-03-29 17:51:15 +02:00
2013-03-15 15:36:36 +01:00
/* Send a message to all the nodes that are part of the cluster having
2013-09-26 16:54:43 +02:00
* a connected link .
* It is guaranteed that this function will never have as a side effect
* some node - > link to be invalidated , so it is safe to call this function
* from event handlers that will do stuff with node links later . */
2011-10-07 15:37:34 +02:00
void clusterBroadcastMessage ( void * buf , size_t len ) {
dictIterator * di ;
dictEntry * de ;
2013-09-04 10:07:50 +02:00
di = dictGetSafeIterator ( server . cluster - > nodes ) ;
2011-10-07 15:37:34 +02:00
while ( ( de = dictNext ( di ) ) ! = NULL ) {
2011-11-08 17:07:55 +01:00
clusterNode * node = dictGetVal ( de ) ;
2011-10-07 15:37:34 +02:00
if ( ! node - > link ) continue ;
2013-03-15 15:36:36 +01:00
if ( node - > flags & ( REDIS_NODE_MYSELF | REDIS_NODE_HANDSHAKE ) )
continue ;
2011-10-07 15:37:34 +02:00
clusterSendMessage ( node - > link , buf , len ) ;
dictReleaseIterator ( di ) ;
2011-03-29 17:51:15 +02:00
/* Build the message header */
void clusterBuildMessageHdr ( clusterMsg * hdr , int type ) {
2012-01-25 16:46:19 +01:00
int totlen = 0 ;
2014-01-28 16:51:50 +01:00
uint64_t offset ;
2014-01-29 11:22:22 +01:00
clusterNode * master ;
2013-10-07 11:30:58 +02:00
/* If this node is a master, we send its slots bitmap and configEpoch.
* If this node is a slave we send the master ' s information instead ( the
* node is flagged as slave so the receiver knows that it is NOT really
* in charge for this slots . */
2014-01-29 12:17:16 +01:00
master = ( nodeIsSlave ( myself ) & & myself - > slaveof ) ?
2014-01-28 16:51:50 +01:00
myself - > slaveof : myself ;
2011-03-29 17:51:15 +02:00
memset ( hdr , 0 , sizeof ( * hdr ) ) ;
2014-02-10 15:53:09 +01:00
hdr - > sig [ 0 ] = ' R ' ;
hdr - > sig [ 1 ] = ' C ' ;
2014-02-10 15:55:21 +01:00
hdr - > sig [ 2 ] = ' m ' ;
2014-02-10 15:53:09 +01:00
hdr - > sig [ 3 ] = ' b ' ;
2011-03-29 17:51:15 +02:00
hdr - > type = htons ( type ) ;
2014-01-28 16:51:50 +01:00
memcpy ( hdr - > sender , myself - > name , REDIS_CLUSTER_NAMELEN ) ;
2013-10-07 11:30:58 +02:00
memcpy ( hdr - > myslots , master - > slots , sizeof ( hdr - > myslots ) ) ;
2011-03-29 17:51:15 +02:00
memset ( hdr - > slaveof , 0 , REDIS_CLUSTER_NAMELEN ) ;
2014-01-28 16:51:50 +01:00
if ( myself - > slaveof ! = NULL )
memcpy ( hdr - > slaveof , myself - > slaveof - > name , REDIS_CLUSTER_NAMELEN ) ;
2011-03-29 17:51:15 +02:00
hdr - > port = htons ( server . port ) ;
2014-01-28 16:51:50 +01:00
hdr - > flags = htons ( myself - > flags ) ;
2013-02-14 13:20:56 +01:00
hdr - > state = server . cluster - > state ;
2011-03-29 17:51:15 +02:00
2013-10-07 11:30:58 +02:00
/* Set the currentEpoch and configEpochs. */
2013-09-25 11:53:35 +02:00
hdr - > currentEpoch = htonu64 ( server . cluster - > currentEpoch ) ;
2013-10-07 11:30:58 +02:00
hdr - > configEpoch = htonu64 ( master - > configEpoch ) ;
2013-09-25 11:53:35 +02:00
2014-01-28 16:51:50 +01:00
/* Set the replication offset. */
2014-01-29 16:39:04 +01:00
if ( nodeIsSlave ( myself ) )
offset = replicationGetSlaveOffset ( ) ;
2014-01-28 16:51:50 +01:00
offset = server . master_repl_offset ;
hdr - > offset = htonu64 ( offset ) ;
2014-02-05 13:01:24 +01:00
/* Set the message flags. */
if ( nodeIsMaster ( myself ) & & server . cluster - > mf_end )
hdr - > mflags [ 0 ] | = CLUSTERMSG_FLAG0_PAUSED ;
2014-01-28 16:51:50 +01:00
/* Compute the message length for certain messages. For other messages
* this is up to the caller . */
2011-03-29 17:51:15 +02:00
if ( type = = CLUSTERMSG_TYPE_FAIL ) {
totlen = sizeof ( clusterMsg ) - sizeof ( union clusterMsgData ) ;
totlen + = sizeof ( clusterMsgDataFail ) ;
2013-11-08 16:26:50 +01:00
} else if ( type = = CLUSTERMSG_TYPE_UPDATE ) {
totlen = sizeof ( clusterMsg ) - sizeof ( union clusterMsgData ) ;
totlen + = sizeof ( clusterMsgDataUpdate ) ;
2011-03-29 17:51:15 +02:00
hdr - > totlen = htonl ( totlen ) ;
2013-09-25 11:53:35 +02:00
/* For PING, PONG, and MEET, fixing the totlen field is up to the caller. */
2011-03-29 17:51:15 +02:00
/* Send a PING or PONG packet to the specified node, making sure to add enough
* gossip informations . */
void clusterSendPing ( clusterLink * link , int type ) {
2013-11-08 17:19:19 +01:00
unsigned char buf [ sizeof ( clusterMsg ) ] ;
2011-03-29 17:51:15 +02:00
clusterMsg * hdr = ( clusterMsg * ) buf ;
int gossipcount = 0 , totlen ;
/* freshnodes is the number of nodes we can still use to populate the
* gossip section of the ping packet . Basically we start with the nodes
* we have in memory minus two ( ourself and the node we are sending the
* message to ) . Every time we add a node we decrement the counter , so when
* it will drop to < = zero we know there is no more gossip info we can
* send . */
2013-02-14 13:20:56 +01:00
int freshnodes = dictSize ( server . cluster - > nodes ) - 2 ;
2011-03-29 17:51:15 +02:00
if ( link - > node & & type = = CLUSTERMSG_TYPE_PING )
2013-10-09 16:18:33 +02:00
link - > node - > ping_sent = mstime ( ) ;
2011-03-29 17:51:15 +02:00
clusterBuildMessageHdr ( hdr , type ) ;
/* Populate the gossip fields */
while ( freshnodes > 0 & & gossipcount < 3 ) {
2013-02-14 13:20:56 +01:00
struct dictEntry * de = dictGetRandomKey ( server . cluster - > nodes ) ;
2011-11-08 17:07:55 +01:00
clusterNode * this = dictGetVal ( de ) ;
2011-03-29 17:51:15 +02:00
clusterMsgDataGossip * gossip ;
int j ;
2013-02-28 15:00:09 +01:00
/* In the gossip section don't include:
* 1 ) Myself .
* 2 ) Nodes in HANDSHAKE state .
* 3 ) Nodes with the NOADDR flag set .
* 4 ) Disconnected nodes if they don ' t have configured slots .
2014-01-29 11:38:14 +01:00
if ( this = = myself | |
2013-02-28 15:00:09 +01:00
2013-02-28 15:13:32 +01:00
( this - > link = = NULL & & this - > numslots = = 0 ) )
2013-02-28 15:00:09 +01:00
2011-03-29 17:51:15 +02:00
freshnodes - - ; /* otherwise we may loop forever. */
continue ;
/* Check if we already added this node */
for ( j = 0 ; j < gossipcount ; j + + ) {
if ( memcmp ( hdr - > data . ping . gossip [ j ] . nodename , this - > name ,
REDIS_CLUSTER_NAMELEN ) = = 0 ) break ;
if ( j ! = gossipcount ) continue ;
/* Add it */
freshnodes - - ;
gossip = & ( hdr - > data . ping . gossip [ gossipcount ] ) ;
memcpy ( gossip - > nodename , this - > name , REDIS_CLUSTER_NAMELEN ) ;
gossip - > ping_sent = htonl ( this - > ping_sent ) ;
gossip - > pong_received = htonl ( this - > pong_received ) ;
memcpy ( gossip - > ip , this - > ip , sizeof ( this - > ip ) ) ;
gossip - > port = htons ( this - > port ) ;
gossip - > flags = htons ( this - > flags ) ;
gossipcount + + ;
totlen = sizeof ( clusterMsg ) - sizeof ( union clusterMsgData ) ;
totlen + = ( sizeof ( clusterMsgDataGossip ) * gossipcount ) ;
hdr - > count = htons ( gossipcount ) ;
hdr - > totlen = htonl ( totlen ) ;
clusterSendMessage ( link , buf , totlen ) ;
2013-09-26 16:54:43 +02:00
/* Send a PONG packet to every connected node that's not in handshake state
* and for which we have a valid link .
2013-03-15 15:43:53 +01:00
2013-09-26 16:54:43 +02:00
* In Redis Cluster pongs are not used just for failure detection , but also
* to carry important configuration information . So broadcasting a pong is
2013-03-15 15:43:53 +01:00
* useful when something changes in the configuration and we want to make
2014-01-29 11:08:52 +01:00
* the cluster aware ASAP ( for instance after a slave promotion ) .
* The ' target ' argument specifies the receiving instances using the
* defines below :
* CLUSTER_BROADCAST_ALL - > All known instances .
* CLUSTER_BROADCAST_LOCAL_SLAVES - > All slaves in my master - slaves ring .
void clusterBroadcastPong ( int target ) {
2013-03-15 15:43:53 +01:00
dictIterator * di ;
dictEntry * de ;
2013-09-04 10:07:50 +02:00
di = dictGetSafeIterator ( server . cluster - > nodes ) ;
2013-03-15 15:43:53 +01:00
while ( ( de = dictNext ( di ) ) ! = NULL ) {
clusterNode * node = dictGetVal ( de ) ;
2013-09-26 16:54:43 +02:00
if ( ! node - > link ) continue ;
2014-01-29 12:17:16 +01:00
if ( node = = myself | | nodeInHandshake ( node ) ) continue ;
2014-01-29 11:08:52 +01:00
int local_slave =
2014-01-29 12:17:16 +01:00
nodeIsSlave ( node ) & & node - > slaveof & &
2014-01-29 11:08:52 +01:00
( node - > slaveof = = myself | | node - > slaveof = = myself - > slaveof ) ;
if ( ! local_slave ) continue ;
2013-03-15 15:43:53 +01:00
clusterSendPing ( node - > link , CLUSTERMSG_TYPE_PONG ) ;
dictReleaseIterator ( di ) ;
2011-10-07 15:37:34 +02:00
/* Send a PUBLISH message.
* If link is NULL , then the message is broadcasted to the whole cluster . */
void clusterSendPublish ( clusterLink * link , robj * channel , robj * message ) {
2013-11-08 17:19:19 +01:00
unsigned char buf [ sizeof ( clusterMsg ) ] , * payload ;
2011-10-07 15:37:34 +02:00
clusterMsg * hdr = ( clusterMsg * ) buf ;
uint32_t totlen ;
uint32_t channel_len , message_len ;
2011-03-29 17:51:15 +02:00
2011-10-07 15:37:34 +02:00
channel = getDecodedObject ( channel ) ;
message = getDecodedObject ( message ) ;
channel_len = sdslen ( channel - > ptr ) ;
message_len = sdslen ( message - > ptr ) ;
2011-03-29 17:51:15 +02:00
2011-10-07 15:37:34 +02:00
clusterBuildMessageHdr ( hdr , CLUSTERMSG_TYPE_PUBLISH ) ;
totlen = sizeof ( clusterMsg ) - sizeof ( union clusterMsgData ) ;
totlen + = sizeof ( clusterMsgDataPublish ) + channel_len + message_len ;
hdr - > data . publish . msg . channel_len = htonl ( channel_len ) ;
hdr - > data . publish . msg . message_len = htonl ( message_len ) ;
hdr - > totlen = htonl ( totlen ) ;
/* Try to use the local buffer if possible */
if ( totlen < sizeof ( buf ) ) {
payload = buf ;
} else {
payload = zmalloc ( totlen ) ;
2012-04-24 11:28:10 +02:00
memcpy ( payload , hdr , sizeof ( * hdr ) ) ;
2013-09-03 11:27:01 +02:00
hdr = ( clusterMsg * ) payload ;
2011-03-29 17:51:15 +02:00
2011-10-07 15:37:34 +02:00
memcpy ( hdr - > data . publish . msg . bulk_data , channel - > ptr , sdslen ( channel - > ptr ) ) ;
memcpy ( hdr - > data . publish . msg . bulk_data + sdslen ( channel - > ptr ) ,
message - > ptr , sdslen ( message - > ptr ) ) ;
if ( link )
clusterSendMessage ( link , payload , totlen ) ;
clusterBroadcastMessage ( payload , totlen ) ;
decrRefCount ( channel ) ;
decrRefCount ( message ) ;
if ( payload ! = buf ) zfree ( payload ) ;
2011-03-29 17:51:15 +02:00
/* Send a FAIL message to all the nodes we are able to contact.
* The FAIL message is sent when we detect that a node is failing
* ( REDIS_NODE_PFAIL ) and we also receive a gossip confirmation of this :
* we switch the node state to REDIS_NODE_FAIL and ask all the other
* nodes to do the same ASAP . */
void clusterSendFail ( char * nodename ) {
2013-11-08 17:19:19 +01:00
unsigned char buf [ sizeof ( clusterMsg ) ] ;
2011-03-29 17:51:15 +02:00
clusterMsg * hdr = ( clusterMsg * ) buf ;
clusterBuildMessageHdr ( hdr , CLUSTERMSG_TYPE_FAIL ) ;
memcpy ( hdr - > data . fail . about . nodename , nodename , REDIS_CLUSTER_NAMELEN ) ;
clusterBroadcastMessage ( buf , ntohl ( hdr - > totlen ) ) ;
2013-11-08 16:26:50 +01:00
/* Send an UPDATE message to the specified link carrying the specified 'node'
* slots configuration . The node name , slots bitmap , and configEpoch info
* are included . */
void clusterSendUpdate ( clusterLink * link , clusterNode * node ) {
2013-11-08 17:19:19 +01:00
unsigned char buf [ sizeof ( clusterMsg ) ] ;
2013-11-08 16:26:50 +01:00
clusterMsg * hdr = ( clusterMsg * ) buf ;
2013-12-17 12:28:37 +01:00
if ( link = = NULL ) return ;
2013-11-08 16:26:50 +01:00
clusterBuildMessageHdr ( hdr , CLUSTERMSG_TYPE_UPDATE ) ;
memcpy ( hdr - > data . update . nodecfg . nodename , node - > name , REDIS_CLUSTER_NAMELEN ) ;
hdr - > data . update . nodecfg . configEpoch = htonu64 ( node - > configEpoch ) ;
memcpy ( hdr - > data . update . nodecfg . slots , node - > slots , sizeof ( node - > slots ) ) ;
clusterSendMessage ( link , buf , ntohl ( hdr - > totlen ) ) ;
2011-10-07 15:37:34 +02:00
/* -----------------------------------------------------------------------------
* CLUSTER Pub / Sub support
* For now we do very little , just propagating PUBLISH messages across the whole
* cluster . In the future we ' ll try to get smarter and avoiding propagating those
* messages to hosts without receives for a given channel .
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
void clusterPropagatePublish ( robj * channel , robj * message ) {
clusterSendPublish ( NULL , channel , message ) ;
2013-03-13 12:44:02 +01:00
/* -----------------------------------------------------------------------------
* SLAVE node specific functions
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
2013-03-13 17:21:20 +01:00
/* This function sends a FAILOVE_AUTH_REQUEST message to every node in order to
* see if there is the quorum for this slave instance to failover its failing
* master .
* Note that we send the failover request to everybody , master and slave nodes ,
* but only the masters are supposed to reply to our query . */
void clusterRequestFailoverAuth ( void ) {
2013-11-08 17:19:19 +01:00
unsigned char buf [ sizeof ( clusterMsg ) ] ;
2013-03-13 17:21:20 +01:00
clusterMsg * hdr = ( clusterMsg * ) buf ;
uint32_t totlen ;
clusterBuildMessageHdr ( hdr , CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST ) ;
2014-02-05 13:10:03 +01:00
/* If this is a manual failover, set the CLUSTERMSG_FLAG0_FORCEACK bit
* in the header to communicate the nodes receiving the message that
* they should authorized the failover even if the master is working . */
if ( server . cluster - > mf_end ) hdr - > mflags [ 0 ] | = CLUSTERMSG_FLAG0_FORCEACK ;
2013-03-13 17:21:20 +01:00
totlen = sizeof ( clusterMsg ) - sizeof ( union clusterMsgData ) ;
hdr - > totlen = htonl ( totlen ) ;
2013-03-14 21:27:12 +01:00
clusterBroadcastMessage ( buf , totlen ) ;
2013-03-13 17:21:20 +01:00
2013-09-30 16:19:44 +02:00
/* Send a FAILOVER_AUTH_ACK message to the specified node. */
void clusterSendFailoverAuth ( clusterNode * node ) {
2013-11-08 17:19:19 +01:00
unsigned char buf [ sizeof ( clusterMsg ) ] ;
2013-03-14 16:31:57 +01:00
clusterMsg * hdr = ( clusterMsg * ) buf ;
uint32_t totlen ;
if ( ! node - > link ) return ;
clusterBuildMessageHdr ( hdr , CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK ) ;
totlen = sizeof ( clusterMsg ) - sizeof ( union clusterMsgData ) ;
hdr - > totlen = htonl ( totlen ) ;
2013-03-14 21:27:12 +01:00
clusterSendMessage ( node - > link , buf , totlen ) ;
2013-03-14 16:31:57 +01:00
2014-02-05 13:01:24 +01:00
/* Send a MFSTART message to the specified node. */
void clusterSendMFStart ( clusterNode * node ) {
unsigned char buf [ sizeof ( clusterMsg ) ] ;
clusterMsg * hdr = ( clusterMsg * ) buf ;
uint32_t totlen ;
if ( ! node - > link ) return ;
clusterBuildMessageHdr ( hdr , CLUSTERMSG_TYPE_MFSTART ) ;
totlen = sizeof ( clusterMsg ) - sizeof ( union clusterMsgData ) ;
hdr - > totlen = htonl ( totlen ) ;
clusterSendMessage ( node - > link , buf , totlen ) ;
2013-09-26 13:00:41 +02:00
/* Vote for the node asking for our vote if there are the conditions. */
2013-09-20 09:22:21 +02:00
void clusterSendFailoverAuthIfNeeded ( clusterNode * node , clusterMsg * request ) {
2013-03-13 19:08:03 +01:00
clusterNode * master = node - > slaveof ;
2013-10-08 12:45:35 +02:00
uint64_t requestCurrentEpoch = ntohu64 ( request - > currentEpoch ) ;
uint64_t requestConfigEpoch = ntohu64 ( request - > configEpoch ) ;
unsigned char * claimed_slots = request - > myslots ;
2014-02-05 13:10:03 +01:00
int force_ack = request - > mflags [ 0 ] & CLUSTERMSG_FLAG0_FORCEACK ;
2013-10-08 12:45:35 +02:00
int j ;
2013-09-26 13:00:41 +02:00
/* IF we are not a master serving at least 1 slot, we don't have the
* right to vote , as the cluster size in Redis Cluster is the number
2013-11-29 16:17:05 +01:00
* of masters serving at least one slot , and quorum is the cluster
* size + 1 */
2014-01-29 12:17:16 +01:00
if ( nodeIsSlave ( myself ) | | myself - > numslots = = 0 ) return ;
2013-09-26 13:00:41 +02:00
/* Request epoch must be >= our currentEpoch. */
2014-01-20 11:19:04 +01:00
if ( requestCurrentEpoch < server . cluster - > currentEpoch ) return ;
2013-03-13 19:08:03 +01:00
2013-09-26 13:00:41 +02:00
/* I already voted for this epoch? Return ASAP. */
2014-01-20 11:19:04 +01:00
if ( server . cluster - > last_vote_epoch = = server . cluster - > currentEpoch ) return ;
2013-09-26 13:00:41 +02:00
2014-02-05 13:10:03 +01:00
/* Node must be a slave and its master down.
* The master can be non failing if the request is flagged
* with CLUSTERMSG_FLAG0_FORCEACK ( manual failover ) . */
if ( nodeIsMaster ( node ) | | master = = NULL | |
( ! nodeFailed ( master ) & & ! force_ack ) ) return ;
2013-03-13 19:08:03 +01:00
2013-09-26 13:00:41 +02:00
/* We did not voted for a slave about this master for two
* times the node timeout . This is not strictly needed for correctness
* of the algorithm but makes the base case more linear . */
2013-10-09 16:18:33 +02:00
if ( mstime ( ) - node - > slaveof - > voted_time < server . cluster_node_timeout * 2 )
return ;
2013-03-13 19:08:03 +01:00
2013-11-29 16:17:05 +01:00
/* The slave requesting the vote must have a configEpoch for the claimed
* slots that is > = the one of the masters currently serving the same
* slots in the current configuration . */
2013-10-08 12:45:35 +02:00
for ( j = 0 ; j < REDIS_CLUSTER_SLOTS ; j + + ) {
if ( bitmapTestBit ( claimed_slots , j ) = = 0 ) continue ;
if ( server . cluster - > slots [ j ] = = NULL | |
server . cluster - > slots [ j ] - > configEpoch < = requestConfigEpoch ) continue ;
/* If we reached this point we found a slot that in our current slots
* is served by a master with a greater configEpoch than the one claimed
* by the slave requesting our vote . Refuse to vote for this slave . */
return ;
2013-09-26 13:00:41 +02:00
/* We can vote for this slave. */
2013-09-30 16:19:44 +02:00
clusterSendFailoverAuth ( node ) ;
2013-09-26 13:00:41 +02:00
server . cluster - > last_vote_epoch = server . cluster - > currentEpoch ;
2013-10-09 16:18:33 +02:00
node - > slaveof - > voted_time = mstime ( ) ;
2013-03-13 18:38:08 +01:00
2014-01-29 16:39:04 +01:00
/* This function returns the "rank" of this instance, a slave, in the context
* of its master - slaves ring . The rank of the slave is given by the number of
* other slaves for the same master that have a better replication offset
* compared to the local one ( better means , greater , so they claim more data ) .
* A slave with rank 0 is the one with the greatest ( most up to date )
* replication offset , and so forth . Note that because how the rank is computed
* multiple slaves may have the same rank , in case they have the same offset .
* The slave rank is used to add a delay to start an election in order to
* get voted and replace a failing master . Slaves with better replication
* offsets are more likely to win . */
int clusterGetSlaveRank ( void ) {
long long myoffset ;
int j , rank = 0 ;
clusterNode * master ;
redisAssert ( nodeIsSlave ( myself ) ) ;
master = myself - > slaveof ;
if ( master = = NULL ) return 0 ; /* Never called by slaves without master. */
myoffset = replicationGetSlaveOffset ( ) ;
for ( j = 0 ; j < master - > numslaves ; j + + )
if ( master - > slaves [ j ] ! = myself & &
master - > slaves [ j ] - > repl_offset > myoffset ) rank + + ;
return rank ;
2013-03-13 12:44:02 +01:00
/* This function is called if we are a slave node and our master serving
2013-03-14 16:39:02 +01:00
* a non - zero amount of hash slots is in FAIL state .
2013-03-13 12:44:02 +01:00
* The gaol of this function is :
* 1 ) To check if we are able to perform a failover , is our data updated ?
2013-09-26 11:13:17 +02:00
* 2 ) Try to get elected by masters .
2013-09-26 13:00:41 +02:00
* 3 ) Perform the failover informing all the other nodes .
2013-03-13 12:44:02 +01:00
void clusterHandleSlaveFailover ( void ) {
2013-10-09 16:18:33 +02:00
mstime_t data_age ;
2013-09-26 11:13:17 +02:00
mstime_t auth_age = mstime ( ) - server . cluster - > failover_auth_time ;
2013-03-13 13:10:49 +01:00
int needed_quorum = ( server . cluster - > size / 2 ) + 1 ;
2014-02-05 16:01:52 +01:00
int manual_failover = server . cluster - > mf_end ! = 0 & &
server . cluster - > mf_can_start ;
2013-03-15 16:53:41 +01:00
int j ;
2014-03-10 09:57:52 +01:00
mstime_t auth_timeout , auth_retry_time ;
/* Compute the failover timeout (the max time we have to send votes
* and wait for replies ) , and the failover retry time ( the time to wait
* before waiting again .
* Timeout is MIN ( NODE_TIMEOUT * 2 , 2000 ) milliseconds .
* Retry is two times the Timeout .
auth_timeout = server . cluster_node_timeout * 2 ;
if ( auth_timeout < 2000 ) auth_timeout = 2000 ;
auth_retry_time = auth_timeout * 2 ;
2013-03-13 13:10:49 +01:00
2013-09-26 16:54:43 +02:00
/* Pre conditions to run the function:
* 1 ) We are a slave .
2014-02-05 13:01:24 +01:00
* 2 ) Our master is flagged as FAIL , or this is a manual failover .
2013-09-26 16:54:43 +02:00
* 3 ) It is serving slots . */
2014-01-29 12:17:16 +01:00
if ( nodeIsMaster ( myself ) | |
2014-01-29 11:38:14 +01:00
myself - > slaveof = = NULL | |
2014-02-05 16:01:52 +01:00
( ! nodeFailed ( myself - > slaveof ) & & ! manual_failover ) | |
2014-01-29 11:38:14 +01:00
myself - > slaveof - > numslots = = 0 ) return ;
2013-09-26 16:54:43 +02:00
2014-01-30 16:34:23 +01:00
/* Set data_age to the number of seconds we are disconnected from
* the master . */
if ( server . repl_state = = REDIS_REPL_CONNECTED ) {
data_age = ( server . unixtime - server . master - > lastinteraction ) * 1000 ;
} else {
data_age = ( server . unixtime - server . repl_down_since ) * 1000 ;
2013-09-26 11:13:17 +02:00
/* Remove the node timeout from the data age as it is fine that we are
* disconnected from our master at least for the time it was down to be
* flagged as FAIL , that ' s the baseline . */
if ( data_age > server . cluster_node_timeout )
data_age - = server . cluster_node_timeout ;
2013-03-13 13:10:49 +01:00
/* Check if our data is recent enough. For now we just use a fixed
* constant of ten times the node timeout since the cluster should
* react much faster to a master down . */
2013-04-04 12:02:48 +02:00
if ( data_age >
2013-12-22 10:05:16 +01:00
( server . repl_ping_slave_period * 1000 ) +
( server . cluster_node_timeout * REDIS_CLUSTER_SLAVE_VALIDITY_MULT ) )
2013-04-04 12:02:48 +02:00
return ;
2013-03-13 13:10:49 +01:00
2014-03-10 09:57:52 +01:00
/* If the previous failover attempt timedout and the retry time has
* elapsed , we can setup a new one . */
if ( auth_age > auth_retry_time ) {
2013-09-26 11:13:17 +02:00
server . cluster - > failover_auth_time = mstime ( ) +
500 + /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */
random ( ) % 500 ; /* Random delay between 0 and 500 milliseconds. */
2013-03-13 13:10:49 +01:00
server . cluster - > failover_auth_count = 0 ;
2013-09-26 11:13:17 +02:00
server . cluster - > failover_auth_sent = 0 ;
2014-01-29 16:51:11 +01:00
server . cluster - > failover_auth_rank = clusterGetSlaveRank ( ) ;
/* We add another delay that is proportional to the slave rank.
* Specifically 1 second * rank . This way slaves that have a probably
* less updated replication offset , are penalized . */
server . cluster - > failover_auth_time + =
server . cluster - > failover_auth_rank * 1000 ;
2014-02-05 13:01:24 +01:00
/* However if this is a manual failover, no delay is needed. */
if ( server . cluster - > mf_end ) {
server . cluster - > failover_auth_time = mstime ( ) ;
server . cluster - > failover_auth_rank = 0 ;
2013-11-29 16:17:05 +01:00
redisLog ( REDIS_WARNING ,
2014-01-29 17:16:10 +01:00
" Start of election delayed for %lld milliseconds "
" (rank #%d, offset %lld). " ,
2014-01-29 16:51:11 +01:00
server . cluster - > failover_auth_time - mstime ( ) ,
2014-01-29 17:16:10 +01:00
server . cluster - > failover_auth_rank ,
replicationGetSlaveOffset ( ) ) ;
2014-01-29 17:19:55 +01:00
/* Now that we have a scheduled election, broadcast our offset
* to all the other slaves so that they ' ll updated their offsets
* if our offset is better . */
2014-01-29 16:51:11 +01:00
return ;
/* It is possible that we received more updated offsets from other
* slaves for the same master since we computed our election delay .
* Update the delay if our rank changed . */
2014-02-05 13:01:24 +01:00
if ( server . cluster - > failover_auth_sent = = 0 & &
server . cluster - > mf_end = = 0 )
2014-01-29 16:51:11 +01:00
int newrank = clusterGetSlaveRank ( ) ;
if ( newrank > server . cluster - > failover_auth_rank ) {
long long added_delay =
( newrank - server . cluster - > failover_auth_rank ) * 1000 ;
server . cluster - > failover_auth_time + = added_delay ;
server . cluster - > failover_auth_rank = newrank ;
redisLog ( REDIS_WARNING ,
" Slave rank updated to #%d, added %lld milliseconds of delay. " ,
newrank , added_delay ) ;
2013-09-26 11:13:17 +02:00
/* Return ASAP if we can't still start the election. */
if ( mstime ( ) < server . cluster - > failover_auth_time ) return ;
/* Return ASAP if the election is too old to be valid. */
2014-03-10 09:57:52 +01:00
if ( auth_age > auth_timeout ) return ;
2013-09-26 11:13:17 +02:00
/* Ask for votes if needed. */
if ( server . cluster - > failover_auth_sent = = 0 ) {
server . cluster - > currentEpoch + + ;
server . cluster - > failover_auth_epoch = server . cluster - > currentEpoch ;
redisLog ( REDIS_WARNING , " Starting a failover election for epoch %llu. " ,
2013-11-05 12:01:07 +01:00
( unsigned long long ) server . cluster - > currentEpoch ) ;
2013-03-14 16:39:02 +01:00
clusterRequestFailoverAuth ( ) ;
2013-09-26 11:13:17 +02:00
server . cluster - > failover_auth_sent = 1 ;
2013-10-03 09:55:20 +02:00
clusterDoBeforeSleep ( CLUSTER_TODO_SAVE_CONFIG |
2013-03-13 13:10:49 +01:00
return ; /* Wait for replies. */
/* Check if we reached the quorum. */
2013-03-15 13:20:23 +01:00
if ( server . cluster - > failover_auth_count > = needed_quorum ) {
2014-01-29 11:38:14 +01:00
clusterNode * oldmaster = myself - > slaveof ;
2013-03-15 16:53:41 +01:00
2013-03-14 16:39:02 +01:00
redisLog ( REDIS_WARNING ,
2013-09-30 11:51:58 +02:00
" Failover election won: I'm the new master. " ) ;
2013-03-15 16:11:34 +01:00
/* We have the quorum, perform all the steps to correctly promote
* this slave to a master .
* 1 ) Turn this node into a master . */
2014-01-29 11:38:14 +01:00
clusterNodeRemoveSlave ( myself - > slaveof , myself ) ;
myself - > flags & = ~ REDIS_NODE_SLAVE ;
myself - > flags | = REDIS_NODE_MASTER ;
myself - > slaveof = NULL ;
2013-03-15 16:11:34 +01:00
replicationUnsetMaster ( ) ;
2013-03-15 16:53:41 +01:00
/* 2) Claim all the slots assigned to our master. */
for ( j = 0 ; j < REDIS_CLUSTER_SLOTS ; j + + ) {
if ( clusterNodeGetSlotBit ( oldmaster , j ) ) {
clusterDelSlot ( j ) ;
2014-01-29 11:38:14 +01:00
clusterAddSlot ( myself , j ) ;
2013-03-15 16:53:41 +01:00
2013-09-30 10:13:58 +02:00
/* 3) Update my configEpoch to the epoch of the election. */
2014-01-29 11:38:14 +01:00
myself - > configEpoch = server . cluster - > failover_auth_epoch ;
2013-09-26 11:13:17 +02:00
2013-09-30 10:13:58 +02:00
/* 4) Update state and save config. */
2013-03-15 16:53:41 +01:00
clusterUpdateState ( ) ;
2013-10-03 09:55:20 +02:00
clusterSaveConfigOrDie ( 1 ) ;
2013-09-30 10:13:58 +02:00
/* 5) Pong all the other nodes so that they can update the state
* accordingly and detect that we switched to master role . */
2014-01-29 11:08:52 +01:00
clusterBroadcastPong ( CLUSTER_BROADCAST_ALL ) ;
2014-02-05 13:01:24 +01:00
/* 6) If there was a manual failover in progress, clear the state. */
resetManualFailover ( ) ;
2013-03-13 13:10:49 +01:00
2013-03-13 12:44:02 +01:00
2014-01-30 18:05:11 +01:00
/* -----------------------------------------------------------------------------
* CLUSTER slave migration
* Slave migration is the process that allows a slave of a master that is
* already covered by at least another slave , to " migrate " to a master that
* is orpaned , that is , left with no working slaves .
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
/* This function is responsible to decide if this replica should be migrated
* to a different ( orphaned ) master . It is called by the clusterCron ( ) function
* only if :
* 1 ) We are a slave node .
* 2 ) It was detected that there is at least one orphaned master in
* the cluster .
* 3 ) We are a slave of one of the masters with the greatest number of
* slaves .
* This checks are performed by the caller since it requires to iterate
* the nodes anyway , so we spend time into clusterHandleSlaveMigration ( )
* if definitely needed .
* The fuction is called with a pre - computed max_slaves , that is the max
* number of working ( not in FAIL state ) slaves for a single master .
* Additional conditions for migration are examined inside the function .
void clusterHandleSlaveMigration ( int max_slaves ) {
int j , okslaves = 0 ;
clusterNode * mymaster = myself - > slaveof , * target = NULL , * candidate = NULL ;
dictIterator * di ;
dictEntry * de ;
/* Step 1: Don't migrate if the cluster state is not ok. */
if ( server . cluster - > state ! = REDIS_CLUSTER_OK ) return ;
2014-01-31 11:12:34 +01:00
/* Step 2: Don't migrate if my master will not be left with at least
* ' migration - barrier ' slaves after my migration . */
2014-01-30 18:05:11 +01:00
if ( mymaster = = NULL ) return ;
for ( j = 0 ; j < mymaster - > numslaves ; j + + )
if ( ! nodeFailed ( mymaster - > slaves [ j ] ) & &
! nodeTimedOut ( mymaster - > slaves [ j ] ) ) okslaves + + ;
2014-01-31 11:12:34 +01:00
if ( okslaves < = server . cluster_migration_barrier ) return ;
2014-01-30 18:05:11 +01:00
/* Step 3: Idenitfy a candidate for migration, and check if among the
* masters with the greatest number of ok slaves , I ' m the one with the
* smaller node ID .
* Note that this means that eventually a replica migration will occurr
* since slaves that are reachable again always have their FAIL flag
* cleared . At the same time this does not mean that there are no
* race conditions possible ( two slaves migrating at the same time ) , but
* this is extremely unlikely to happen , and harmless . */
candidate = myself ;
di = dictGetSafeIterator ( server . cluster - > nodes ) ;
while ( ( de = dictNext ( di ) ) ! = NULL ) {
clusterNode * node = dictGetVal ( de ) ;
int okslaves ;
/* Only iterate over working masters. */
if ( nodeIsSlave ( node ) | | nodeFailed ( node ) ) continue ;
okslaves = clusterCountNonFailingSlaves ( node ) ;
2014-02-10 17:08:37 +01:00
if ( okslaves = = 0 & & target = = NULL & & node - > numslots > 0 ) target = node ;
2014-01-30 18:05:11 +01:00
if ( okslaves = = max_slaves ) {
for ( j = 0 ; j < node - > numslaves ; j + + ) {
if ( memcmp ( node - > slaves [ j ] - > name ,
candidate - > name ,
candidate = node - > slaves [ j ] ;
/* Step 4: perform the migration if there is a target, and if I'm the
* candidate . */
if ( target & & candidate = = myself ) {
redisLog ( REDIS_WARNING , " Migrating to orphaned master %.40s " ,
target - > name ) ;
clusterSetMaster ( target ) ;
2014-02-05 13:01:24 +01:00
/* -----------------------------------------------------------------------------
* CLUSTER manual failover
* This are the important steps performed by slaves during a manual failover :
* 1 ) User send CLUSTER FAILOVER command . The failover state is initialized
* setting mf_end to the millisecond unix time at which we ' ll abort the
* attempt .
* 2 ) Slave sends a MFSTART message to the master requesting to pause clients
* for two times the manual failover timeout REDIS_CLUSTER_MF_TIMEOUT .
* When master is paused for manual failover , it also starts to flag
* packets with CLUSTERMSG_FLAG0_PAUSED .
* 3 ) Slave waits for master to send its replication offset flagged as PAUSED .
* 4 ) If slave received the offset from the master , and its offset matches ,
* mf_can_start is set to 1 , and clusterHandleSlaveFailover ( ) will perform
* the failover as usually , with the difference that the vote request
* will be modified to force masters to vote for a slave that has a
* working master .
* From the point of view of the master things are simpler : when a
* PAUSE_CLIENTS packet is received the master sets mf_end as well and
* the sender in mf_slave . During the time limit for the manual failover
* the master will just send PINGs more often to this slave , flagged with
* the PAUSED flag , so that the slave will set mf_master_offset when receiving
* a packet from the master with this flag set .
* The gaol of the manual failover is to perform a fast failover without
* data loss due to the asynchronous master - slave replication .
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
/* Reset the manual failover state. This works for both masters and slavesa
* as all the state about manual failover is cleared .
* The function can be used both to initialize the manual failover state at
* startup or to abort a manual failover in progress . */
void resetManualFailover ( void ) {
if ( server . cluster - > mf_end & & clientsArePaused ( ) ) {
server . clients_pause_end_time = 0 ;
clientsArePaused ( ) ; /* Just use the side effect of the function. */
server . cluster - > mf_end = 0 ; /* No manual failover in progress. */
server . cluster - > mf_can_start = 0 ;
server . cluster - > mf_slave = NULL ;
server . cluster - > mf_master_offset = 0 ;
/* If a manual failover timed out, abort it. */
void manualFailoverCheckTimeout ( void ) {
2014-02-05 15:45:15 +01:00
if ( server . cluster - > mf_end & & server . cluster - > mf_end < mstime ( ) ) {
2014-02-05 13:01:24 +01:00
redisLog ( REDIS_WARNING , " Manual failover timed out. " ) ;
resetManualFailover ( ) ;
/* This function is called from the cluster cron function in order to go
* forward with a manual failover state machine . */
void clusterHandleManualFailover ( void ) {
/* Return ASAP if no manual failover is in progress. */
if ( server . cluster - > mf_end = = 0 ) return ;
/* If mf_can_start is non-zero, the failover was alrady triggered so the
* next steps are performed by clusterHandleSlaveFailover ( ) . */
if ( server . cluster - > mf_can_start ) return ;
if ( server . cluster - > mf_master_offset = = 0 ) return ; /* Wait for offset... */
if ( server . cluster - > mf_master_offset = = replicationGetSlaveOffset ( ) ) {
/* Our replication offset matches the master replication offset
* announced after clients were paused . We can start the failover . */
server . cluster - > mf_can_start = 1 ;
redisLog ( REDIS_WARNING , " All master replication stream processed, manual failover can start. " ) ;
2011-03-29 17:51:15 +02:00
/* -----------------------------------------------------------------------------
* CLUSTER cron job
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
2013-10-09 16:18:33 +02:00
/* This is executed 10 times every second */
2011-03-29 17:51:15 +02:00
void clusterCron ( void ) {
dictIterator * di ;
dictEntry * de ;
2014-01-30 18:05:11 +01:00
int update_state = 0 ;
int orphaned_masters ; /* How many masters there are without ok slaves. */
int max_slaves ; /* Max number of ok slaves for a single master. */
int this_slaves ; /* Number of ok slaves for our master (if we are slave). */
2013-10-09 16:18:33 +02:00
mstime_t min_pong = 0 , now = mstime ( ) ;
2013-04-11 18:55:58 +02:00
clusterNode * min_pong_node = NULL ;
2013-10-09 16:29:14 +02:00
static unsigned long long iteration = 0 ;
2013-10-11 10:34:32 +02:00
mstime_t handshake_timeout ;
2013-10-09 16:29:14 +02:00
iteration + + ; /* Number of times this function was called so far. */
2011-03-29 17:51:15 +02:00
2013-12-05 16:35:32 +01:00
/* The handshake timeout is the time after which a handshake node that was
2013-10-11 10:34:32 +02:00
* not turned into a normal node is removed from the nodes . Usually it is
* just the NODE_TIMEOUT value , but when NODE_TIMEOUT is too small we use
* the value of 1 second . */
handshake_timeout = server . cluster_node_timeout ;
if ( handshake_timeout < 1000 ) handshake_timeout = 1000 ;
2013-01-17 01:00:20 +08:00
/* Check if we have disconnected nodes and re-establish the connection. */
2013-08-21 15:51:10 +02:00
di = dictGetSafeIterator ( server . cluster - > nodes ) ;
2011-03-29 17:51:15 +02:00
while ( ( de = dictNext ( di ) ) ! = NULL ) {
2011-11-08 17:07:55 +01:00
clusterNode * node = dictGetVal ( de ) ;
2011-03-29 17:51:15 +02:00
if ( node - > flags & ( REDIS_NODE_MYSELF | REDIS_NODE_NOADDR ) ) continue ;
2013-09-04 12:41:21 +02:00
/* A Node in HANDSHAKE state has a limited lifespan equal to the
* configured node timeout . */
2014-01-29 12:17:16 +01:00
if ( nodeInHandshake ( node ) & & now - node - > ctime > handshake_timeout ) {
2013-09-04 12:41:21 +02:00
freeClusterNode ( node ) ;
continue ;
2011-03-29 17:51:15 +02:00
if ( node - > link = = NULL ) {
int fd ;
2013-10-09 16:18:33 +02:00
mstime_t old_ping_sent ;
2011-03-29 17:51:15 +02:00
clusterLink * link ;
2014-03-03 10:57:27 -05:00
fd = anetTcpNonBlockBindConnect ( server . neterr , node - > ip ,
2014-03-10 10:33:53 +01:00
node - > port + REDIS_CLUSTER_PORT_INCR ,
server . bindaddr_count ? server . bindaddr [ 0 ] : NULL ) ;
2014-03-03 10:57:27 -05:00
if ( fd = = - 1 ) {
redisLog ( REDIS_DEBUG , " Unable to connect to "
2014-03-10 10:32:28 +01:00
" Cluster Node [%s]:%d -> %s " , node - > ip ,
node - > port + REDIS_CLUSTER_PORT_INCR ,
server . neterr ) ;
2014-03-03 10:57:27 -05:00
continue ;
2011-03-29 17:51:15 +02:00
link = createClusterLink ( node ) ;
link - > fd = fd ;
node - > link = link ;
aeCreateFileEvent ( server . el , link - > fd , AE_READABLE , clusterReadHandler , link ) ;
2013-04-11 18:55:58 +02:00
/* Queue a PING in the new connection ASAP: this is crucial
* to avoid false positives in failure detection .
* If the node is flagged as MEET , we send a MEET message instead
2011-03-29 17:51:15 +02:00
* of a PING one , to force the receiver to add us in its node
* table . */
2013-04-11 19:12:29 +02:00
old_ping_sent = node - > ping_sent ;
2011-03-29 17:51:15 +02:00
clusterSendPing ( link , node - > flags & REDIS_NODE_MEET ?
2013-04-11 19:12:29 +02:00
if ( old_ping_sent ) {
/* If there was an active ping before the link was
* disconnected , we want to restore the ping time , otherwise
* replaced by the clusterSendPing ( ) call . */
node - > ping_sent = old_ping_sent ;
2011-03-29 17:51:15 +02:00
/* We can clear the flag after the first packet is sent.
* If we ' ll never receive a PONG , we ' ll never send new packets
* to this node . Instead after the PONG is received and we
* are no longer in meet / handshake status , we want to send
* normal PING packets . */
node - > flags & = ~ REDIS_NODE_MEET ;
2013-02-20 13:18:51 +01:00
redisLog ( REDIS_DEBUG , " Connecting with Node %.40s at %s:%d " , node - > name , node - > ip , node - > port + REDIS_CLUSTER_PORT_INCR ) ;
2011-03-29 17:51:15 +02:00
dictReleaseIterator ( di ) ;
2013-10-09 16:29:14 +02:00
/* Ping some random node 1 time every 10 iterations, so that we usually ping
* one random node every second . */
if ( ! ( iteration % 10 ) ) {
2014-01-30 18:05:11 +01:00
int j ;
2013-10-09 16:29:14 +02:00
/* Check a few random nodes and ping the one with the oldest
* pong_received time . */
for ( j = 0 ; j < 5 ; j + + ) {
de = dictGetRandomKey ( server . cluster - > nodes ) ;
clusterNode * this = dictGetVal ( de ) ;
/* Don't ping nodes disconnected or with a ping currently active. */
if ( this - > link = = NULL | | this - > ping_sent ! = 0 ) continue ;
if ( this - > flags & ( REDIS_NODE_MYSELF | REDIS_NODE_HANDSHAKE ) ) continue ;
if ( min_pong_node = = NULL | | min_pong > this - > pong_received ) {
min_pong_node = this ;
min_pong = this - > pong_received ;
if ( min_pong_node ) {
redisLog ( REDIS_DEBUG , " Pinging node %.40s " , min_pong_node - > name ) ;
clusterSendPing ( min_pong_node - > link , CLUSTERMSG_TYPE_PING ) ;
2011-03-29 17:51:15 +02:00
2014-01-30 18:05:11 +01:00
/* Iterate nodes to check if we need to flag something as failing.
* This loop is also responsible to :
* 1 ) Check if there are orphaned masters ( masters without non failing
* slaves ) .
* 2 ) Count the max number of non failing slaves for a single master .
* 3 ) Count the number of slaves for our master , if we are a slave . */
orphaned_masters = 0 ;
max_slaves = 0 ;
this_slaves = 0 ;
2013-09-04 10:07:50 +02:00
di = dictGetSafeIterator ( server . cluster - > nodes ) ;
2011-03-29 17:51:15 +02:00
while ( ( de = dictNext ( di ) ) ! = NULL ) {
2011-11-08 17:07:55 +01:00
clusterNode * node = dictGetVal ( de ) ;
2013-10-09 16:18:33 +02:00
now = mstime ( ) ; /* Use an updated time at every iteration. */
2013-12-17 10:27:12 +01:00
mstime_t delay ;
2011-03-29 17:51:15 +02:00
if ( node - > flags &
2011-04-07 23:10:32 +02:00
continue ;
2013-03-05 12:13:39 +01:00
2014-01-30 18:23:31 +01:00
/* Orphaned master check, useful only if the current instance
* is a slave that may migrate to another master . */
if ( nodeIsSlave ( myself ) & & nodeIsMaster ( node ) & & ! nodeFailed ( node ) ) {
int okslaves = clusterCountNonFailingSlaves ( node ) ;
2014-02-10 17:08:37 +01:00
if ( okslaves = = 0 & & node - > numslots > 0 ) orphaned_masters + + ;
2014-01-30 18:23:31 +01:00
if ( okslaves > max_slaves ) max_slaves = okslaves ;
if ( nodeIsSlave ( myself ) & & myself - > slaveof = = node )
this_slaves = okslaves ;
2013-04-11 18:55:58 +02:00
/* If we are waiting for the PONG more than half the cluster
* timeout , reconnect the link : maybe there is a connection
* issue even if the node is alive . */
if ( node - > link & & /* is connected */
2013-10-09 16:18:33 +02:00
now - node - > link - > ctime >
2013-05-03 12:37:45 +02:00
server . cluster_node_timeout & & /* was not already reconnected */
2013-04-11 18:55:58 +02:00
node - > ping_sent & & /* we already sent a ping */
node - > pong_received < node - > ping_sent & & /* still waiting pong */
/* and we are waiting for the pong more than timeout/2 */
now - node - > ping_sent > server . cluster_node_timeout / 2 )
/* Disconnect the link, it will be reconnected automatically. */
freeClusterLink ( node - > link ) ;
/* If we have currently no active ping in this instance, and the
* received PONG is older than half the cluster timeout , send
* a new ping now , to ensure all the nodes are pinged without
* a too big delay . */
2013-03-05 12:13:39 +01:00
if ( node - > link & &
2013-04-11 18:55:58 +02:00
node - > ping_sent = = 0 & &
( now - node - > pong_received ) > server . cluster_node_timeout / 2 )
2013-03-05 12:13:39 +01:00
clusterSendPing ( node - > link , CLUSTERMSG_TYPE_PING ) ;
continue ;
2014-02-05 13:01:24 +01:00
/* If we are a master and one of the slaves requested a manual
* failover , ping it continuously . */
if ( server . cluster - > mf_end & &
nodeIsMaster ( myself ) & &
server . cluster - > mf_slave = = node & &
node - > link )
clusterSendPing ( node - > link , CLUSTERMSG_TYPE_PING ) ;
continue ;
2013-04-11 18:55:58 +02:00
/* Check only if we have an active ping for this instance. */
if ( node - > ping_sent = = 0 ) continue ;
2013-04-08 19:40:20 +02:00
2013-04-11 18:55:58 +02:00
/* Compute the delay of the PONG. Note that if we already received
* the PONG , then node - > ping_sent is zero , so can ' t reach this
* code at all . */
delay = now - node - > ping_sent ;
2013-02-26 15:15:44 +01:00
2013-04-11 18:55:58 +02:00
if ( delay > server . cluster_node_timeout ) {
2013-01-17 01:00:20 +08:00
/* Timeout reached. Set the node as possibly failing if it is
2011-04-07 23:06:01 +02:00
* not already in this state . */
2011-04-07 23:10:32 +02:00
if ( ! ( node - > flags & ( REDIS_NODE_PFAIL | REDIS_NODE_FAIL ) ) ) {
2011-03-29 17:51:15 +02:00
redisLog ( REDIS_DEBUG , " *** NODE %.40s possibly failing " ,
node - > name ) ;
node - > flags | = REDIS_NODE_PFAIL ;
2013-03-07 15:40:53 +01:00
update_state = 1 ;
2011-03-29 17:51:15 +02:00
dictReleaseIterator ( di ) ;
2013-03-05 16:12:08 +01:00
/* If we are a slave node but the replication is still turned off,
* enable it if we know the address of our master and it appears to
* be up . */
2014-01-29 12:17:16 +01:00
if ( nodeIsSlave ( myself ) & &
2013-03-05 16:12:08 +01:00
server . masterhost = = NULL & &
2014-01-29 11:38:14 +01:00
myself - > slaveof & &
2014-01-29 12:17:16 +01:00
nodeHasAddr ( myself - > slaveof ) )
2013-03-05 16:12:08 +01:00
2014-01-29 11:38:14 +01:00
replicationSetMaster ( myself - > slaveof - > ip , myself - > slaveof - > port ) ;
2013-03-05 16:12:08 +01:00
2013-03-07 15:40:53 +01:00
2014-02-05 13:01:24 +01:00
/* Abourt a manual failover if the timeout is reached. */
manualFailoverCheckTimeout ( ) ;
2014-01-30 18:05:11 +01:00
if ( nodeIsSlave ( myself ) ) {
2014-02-05 13:01:24 +01:00
clusterHandleManualFailover ( ) ;
2014-01-30 18:05:11 +01:00
clusterHandleSlaveFailover ( ) ;
/* If there are orphaned slaves, and we are a slave among the masters
* with the max number of non - failing slaves , consider migrating to
* the orphaned masters . Note that it does not make sense to try
* a migration if there is no master with at least * two * working
* slaves . */
if ( orphaned_masters & & max_slaves > = 2 & & this_slaves = = max_slaves )
clusterHandleSlaveMigration ( max_slaves ) ;
2014-01-15 12:26:12 +01:00
if ( update_state | | server . cluster - > state = = REDIS_CLUSTER_FAIL )
clusterUpdateState ( ) ;
2013-09-26 16:54:43 +02:00
/* This function is called before the event handler returns to sleep for
* events . It is useful to perform operations that must be done ASAP in
* reaction to events fired but that are not safe to perform inside event
2013-10-03 09:55:20 +02:00
* handlers , or to perform potentially expansive tasks that we need to do
* a single time before replying to clients . */
2013-09-26 16:54:43 +02:00
void clusterBeforeSleep ( void ) {
2013-10-03 09:55:20 +02:00
/* Handle failover, this is needed when it is likely that there is already
* the quorum from masters in order to react fast . */
if ( server . cluster - > todo_before_sleep & CLUSTER_TODO_HANDLE_FAILOVER )
2013-03-13 12:44:02 +01:00
clusterHandleSlaveFailover ( ) ;
2013-10-03 09:55:20 +02:00
/* Update the cluster state. */
if ( server . cluster - > todo_before_sleep & CLUSTER_TODO_UPDATE_STATE )
clusterUpdateState ( ) ;
/* Save the config, possibly using fsync. */
if ( server . cluster - > todo_before_sleep & CLUSTER_TODO_SAVE_CONFIG ) {
int fsync = server . cluster - > todo_before_sleep & CLUSTER_TODO_FSYNC_CONFIG ;
clusterSaveConfigOrDie ( fsync ) ;
2013-03-13 12:44:02 +01:00
2013-10-03 09:55:20 +02:00
/* Reset our flags. */
server . cluster - > todo_before_sleep = 0 ;
void clusterDoBeforeSleep ( int flags ) {
server . cluster - > todo_before_sleep | = flags ;
2011-03-29 17:51:15 +02:00
/* -----------------------------------------------------------------------------
* Slots management
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
2013-10-08 12:45:35 +02:00
/* Test bit 'pos' in a generic bitmap. Return 1 if the bit is set,
2013-02-28 15:23:09 +01:00
* otherwise 0. */
int bitmapTestBit ( unsigned char * bitmap , int pos ) {
off_t byte = pos / 8 ;
int bit = pos & 7 ;
return ( bitmap [ byte ] & ( 1 < < bit ) ) ! = 0 ;
/* Set the bit at position 'pos' in a bitmap. */
void bitmapSetBit ( unsigned char * bitmap , int pos ) {
off_t byte = pos / 8 ;
int bit = pos & 7 ;
bitmap [ byte ] | = 1 < < bit ;
/* Clear the bit at position 'pos' in a bitmap. */
void bitmapClearBit ( unsigned char * bitmap , int pos ) {
off_t byte = pos / 8 ;
int bit = pos & 7 ;
bitmap [ byte ] & = ~ ( 1 < < bit ) ;
2011-03-29 17:51:15 +02:00
/* Set the slot bit and return the old value. */
int clusterNodeSetSlotBit ( clusterNode * n , int slot ) {
2013-02-28 15:23:09 +01:00
int old = bitmapTestBit ( n - > slots , slot ) ;
bitmapSetBit ( n - > slots , slot ) ;
2013-02-28 15:11:05 +01:00
if ( ! old ) n - > numslots + + ;
2011-03-29 17:51:15 +02:00
return old ;
/* Clear the slot bit and return the old value. */
int clusterNodeClearSlotBit ( clusterNode * n , int slot ) {
2013-02-28 15:23:09 +01:00
int old = bitmapTestBit ( n - > slots , slot ) ;
bitmapClearBit ( n - > slots , slot ) ;
2013-02-28 15:11:05 +01:00
if ( old ) n - > numslots - - ;
2011-03-29 17:51:15 +02:00
return old ;
/* Return the slot bit from the cluster node structure. */
int clusterNodeGetSlotBit ( clusterNode * n , int slot ) {
2013-02-28 15:23:09 +01:00
return bitmapTestBit ( n - > slots , slot ) ;
2011-03-29 17:51:15 +02:00
/* Add the specified slot to the list of slots that node 'n' will
* serve . Return REDIS_OK if the operation ended with success .
* If the slot is already assigned to another instance this is considered
* an error and REDIS_ERR is returned . */
int clusterAddSlot ( clusterNode * n , int slot ) {
2013-02-21 11:51:17 +01:00
if ( server . cluster - > slots [ slot ] ) return REDIS_ERR ;
clusterNodeSetSlotBit ( n , slot ) ;
2013-02-14 13:20:56 +01:00
server . cluster - > slots [ slot ] = n ;
2011-03-29 17:51:15 +02:00
return REDIS_OK ;
2011-05-06 13:38:27 +02:00
/* Delete the specified slot marking it as unassigned.
* Returns REDIS_OK if the slot was assigned , otherwise if the slot was
* already unassigned REDIS_ERR is returned . */
int clusterDelSlot ( int slot ) {
2013-02-14 13:20:56 +01:00
clusterNode * n = server . cluster - > slots [ slot ] ;
2011-05-06 13:38:27 +02:00
if ( ! n ) return REDIS_ERR ;
redisAssert ( clusterNodeClearSlotBit ( n , slot ) = = 1 ) ;
2013-02-14 13:20:56 +01:00
server . cluster - > slots [ slot ] = NULL ;
2011-05-06 13:38:27 +02:00
return REDIS_OK ;
2013-03-15 16:35:16 +01:00
/* Delete all the slots associated with the specified node.
* The number of deleted slots is returned . */
int clusterDelNodeSlots ( clusterNode * node ) {
int deleted = 0 , j ;
for ( j = 0 ; j < REDIS_CLUSTER_SLOTS ; j + + ) {
if ( clusterNodeGetSlotBit ( node , j ) ) clusterDelSlot ( j ) ;
deleted + + ;
return deleted ;
2014-03-11 11:16:18 +01:00
/* Clear the migrating / importing state for all the slots.
* This is useful at initialization and when turning a master into slave . */
void clusterCloseAllSlots ( void ) {
memset ( server . cluster - > migrating_slots_to , 0 ,
sizeof ( server . cluster - > migrating_slots_to ) ) ;
memset ( server . cluster - > importing_slots_from , 0 ,
sizeof ( server . cluster - > importing_slots_from ) ) ;
2011-03-29 17:51:15 +02:00
/* -----------------------------------------------------------------------------
* Cluster state evaluation function
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
2013-12-20 09:56:18 +01:00
2014-01-20 11:52:52 +01:00
/* The following are defines that are only used in the evaluation function
* and are based on heuristics . Actaully the main point about the rejoin and
* writable delay is that they should be a few orders of magnitude larger
* than the network latency . */
2013-12-20 09:56:18 +01:00
2014-01-15 12:34:33 +01:00
2014-01-20 11:52:52 +01:00
2013-12-20 09:56:18 +01:00
2011-03-29 17:51:15 +02:00
void clusterUpdateState ( void ) {
2013-12-20 09:56:18 +01:00
int j , new_state ;
2013-03-07 15:36:59 +01:00
int unreachable_masters = 0 ;
2013-12-20 09:56:18 +01:00
static mstime_t among_minority_time ;
2014-01-20 11:52:52 +01:00
static mstime_t first_call_time = 0 ;
/* If this is a master node, wait some time before turning the state
* into OK , since it is not a good idea to rejoin the cluster as a writable
* master , after a reboot , without giving the cluster a chance to
* reconfigure this node . Note that the delay is calculated starting from
* the first call to this function and not since the server start , in order
* to don ' t count the DB loading time . */
if ( first_call_time = = 0 ) first_call_time = mstime ( ) ;
2014-01-29 12:17:16 +01:00
if ( nodeIsMaster ( myself ) & &
2014-01-20 11:52:52 +01:00
mstime ( ) - first_call_time < REDIS_CLUSTER_WRITABLE_DELAY ) return ;
2011-03-29 17:51:15 +02:00
2013-03-06 18:24:43 +01:00
/* Start assuming the state is OK. We'll turn it into FAIL if there
* are the right conditions . */
2013-12-20 09:56:18 +01:00
new_state = REDIS_CLUSTER_OK ;
2013-03-06 18:24:43 +01:00
2013-02-22 17:43:35 +01:00
/* Check if all the slots are covered. */
2011-03-29 17:51:15 +02:00
for ( j = 0 ; j < REDIS_CLUSTER_SLOTS ; j + + ) {
2013-02-14 13:20:56 +01:00
if ( server . cluster - > slots [ j ] = = NULL | |
server . cluster - > slots [ j ] - > flags & ( REDIS_NODE_FAIL ) )
2011-03-29 17:51:15 +02:00
2013-12-20 09:56:18 +01:00
new_state = REDIS_CLUSTER_FAIL ;
2011-03-29 17:51:15 +02:00
break ;
2013-02-22 17:43:35 +01:00
2013-02-22 19:18:30 +01:00
/* Compute the cluster size, that is the number of master nodes
2013-03-07 15:36:59 +01:00
* serving at least a single slot .
* At the same time count the number of unreachable masters with
* at least one node . */
2013-02-22 19:18:30 +01:00
dictIterator * di ;
dictEntry * de ;
server . cluster - > size = 0 ;
2013-09-04 10:07:50 +02:00
di = dictGetSafeIterator ( server . cluster - > nodes ) ;
2013-02-22 19:18:30 +01:00
while ( ( de = dictNext ( di ) ) ! = NULL ) {
clusterNode * node = dictGetVal ( de ) ;
2014-01-29 12:17:16 +01:00
if ( nodeIsMaster ( node ) & & node - > numslots ) {
2013-02-22 19:18:30 +01:00
server . cluster - > size + + ;
2013-03-07 15:36:59 +01:00
if ( node - > flags & ( REDIS_NODE_FAIL | REDIS_NODE_PFAIL ) )
unreachable_masters + + ;
2013-02-22 19:18:30 +01:00
dictReleaseIterator ( di ) ;
2013-03-07 15:22:32 +01:00
2013-03-07 15:36:59 +01:00
/* If we can't reach at least half the masters, change the cluster state
2013-09-27 09:55:41 +02:00
* to FAIL , as we are not even able to mark nodes as FAIL in this side
2013-12-20 09:56:18 +01:00
* of the netsplit because of lack of majority . */
2013-03-07 15:36:59 +01:00
int needed_quorum = ( server . cluster - > size / 2 ) + 1 ;
2013-12-20 09:56:18 +01:00
if ( unreachable_masters > = needed_quorum ) {
new_state = REDIS_CLUSTER_FAIL ;
among_minority_time = mstime ( ) ;
2013-03-07 15:36:59 +01:00
2013-03-07 15:22:32 +01:00
/* Log a state change */
2013-12-20 09:56:18 +01:00
if ( new_state ! = server . cluster - > state ) {
mstime_t rejoin_delay = server . cluster_node_timeout ;
/* If the instance is a master and was partitioned away with the
* minority , don ' t let it accept queries for some time after the
* partition heals , to make sure there is enough time to receive
* a configuration update . */
if ( rejoin_delay > REDIS_CLUSTER_MAX_REJOIN_DELAY )
2014-01-15 12:34:33 +01:00
if ( rejoin_delay < REDIS_CLUSTER_MIN_REJOIN_DELAY )
2013-12-20 09:56:18 +01:00
if ( new_state = = REDIS_CLUSTER_OK & &
2014-01-29 12:17:16 +01:00
nodeIsMaster ( myself ) & &
2013-12-20 09:56:18 +01:00
mstime ( ) - among_minority_time < rejoin_delay )
return ;
/* Change the state and log the event. */
2013-03-07 15:22:32 +01:00
redisLog ( REDIS_WARNING , " Cluster state changed: %s " ,
2013-12-20 09:56:18 +01:00
new_state = = REDIS_CLUSTER_OK ? " ok " : " fail " ) ;
server . cluster - > state = new_state ;
2011-03-29 17:51:15 +02:00
2013-02-25 11:20:17 +01:00
/* This function is called after the node startup in order to verify that data
* loaded from disk is in agreement with the cluster configuration :
* 1 ) If we find keys about hash slots we have no responsibility for , the
* following happens :
* A ) If no other node is in charge according to the current cluster
* configuration , we add these slots to our node .
* B ) If according to our config other nodes are already in charge for
* this lots , we set the slots as IMPORTING from our point of view
* in order to justify we have those slots , and in order to make
* redis - trib aware of the issue , so that it can try to fix it .
* 2 ) If we find data in a DB different than DB0 we return REDIS_ERR to
* signal the caller it should quit the server with an error message
* or take other actions .
* The function always returns REDIS_OK even if it will try to correct
* the error described in " 1 " . However if data is found in DB different
* from DB0 , REDIS_ERR is returned .
* The function also uses the logging facility in order to warn the user
* about desynchronizations between the data we have in memory and the
* cluster configuration . */
int verifyClusterConfigWithData ( void ) {
2013-02-25 11:43:49 +01:00
int j ;
int update_config = 0 ;
2013-03-04 19:47:00 +01:00
/* If this node is a slave, don't perform the check at all as we
* completely depend on the replication stream . */
2014-01-29 12:17:16 +01:00
if ( nodeIsSlave ( myself ) ) return REDIS_OK ;
2013-03-04 19:47:00 +01:00
2013-02-25 11:43:49 +01:00
/* Make sure we only have keys in DB0. */
for ( j = 1 ; j < server . dbnum ; j + + ) {
if ( dictSize ( server . db [ j ] . dict ) ) return REDIS_ERR ;
/* Check that all the slots we see populated memory have a corresponding
* entry in the cluster table . Otherwise fix the table . */
for ( j = 0 ; j < REDIS_CLUSTER_SLOTS ; j + + ) {
if ( ! countKeysInSlot ( j ) ) continue ; /* No keys in this slot. */
/* Check if we are assigned to this slot or if we are importing it.
* In both cases check the next slot as the configuration makes
* sense . */
2014-01-29 11:38:14 +01:00
if ( server . cluster - > slots [ j ] = = myself | |
2013-02-25 11:43:49 +01:00
server . cluster - > importing_slots_from [ j ] ! = NULL ) continue ;
/* If we are here data and cluster config don't agree, and we have
* slot ' j ' populated even if we are not importing it , nor we are
* assigned to this slot . Fix this condition . */
update_config + + ;
/* Case A: slot is unassigned. Take responsability for it. */
if ( server . cluster - > slots [ j ] = = NULL ) {
redisLog ( REDIS_WARNING , " I've keys about slot %d that is "
" unassigned. Taking responsability "
" for it. " , j ) ;
2014-01-29 11:38:14 +01:00
clusterAddSlot ( myself , j ) ;
2013-02-25 11:43:49 +01:00
} else {
redisLog ( REDIS_WARNING , " I've keys about slot %d that is "
" already assigned to a different node. "
" Setting it in importing state. " , j ) ;
server . cluster - > importing_slots_from [ j ] = server . cluster - > slots [ j ] ;
2013-10-03 09:55:20 +02:00
if ( update_config ) clusterSaveConfigOrDie ( 1 ) ;
2013-02-25 11:20:17 +01:00
return REDIS_OK ;
2013-03-04 13:15:09 +01:00
/* -----------------------------------------------------------------------------
* SLAVE nodes handling
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
2014-03-11 11:22:06 +01:00
/* Set the specified node 'n' as master for this node.
* If this node is currently a master , it is turned into a slave . */
2013-03-04 13:15:09 +01:00
void clusterSetMaster ( clusterNode * n ) {
redisAssert ( n ! = myself ) ;
2013-03-20 11:32:35 +01:00
redisAssert ( myself - > numslots = = 0 ) ;
2013-03-04 13:15:09 +01:00
2014-01-29 12:17:16 +01:00
if ( nodeIsMaster ( myself ) ) {
2013-03-04 13:15:09 +01:00
myself - > flags & = ~ REDIS_NODE_MASTER ;
myself - > flags | = REDIS_NODE_SLAVE ;
2014-03-11 11:22:06 +01:00
clusterCloseAllSlots ( ) ;
2014-01-22 18:46:06 +01:00
} else {
if ( myself - > slaveof )
clusterNodeRemoveSlave ( myself - > slaveof , myself ) ;
2013-03-04 13:15:09 +01:00
myself - > slaveof = n ;
2014-01-22 18:46:06 +01:00
clusterNodeAddSlave ( n , myself ) ;
2013-03-04 15:27:58 +01:00
replicationSetMaster ( n - > ip , n - > port ) ;
2014-02-05 13:01:24 +01:00
resetManualFailover ( ) ;
2013-03-04 13:15:09 +01:00
2011-03-29 17:51:15 +02:00
/* -----------------------------------------------------------------------------
* CLUSTER command
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
2014-01-22 18:09:06 +01:00
/* Generate a csv-alike representation of the specified cluster node.
* See clusterGenNodesDescription ( ) top comment for more information .
* The function returns the string representation as an SDS string . */
sds clusterGenNodeDescription ( clusterNode * node ) {
int j , start ;
sds ci ;
/* Node coordinates */
ci = sdscatprintf ( sdsempty ( ) , " %.40s %s:%d " ,
node - > name ,
node - > ip ,
node - > port ) ;
/* Flags */
if ( node - > flags = = 0 ) ci = sdscat ( ci , " noflags, " ) ;
if ( node - > flags & REDIS_NODE_MYSELF ) ci = sdscat ( ci , " myself, " ) ;
if ( node - > flags & REDIS_NODE_MASTER ) ci = sdscat ( ci , " master, " ) ;
if ( node - > flags & REDIS_NODE_SLAVE ) ci = sdscat ( ci , " slave, " ) ;
if ( node - > flags & REDIS_NODE_PFAIL ) ci = sdscat ( ci , " fail?, " ) ;
if ( node - > flags & REDIS_NODE_FAIL ) ci = sdscat ( ci , " fail, " ) ;
if ( node - > flags & REDIS_NODE_HANDSHAKE ) ci = sdscat ( ci , " handshake, " ) ;
if ( node - > flags & REDIS_NODE_NOADDR ) ci = sdscat ( ci , " noaddr, " ) ;
if ( ci [ sdslen ( ci ) - 1 ] = = ' , ' ) ci [ sdslen ( ci ) - 1 ] = ' ' ;
/* Slave of... or just "-" */
if ( node - > slaveof )
ci = sdscatprintf ( ci , " %.40s " , node - > slaveof - > name ) ;
ci = sdscatprintf ( ci , " - " ) ;
/* Latency from the POV of this node, link status */
ci = sdscatprintf ( ci , " %lld %lld %llu %s " ,
( long long ) node - > ping_sent ,
( long long ) node - > pong_received ,
( unsigned long long ) node - > configEpoch ,
( node - > link | | node - > flags & REDIS_NODE_MYSELF ) ?
" connected " : " disconnected " ) ;
/* Slots served by this instance */
start = - 1 ;
for ( j = 0 ; j < REDIS_CLUSTER_SLOTS ; j + + ) {
int bit ;
if ( ( bit = clusterNodeGetSlotBit ( node , j ) ) ! = 0 ) {
if ( start = = - 1 ) start = j ;
if ( start ! = - 1 & & ( ! bit | | j = = REDIS_CLUSTER_SLOTS - 1 ) ) {
if ( j = = REDIS_CLUSTER_SLOTS - 1 ) j + + ;
if ( start = = j - 1 ) {
ci = sdscatprintf ( ci , " %d " , start ) ;
} else {
ci = sdscatprintf ( ci , " %d-%d " , start , j - 1 ) ;
start = - 1 ;
/* Just for MYSELF node we also dump info about slots that
* we are migrating to other instances or importing from other
* instances . */
if ( node - > flags & REDIS_NODE_MYSELF ) {
for ( j = 0 ; j < REDIS_CLUSTER_SLOTS ; j + + ) {
if ( server . cluster - > migrating_slots_to [ j ] ) {
ci = sdscatprintf ( ci , " [%d->-%.40s] " , j ,
server . cluster - > migrating_slots_to [ j ] - > name ) ;
} else if ( server . cluster - > importing_slots_from [ j ] ) {
ci = sdscatprintf ( ci , " [%d-<-%.40s] " , j ,
server . cluster - > importing_slots_from [ j ] - > name ) ;
return ci ;
2013-09-04 10:25:26 +02:00
/* Generate a csv-alike representation of the nodes we are aware of,
* including the " myself " node , and return an SDS string containing the
* representation ( it is up to the caller to free it ) .
* All the nodes matching at least one of the node flags specified in
* " filter " are excluded from the output , so using zero as a filter will
* include all the known nodes in the representation , including nodes in
* the HANDSHAKE state .
* The representation obtained using this function is used for the output
* of the CLUSTER NODES function , and as format for the cluster
* configuration file ( nodes . conf ) for a given node . */
sds clusterGenNodesDescription ( int filter ) {
2014-01-22 18:09:06 +01:00
sds ci = sdsempty ( ) , ni ;
2011-03-30 14:58:19 +02:00
dictIterator * di ;
dictEntry * de ;
2013-09-04 10:07:50 +02:00
di = dictGetSafeIterator ( server . cluster - > nodes ) ;
2011-03-30 14:58:19 +02:00
while ( ( de = dictNext ( di ) ) ! = NULL ) {
2011-11-08 17:07:55 +01:00
clusterNode * node = dictGetVal ( de ) ;
2011-03-30 14:58:19 +02:00
2013-09-04 10:25:26 +02:00
if ( node - > flags & filter ) continue ;
2014-01-22 18:09:06 +01:00
ni = clusterGenNodeDescription ( node ) ;
ci = sdscatsds ( ci , ni ) ;
sdsfree ( ni ) ;
2011-04-07 19:04:16 +02:00
ci = sdscatlen ( ci , " \n " , 1 ) ;
2011-03-30 14:58:19 +02:00
dictReleaseIterator ( di ) ;
return ci ;
2011-05-06 15:44:09 +02:00
int getSlotOrReply ( redisClient * c , robj * o ) {
long long slot ;
if ( getLongLongFromObject ( o , & slot ) ! = REDIS_OK | |
2014-01-15 11:33:41 +01:00
slot < 0 | | slot > = REDIS_CLUSTER_SLOTS )
2011-05-06 15:44:09 +02:00
addReplyError ( c , " Invalid or out of range slot " ) ;
return - 1 ;
return ( int ) slot ;
2011-03-29 17:51:15 +02:00
void clusterCommand ( redisClient * c ) {
if ( server . cluster_enabled = = 0 ) {
addReplyError ( c , " This instance has cluster support disabled " ) ;
return ;
if ( ! strcasecmp ( c - > argv [ 1 ] - > ptr , " meet " ) & & c - > argc = = 4 ) {
long port ;
2013-12-20 12:37:18 +01:00
if ( getLongFromObjectOrReply ( c , c - > argv [ 3 ] , & port , NULL ) ! = REDIS_OK ) {
2011-03-29 17:51:15 +02:00
addReplyError ( c , " Invalid TCP port specified " ) ;
return ;
2013-12-20 12:37:18 +01:00
if ( clusterStartHandshake ( c - > argv [ 2 ] - > ptr , port ) = = 0 & &
errno = = EINVAL )
addReplyError ( c , " Invalid node address specified " ) ;
} else {
addReply ( c , shared . ok ) ;
2011-03-29 17:51:15 +02:00
} else if ( ! strcasecmp ( c - > argv [ 1 ] - > ptr , " nodes " ) & & c - > argc = = 2 ) {
2013-02-27 16:12:59 +01:00
2011-03-29 17:51:15 +02:00
robj * o ;
2013-09-04 10:25:26 +02:00
sds ci = clusterGenNodesDescription ( 0 ) ;
2011-03-29 17:51:15 +02:00
o = createObject ( REDIS_STRING , ci ) ;
addReplyBulk ( c , o ) ;
decrRefCount ( o ) ;
2013-03-19 09:58:05 +01:00
} else if ( ! strcasecmp ( c - > argv [ 1 ] - > ptr , " flushslots " ) & & c - > argc = = 2 ) {
if ( dictSize ( server . db [ 0 ] . dict ) ! = 0 ) {
addReplyError ( c , " DB must be empty to perform CLUSTER FLUSHSLOTS. " ) ;
return ;
2014-01-29 11:38:14 +01:00
clusterDelNodeSlots ( myself ) ;
2013-10-03 09:55:20 +02:00
2013-03-19 09:58:05 +01:00
addReply ( c , shared . ok ) ;
2011-05-06 13:38:27 +02:00
} else if ( ( ! strcasecmp ( c - > argv [ 1 ] - > ptr , " addslots " ) | |
2011-10-13 14:51:29 +02:00
! strcasecmp ( c - > argv [ 1 ] - > ptr , " delslots " ) ) & & c - > argc > = 3 )
/* CLUSTER ADDSLOTS <slot> [slot] ... */
/* CLUSTER DELSLOTS <slot> [slot] ... */
2011-05-06 15:44:09 +02:00
int j , slot ;
2011-03-29 17:51:15 +02:00
unsigned char * slots = zmalloc ( REDIS_CLUSTER_SLOTS ) ;
2011-05-06 13:38:27 +02:00
int del = ! strcasecmp ( c - > argv [ 1 ] - > ptr , " delslots " ) ;
2011-03-29 17:51:15 +02:00
memset ( slots , 0 , REDIS_CLUSTER_SLOTS ) ;
/* Check that all the arguments are parsable and that all the
* slots are not already busy . */
for ( j = 2 ; j < c - > argc ; j + + ) {
2011-05-06 15:44:09 +02:00
if ( ( slot = getSlotOrReply ( c , c - > argv [ j ] ) ) = = - 1 ) {
2011-03-29 17:51:15 +02:00
zfree ( slots ) ;
return ;
2013-02-14 13:20:56 +01:00
if ( del & & server . cluster - > slots [ slot ] = = NULL ) {
2011-05-06 15:44:09 +02:00
addReplyErrorFormat ( c , " Slot %d is already unassigned " , slot ) ;
2011-05-06 13:38:27 +02:00
zfree ( slots ) ;
return ;
2013-02-14 13:20:56 +01:00
} else if ( ! del & & server . cluster - > slots [ slot ] ) {
2011-05-06 15:44:09 +02:00
addReplyErrorFormat ( c , " Slot %d is already busy " , slot ) ;
2011-03-29 17:51:15 +02:00
zfree ( slots ) ;
return ;
if ( slots [ slot ] + + = = 1 ) {
addReplyErrorFormat ( c , " Slot %d specified multiple times " ,
( int ) slot ) ;
zfree ( slots ) ;
return ;
for ( j = 0 ; j < REDIS_CLUSTER_SLOTS ; j + + ) {
if ( slots [ j ] ) {
2011-05-06 16:08:10 +02:00
int retval ;
/* If this slot was set as importing we can clear this
* state as now we are the real owner of the slot . */
2013-02-14 13:20:56 +01:00
if ( server . cluster - > importing_slots_from [ j ] )
server . cluster - > importing_slots_from [ j ] = NULL ;
2011-05-06 16:08:10 +02:00
retval = del ? clusterDelSlot ( j ) :
2014-01-29 11:38:14 +01:00
clusterAddSlot ( myself , j ) ;
2011-10-04 18:43:03 +02:00
redisAssertWithInfo ( c , NULL , retval = = REDIS_OK ) ;
2011-03-29 17:51:15 +02:00
zfree ( slots ) ;
2013-10-03 09:55:20 +02:00
2011-03-29 17:51:15 +02:00
addReply ( c , shared . ok ) ;
2011-04-29 17:34:03 +02:00
} else if ( ! strcasecmp ( c - > argv [ 1 ] - > ptr , " setslot " ) & & c - > argc > = 4 ) {
2011-09-29 15:26:01 +02:00
/* SETSLOT 10 MIGRATING <node ID> */
/* SETSLOT 10 IMPORTING <node ID> */
2011-04-29 17:34:03 +02:00
2011-09-29 15:26:01 +02:00
/* SETSLOT 10 NODE <node ID> */
2011-05-06 15:44:09 +02:00
int slot ;
2011-04-29 17:34:03 +02:00
clusterNode * n ;
2011-05-06 15:44:09 +02:00
if ( ( slot = getSlotOrReply ( c , c - > argv [ 2 ] ) ) = = - 1 ) return ;
2011-04-29 17:34:03 +02:00
if ( ! strcasecmp ( c - > argv [ 3 ] - > ptr , " migrating " ) & & c - > argc = = 5 ) {
2014-01-29 11:38:14 +01:00
if ( server . cluster - > slots [ slot ] ! = myself ) {
2011-05-05 18:10:02 +02:00
addReplyErrorFormat ( c , " I'm not the owner of hash slot %u " , slot ) ;
return ;
2011-04-29 17:34:03 +02:00
if ( ( n = clusterLookupNode ( c - > argv [ 4 ] - > ptr ) ) = = NULL ) {
addReplyErrorFormat ( c , " I don't know about node %s " ,
( char * ) c - > argv [ 4 ] - > ptr ) ;
return ;
2013-02-14 13:20:56 +01:00
server . cluster - > migrating_slots_to [ slot ] = n ;
2011-04-29 17:34:03 +02:00
} else if ( ! strcasecmp ( c - > argv [ 3 ] - > ptr , " importing " ) & & c - > argc = = 5 ) {
2014-01-29 11:38:14 +01:00
if ( server . cluster - > slots [ slot ] = = myself ) {
2011-05-05 18:10:02 +02:00
addReplyErrorFormat ( c ,
" I'm already the owner of hash slot %u " , slot ) ;
return ;
2011-04-29 17:34:03 +02:00
if ( ( n = clusterLookupNode ( c - > argv [ 4 ] - > ptr ) ) = = NULL ) {
addReplyErrorFormat ( c , " I don't know about node %s " ,
( char * ) c - > argv [ 3 ] - > ptr ) ;
return ;
2013-02-14 13:20:56 +01:00
server . cluster - > importing_slots_from [ slot ] = n ;
2011-04-29 17:34:03 +02:00
} else if ( ! strcasecmp ( c - > argv [ 3 ] - > ptr , " stable " ) & & c - > argc = = 4 ) {
2011-05-06 15:44:09 +02:00
2013-02-14 13:20:56 +01:00
server . cluster - > importing_slots_from [ slot ] = NULL ;
server . cluster - > migrating_slots_to [ slot ] = NULL ;
2011-09-30 22:17:24 +02:00
} else if ( ! strcasecmp ( c - > argv [ 3 ] - > ptr , " node " ) & & c - > argc = = 5 ) {
2011-05-06 15:44:09 +02:00
clusterNode * n = clusterLookupNode ( c - > argv [ 4 ] - > ptr ) ;
2013-02-27 17:53:48 +01:00
if ( ! n ) {
addReplyErrorFormat ( c , " Unknown node %s " ,
( char * ) c - > argv [ 4 ] - > ptr ) ;
return ;
2011-05-06 15:44:09 +02:00
/* If this hash slot was served by 'myself' before to switch
* make sure there are no longer local keys for this hash slot . */
2014-01-29 11:38:14 +01:00
if ( server . cluster - > slots [ slot ] = = myself & & n ! = myself ) {
2013-02-25 11:24:42 +01:00
if ( countKeysInSlot ( slot ) ! = 0 ) {
2011-05-06 15:44:09 +02:00
addReplyErrorFormat ( c , " Can't assign hashslot %d to a different node while I still hold keys for this hash slot. " , slot ) ;
return ;
2014-02-10 23:48:42 +01:00
/* If this slot is in migrating status but we have no keys
* for it assigning the slot to another node will clear
2011-05-06 16:08:10 +02:00
* the migratig status . */
2014-02-10 23:48:42 +01:00
if ( countKeysInSlot ( slot ) = = 0 & &
2013-02-14 13:20:56 +01:00
server . cluster - > migrating_slots_to [ slot ] )
server . cluster - > migrating_slots_to [ slot ] = NULL ;
2011-05-06 16:08:10 +02:00
2011-09-30 19:20:56 +02:00
/* If this node was importing this slot, assigning the slot to
* itself also clears the importing status . */
2014-01-29 11:38:14 +01:00
if ( n = = myself & &
2013-02-21 16:24:48 +01:00
server . cluster - > importing_slots_from [ slot ] )
2014-02-10 18:01:58 +01:00
/* This slot was manually migrated, set this node configEpoch
2014-02-11 00:32:39 +01:00
* to a new epoch so that the new version can be propagated
* by the cluster .
* FIXME : the new version should be agreed otherwise a race
* is possible if while a manual resharding is in progress
* the master is failed over by a slave . */
2014-02-11 10:06:10 +01:00
uint64_t maxEpoch = clusterGetMaxEpoch ( ) ;
2014-03-03 11:12:11 +01:00
if ( myself - > configEpoch = = 0 | |
myself - > configEpoch ! = maxEpoch )
2014-02-11 10:06:10 +01:00
server . cluster - > currentEpoch + + ;
myself - > configEpoch = server . cluster - > currentEpoch ;
clusterDoBeforeSleep ( CLUSTER_TODO_FSYNC_CONFIG ) ;
2013-02-14 13:20:56 +01:00
server . cluster - > importing_slots_from [ slot ] = NULL ;
2014-02-10 18:01:58 +01:00
2011-05-06 15:44:09 +02:00
clusterDelSlot ( slot ) ;
clusterAddSlot ( n , slot ) ;
2011-04-29 17:34:03 +02:00
} else {
addReplyError ( c , " Invalid CLUSTER SETSLOT action or number of arguments " ) ;
2011-05-05 17:52:19 +02:00
return ;
2011-04-29 17:34:03 +02:00
2014-02-10 23:54:08 +01:00
2011-05-02 19:04:33 +02:00
addReply ( c , shared . ok ) ;
2011-03-29 17:51:15 +02:00
} else if ( ! strcasecmp ( c - > argv [ 1 ] - > ptr , " info " ) & & c - > argc = = 2 ) {
2013-02-27 16:12:59 +01:00
2011-03-29 17:51:15 +02:00
char * statestr [ ] = { " ok " , " fail " , " needhelp " } ;
int slots_assigned = 0 , slots_ok = 0 , slots_pfail = 0 , slots_fail = 0 ;
int j ;
for ( j = 0 ; j < REDIS_CLUSTER_SLOTS ; j + + ) {
2013-02-14 13:20:56 +01:00
clusterNode * n = server . cluster - > slots [ j ] ;
2011-03-29 17:51:15 +02:00
if ( n = = NULL ) continue ;
slots_assigned + + ;
2014-01-29 12:17:16 +01:00
if ( nodeFailed ( n ) ) {
2011-03-29 17:51:15 +02:00
slots_fail + + ;
2014-01-29 12:17:16 +01:00
} else if ( nodeTimedOut ( n ) ) {
2011-03-29 17:51:15 +02:00
slots_pfail + + ;
} else {
slots_ok + + ;
sds info = sdscatprintf ( sdsempty ( ) ,
" cluster_state:%s \r \n "
" cluster_slots_assigned:%d \r \n "
" cluster_slots_ok:%d \r \n "
" cluster_slots_pfail:%d \r \n "
" cluster_slots_fail:%d \r \n "
2011-04-11 17:40:35 +02:00
" cluster_known_nodes:%lu \r \n "
2013-02-22 19:20:38 +01:00
" cluster_size:%d \r \n "
2013-09-25 12:38:36 +02:00
" cluster_current_epoch:%llu \r \n "
2013-10-02 10:10:08 +02:00
" cluster_stats_messages_sent:%lld \r \n "
" cluster_stats_messages_received:%lld \r \n "
2013-02-14 13:20:56 +01:00
, statestr [ server . cluster - > state ] ,
2011-03-29 17:51:15 +02:00
slots_assigned ,
slots_ok ,
slots_pfail ,
2011-04-11 17:40:35 +02:00
slots_fail ,
2013-02-22 19:20:38 +01:00
dictSize ( server . cluster - > nodes ) ,
2013-09-25 12:38:36 +02:00
server . cluster - > size ,
2013-10-02 10:10:08 +02:00
( unsigned long long ) server . cluster - > currentEpoch ,
server . cluster - > stats_bus_messages_sent ,
server . cluster - > stats_bus_messages_received
2011-03-29 17:51:15 +02:00
) ;
addReplySds ( c , sdscatprintf ( sdsempty ( ) , " $%lu \r \n " ,
( unsigned long ) sdslen ( info ) ) ) ;
addReplySds ( c , info ) ;
addReply ( c , shared . crlf ) ;
2013-09-04 10:32:09 +02:00
} else if ( ! strcasecmp ( c - > argv [ 1 ] - > ptr , " saveconfig " ) & & c - > argc = = 2 ) {
2013-10-03 09:55:20 +02:00
int retval = clusterSaveConfig ( 1 ) ;
2013-09-04 10:32:09 +02:00
if ( retval = = 0 )
addReply ( c , shared . ok ) ;
addReplyErrorFormat ( c , " error saving the cluster node config: %s " ,
strerror ( errno ) ) ;
2011-04-29 14:31:18 +02:00
} else if ( ! strcasecmp ( c - > argv [ 1 ] - > ptr , " keyslot " ) & & c - > argc = = 3 ) {
2013-02-27 16:12:59 +01:00
2011-04-29 14:31:18 +02:00
sds key = c - > argv [ 2 ] - > ptr ;
addReplyLongLong ( c , keyHashSlot ( key , sdslen ( key ) ) ) ;
2013-02-25 12:04:31 +01:00
} else if ( ! strcasecmp ( c - > argv [ 1 ] - > ptr , " countkeysinslot " ) & & c - > argc = = 3 ) {
2013-02-27 16:12:59 +01:00
2013-02-25 12:04:31 +01:00
long long slot ;
if ( getLongLongFromObjectOrReply ( c , c - > argv [ 2 ] , & slot , NULL ) ! = REDIS_OK )
return ;
2013-02-25 12:40:32 +01:00
if ( slot < 0 | | slot > = REDIS_CLUSTER_SLOTS ) {
addReplyError ( c , " Invalid slot " ) ;
return ;
2013-02-25 12:04:31 +01:00
addReplyLongLong ( c , countKeysInSlot ( slot ) ) ;
2011-04-29 16:17:58 +02:00
} else if ( ! strcasecmp ( c - > argv [ 1 ] - > ptr , " getkeysinslot " ) & & c - > argc = = 4 ) {
2013-02-27 16:12:59 +01:00
/* CLUSTER GETKEYSINSLOT <slot> <count> */
2011-04-29 16:17:58 +02:00
long long maxkeys , slot ;
2011-04-29 17:34:03 +02:00
unsigned int numkeys , j ;
2011-04-29 16:17:58 +02:00
robj * * keys ;
if ( getLongLongFromObjectOrReply ( c , c - > argv [ 2 ] , & slot , NULL ) ! = REDIS_OK )
return ;
if ( getLongLongFromObjectOrReply ( c , c - > argv [ 3 ] , & maxkeys , NULL ) ! = REDIS_OK )
return ;
2013-02-25 12:41:13 +01:00
if ( slot < 0 | | slot > = REDIS_CLUSTER_SLOTS | | maxkeys < 0 ) {
2011-04-29 16:17:58 +02:00
addReplyError ( c , " Invalid slot or number of keys " ) ;
return ;
keys = zmalloc ( sizeof ( robj * ) * maxkeys ) ;
2013-02-25 11:24:42 +01:00
numkeys = getKeysInSlot ( slot , keys , maxkeys ) ;
2011-04-29 16:17:58 +02:00
addReplyMultiBulkLen ( c , numkeys ) ;
for ( j = 0 ; j < numkeys ; j + + ) addReplyBulk ( c , keys [ j ] ) ;
zfree ( keys ) ;
2013-02-27 17:55:59 +01:00
} else if ( ! strcasecmp ( c - > argv [ 1 ] - > ptr , " forget " ) & & c - > argc = = 3 ) {
clusterNode * n = clusterLookupNode ( c - > argv [ 2 ] - > ptr ) ;
2014-01-16 17:49:35 +01:00
if ( ! n ) {
2013-02-27 17:55:59 +01:00
addReplyErrorFormat ( c , " Unknown node %s " , ( char * ) c - > argv [ 2 ] - > ptr ) ;
return ;
2014-01-29 11:38:14 +01:00
} else if ( n = = myself ) {
2014-01-16 17:49:35 +01:00
addReplyError ( c , " I tried hard but I can't forget myself... " ) ;
return ;
2014-01-29 12:17:16 +01:00
} else if ( nodeIsSlave ( myself ) & & myself - > slaveof = = n ) {
2014-01-16 17:49:35 +01:00
addReplyError ( c , " Can't forget my master! " ) ;
return ;
2013-02-27 17:55:59 +01:00
2014-01-15 16:50:45 +01:00
clusterBlacklistAddNode ( n ) ;
2013-02-27 17:55:59 +01:00
clusterDelNode ( n ) ;
2013-10-03 09:55:20 +02:00
2013-02-27 17:55:59 +01:00
addReply ( c , shared . ok ) ;
2013-03-04 13:15:09 +01:00
} else if ( ! strcasecmp ( c - > argv [ 1 ] - > ptr , " replicate " ) & & c - > argc = = 3 ) {
clusterNode * n = clusterLookupNode ( c - > argv [ 2 ] - > ptr ) ;
/* Lookup the specified node in our table. */
if ( ! n ) {
addReplyErrorFormat ( c , " Unknown node %s " , ( char * ) c - > argv [ 2 ] - > ptr ) ;
return ;
/* I can't replicate myself. */
2014-01-29 11:38:14 +01:00
if ( n = = myself ) {
2013-03-04 13:15:09 +01:00
addReplyError ( c , " Can't replicate myself " ) ;
return ;
/* Can't replicate a slave. */
if ( n - > slaveof ! = NULL ) {
addReplyError ( c , " I can only replicate a master, not a slave. " ) ;
return ;
2014-01-17 18:22:35 +01:00
/* If the instance is currently a master, it should have no assigned
* slots nor keys to accept to replicate some other node .
* Slaves can switch to another master without issues . */
2014-01-29 12:17:16 +01:00
if ( nodeIsMaster ( myself ) & &
( myself - > numslots ! = 0 | | dictSize ( server . db [ 0 ] . dict ) ! = 0 ) ) {
2013-03-04 13:15:09 +01:00
addReplyError ( c , " To set a master the node must be empty and without assigned slots. " ) ;
return ;
/* Set the master. */
clusterSetMaster ( n ) ;
2013-10-03 09:55:20 +02:00
2013-03-04 13:15:09 +01:00
addReply ( c , shared . ok ) ;
2014-01-22 18:38:42 +01:00
} else if ( ! strcasecmp ( c - > argv [ 1 ] - > ptr , " slaves " ) & & c - > argc = = 3 ) {
clusterNode * n = clusterLookupNode ( c - > argv [ 2 ] - > ptr ) ;
int j ;
/* Lookup the specified node in our table. */
if ( ! n ) {
addReplyErrorFormat ( c , " Unknown node %s " , ( char * ) c - > argv [ 2 ] - > ptr ) ;
return ;
2014-01-29 12:17:16 +01:00
if ( nodeIsSlave ( n ) ) {
2014-01-22 18:38:42 +01:00
addReplyError ( c , " The specified node is not a master " ) ;
return ;
addReplyMultiBulkLen ( c , n - > numslaves ) ;
for ( j = 0 ; j < n - > numslaves ; j + + ) {
sds ni = clusterGenNodeDescription ( n - > slaves [ j ] ) ;
addReplyBulkCString ( c , ni ) ;
sdsfree ( ni ) ;
2014-02-05 13:01:24 +01:00
} else if ( ! strcasecmp ( c - > argv [ 1 ] - > ptr , " failover " ) & & c - > argc = = 2 ) {
if ( nodeIsMaster ( myself ) ) {
addReplyError ( c , " You should send CLUSTER FAILOVER to a slave " ) ;
return ;
} else if ( myself - > slaveof = = NULL | | nodeFailed ( myself - > slaveof ) | |
myself - > slaveof - > link = = NULL )
addReplyError ( c , " Master is down or failed, "
" please use CLUSTER FAILOVER FORCE " ) ;
return ;
resetManualFailover ( ) ;
server . cluster - > mf_end = mstime ( ) + REDIS_CLUSTER_MF_TIMEOUT ;
clusterSendMFStart ( myself - > slaveof ) ;
2014-02-05 15:52:38 +01:00
redisLog ( REDIS_WARNING , " Manual failover user request accepted. " ) ;
addReply ( c , shared . ok ) ;
2011-03-29 17:51:15 +02:00
} else {
addReplyError ( c , " Wrong CLUSTER subcommand or number of arguments " ) ;
/* -----------------------------------------------------------------------------
2012-04-01 12:51:40 +02:00
* DUMP , RESTORE and MIGRATE commands
2011-03-29 17:51:15 +02:00
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
2012-04-01 12:51:40 +02:00
/* Generates a DUMP-format representation of the object 'o', adding it to the
* io stream pointed by ' rio ' . This function can ' t fail . */
void createDumpPayload ( rio * payload , robj * o ) {
2012-04-02 13:10:39 +02:00
unsigned char buf [ 2 ] ;
uint64_t crc ;
2012-04-01 12:51:40 +02:00
/* Serialize the object in a RDB-like format. It consist of an object type
* byte followed by the serialized object . This is understood by RESTORE . */
rioInitWithBuffer ( payload , sdsempty ( ) ) ;
redisAssert ( rdbSaveObjectType ( payload , o ) ) ;
redisAssert ( rdbSaveObject ( payload , o ) ) ;
/* Write the footer, this is how it looks like:
2012-04-02 13:10:39 +02:00
* - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - +
* . . . RDB payload | 2 bytes RDB version | 8 bytes CRC64 |
* - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - - - - - - - + - - - - - - - - - - - - - - - +
* RDB version and CRC are both in little endian .
2012-04-02 10:52:39 +02:00
/* RDB version */
2012-04-02 10:46:24 +02:00
buf [ 0 ] = REDIS_RDB_VERSION & 0xff ;
buf [ 1 ] = ( REDIS_RDB_VERSION > > 8 ) & 0xff ;
2012-04-01 12:51:40 +02:00
payload - > io . buffer . ptr = sdscatlen ( payload - > io . buffer . ptr , buf , 2 ) ;
2012-04-02 13:10:39 +02:00
/* CRC64 */
2012-04-09 12:33:57 +02:00
crc = crc64 ( 0 , ( unsigned char * ) payload - > io . buffer . ptr ,
2012-04-02 13:10:39 +02:00
sdslen ( payload - > io . buffer . ptr ) ) ;
memrev64ifbe ( & crc ) ;
payload - > io . buffer . ptr = sdscatlen ( payload - > io . buffer . ptr , & crc , 8 ) ;
2012-04-01 12:51:40 +02:00
/* Verify that the RDB version of the dump payload matches the one of this Redis
2012-04-02 13:10:39 +02:00
* instance and that the checksum is ok .
2012-04-01 12:51:40 +02:00
* If the DUMP payload looks valid REDIS_OK is returned , otherwise REDIS_ERR
* is returned . */
int verifyDumpPayload ( unsigned char * p , size_t len ) {
2012-04-02 13:10:39 +02:00
unsigned char * footer ;
2012-04-01 12:51:40 +02:00
uint16_t rdbver ;
2012-04-02 13:10:39 +02:00
uint64_t crc ;
2012-04-01 12:51:40 +02:00
2012-04-02 13:10:39 +02:00
/* At least 2 bytes of RDB version and 8 of CRC64 should be present. */
2012-04-01 12:51:40 +02:00
if ( len < 10 ) return REDIS_ERR ;
footer = p + ( len - 10 ) ;
2012-04-02 10:52:39 +02:00
/* Verify RDB version */
2012-04-02 10:46:24 +02:00
rdbver = ( footer [ 1 ] < < 8 ) | footer [ 0 ] ;
2012-04-01 12:51:40 +02:00
if ( rdbver ! = REDIS_RDB_VERSION ) return REDIS_ERR ;
2012-04-02 10:52:39 +02:00
2012-04-02 13:10:39 +02:00
/* Verify CRC64 */
2012-04-09 12:33:57 +02:00
crc = crc64 ( 0 , p , len - 8 ) ;
2012-04-02 13:10:39 +02:00
memrev64ifbe ( & crc ) ;
return ( memcmp ( & crc , footer + 2 , 8 ) = = 0 ) ? REDIS_OK : REDIS_ERR ;
2012-04-01 12:51:40 +02:00
/* DUMP keyname
* DUMP is actually not used by Redis Cluster but it is the obvious
* complement of RESTORE and can be useful for different applications . */
void dumpCommand ( redisClient * c ) {
robj * o , * dumpobj ;
rio payload ;
/* Check if the key is here. */
if ( ( o = lookupKeyRead ( c - > db , c - > argv [ 1 ] ) ) = = NULL ) {
addReply ( c , shared . nullbulk ) ;
return ;
/* Create the DUMP encoded representation. */
createDumpPayload ( & payload , o ) ;
/* Transfer to the client */
dumpobj = createObject ( REDIS_STRING , payload . io . buffer . ptr ) ;
addReplyBulk ( c , dumpobj ) ;
decrRefCount ( dumpobj ) ;
return ;
2012-11-07 10:57:23 +01:00
/* RESTORE key ttl serialized-value [REPLACE] */
2011-03-29 17:51:15 +02:00
void restoreCommand ( redisClient * c ) {
2014-01-09 11:09:23 +01:00
long long ttl ;
2011-05-13 17:31:00 +02:00
rio payload ;
2012-11-07 10:57:23 +01:00
int j , type , replace = 0 ;
2011-05-13 22:14:39 +02:00
robj * obj ;
2011-03-29 17:51:15 +02:00
2012-11-07 10:57:23 +01:00
/* Parse additional options */
for ( j = 4 ; j < c - > argc ; j + + ) {
if ( ! strcasecmp ( c - > argv [ j ] - > ptr , " replace " ) ) {
replace = 1 ;
} else {
addReply ( c , shared . syntaxerr ) ;
return ;
2011-03-29 17:51:15 +02:00
/* Make sure this key does not already exist here... */
2012-11-07 10:57:23 +01:00
if ( ! replace & & lookupKeyWrite ( c - > db , c - > argv [ 1 ] ) ! = NULL ) {
2011-03-29 17:51:15 +02:00
addReplyError ( c , " Target key name is busy. " ) ;
return ;
/* Check if the TTL value makes sense */
2014-01-09 11:09:23 +01:00
if ( getLongLongFromObjectOrReply ( c , c - > argv [ 2 ] , & ttl , NULL ) ! = REDIS_OK ) {
2011-03-29 17:51:15 +02:00
return ;
} else if ( ttl < 0 ) {
addReplyError ( c , " Invalid TTL value, must be >= 0 " ) ;
return ;
2012-04-02 13:10:39 +02:00
/* Verify RDB version and data checksum. */
2014-01-09 11:09:23 +01:00
if ( verifyDumpPayload ( c - > argv [ 3 ] - > ptr , sdslen ( c - > argv [ 3 ] - > ptr ) ) = = REDIS_ERR )
2012-04-01 12:51:40 +02:00
addReplyError ( c , " DUMP payload version or checksum are wrong " ) ;
return ;
2011-09-22 16:00:40 +02:00
rioInitWithBuffer ( & payload , c - > argv [ 3 ] - > ptr ) ;
2011-05-13 22:14:39 +02:00
if ( ( ( type = rdbLoadObjectType ( & payload ) ) = = - 1 ) | |
( ( obj = rdbLoadObject ( type , & payload ) ) = = NULL ) )
2011-04-05 13:57:28 +02:00
2011-05-13 22:14:39 +02:00
addReplyError ( c , " Bad data format " ) ;
2011-03-29 17:51:15 +02:00
return ;
2012-11-07 10:57:23 +01:00
/* Remove the old key if needed. */
if ( replace ) dbDelete ( c - > db , c - > argv [ 1 ] ) ;
2011-03-29 17:51:15 +02:00
/* Create the key and set the TTL if any */
2011-05-13 22:14:39 +02:00
dbAdd ( c - > db , c - > argv [ 1 ] , obj ) ;
2012-04-02 11:14:47 +02:00
if ( ttl ) setExpire ( c - > db , c - > argv [ 1 ] , mstime ( ) + ttl ) ;
2011-10-20 11:17:30 +02:00
signalModifiedKey ( c - > db , c - > argv [ 1 ] ) ;
2011-03-29 17:51:15 +02:00
addReply ( c , shared . ok ) ;
2011-10-03 15:45:14 +02:00
server . dirty + + ;
2011-03-29 17:51:15 +02:00
2012-11-12 00:45:10 +01:00
/* MIGRATE socket cache implementation.
* We take a map between host : ip and a TCP socket that we used to connect
* to this instance in recent time .
* This sockets are closed when the max number we cache is reached , and also
* in serverCron ( ) when they are around for more than a few seconds . */
# define MIGRATE_SOCKET_CACHE_ITEMS 64 /* max num of items in the cache. */
# define MIGRATE_SOCKET_CACHE_TTL 10 /* close cached socekts after 10 sec. */
typedef struct migrateCachedSocket {
int fd ;
time_t last_use_time ;
} migrateCachedSocket ;
/* Return a TCP scoket connected with the target instance, possibly returning
* a cached one .
* This function is responsible of sending errors to the client if a
* connection can ' t be established . In this case - 1 is returned .
* Otherwise on success the socket is returned , and the caller should not
* attempt to free it after usage .
* If the caller detects an error while using the socket , migrateCloseSocket ( )
* should be called so that the connection will be craeted from scratch
* the next time . */
int migrateGetSocket ( redisClient * c , robj * host , robj * port , long timeout ) {
int fd ;
sds name = sdsempty ( ) ;
migrateCachedSocket * cs ;
/* Check if we have an already cached socket for this ip:port pair. */
name = sdscatlen ( name , host - > ptr , sdslen ( host - > ptr ) ) ;
name = sdscatlen ( name , " : " , 1 ) ;
name = sdscatlen ( name , port - > ptr , sdslen ( port - > ptr ) ) ;
cs = dictFetchValue ( server . migrate_cached_sockets , name ) ;
if ( cs ) {
sdsfree ( name ) ;
cs - > last_use_time = server . unixtime ;
return cs - > fd ;
/* No cached socket, create one. */
if ( dictSize ( server . migrate_cached_sockets ) = = MIGRATE_SOCKET_CACHE_ITEMS ) {
/* Too many items, drop one at random. */
dictEntry * de = dictGetRandomKey ( server . migrate_cached_sockets ) ;
cs = dictGetVal ( de ) ;
close ( cs - > fd ) ;
zfree ( cs ) ;
dictDelete ( server . migrate_cached_sockets , dictGetKey ( de ) ) ;
/* Create the socket */
fd = anetTcpNonBlockConnect ( server . neterr , c - > argv [ 1 ] - > ptr ,
atoi ( c - > argv [ 2 ] - > ptr ) ) ;
if ( fd = = - 1 ) {
sdsfree ( name ) ;
addReplyErrorFormat ( c , " Can't connect to target node: %s " ,
server . neterr ) ;
return - 1 ;
2013-01-31 11:14:15 +01:00
anetEnableTcpNoDelay ( server . neterr , fd ) ;
2012-11-12 00:45:10 +01:00
/* Check if it connects within the specified timeout. */
2012-11-12 14:00:59 +01:00
if ( ( aeWait ( fd , AE_WRITABLE , timeout ) & AE_WRITABLE ) = = 0 ) {
2012-11-12 00:45:10 +01:00
sdsfree ( name ) ;
addReplySds ( c , sdsnew ( " -IOERR error or timeout connecting to the client \r \n " ) ) ;
close ( fd ) ;
return - 1 ;
/* Add to the cache and return it to the caller. */
cs = zmalloc ( sizeof ( * cs ) ) ;
cs - > fd = fd ;
cs - > last_use_time = server . unixtime ;
dictAdd ( server . migrate_cached_sockets , name , cs ) ;
return fd ;
/* Free a migrate cached connection. */
void migrateCloseSocket ( robj * host , robj * port ) {
sds name = sdsempty ( ) ;
migrateCachedSocket * cs ;
name = sdscatlen ( name , host - > ptr , sdslen ( host - > ptr ) ) ;
name = sdscatlen ( name , " : " , 1 ) ;
name = sdscatlen ( name , port - > ptr , sdslen ( port - > ptr ) ) ;
cs = dictFetchValue ( server . migrate_cached_sockets , name ) ;
if ( ! cs ) {
sdsfree ( name ) ;
return ;
close ( cs - > fd ) ;
zfree ( cs ) ;
dictDelete ( server . migrate_cached_sockets , name ) ;
sdsfree ( name ) ;
void migrateCloseTimedoutSockets ( void ) {
dictIterator * di = dictGetSafeIterator ( server . migrate_cached_sockets ) ;
dictEntry * de ;
while ( ( de = dictNext ( di ) ) ! = NULL ) {
migrateCachedSocket * cs = dictGetVal ( de ) ;
if ( ( server . unixtime - cs - > last_use_time ) > MIGRATE_SOCKET_CACHE_TTL ) {
close ( cs - > fd ) ;
zfree ( cs ) ;
dictDelete ( server . migrate_cached_sockets , dictGetKey ( de ) ) ;
dictReleaseIterator ( di ) ;
2012-11-07 15:32:27 +01:00
/* MIGRATE host port key dbid timeout [COPY | REPLACE] */
2011-03-29 17:51:15 +02:00
void migrateCommand ( redisClient * c ) {
2012-11-14 11:30:24 +01:00
int fd , copy , replace , j ;
2011-03-29 17:51:15 +02:00
long timeout ;
long dbid ;
2012-11-14 11:30:24 +01:00
long long ttl , expireat ;
2011-03-29 17:51:15 +02:00
robj * o ;
2011-05-13 17:31:00 +02:00
rio cmd , payload ;
2012-11-14 11:30:24 +01:00
int retry_num = 0 ;
try_again :
/* Initialization */
copy = 0 ;
replace = 0 ;
ttl = 0 ;
2011-03-29 17:51:15 +02:00
2012-11-07 15:32:27 +01:00
/* Parse additional options */
for ( j = 6 ; j < c - > argc ; j + + ) {
if ( ! strcasecmp ( c - > argv [ j ] - > ptr , " copy " ) ) {
copy = 1 ;
} else if ( ! strcasecmp ( c - > argv [ j ] - > ptr , " replace " ) ) {
replace = 1 ;
} else {
addReply ( c , shared . syntaxerr ) ;
return ;
2011-03-29 17:51:15 +02:00
/* Sanity check */
if ( getLongFromObjectOrReply ( c , c - > argv [ 5 ] , & timeout , NULL ) ! = REDIS_OK )
return ;
if ( getLongFromObjectOrReply ( c , c - > argv [ 4 ] , & dbid , NULL ) ! = REDIS_OK )
return ;
2012-11-12 15:04:54 +01:00
if ( timeout < = 0 ) timeout = 1000 ;
2011-03-29 17:51:15 +02:00
/* Check if the key is here. If not we reply with success as there is
* nothing to migrate ( for instance the key expired in the meantime ) , but
* we include such information in the reply string . */
if ( ( o = lookupKeyRead ( c - > db , c - > argv [ 3 ] ) ) = = NULL ) {
2011-10-17 16:44:08 +02:00
addReplySds ( c , sdsnew ( " +NOKEY \r \n " ) ) ;
2011-03-29 17:51:15 +02:00
return ;
/* Connect */
2012-11-12 00:45:10 +01:00
fd = migrateGetSocket ( c , c - > argv [ 1 ] , c - > argv [ 2 ] , timeout ) ;
if ( fd = = - 1 ) return ; /* error sent to the client by migrateGetSocket() */
2011-03-29 17:51:15 +02:00
2012-04-01 12:51:40 +02:00
/* Create RESTORE payload and generate the protocol to call the command. */
2011-09-22 16:00:40 +02:00
rioInitWithBuffer ( & cmd , sdsempty ( ) ) ;
2011-10-04 18:43:03 +02:00
redisAssertWithInfo ( c , NULL , rioWriteBulkCount ( & cmd , ' * ' , 2 ) ) ;
redisAssertWithInfo ( c , NULL , rioWriteBulkString ( & cmd , " SELECT " , 6 ) ) ;
redisAssertWithInfo ( c , NULL , rioWriteBulkLongLong ( & cmd , dbid ) ) ;
2011-03-29 17:51:15 +02:00
2012-04-03 15:10:42 +02:00
expireat = getExpire ( c - > db , c - > argv [ 3 ] ) ;
if ( expireat ! = - 1 ) {
ttl = expireat - mstime ( ) ;
if ( ttl < 1 ) ttl = 1 ;
2012-11-07 15:32:27 +01:00
redisAssertWithInfo ( c , NULL , rioWriteBulkCount ( & cmd , ' * ' , replace ? 5 : 4 ) ) ;
2013-02-20 17:36:54 +01:00
if ( server . cluster_enabled )
redisAssertWithInfo ( c , NULL ,
rioWriteBulkString ( & cmd , " RESTORE-ASKING " , 14 ) ) ;
redisAssertWithInfo ( c , NULL , rioWriteBulkString ( & cmd , " RESTORE " , 7 ) ) ;
2012-06-05 21:50:10 +02:00
redisAssertWithInfo ( c , NULL , sdsEncodedObject ( c - > argv [ 3 ] ) ) ;
2011-10-04 18:43:03 +02:00
redisAssertWithInfo ( c , NULL , rioWriteBulkString ( & cmd , c - > argv [ 3 ] - > ptr , sdslen ( c - > argv [ 3 ] - > ptr ) ) ) ;
2012-04-10 16:46:29 +02:00
redisAssertWithInfo ( c , NULL , rioWriteBulkLongLong ( & cmd , ttl ) ) ;
2011-03-29 17:51:15 +02:00
2013-01-17 01:00:20 +08:00
/* Emit the payload argument, that is the serialized object using
2012-11-07 15:32:27 +01:00
* the DUMP format . */
2012-04-01 12:51:40 +02:00
createDumpPayload ( & payload , o ) ;
redisAssertWithInfo ( c , NULL , rioWriteBulkString ( & cmd , payload . io . buffer . ptr ,
sdslen ( payload . io . buffer . ptr ) ) ) ;
2011-05-13 17:31:00 +02:00
sdsfree ( payload . io . buffer . ptr ) ;
2012-11-07 15:32:27 +01:00
/* Add the REPLACE option to the RESTORE command if it was specified
* as a MIGRATE option . */
if ( replace )
redisAssertWithInfo ( c , NULL , rioWriteBulkString ( & cmd , " REPLACE " , 7 ) ) ;
2013-01-17 01:00:20 +08:00
/* Transfer the query to the other node in 64K chunks. */
2012-11-14 11:30:24 +01:00
errno = 0 ;
2011-03-29 17:51:15 +02:00
2011-05-13 17:31:00 +02:00
sds buf = cmd . io . buffer . ptr ;
size_t pos = 0 , towrite ;
int nwritten = 0 ;
while ( ( towrite = sdslen ( buf ) - pos ) > 0 ) {
towrite = ( towrite > ( 64 * 1024 ) ? ( 64 * 1024 ) : towrite ) ;
2012-04-03 12:17:40 +02:00
nwritten = syncWrite ( fd , buf + pos , towrite , timeout ) ;
2011-05-13 17:31:00 +02:00
if ( nwritten ! = ( signed ) towrite ) goto socket_wr_err ;
pos + = nwritten ;
2011-03-29 17:51:15 +02:00
2011-05-13 17:31:00 +02:00
/* Read back the reply. */
2011-03-29 17:51:15 +02:00
char buf1 [ 1024 ] ;
char buf2 [ 1024 ] ;
/* Read the two replies */
if ( syncReadLine ( fd , buf1 , sizeof ( buf1 ) , timeout ) < = 0 )
goto socket_rd_err ;
if ( syncReadLine ( fd , buf2 , sizeof ( buf2 ) , timeout ) < = 0 )
2011-05-13 17:31:00 +02:00
goto socket_rd_err ;
2011-03-29 17:51:15 +02:00
if ( buf1 [ 0 ] = = ' - ' | | buf2 [ 0 ] = = ' - ' ) {
addReplyErrorFormat ( c , " Target instance replied with error: %s " ,
( buf1 [ 0 ] = = ' - ' ) ? buf1 + 1 : buf2 + 1 ) ;
} else {
2011-10-03 15:51:10 +02:00
robj * aux ;
2012-11-07 15:32:27 +01:00
if ( ! copy ) {
/* No COPY option: remove the local key, signal the change. */
dbDelete ( c - > db , c - > argv [ 3 ] ) ;
signalModifiedKey ( c - > db , c - > argv [ 3 ] ) ;
2011-03-29 17:51:15 +02:00
addReply ( c , shared . ok ) ;
2011-10-03 15:51:10 +02:00
server . dirty + + ;
/* Translate MIGRATE as DEL for replication/AOF. */
2011-10-17 16:39:05 +02:00
aux = createStringObject ( " DEL " , 3 ) ;
2011-10-03 15:51:10 +02:00
rewriteClientCommandVector ( c , 2 , aux , c - > argv [ 3 ] ) ;
decrRefCount ( aux ) ;
2011-03-29 17:51:15 +02:00
2011-05-13 17:31:00 +02:00
sdsfree ( cmd . io . buffer . ptr ) ;
2011-04-01 18:59:28 +02:00
return ;
2011-03-29 17:51:15 +02:00
socket_wr_err :
2011-05-13 17:31:00 +02:00
sdsfree ( cmd . io . buffer . ptr ) ;
2012-11-12 00:45:10 +01:00
migrateCloseSocket ( c - > argv [ 1 ] , c - > argv [ 2 ] ) ;
2012-11-14 11:30:24 +01:00
if ( errno ! = ETIMEDOUT & & retry_num + + = = 0 ) goto try_again ;
addReplySds ( c ,
sdsnew ( " -IOERR error or timeout writing to target instance \r \n " ) ) ;
2011-04-01 18:59:28 +02:00
return ;
2011-03-29 17:51:15 +02:00
socket_rd_err :
2011-05-13 17:31:00 +02:00
sdsfree ( cmd . io . buffer . ptr ) ;
2012-11-12 00:45:10 +01:00
migrateCloseSocket ( c - > argv [ 1 ] , c - > argv [ 2 ] ) ;
2012-11-14 11:30:24 +01:00
if ( errno ! = ETIMEDOUT & & retry_num + + = = 0 ) goto try_again ;
addReplySds ( c ,
sdsnew ( " -IOERR error or timeout reading from target node \r \n " ) ) ;
2011-04-01 18:59:28 +02:00
return ;
2014-01-14 16:33:14 +01:00
/* -----------------------------------------------------------------------------
* Cluster functions related to serving / redirecting clients
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
2011-10-17 17:35:23 +02:00
/* The ASKING command is required after a -ASK redirection.
2013-01-17 01:00:20 +08:00
* The client should issue ASKING before to actually send the command to
2011-10-17 17:35:23 +02:00
* the target instance . See the Redis Cluster specification for more
* information . */
void askingCommand ( redisClient * c ) {
if ( server . cluster_enabled = = 0 ) {
addReplyError ( c , " This instance has cluster support disabled " ) ;
return ;
c - > flags | = REDIS_ASKING ;
addReply ( c , shared . ok ) ;
2014-01-14 16:33:14 +01:00
/* The READONLY command is uesd by clients to enter the read-only mode.
* In this mode slaves will not redirect clients as long as clients access
* with read - only commands to keys that are served by the slave ' s master . */
void readonlyCommand ( redisClient * c ) {
if ( server . cluster_enabled = = 0 ) {
addReplyError ( c , " This instance has cluster support disabled " ) ;
return ;
c - > flags | = REDIS_READONLY ;
addReply ( c , shared . ok ) ;
/* The READWRITE command just clears the READONLY command state. */
void readwriteCommand ( redisClient * c ) {
c - > flags & = ~ REDIS_READONLY ;
addReply ( c , shared . ok ) ;
2011-03-29 17:51:15 +02:00
2013-02-15 11:50:54 +01:00
/* Return the pointer to the cluster node that is able to serve the command.
2014-03-07 13:19:09 +01:00
* For the function to succeed the command should only target either :
2011-03-29 17:51:15 +02:00
2014-03-07 13:19:09 +01:00
* 1 ) A single key ( even multiple times like LPOPRPUSH mylist mylist ) .
* 2 ) Multiple keys in the same hash slot , while the slot is stable ( no
* resharding in progress ) .
2011-05-05 11:13:21 +02:00
2014-03-07 13:19:09 +01:00
* On success the function returns the node that is able to serve the request .
* If the node is not ' myself ' a redirection must be perfomed . The kind of
* redirection is specified setting the integer passed by reference
* ' error_code ' , which will be set to REDIS_CLUSTER_REDIR_ASK or
* When the node is ' myself ' ' error_code ' is set to REDIS_CLUSTER_REDIR_NONE .
* If the command fails NULL is returned , and the reason of the failure is
* provided via ' error_code ' , which will be set to :
* REDIS_CLUSTER_REDIR_CROSS_SLOT if the request contains multiple keys that
* don ' t belong to the same hash slot .
* REDIS_CLUSTER_REDIR_UNSTABLE if the request contains mutliple keys
* belonging to the same slot , but the slot is not stable ( in migration or
* importing state , likely because a resharding is in progress ) . */
clusterNode * getNodeByQuery ( redisClient * c , struct redisCommand * cmd , robj * * argv , int argc , int * hashslot , int * error_code ) {
2011-03-29 17:51:15 +02:00
clusterNode * n = NULL ;
2011-05-05 11:13:21 +02:00
robj * firstkey = NULL ;
2014-03-07 13:19:09 +01:00
int multiple_keys = 0 ;
2011-03-29 17:51:15 +02:00
multiState * ms , _ms ;
multiCmd mc ;
2014-03-07 13:19:09 +01:00
int i , slot = 0 , migrating_slot = 0 , importing_slot = 0 , missing_keys = 0 ;
/* Set error code optimistically for the base case. */
if ( error_code ) * error_code = REDIS_CLUSTER_REDIR_NONE ;
2011-03-29 17:51:15 +02:00
/* We handle all the cases as if they were EXEC commands, so we have
* a common code path for everything */
if ( cmd - > proc = = execCommand ) {
/* If REDIS_MULTI flag is not set EXEC is just going to return an
* error . */
2014-01-29 11:38:14 +01:00
if ( ! ( c - > flags & REDIS_MULTI ) ) return myself ;
2011-03-29 17:51:15 +02:00
ms = & c - > mstate ;
} else {
2011-05-05 11:13:21 +02:00
/* In order to have a single codepath create a fake Multi State
* structure if the client is not in MULTI / EXEC state , this way
* we have a single codepath below . */
2011-03-29 17:51:15 +02:00
ms = & _ms ;
_ms . commands = & mc ;
_ms . count = 1 ;
mc . argv = argv ;
mc . argc = argc ;
mc . cmd = cmd ;
2014-03-07 13:19:09 +01:00
/* Check that all the keys are in the same hash slot, and obtain this
* slot and the node associated . */
2011-03-29 17:51:15 +02:00
for ( i = 0 ; i < ms - > count ; i + + ) {
struct redisCommand * mcmd ;
robj * * margv ;
int margc , * keyindex , numkeys , j ;
mcmd = ms - > commands [ i ] . cmd ;
margc = ms - > commands [ i ] . argc ;
margv = ms - > commands [ i ] . argv ;
2014-03-10 13:18:41 +01:00
keyindex = getKeysFromCommand ( mcmd , margv , margc , & numkeys ) ;
2011-03-29 17:51:15 +02:00
for ( j = 0 ; j < numkeys ; j + + ) {
2014-03-07 13:19:09 +01:00
robj * thiskey = margv [ keyindex [ j ] ] ;
int thisslot = keyHashSlot ( ( char * ) thiskey - > ptr ,
sdslen ( thiskey - > ptr ) ) ;
2011-05-05 11:13:21 +02:00
if ( firstkey = = NULL ) {
/* This is the first key we see. Check what is the slot
* and node . */
2014-03-07 13:19:09 +01:00
firstkey = thiskey ;
slot = thisslot ;
2013-02-14 13:20:56 +01:00
n = server . cluster - > slots [ slot ] ;
2011-10-04 18:43:03 +02:00
redisAssertWithInfo ( c , firstkey , n ! = NULL ) ;
2014-03-07 13:19:09 +01:00
/* If we are migrating or importing this slot, we need to check
* if we have all the keys in the request ( the only way we
* can safely serve the request , otherwise we return a TRYAGAIN
* error ) . To do so we set the importing / migrating state and
* increment a counter for every missing key . */
if ( n = = myself & &
server . cluster - > migrating_slots_to [ slot ] ! = NULL )
migrating_slot = 1 ;
} else if ( server . cluster - > importing_slots_from [ slot ] ! = NULL ) {
importing_slot = 1 ;
2011-03-29 17:51:15 +02:00
} else {
2011-05-05 11:13:21 +02:00
/* If it is not the first key, make sure it is exactly
* the same key as the first we saw . */
2014-03-07 13:19:09 +01:00
if ( ! equalStringObjects ( firstkey , thiskey ) ) {
if ( slot ! = thisslot ) {
/* Error: multiple keys from different slots. */
getKeysFreeResult ( keyindex ) ;
if ( error_code )
return NULL ;
} else {
/* Flag this request as one with multiple different
* keys . */
multiple_keys = 1 ;
2011-05-05 11:13:21 +02:00
2011-03-29 17:51:15 +02:00
2014-03-07 13:19:09 +01:00
/* Migarting / Improrting slot? Count keys we don't have. */
if ( ( migrating_slot | | importing_slot ) & &
lookupKeyRead ( & server . db [ 0 ] , thiskey ) = = NULL )
missing_keys + + ;
2011-03-29 17:51:15 +02:00
getKeysFreeResult ( keyindex ) ;
2014-03-07 13:19:09 +01:00
2011-05-05 11:13:21 +02:00
/* No key at all in command? then we can serve the request
2014-03-07 13:19:09 +01:00
* without redirections or errors . */
2014-01-29 11:38:14 +01:00
if ( n = = NULL ) return myself ;
2014-03-07 13:19:09 +01:00
/* Return the hashslot by reference. */
2011-05-05 11:13:21 +02:00
if ( hashslot ) * hashslot = slot ;
2014-03-07 13:19:09 +01:00
2011-05-05 11:13:21 +02:00
/* This request is about a slot we are migrating into another instance?
2014-03-07 13:19:09 +01:00
* Then if we have all the keys . */
/* If we don't have all the keys and we are migrating the slot, send
* an ASK redirection . */
if ( migrating_slot & & missing_keys ) {
if ( error_code ) * error_code = REDIS_CLUSTER_REDIR_ASK ;
return server . cluster - > migrating_slots_to [ slot ] ;
2014-03-07 16:18:00 +01:00
/* If we are receiving the slot, and the client correctly flagged the
* request as " ASKING " , we can serve the request . However if the request
* involves multiple keys and we don ' t have them all , the only option is
* to send a TRYAGAIN error . */
2014-03-07 13:19:09 +01:00
if ( importing_slot & &
( c - > flags & REDIS_ASKING | | cmd - > flags & REDIS_CMD_ASKING ) )
2014-03-07 16:18:00 +01:00
if ( multiple_keys & & missing_keys ) {
2014-03-07 13:19:09 +01:00
if ( error_code ) * error_code = REDIS_CLUSTER_REDIR_UNSTABLE ;
return NULL ;
} else {
return myself ;
2011-10-17 17:35:23 +02:00
2014-03-07 13:19:09 +01:00
2014-01-14 16:33:14 +01:00
/* Handle the read-only client case reading from a slave: if this
* node is a slave and the request is about an hash slot our master
* is serving , we can reply without redirection . */
if ( c - > flags & REDIS_READONLY & &
cmd - > flags & REDIS_CMD_READONLY & &
2014-01-29 12:17:16 +01:00
nodeIsSlave ( myself ) & &
2014-01-29 11:38:14 +01:00
myself - > slaveof = = n )
2014-01-14 16:33:14 +01:00
2014-01-29 11:38:14 +01:00
return myself ;
2014-01-14 16:33:14 +01:00
2014-03-07 13:19:09 +01:00
/* Base case: just return the right node. However if this node is not
* myself , set error_code to MOVED since we need to issue a rediretion . */
if ( n ! = myself & & error_code ) * error_code = REDIS_CLUSTER_REDIR_MOVED ;
2011-05-05 11:13:21 +02:00
return n ;
2011-03-29 17:51:15 +02:00