diff --git a/src/Makefile.dep b/src/Makefile.dep index e945efb0..9ec6d9c9 100644 --- a/src/Makefile.dep +++ b/src/Makefile.dep @@ -6,112 +6,114 @@ ae_kqueue.o: ae_kqueue.c ae_select.o: ae_select.c anet.o: anet.c fmacros.h anet.h aof.o: aof.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h bio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h bio.h bio.o: bio.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h bio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h bio.h bitops.o: bitops.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h cluster.o: cluster.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h endianconv.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h cluster.h endianconv.h config.o: config.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h cluster.h crc16.o: crc16.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h crc64.o: crc64.c db.o: db.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h cluster.h debug.o: debug.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h sha1.h crc64.h bio.h -dict.o: dict.c fmacros.h dict.h zmalloc.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h sha1.h crc64.h bio.h +dict.o: dict.c fmacros.h dict.h zmalloc.h redisassert.h endianconv.o: endianconv.c intset.o: intset.c intset.h zmalloc.h endianconv.h config.h lzf_c.o: lzf_c.c lzfP.h lzf_d.o: lzf_d.c lzfP.h memtest.o: memtest.c config.h multi.o: multi.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h networking.o: networking.c redis.h fmacros.h config.h \ - ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \ - adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h rdb.h \ - rio.h + ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \ + adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h rdb.h \ + rio.h notify.o: notify.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h object.o: object.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h pqsort.o: pqsort.c pubsub.o: pubsub.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h rand.o: rand.c rdb.o: rdb.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h lzf.h zipmap.h \ - endianconv.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h lzf.h zipmap.h \ + endianconv.h redis-benchmark.o: redis-benchmark.c fmacros.h ae.h \ - ../deps/hiredis/hiredis.h sds.h adlist.h zmalloc.h + ../deps/hiredis/hiredis.h sds.h adlist.h zmalloc.h redis-check-aof.o: redis-check-aof.c fmacros.h config.h redis-check-dump.o: redis-check-dump.c lzf.h crc64.h redis-cli.o: redis-cli.c fmacros.h version.h ../deps/hiredis/hiredis.h \ - sds.h zmalloc.h ../deps/linenoise/linenoise.h help.h anet.h ae.h + sds.h zmalloc.h ../deps/linenoise/linenoise.h help.h anet.h ae.h redis.o: redis.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h slowlog.h bio.h \ - asciilogo.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h cluster.h slowlog.h \ + bio.h asciilogo.h release.o: release.c release.h version.h crc64.h replication.o: replication.c redis.h fmacros.h config.h \ - ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \ - adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h rdb.h \ - rio.h -rio.o: rio.c fmacros.h rio.h sds.h util.h crc64.h + ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h sds.h dict.h \ + adlist.h zmalloc.h anet.h ziplist.h intset.h version.h util.h rdb.h \ + rio.h +rio.o: rio.c fmacros.h rio.h sds.h util.h crc64.h config.h redis.h \ + ../deps/lua/src/lua.h ../deps/lua/src/luaconf.h ae.h dict.h adlist.h \ + zmalloc.h anet.h ziplist.h intset.h version.h rdb.h scripting.o: scripting.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h sha1.h rand.h \ - ../deps/lua/src/lauxlib.h ../deps/lua/src/lua.h \ - ../deps/lua/src/lualib.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h sha1.h rand.h \ + ../deps/lua/src/lauxlib.h ../deps/lua/src/lua.h ../deps/lua/src/lualib.h sds.o: sds.c sds.h zmalloc.h sentinel.o: sentinel.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h \ - ../deps/hiredis/hiredis.h ../deps/hiredis/async.h \ - ../deps/hiredis/hiredis.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h \ + ../deps/hiredis/hiredis.h ../deps/hiredis/async.h \ + ../deps/hiredis/hiredis.h setproctitle.o: setproctitle.c sha1.o: sha1.c sha1.h config.h slowlog.o: slowlog.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h slowlog.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h slowlog.h sort.o: sort.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h pqsort.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h pqsort.h syncio.o: syncio.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h t_hash.o: t_hash.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h t_list.o: t_list.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h t_set.o: t_set.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h t_string.o: t_string.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h t_zset.o: t_zset.c redis.h fmacros.h config.h ../deps/lua/src/lua.h \ - ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ - ziplist.h intset.h version.h util.h rdb.h rio.h -util.o: util.c fmacros.h util.h -ziplist.o: ziplist.c zmalloc.h util.h ziplist.h endianconv.h config.h + ../deps/lua/src/luaconf.h ae.h sds.h dict.h adlist.h zmalloc.h anet.h \ + ziplist.h intset.h version.h util.h rdb.h rio.h +util.o: util.c fmacros.h util.h sds.h +ziplist.o: ziplist.c zmalloc.h util.h sds.h ziplist.h endianconv.h \ + config.h redisassert.h zipmap.o: zipmap.c zmalloc.h endianconv.h config.h zmalloc.o: zmalloc.c config.h zmalloc.h diff --git a/src/cluster.c b/src/cluster.c index 23d4196d..9c0d3e40 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -29,6 +29,7 @@ */ #include "redis.h" +#include "cluster.h" #include "endianconv.h" #include @@ -38,6 +39,8 @@ #include #include +clusterNode *createClusterNode(char *nodename, int flags); +int clusterAddNode(clusterNode *node); void clusterAcceptHandler(aeEventLoop *el, int fd, void *privdata, int mask); void clusterReadHandler(aeEventLoop *el, int fd, void *privdata, int mask); void clusterSendPing(clusterLink *link, int type); diff --git a/src/cluster.h b/src/cluster.h new file mode 100644 index 00000000..d46b105f --- /dev/null +++ b/src/cluster.h @@ -0,0 +1,181 @@ +#ifndef __REDIS_CLUSTER_H +#define __REDIS_CLUSTER_H + +/*----------------------------------------------------------------------------- + * Redis cluster data structures, defines, exported API. + *----------------------------------------------------------------------------*/ + +#define REDIS_CLUSTER_SLOTS 16384 +#define REDIS_CLUSTER_OK 0 /* Everything looks ok */ +#define REDIS_CLUSTER_FAIL 1 /* The cluster can't work */ +#define REDIS_CLUSTER_NAMELEN 40 /* sha1 hex length */ +#define REDIS_CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */ +#define REDIS_CLUSTER_IPLEN INET6_ADDRSTRLEN /* IPv6 address string length */ + +/* The following defines are amunt of time, sometimes expressed as + * multiplicators of the node timeout value (when ending with MULT). */ +#define REDIS_CLUSTER_DEFAULT_NODE_TIMEOUT 15 +#define REDIS_CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */ +#define REDIS_CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */ +#define REDIS_CLUSTER_FAIL_UNDO_TIME_ADD 10 /* Some additional time. */ +#define REDIS_CLUSTER_SLAVE_VALIDITY_MULT 10 /* Slave data validity. */ +#define REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT 4 /* Auth request retry time. */ +#define REDIS_CLUSTER_FAILOVER_DELAY 5 /* Seconds */ + +struct clusterNode; + +/* clusterLink encapsulates everything needed to talk with a remote node. */ +typedef struct clusterLink { + time_t ctime; /* Link creation time */ + int fd; /* TCP socket file descriptor */ + sds sndbuf; /* Packet send buffer */ + sds rcvbuf; /* Packet reception buffer */ + struct clusterNode *node; /* Node related to this link if any, or NULL */ +} clusterLink; + +/* Node flags */ +#define REDIS_NODE_MASTER 1 /* The node is a master */ +#define REDIS_NODE_SLAVE 2 /* The node is a slave */ +#define REDIS_NODE_PFAIL 4 /* Failure? Need acknowledge */ +#define REDIS_NODE_FAIL 8 /* The node is believed to be malfunctioning */ +#define REDIS_NODE_MYSELF 16 /* This node is myself */ +#define REDIS_NODE_HANDSHAKE 32 /* We have still to exchange the first ping */ +#define REDIS_NODE_NOADDR 64 /* We don't know the address of this node */ +#define REDIS_NODE_MEET 128 /* Send a MEET message to this node */ +#define REDIS_NODE_PROMOTED 256 /* Master was a slave propoted by failover */ +#define REDIS_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" + +/* This structure represent elements of node->fail_reports. */ +struct clusterNodeFailReport { + struct clusterNode *node; /* Node reporting the failure condition. */ + time_t time; /* Time of the last report from this node. */ +} typedef clusterNodeFailReport; + +struct clusterNode { + time_t ctime; /* Node object creation time. */ + char name[REDIS_CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */ + int flags; /* REDIS_NODE_... */ + uint64_t configEpoch; /* Last configEpoch observed for this node */ + unsigned char slots[REDIS_CLUSTER_SLOTS/8]; /* slots handled by this node */ + int numslots; /* Number of slots handled by this node */ + int numslaves; /* Number of slave nodes, if this is a master */ + struct clusterNode **slaves; /* pointers to slave nodes */ + struct clusterNode *slaveof; /* pointer to the master node */ + time_t ping_sent; /* Unix time we sent latest ping */ + time_t pong_received; /* Unix time we received the pong */ + time_t fail_time; /* Unix time when FAIL flag was set */ + time_t voted_time; /* Last time we voted for a slave of this master */ + char ip[REDIS_IP_STR_LEN]; /* Latest known IP address of this node */ + int port; /* Latest known port of this node */ + clusterLink *link; /* TCP/IP link with this node */ + list *fail_reports; /* List of nodes signaling this as failing */ +}; +typedef struct clusterNode clusterNode; + +typedef struct clusterState { + clusterNode *myself; /* This node */ + uint64_t currentEpoch; + int state; /* REDIS_CLUSTER_OK, REDIS_CLUSTER_FAIL, ... */ + int size; /* Num of master nodes with at least one slot */ + dict *nodes; /* Hash table of name -> clusterNode structures */ + clusterNode *migrating_slots_to[REDIS_CLUSTER_SLOTS]; + clusterNode *importing_slots_from[REDIS_CLUSTER_SLOTS]; + clusterNode *slots[REDIS_CLUSTER_SLOTS]; + zskiplist *slots_to_keys; + /* The following fields are used to take the slave state on elections. */ + mstime_t failover_auth_time;/* Time at which we'll try to get elected in ms*/ + int failover_auth_count; /* Number of votes received so far. */ + int failover_auth_sent; /* True if we already asked for votes. */ + uint64_t failover_auth_epoch; /* Epoch of the current election. */ + /* The followign fields are uesd by masters to take state on elections. */ + uint64_t last_vote_epoch; /* Epoch of the last vote granted. */ + int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */ + long long stats_bus_messages_sent; /* Num of msg sent via cluster bus. */ + long long stats_bus_messages_received; /* Num of msg received via cluster bus. */ +} clusterState; + +/* clusterState todo_before_sleep flags. */ +#define CLUSTER_TODO_HANDLE_FAILOVER (1<<0) +#define CLUSTER_TODO_UPDATE_STATE (1<<1) +#define CLUSTER_TODO_SAVE_CONFIG (1<<2) +#define CLUSTER_TODO_FSYNC_CONFIG (1<<3) + +/* Redis cluster messages header */ + +/* Note that the PING, PONG and MEET messages are actually the same exact + * kind of packet. PONG is the reply to ping, in the exact format as a PING, + * while MEET is a special PING that forces the receiver to add the sender + * as a node (if it is not already in the list). */ +#define CLUSTERMSG_TYPE_PING 0 /* Ping */ +#define CLUSTERMSG_TYPE_PONG 1 /* Pong (reply to Ping) */ +#define CLUSTERMSG_TYPE_MEET 2 /* Meet "let's join" message */ +#define CLUSTERMSG_TYPE_FAIL 3 /* Mark node xxx as failing */ +#define CLUSTERMSG_TYPE_PUBLISH 4 /* Pub/Sub Publish propagation */ +#define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */ +#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6 /* Yes, you can failover. */ + +/* Initially we don't know our "name", but we'll find it once we connect + * to the first node, using the getsockname() function. Then we'll use this + * address for all the next messages. */ +typedef struct { + char nodename[REDIS_CLUSTER_NAMELEN]; + uint32_t ping_sent; + uint32_t pong_received; + char ip[16]; /* IP address last time it was seen */ + uint16_t port; /* port last time it was seen */ + uint16_t flags; + uint32_t notused; /* for 64 bit alignment */ +} clusterMsgDataGossip; + +typedef struct { + char nodename[REDIS_CLUSTER_NAMELEN]; +} clusterMsgDataFail; + +typedef struct { + uint32_t channel_len; + uint32_t message_len; + unsigned char bulk_data[8]; /* defined as 8 just for alignment concerns. */ +} clusterMsgDataPublish; + +union clusterMsgData { + /* PING, MEET and PONG */ + struct { + /* Array of N clusterMsgDataGossip structures */ + clusterMsgDataGossip gossip[1]; + } ping; + + /* FAIL */ + struct { + clusterMsgDataFail about; + } fail; + + /* PUBLISH */ + struct { + clusterMsgDataPublish msg; + } publish; +}; + +typedef struct { + uint32_t totlen; /* Total length of this message */ + uint16_t type; /* Message type */ + uint16_t count; /* Only used for some kind of messages. */ + uint64_t currentEpoch; /* The epoch accordingly to the sending node. */ + uint64_t configEpoch; /* The config epoch if it's a master, or the last epoch + advertised by its master if it is a slave. */ + char sender[REDIS_CLUSTER_NAMELEN]; /* Name of the sender node */ + unsigned char myslots[REDIS_CLUSTER_SLOTS/8]; + char slaveof[REDIS_CLUSTER_NAMELEN]; + char notused1[32]; /* 32 bytes reserved for future usage. */ + uint16_t port; /* Sender TCP base port */ + uint16_t flags; /* Sender node flags */ + unsigned char state; /* Cluster state from the POV of the sender */ + unsigned char notused2[3]; /* Reserved for future use. For alignment. */ + union clusterMsgData data; +} clusterMsg; + +#define CLUSTERMSG_MIN_LEN (sizeof(clusterMsg)-sizeof(union clusterMsgData)) + +/* ----------------------- API exported outside cluster.c ------------------------- */ +clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask); + +#endif /* __REDIS_CLUSTER_H */ diff --git a/src/config.c b/src/config.c index 2707bfae..64f836c7 100644 --- a/src/config.c +++ b/src/config.c @@ -29,6 +29,7 @@ */ #include "redis.h" +#include "cluster.h" #include #include diff --git a/src/db.c b/src/db.c index 02f8dd3a..9c0349bd 100644 --- a/src/db.c +++ b/src/db.c @@ -28,6 +28,7 @@ */ #include "redis.h" +#include "cluster.h" #include #include diff --git a/src/redis.c b/src/redis.c index bd547cd3..bc75b1c9 100644 --- a/src/redis.c +++ b/src/redis.c @@ -28,6 +28,7 @@ */ #include "redis.h" +#include "cluster.h" #include "slowlog.h" #include "bio.h" diff --git a/src/redis.h b/src/redis.h index 94decca9..88792f77 100644 --- a/src/redis.h +++ b/src/redis.h @@ -565,184 +565,12 @@ typedef struct redisOpArray { int numops; } redisOpArray; -/*----------------------------------------------------------------------------- - * Redis cluster data structures - *----------------------------------------------------------------------------*/ - -#define REDIS_CLUSTER_SLOTS 16384 -#define REDIS_CLUSTER_OK 0 /* Everything looks ok */ -#define REDIS_CLUSTER_FAIL 1 /* The cluster can't work */ -#define REDIS_CLUSTER_NAMELEN 40 /* sha1 hex length */ -#define REDIS_CLUSTER_PORT_INCR 10000 /* Cluster port = baseport + PORT_INCR */ -#define REDIS_CLUSTER_IPLEN INET6_ADDRSTRLEN /* IPv6 address string length */ - -/* The following defines are amunt of time, sometimes expressed as - * multiplicators of the node timeout value (when ending with MULT). */ -#define REDIS_CLUSTER_DEFAULT_NODE_TIMEOUT 15 -#define REDIS_CLUSTER_FAIL_REPORT_VALIDITY_MULT 2 /* Fail report validity. */ -#define REDIS_CLUSTER_FAIL_UNDO_TIME_MULT 2 /* Undo fail if master is back. */ -#define REDIS_CLUSTER_FAIL_UNDO_TIME_ADD 10 /* Some additional time. */ -#define REDIS_CLUSTER_SLAVE_VALIDITY_MULT 10 /* Slave data validity. */ -#define REDIS_CLUSTER_FAILOVER_AUTH_RETRY_MULT 4 /* Auth request retry time. */ -#define REDIS_CLUSTER_FAILOVER_DELAY 5 /* Seconds */ - -struct clusterNode; - -/* clusterLink encapsulates everything needed to talk with a remote node. */ -typedef struct clusterLink { - time_t ctime; /* Link creation time */ - int fd; /* TCP socket file descriptor */ - sds sndbuf; /* Packet send buffer */ - sds rcvbuf; /* Packet reception buffer */ - struct clusterNode *node; /* Node related to this link if any, or NULL */ -} clusterLink; - -/* Node flags */ -#define REDIS_NODE_MASTER 1 /* The node is a master */ -#define REDIS_NODE_SLAVE 2 /* The node is a slave */ -#define REDIS_NODE_PFAIL 4 /* Failure? Need acknowledge */ -#define REDIS_NODE_FAIL 8 /* The node is believed to be malfunctioning */ -#define REDIS_NODE_MYSELF 16 /* This node is myself */ -#define REDIS_NODE_HANDSHAKE 32 /* We have still to exchange the first ping */ -#define REDIS_NODE_NOADDR 64 /* We don't know the address of this node */ -#define REDIS_NODE_MEET 128 /* Send a MEET message to this node */ -#define REDIS_NODE_PROMOTED 256 /* Master was a slave propoted by failover */ -#define REDIS_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" - -/* This structure represent elements of node->fail_reports. */ -struct clusterNodeFailReport { - struct clusterNode *node; /* Node reporting the failure condition. */ - time_t time; /* Time of the last report from this node. */ -} typedef clusterNodeFailReport; - -struct clusterNode { - time_t ctime; /* Node object creation time. */ - char name[REDIS_CLUSTER_NAMELEN]; /* Node name, hex string, sha1-size */ - int flags; /* REDIS_NODE_... */ - uint64_t configEpoch; /* Last configEpoch observed for this node */ - unsigned char slots[REDIS_CLUSTER_SLOTS/8]; /* slots handled by this node */ - int numslots; /* Number of slots handled by this node */ - int numslaves; /* Number of slave nodes, if this is a master */ - struct clusterNode **slaves; /* pointers to slave nodes */ - struct clusterNode *slaveof; /* pointer to the master node */ - time_t ping_sent; /* Unix time we sent latest ping */ - time_t pong_received; /* Unix time we received the pong */ - time_t fail_time; /* Unix time when FAIL flag was set */ - time_t voted_time; /* Last time we voted for a slave of this master */ - char ip[REDIS_IP_STR_LEN]; /* Latest known IP address of this node */ - int port; /* Latest known port of this node */ - clusterLink *link; /* TCP/IP link with this node */ - list *fail_reports; /* List of nodes signaling this as failing */ -}; -typedef struct clusterNode clusterNode; - -typedef struct { - clusterNode *myself; /* This node */ - uint64_t currentEpoch; - int state; /* REDIS_CLUSTER_OK, REDIS_CLUSTER_FAIL, ... */ - int size; /* Num of master nodes with at least one slot */ - dict *nodes; /* Hash table of name -> clusterNode structures */ - clusterNode *migrating_slots_to[REDIS_CLUSTER_SLOTS]; - clusterNode *importing_slots_from[REDIS_CLUSTER_SLOTS]; - clusterNode *slots[REDIS_CLUSTER_SLOTS]; - zskiplist *slots_to_keys; - /* The following fields are used to take the slave state on elections. */ - mstime_t failover_auth_time;/* Time at which we'll try to get elected in ms*/ - int failover_auth_count; /* Number of votes received so far. */ - int failover_auth_sent; /* True if we already asked for votes. */ - uint64_t failover_auth_epoch; /* Epoch of the current election. */ - /* The followign fields are uesd by masters to take state on elections. */ - uint64_t last_vote_epoch; /* Epoch of the last vote granted. */ - int todo_before_sleep; /* Things to do in clusterBeforeSleep(). */ - long long stats_bus_messages_sent; /* Num of msg sent via cluster bus. */ - long long stats_bus_messages_received; /* Num of msg received via cluster bus. */ -} clusterState; - -/* clusterState todo_before_sleep flags. */ -#define CLUSTER_TODO_HANDLE_FAILOVER (1<<0) -#define CLUSTER_TODO_UPDATE_STATE (1<<1) -#define CLUSTER_TODO_SAVE_CONFIG (1<<2) -#define CLUSTER_TODO_FSYNC_CONFIG (1<<3) - -/* Redis cluster messages header */ - -/* Note that the PING, PONG and MEET messages are actually the same exact - * kind of packet. PONG is the reply to ping, in the exact format as a PING, - * while MEET is a special PING that forces the receiver to add the sender - * as a node (if it is not already in the list). */ -#define CLUSTERMSG_TYPE_PING 0 /* Ping */ -#define CLUSTERMSG_TYPE_PONG 1 /* Pong (reply to Ping) */ -#define CLUSTERMSG_TYPE_MEET 2 /* Meet "let's join" message */ -#define CLUSTERMSG_TYPE_FAIL 3 /* Mark node xxx as failing */ -#define CLUSTERMSG_TYPE_PUBLISH 4 /* Pub/Sub Publish propagation */ -#define CLUSTERMSG_TYPE_FAILOVER_AUTH_REQUEST 5 /* May I failover? */ -#define CLUSTERMSG_TYPE_FAILOVER_AUTH_ACK 6 /* Yes, you can failover. */ - -/* Initially we don't know our "name", but we'll find it once we connect - * to the first node, using the getsockname() function. Then we'll use this - * address for all the next messages. */ -typedef struct { - char nodename[REDIS_CLUSTER_NAMELEN]; - uint32_t ping_sent; - uint32_t pong_received; - char ip[16]; /* IP address last time it was seen */ - uint16_t port; /* port last time it was seen */ - uint16_t flags; - uint32_t notused; /* for 64 bit alignment */ -} clusterMsgDataGossip; - -typedef struct { - char nodename[REDIS_CLUSTER_NAMELEN]; -} clusterMsgDataFail; - -typedef struct { - uint32_t channel_len; - uint32_t message_len; - unsigned char bulk_data[8]; /* defined as 8 just for alignment concerns. */ -} clusterMsgDataPublish; - -union clusterMsgData { - /* PING, MEET and PONG */ - struct { - /* Array of N clusterMsgDataGossip structures */ - clusterMsgDataGossip gossip[1]; - } ping; - - /* FAIL */ - struct { - clusterMsgDataFail about; - } fail; - - /* PUBLISH */ - struct { - clusterMsgDataPublish msg; - } publish; -}; - -typedef struct { - uint32_t totlen; /* Total length of this message */ - uint16_t type; /* Message type */ - uint16_t count; /* Only used for some kind of messages. */ - uint64_t currentEpoch; /* The epoch accordingly to the sending node. */ - uint64_t configEpoch; /* The config epoch if it's a master, or the last epoch - advertised by its master if it is a slave. */ - char sender[REDIS_CLUSTER_NAMELEN]; /* Name of the sender node */ - unsigned char myslots[REDIS_CLUSTER_SLOTS/8]; - char slaveof[REDIS_CLUSTER_NAMELEN]; - char notused1[32]; /* 32 bytes reserved for future usage. */ - uint16_t port; /* Sender TCP base port */ - uint16_t flags; /* Sender node flags */ - unsigned char state; /* Cluster state from the POV of the sender */ - unsigned char notused2[3]; /* Reserved for future use. For alignment. */ - union clusterMsgData data; -} clusterMsg; - -#define CLUSTERMSG_MIN_LEN (sizeof(clusterMsg)-sizeof(union clusterMsgData)) - /*----------------------------------------------------------------------------- * Global server state *----------------------------------------------------------------------------*/ +struct clusterState; + struct redisServer { /* General */ char *configfile; /* Absolute config file path, or NULL */ @@ -942,7 +770,7 @@ struct redisServer { int cluster_enabled; /* Is cluster enabled? */ int cluster_node_timeout; /* Cluster node timeout. */ char *cluster_configfile; /* Cluster auto-generated config file name. */ - clusterState *cluster; /* State of the cluster */ + struct clusterState *cluster; /* State of the cluster */ /* Scripting */ lua_State *lua; /* The Lua interpreter. We use just one for all clients */ redisClient *lua_client; /* The "fake client" to query Redis from Lua */ @@ -1380,10 +1208,7 @@ int *zunionInterGetKeys(struct redisCommand *cmd,robj **argv, int argc, int *num void clusterInit(void); unsigned short crc16(const char *buf, int len); unsigned int keyHashSlot(char *key, int keylen); -clusterNode *createClusterNode(char *nodename, int flags); -int clusterAddNode(clusterNode *node); void clusterCron(void); -clusterNode *getNodeByQuery(redisClient *c, struct redisCommand *cmd, robj **argv, int argc, int *hashslot, int *ask); void clusterPropagatePublish(robj *channel, robj *message); void migrateCloseTimedoutSockets(void); void clusterBeforeSleep(void);