From 22bcee5178c68255d1ba7da908405161c4ffbdf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Fievet?= <_@sebastien-fievet.fr> Date: Wed, 26 Jul 2017 14:11:05 +0200 Subject: [PATCH 001/102] Fix some typos --- redis.conf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/redis.conf b/redis.conf index c54dba39..7afd35a4 100644 --- a/redis.conf +++ b/redis.conf @@ -606,7 +606,7 @@ slave-priority 100 # deletion of the object. It means that the server stops processing new commands # in order to reclaim all the memory associated with an object in a synchronous # way. If the key deleted is associated with a small object, the time needed -# in order to execute th DEL command is very small and comparable to most other +# in order to execute the DEL command is very small and comparable to most other # O(1) or O(log_N) commands in Redis. However if the key is associated with an # aggregated value containing millions of elements, the server can block for # a long time (even seconds) in order to complete the operation. @@ -621,7 +621,7 @@ slave-priority 100 # It's up to the design of the application to understand when it is a good # idea to use one or the other. However the Redis server sometimes has to # delete keys or flush the whole database as a side effect of other operations. -# Specifically Redis deletes objects independently of an user call in the +# Specifically Redis deletes objects independently of a user call in the # following scenarios: # # 1) On eviction, because of the maxmemory and maxmemory policy configurations, @@ -914,7 +914,7 @@ lua-time-limit 5000 # Docker and other containers). # # In order to make Redis Cluster working in such environments, a static -# configuration where each node known its public address is needed. The +# configuration where each node knows its public address is needed. The # following two options are used for this scope, and are: # # * cluster-announce-ip From 005d9fa8615bfffb550c62f94a88056e06858e74 Mon Sep 17 00:00:00 2001 From: Bo Cai Date: Wed, 26 Jul 2017 21:24:28 +0800 Subject: [PATCH 002/102] redis-cli.c typo: helpe -> helper. Signed-off-by: Bo Cai --- src/redis-cli.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/redis-cli.c b/src/redis-cli.c index 61068483..524d879e 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -624,7 +624,7 @@ int isColorTerm(void) { return t != NULL && strstr(t,"xterm") != NULL; } -/* Helpe function for sdsCatColorizedLdbReply() appending colorize strings +/* Helper function for sdsCatColorizedLdbReply() appending colorize strings * to an SDS string. */ sds sdscatcolor(sds o, char *s, size_t len, char *color) { if (!isColorTerm()) return sdscatlen(o,s,len); From 00954f4d48f1e2fa42dc2a4ae6090ec083996c11 Mon Sep 17 00:00:00 2001 From: Bo Cai Date: Wed, 26 Jul 2017 21:33:29 +0800 Subject: [PATCH 003/102] redis-cli.c typo: Requets -> Requests. Signed-off-by: Bo Cai --- src/redis-cli.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/redis-cli.c b/src/redis-cli.c index 61068483..d4d9d631 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -2364,7 +2364,7 @@ static void statMode(void) { sprintf(buf,"%ld",aux); printf("%-8s",buf); - /* Requets */ + /* Requests */ aux = getLongInfoField(reply->str,"total_commands_processed"); sprintf(buf,"%ld (+%ld)",aux,requests == 0 ? 0 : aux-requests); printf("%-19s",buf); From 2e6f28500941d7a6f24f3831e9cabcb576d290fd Mon Sep 17 00:00:00 2001 From: Shaun Webb Date: Thu, 27 Jul 2017 09:37:37 +0900 Subject: [PATCH 004/102] Fix typo --- src/cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster.c b/src/cluster.c index a9fedce0..4e013313 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -650,7 +650,7 @@ unsigned int keyHashSlot(char *key, int keylen) { for (e = s+1; e < keylen; e++) if (key[e] == '}') break; - /* No '}' or nothing betweeen {} ? Hash the whole key. */ + /* No '}' or nothing between {} ? Hash the whole key. */ if (e == keylen || e == s+1) return crc16(key,keylen) & 0x3FFF; /* If we are here there is both a { and a } on its right. Hash From f06f10e66bfe5afdb013798e02c6aec7937435a5 Mon Sep 17 00:00:00 2001 From: Shaun Webb Date: Thu, 27 Jul 2017 15:27:46 +0900 Subject: [PATCH 005/102] Typo fix --- redis.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/redis.conf b/redis.conf index c54dba39..249399e9 100644 --- a/redis.conf +++ b/redis.conf @@ -59,7 +59,7 @@ # internet, binding to all the interfaces is dangerous and will expose the # instance to everybody on the internet. So by default we uncomment the # following bind directive, that will force Redis to listen only into -# the IPv4 lookback interface address (this means Redis will be able to +# the IPv4 loopback interface address (this means Redis will be able to # accept connections only from clients running into the same computer it # is running). # From 0c6ea46f2bfc0d2ee6ef7b2e5e04ce6502d9fa27 Mon Sep 17 00:00:00 2001 From: Felix Krause Date: Fri, 28 Jul 2017 13:04:52 -0400 Subject: [PATCH 006/102] Update link to https and use inline link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 70a15790..42ab4785 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -This README is just a fast *quick start* document. You can find more detailed documentation at http://redis.io. +This README is just a fast *quick start* document. You can find more detailed documentation at [redis.io](https://redis.io). What is Redis? -------------- From 447b373fc9d8d7cd3f66de42f7a809fb40941bfc Mon Sep 17 00:00:00 2001 From: "jeesyn.liu" Date: Tue, 8 Aug 2017 17:45:51 +0800 Subject: [PATCH 007/102] fix a typo --- src/anet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anet.c b/src/anet.c index 53a56b0d..e9530398 100644 --- a/src/anet.c +++ b/src/anet.c @@ -237,7 +237,7 @@ int anetResolveIP(char *err, char *host, char *ipbuf, size_t ipbuf_len) { static int anetSetReuseAddr(char *err, int fd) { int yes = 1; - /* Make sure connection-intensive things like the redis benckmark + /* Make sure connection-intensive things like the redis benchmark * will be able to close/open sockets a zillion of times */ if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) == -1) { anetSetError(err, "setsockopt SO_REUSEADDR: %s", strerror(errno)); From 6b9f02ac1263b21675ecd4a05cc12d3da41e8a0f Mon Sep 17 00:00:00 2001 From: Chris Lamb Date: Sat, 12 Aug 2017 22:21:03 -0700 Subject: [PATCH 008/102] Correct spelling of "faield". --- tests/instances.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/instances.tcl b/tests/instances.tcl index 2ba67ac1..357b3481 100644 --- a/tests/instances.tcl +++ b/tests/instances.tcl @@ -318,7 +318,7 @@ proc end_tests {} { puts "GOOD! No errors." exit 0 } else { - puts "WARNING $::failed tests faield." + puts "WARNING $::failed test(s) failed." exit 1 } } From 6eb996540c662832370ddaac25484c4cbbca4e7a Mon Sep 17 00:00:00 2001 From: rouzier Date: Fri, 13 Oct 2017 13:20:45 -0400 Subject: [PATCH 009/102] Fix file descriptor leak and error handling --- src/redis-check-rdb.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/redis-check-rdb.c b/src/redis-check-rdb.c index 4027536e..71ac50d0 100644 --- a/src/redis-check-rdb.c +++ b/src/redis-check-rdb.c @@ -193,12 +193,12 @@ int redis_check_rdb(char *rdbfilename, FILE *fp) { buf[9] = '\0'; if (memcmp(buf,"REDIS",5) != 0) { rdbCheckError("Wrong signature trying to load DB from file"); - return 1; + goto err; } rdbver = atoi(buf+5); if (rdbver < 1 || rdbver > RDB_VERSION) { rdbCheckError("Can't handle RDB format version %d",rdbver); - return 1; + goto err; } startLoading(fp); @@ -270,7 +270,7 @@ int redis_check_rdb(char *rdbfilename, FILE *fp) { } else { if (!rdbIsObjectType(type)) { rdbCheckError("Invalid object type: %d", type); - return 1; + goto err; } rdbstate.key_type = type; } @@ -307,6 +307,7 @@ int redis_check_rdb(char *rdbfilename, FILE *fp) { rdbCheckInfo("RDB file was saved with checksum disabled: no check performed."); } else if (cksum != expected) { rdbCheckError("RDB CRC error"); + goto err; } else { rdbCheckInfo("Checksum OK"); } @@ -321,6 +322,8 @@ eoferr: /* unexpected end of file is handled here with a fatal exit */ } else { rdbCheckError("Unexpected EOF reading RDB file"); } +err: + if (closefile) fclose(fp); return 1; } From 62689ef0cf3c805b100ff5260485368e1c9b683c Mon Sep 17 00:00:00 2001 From: David Carlier Date: Sun, 19 Nov 2017 16:23:42 +0000 Subject: [PATCH 010/102] Fix undefined behavior constant defined. --- src/lzfP.h | 6 +++++- src/setproctitle.c | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/lzfP.h b/src/lzfP.h index c6d2e096..93c27b42 100644 --- a/src/lzfP.h +++ b/src/lzfP.h @@ -79,7 +79,11 @@ * Unconditionally aligning does not cost very much, so do it if unsure */ #ifndef STRICT_ALIGN -# define STRICT_ALIGN !(defined(__i386) || defined (__amd64)) +# if !(defined(__i386) || defined (__amd64)) +# define STRICT_ALIGN 1 +# else +# define STRICT_ALIGN 0 +# endif #endif /* diff --git a/src/setproctitle.c b/src/setproctitle.c index f44253e1..6563242d 100644 --- a/src/setproctitle.c +++ b/src/setproctitle.c @@ -39,7 +39,11 @@ #include /* errno program_invocation_name program_invocation_short_name */ #if !defined(HAVE_SETPROCTITLE) -#define HAVE_SETPROCTITLE (defined __NetBSD__ || defined __FreeBSD__ || defined __OpenBSD__) +#if (defined __NetBSD__ || defined __FreeBSD__ || defined __OpenBSD__) +#define HAVE_SETPROCTITLE 1 +#else +#define HAVE_SETPROCTITLE 0 +#endif #endif From dfc42ec4471ea3f90c73c123b655e4a10c0e922e Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Fri, 13 Oct 2017 10:52:10 +0800 Subject: [PATCH 011/102] LFU: fix the missing of config get and rewrite --- src/config.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/config.c b/src/config.c index 06d869be..50993666 100644 --- a/src/config.c +++ b/src/config.c @@ -330,13 +330,13 @@ void loadServerConfigFromString(char *config) { } } else if (!strcasecmp(argv[0],"lfu-log-factor") && argc == 2) { server.lfu_log_factor = atoi(argv[1]); - if (server.maxmemory_samples < 0) { + if (server.lfu_log_factor < 0) { err = "lfu-log-factor must be 0 or greater"; goto loaderr; } } else if (!strcasecmp(argv[0],"lfu-decay-time") && argc == 2) { server.lfu_decay_time = atoi(argv[1]); - if (server.maxmemory_samples < 1) { + if (server.lfu_decay_time < 0) { err = "lfu-decay-time must be 0 or greater"; goto loaderr; } @@ -1221,6 +1221,8 @@ void configGetCommand(client *c) { /* Numerical values */ config_get_numerical_field("maxmemory",server.maxmemory); config_get_numerical_field("maxmemory-samples",server.maxmemory_samples); + config_get_numerical_field("lfu-log-factor",server.lfu_log_factor); + config_get_numerical_field("lfu-decay-time",server.lfu_decay_time); config_get_numerical_field("timeout",server.maxidletime); config_get_numerical_field("active-defrag-threshold-lower",server.active_defrag_threshold_lower); config_get_numerical_field("active-defrag-threshold-upper",server.active_defrag_threshold_upper); @@ -1992,6 +1994,8 @@ int rewriteConfig(char *path) { rewriteConfigBytesOption(state,"maxmemory",server.maxmemory,CONFIG_DEFAULT_MAXMEMORY); rewriteConfigEnumOption(state,"maxmemory-policy",server.maxmemory_policy,maxmemory_policy_enum,CONFIG_DEFAULT_MAXMEMORY_POLICY); rewriteConfigNumericalOption(state,"maxmemory-samples",server.maxmemory_samples,CONFIG_DEFAULT_MAXMEMORY_SAMPLES); + rewriteConfigNumericalOption(state,"lfu-log-factor",server.lfu_log_factor,CONFIG_DEFAULT_LFU_LOG_FACTOR); + rewriteConfigNumericalOption(state,"lfu-decay-time",server.lfu_decay_time,CONFIG_DEFAULT_LFU_DECAY_TIME); rewriteConfigNumericalOption(state,"active-defrag-threshold-lower",server.active_defrag_threshold_lower,CONFIG_DEFAULT_DEFRAG_THRESHOLD_LOWER); rewriteConfigNumericalOption(state,"active-defrag-threshold-upper",server.active_defrag_threshold_upper,CONFIG_DEFAULT_DEFRAG_THRESHOLD_UPPER); rewriteConfigBytesOption(state,"active-defrag-ignore-bytes",server.active_defrag_ignore_bytes,CONFIG_DEFAULT_DEFRAG_IGNORE_BYTES); From 53cea97204ebc8d863ca99db4c9705ce0f87892f Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Fri, 13 Oct 2017 11:09:48 +0800 Subject: [PATCH 012/102] LFU: change lfu* parameters to int --- src/evict.c | 2 +- src/server.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/evict.c b/src/evict.c index 5ce5ca07..0a04ed1b 100644 --- a/src/evict.c +++ b/src/evict.c @@ -334,7 +334,7 @@ uint8_t LFULogIncr(uint8_t counter) { unsigned long LFUDecrAndReturn(robj *o) { unsigned long ldt = o->lru >> 8; unsigned long counter = o->lru & 255; - if (LFUTimeElapsed(ldt) >= server.lfu_decay_time && counter) { + if (LFUTimeElapsed(ldt) >= (unsigned long)server.lfu_decay_time && counter) { if (counter > LFU_INIT_VAL*2) { counter /= 2; if (counter < LFU_INIT_VAL*2) counter = LFU_INIT_VAL*2; diff --git a/src/server.h b/src/server.h index e3b56075..aa04344c 100644 --- a/src/server.h +++ b/src/server.h @@ -1118,8 +1118,8 @@ struct redisServer { unsigned long long maxmemory; /* Max number of memory bytes to use */ int maxmemory_policy; /* Policy for key eviction */ int maxmemory_samples; /* Pricision of random sampling */ - unsigned int lfu_log_factor; /* LFU logarithmic counter factor. */ - unsigned int lfu_decay_time; /* LFU counter decay factor. */ + int lfu_log_factor; /* LFU logarithmic counter factor. */ + int lfu_decay_time; /* LFU counter decay factor. */ /* Blocked clients */ unsigned int bpop_blocked_clients; /* Number of clients blocked by lists */ list *unblocked_clients; /* list of clients to unblock before next loop */ From 583c31472577fb8175e17ee0ce243972f4dd8425 Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Sun, 15 Oct 2017 20:17:55 +0800 Subject: [PATCH 013/102] LFU: do some changes about LFU to find hotkeys Firstly, use access time to replace the decreas time of LFU. For function LFUDecrAndReturn, it should only try to get decremented counter, not update LFU fields, we will update it in an explicit way. And we will times halve the counter according to the times of elapsed time than server.lfu_decay_time. Everytime a key is accessed, we should update the LFU including update access time, and increment the counter after call function LFUDecrAndReturn. If a key is overwritten, the LFU should be also updated. Then we can use `OBJECT freq` command to get a key's frequence, and LFUDecrAndReturn should be called in `OBJECT freq` command in case of the key has not been accessed for a long time, because we update the access time only when the key is read or overwritten. --- src/db.c | 16 +++++++++++++--- src/evict.c | 31 ++++++++++++++++++------------- src/object.c | 8 ++++++-- src/server.h | 3 ++- 4 files changed, 39 insertions(+), 19 deletions(-) diff --git a/src/db.c b/src/db.c index 71c642d0..4d6999be 100644 --- a/src/db.c +++ b/src/db.c @@ -38,6 +38,15 @@ * C-level DB API *----------------------------------------------------------------------------*/ +/* Update LFU when an object is accessed. + * Firstly, decrement the counter if the decrement time is reached. + * Then logarithmically increment the counter, and update the access time. */ +void updateLFU(robj *val) { + unsigned long counter = LFUDecrAndReturn(val); + counter = LFULogIncr(counter); + val->lru = (LFUGetTimeInMinutes()<<8) | counter; +} + /* Low level key lookup API, not actually called directly from commands * implementations that should instead rely on lookupKeyRead(), * lookupKeyWrite() and lookupKeyReadWithFlags(). */ @@ -54,9 +63,7 @@ robj *lookupKey(redisDb *db, robj *key, int flags) { !(flags & LOOKUP_NOTOUCH)) { if (server.maxmemory_policy & MAXMEMORY_FLAG_LFU) { - unsigned long ldt = val->lru >> 8; - unsigned long counter = LFULogIncr(val->lru & 255); - val->lru = (ldt << 8) | counter; + updateLFU(val); } else { val->lru = LRU_CLOCK(); } @@ -180,6 +187,9 @@ void dbOverwrite(redisDb *db, robj *key, robj *val) { int saved_lru = old->lru; dictReplace(db->dict, key->ptr, val); val->lru = saved_lru; + /* LFU should be not only copied but also updated + * when a key is overwritten. */ + updateLFU(val); } else { dictReplace(db->dict, key->ptr, val); } diff --git a/src/evict.c b/src/evict.c index 0a04ed1b..55b13212 100644 --- a/src/evict.c +++ b/src/evict.c @@ -60,8 +60,6 @@ struct evictionPoolEntry { static struct evictionPoolEntry *EvictionPoolLRU; -unsigned long LFUDecrAndReturn(robj *o); - /* ---------------------------------------------------------------------------- * Implementation of eviction, aging and LRU * --------------------------------------------------------------------------*/ @@ -302,8 +300,8 @@ unsigned long LFUGetTimeInMinutes(void) { return (server.unixtime/60) & 65535; } -/* Given an object last decrement time, compute the minimum number of minutes - * that elapsed since the last decrement. Handle overflow (ldt greater than +/* Given an object last access time, compute the minimum number of minutes + * that elapsed since the last access. Handle overflow (ldt greater than * the current 16 bits minutes time) considering the time as wrapping * exactly once. */ unsigned long LFUTimeElapsed(unsigned long ldt) { @@ -324,24 +322,31 @@ uint8_t LFULogIncr(uint8_t counter) { return counter; } -/* If the object decrement time is reached, decrement the LFU counter and - * update the decrement time field. Return the object frequency counter. +/* If the object decrement time is reached decrement the LFU counter but + * do not update LFU fields of the object, we update the access time + * and counter in an explicit way when the object is really accessed. + * And we will times halve the counter according to the times of + * elapsed time than server.lfu_decay_time. + * Return the object frequency counter. * * This function is used in order to scan the dataset for the best object * to fit: as we check for the candidate, we incrementally decrement the * counter of the scanned objects if needed. */ -#define LFU_DECR_INTERVAL 1 unsigned long LFUDecrAndReturn(robj *o) { unsigned long ldt = o->lru >> 8; unsigned long counter = o->lru & 255; - if (LFUTimeElapsed(ldt) >= (unsigned long)server.lfu_decay_time && counter) { - if (counter > LFU_INIT_VAL*2) { - counter /= 2; - if (counter < LFU_INIT_VAL*2) counter = LFU_INIT_VAL*2; + long halve_times = server.lfu_decay_time ? LFUTimeElapsed(ldt) / server.lfu_decay_time : 0; + if (halve_times > 0 && counter) { + if (halve_times == 1) { + if (counter > LFU_INIT_VAL*2) { + counter /= 2; + if (counter < LFU_INIT_VAL*2) counter = LFU_INIT_VAL*2; + } else { + counter--; + } } else { - counter--; + counter = counter >> halve_times; } - o->lru = (LFUGetTimeInMinutes()<<8) | counter; } return counter; } diff --git a/src/object.c b/src/object.c index 8c33d7ef..d2f8d53c 100644 --- a/src/object.c +++ b/src/object.c @@ -1050,10 +1050,14 @@ void objectCommand(client *c) { if ((o = objectCommandLookupOrReply(c,c->argv[2],shared.nullbulk)) == NULL) return; if (!(server.maxmemory_policy & MAXMEMORY_FLAG_LFU)) { - addReplyError(c,"A non-LFU maxmemory policy is selected, access frequency not tracked. Please note that when switching between policies at runtime LRU and LFU data will take some time to adjust."); + addReplyError(c,"An LFU maxmemory policy is not selected, access frequency not tracked. Please note that when switching between policies at runtime LRU and LFU data will take some time to adjust."); return; } - addReplyLongLong(c,o->lru&255); + /* LFUDecrAndReturn should be called + * in case of the key has not been accessed for a long time, + * because we update the access time only + * when the key is read or overwritten. */ + addReplyLongLong(c,LFUDecrAndReturn(o)); } else { addReplyErrorFormat(c, "Unknown subcommand or wrong number of arguments for '%s'. Try OBJECT help", (char *)c->argv[1]->ptr); diff --git a/src/server.h b/src/server.h index aa04344c..9b7da1d3 100644 --- a/src/server.h +++ b/src/server.h @@ -586,7 +586,7 @@ typedef struct redisObject { unsigned encoding:4; unsigned lru:LRU_BITS; /* LRU time (relative to global lru_clock) or * LFU data (least significant 8 bits frequency - * and most significant 16 bits decreas time). */ + * and most significant 16 bits access time). */ int refcount; void *ptr; } robj; @@ -1802,6 +1802,7 @@ void evictionPoolAlloc(void); #define LFU_INIT_VAL 5 unsigned long LFUGetTimeInMinutes(void); uint8_t LFULogIncr(uint8_t value); +unsigned long LFUDecrAndReturn(robj *o); /* Keys hashing / comparison functions for dict.c hash tables. */ uint64_t dictSdsHash(const void *key); From 9f131c9a895fac9418cceb3a340627f2dac2162a Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Thu, 19 Oct 2017 14:04:39 +0800 Subject: [PATCH 014/102] LFU: add hotkeys option to redis-cli --- src/redis-cli.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/src/redis-cli.c b/src/redis-cli.c index 84eabf39..a6f8c113 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -107,6 +107,7 @@ static struct config { char *pattern; char *rdb_filename; int bigkeys; + int hotkeys; int stdinarg; /* get last arg from stdin. (-x option) */ char *auth; int output; /* output mode, see OUTPUT_* defines */ @@ -1129,6 +1130,8 @@ static int parseOptions(int argc, char **argv) { config.pipe_timeout = atoi(argv[++i]); } else if (!strcmp(argv[i],"--bigkeys")) { config.bigkeys = 1; + } else if (!strcmp(argv[i],"--hotkeys")) { + config.hotkeys = 1; } else if (!strcmp(argv[i],"--eval") && !lastarg) { config.eval = argv[++i]; } else if (!strcmp(argv[i],"--ldb")) { @@ -1229,6 +1232,8 @@ static void usage(void) { " no reply is received within seconds.\n" " Default timeout: %d. Use 0 to wait forever.\n" " --bigkeys Sample Redis keys looking for big keys.\n" +" --hotkeys Sample Redis keys looking for hot keys.\n" +" only works when maxmemory-policy is *lfu.\n" " --scan List all keys using the SCAN command.\n" " --pattern Useful with --scan to specify a SCAN pattern.\n" " --intrinsic-latency Run a test to measure intrinsic system latency.\n" @@ -2343,6 +2348,129 @@ static void findBigKeys(void) { exit(0); } +static void getKeyFreqs(redisReply *keys, unsigned long long *freqs) { + redisReply *reply; + unsigned int i; + + /* Pipeline OBJECT freq commands */ + for(i=0;ielements;i++) { + redisAppendCommand(context, "OBJECT freq %s", keys->element[i]->str); + } + + /* Retrieve freqs */ + for(i=0;ielements;i++) { + if(redisGetReply(context, (void**)&reply)!=REDIS_OK) { + fprintf(stderr, "Error getting freq for key '%s' (%d: %s)\n", + keys->element[i]->str, context->err, context->errstr); + exit(1); + } else if(reply->type != REDIS_REPLY_INTEGER) { + if(reply->type == REDIS_REPLY_ERROR) { + fprintf(stderr, "Error: %s\n", reply->str); + exit(1); + } else { + fprintf(stderr, "Warning: OBJECT freq on '%s' failed (may have been deleted)\n", keys->element[i]->str); + freqs[i] = 0; + } + } else { + freqs[i] = reply->integer; + } + freeReplyObject(reply); + } +} + +#define HOTKEYS_SAMPLE 16 +static void findHotKeys(void) { + redisReply *keys, *reply; + unsigned long long counters[HOTKEYS_SAMPLE] = {0}; + sds hotkeys[HOTKEYS_SAMPLE] = {NULL}; + unsigned long long sampled = 0, total_keys, *freqs = NULL, it = 0; + unsigned int arrsize = 0, i, k; + double pct; + + /* Total keys pre scanning */ + total_keys = getDbSize(); + + /* Status message */ + printf("\n# Scanning the entire keyspace to find hot keys as well as\n"); + printf("# average sizes per key type. You can use -i 0.1 to sleep 0.1 sec\n"); + printf("# per 100 SCAN commands (not usually needed).\n\n"); + + /* SCAN loop */ + do { + /* Calculate approximate percentage completion */ + pct = 100 * (double)sampled/total_keys; + + /* Grab some keys and point to the keys array */ + reply = sendScan(&it); + keys = reply->element[1]; + + /* Reallocate our freqs array if we need to */ + if(keys->elements > arrsize) { + freqs = zrealloc(freqs, sizeof(unsigned long long)*keys->elements); + + if(!freqs) { + fprintf(stderr, "Failed to allocate storage for keys!\n"); + exit(1); + } + + arrsize = keys->elements; + } + + getKeyFreqs(keys, freqs); + + /* Now update our stats */ + for(i=0;ielements;i++) { + sampled++; + /* Update overall progress */ + if(sampled % 1000000 == 0) { + printf("[%05.2f%%] Sampled %llu keys so far\n", pct, sampled); + } + + /* Use eviction pool here */ + k = 0; + while (k < HOTKEYS_SAMPLE && freqs[i] > counters[k]) k++; + if (k == 0) continue; + k--; + if (k == 0 || counters[k] == 0) { + sdsfree(hotkeys[k]); + } else { + sdsfree(hotkeys[0]); + memmove(counters,counters+1,sizeof(counters[0])*k); + memmove(hotkeys,hotkeys+1,sizeof(hotkeys[0])*k); + } + counters[k] = freqs[i]; + hotkeys[k] = sdsnew(keys->element[i]->str); + printf( + "[%05.2f%%] Hot key '%s' found so far with counter %llu\n", + pct, keys->element[i]->str, freqs[i]); + } + + /* Sleep if we've been directed to do so */ + if(sampled && (sampled %100) == 0 && config.interval) { + usleep(config.interval); + } + + freeReplyObject(reply); + } while(it != 0); + + if (freqs) zfree(freqs); + + /* We're done */ + printf("\n-------- summary -------\n\n"); + + printf("Sampled %llu keys in the keyspace!\n", sampled); + + for (i=1; i<= HOTKEYS_SAMPLE; i++) { + k = HOTKEYS_SAMPLE - i; + if(counters[k]>0) { + printf("hot key found with counter: %llu\tkeyname: %s\n", counters[k], hotkeys[k]); + sdsfree(hotkeys[k]); + } + } + + exit(0); +} + /*------------------------------------------------------------------------------ * Stats mode *--------------------------------------------------------------------------- */ @@ -2720,6 +2848,7 @@ int main(int argc, char **argv) { config.pipe_mode = 0; config.pipe_timeout = REDIS_CLI_DEFAULT_PIPE_TIMEOUT; config.bigkeys = 0; + config.hotkeys = 0; config.stdinarg = 0; config.auth = NULL; config.eval = NULL; @@ -2780,6 +2909,12 @@ int main(int argc, char **argv) { findBigKeys(); } + /* Find hot keys */ + if (config.hotkeys) { + if (cliConnect(0) == REDIS_ERR) exit(1); + findHotKeys(); + } + /* Stat mode */ if (config.stat_mode) { if (cliConnect(0) == REDIS_ERR) exit(1); From 06ca9d683920da19ad53532f8cd55b54584027bc Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 27 Nov 2017 19:04:58 +0100 Subject: [PATCH 015/102] LFU: Fix LFUDecrAndReturn() to just decrement. Splitting the popularity in half actually just needs decrementing the counter because the counter is logarithmic. --- src/evict.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/src/evict.c b/src/evict.c index 55b13212..bf485ddc 100644 --- a/src/evict.c +++ b/src/evict.c @@ -335,19 +335,9 @@ uint8_t LFULogIncr(uint8_t counter) { unsigned long LFUDecrAndReturn(robj *o) { unsigned long ldt = o->lru >> 8; unsigned long counter = o->lru & 255; - long halve_times = server.lfu_decay_time ? LFUTimeElapsed(ldt) / server.lfu_decay_time : 0; - if (halve_times > 0 && counter) { - if (halve_times == 1) { - if (counter > LFU_INIT_VAL*2) { - counter /= 2; - if (counter < LFU_INIT_VAL*2) counter = LFU_INIT_VAL*2; - } else { - counter--; - } - } else { - counter = counter >> halve_times; - } - } + unsigned long num_periods = server.lfu_decay_time ? LFUTimeElapsed(ldt) / server.lfu_decay_time : 0; + if (num_periods) + counter = (num_periods > counter) ? 0 : counter - num_periods; return counter; } From 8c7f90e91e1e6324db8a1e9ca20e3dc95ce049d0 Mon Sep 17 00:00:00 2001 From: Itamar Haber Date: Tue, 28 Nov 2017 18:18:45 +0200 Subject: [PATCH 016/102] Standardizes arity handling of DEBUG --- src/debug.c | 5 ----- src/server.c | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/src/debug.c b/src/debug.c index 5c3fd347..a058737c 100644 --- a/src/debug.c +++ b/src/debug.c @@ -262,11 +262,6 @@ void computeDatasetDigest(unsigned char *final) { } void debugCommand(client *c) { - if (c->argc == 1) { - addReplyError(c,"You must specify a subcommand for DEBUG. Try DEBUG HELP for info."); - return; - } - if (!strcasecmp(c->argv[1]->ptr,"help")) { void *blenp = addDeferredMultiBulkLength(c); int blen = 0; diff --git a/src/server.c b/src/server.c index 6bc8bc66..30c5297a 100644 --- a/src/server.c +++ b/src/server.c @@ -258,7 +258,7 @@ struct redisCommand redisCommandTable[] = { {"persist",persistCommand,2,"wF",0,NULL,1,1,1,0,0}, {"slaveof",slaveofCommand,3,"ast",0,NULL,0,0,0,0,0}, {"role",roleCommand,1,"lst",0,NULL,0,0,0,0,0}, - {"debug",debugCommand,-1,"as",0,NULL,0,0,0,0,0}, + {"debug",debugCommand,-2,"as",0,NULL,0,0,0,0,0}, {"config",configCommand,-2,"lat",0,NULL,0,0,0,0,0}, {"subscribe",subscribeCommand,-2,"pslt",0,NULL,0,0,0,0,0}, {"unsubscribe",unsubscribeCommand,-1,"pslt",0,NULL,0,0,0,0,0}, From c44732ac58befabf0d6f72f2dae0005ffef7d3c4 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 28 Nov 2017 18:25:14 +0100 Subject: [PATCH 017/102] adlist: fix listJoin() in the case the second list is empty. See #4192, the original PR removed lines of code that are actually needed, so thanks to @chunqiulfq for reporting the problem, but merging solution from @jeesyn after checking, together with @artix75, that the logic covers all the cases. --- src/adlist.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/adlist.c b/src/adlist.c index e87d25ce..ec5f8bbf 100644 --- a/src/adlist.c +++ b/src/adlist.c @@ -353,7 +353,7 @@ void listJoin(list *l, list *o) { else l->head = o->head; - l->tail = o->tail; + if (o->tail) l->tail = o->tail; l->len += o->len; /* Setup other as an empty list. */ From 851e9fc48b5155de9fb47f0185b3523b59373a97 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 28 Nov 2017 18:38:49 +0100 Subject: [PATCH 018/102] t_hash.c: clarify calling two times the same function. --- src/t_hash.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/t_hash.c b/src/t_hash.c index 700a6233..be73932c 100644 --- a/src/t_hash.c +++ b/src/t_hash.c @@ -287,8 +287,8 @@ int hashTypeDelete(robj *o, sds field) { if (fptr != NULL) { fptr = ziplistFind(fptr, (unsigned char*)field, sdslen(field), 1); if (fptr != NULL) { - zl = ziplistDelete(zl,&fptr); - zl = ziplistDelete(zl,&fptr); + zl = ziplistDelete(zl,&fptr); /* Delete the key. */ + zl = ziplistDelete(zl,&fptr); /* Delete the value. */ o->ptr = zl; deleted = 1; } From 43be967690d7b778cf829540b504a9662177511d Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Thu, 30 Nov 2017 18:11:05 +0800 Subject: [PATCH 019/102] networking: optimize unlinkClient() in freeClient() --- src/networking.c | 14 ++++++++++---- src/replication.c | 1 + src/server.h | 1 + 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/networking.c b/src/networking.c index aeaeca96..10e8b48e 100644 --- a/src/networking.c +++ b/src/networking.c @@ -135,7 +135,12 @@ client *createClient(int fd) { c->peerid = NULL; listSetFreeMethod(c->pubsub_patterns,decrRefCountVoid); listSetMatchMethod(c->pubsub_patterns,listMatchObjects); - if (fd != -1) listAddNodeTail(server.clients,c); + if (fd != -1) { + listAddNodeTail(server.clients,c); + c->client_list_node = listLast(server.clients); + } else { + c->client_list_node = NULL; + } initClientMultiState(c); return c; } @@ -743,9 +748,10 @@ void unlinkClient(client *c) { * fd is already set to -1. */ if (c->fd != -1) { /* Remove from the list of active clients. */ - ln = listSearchKey(server.clients,c); - serverAssert(ln != NULL); - listDelNode(server.clients,ln); + if (c->client_list_node) { + listDelNode(server.clients,c->client_list_node); + c->client_list_node = NULL; + } /* Unregister async I/O handlers and close the socket. */ aeDeleteFileEvent(server.el,c->fd,AE_READABLE); diff --git a/src/replication.c b/src/replication.c index cf4db3e3..1207e060 100644 --- a/src/replication.c +++ b/src/replication.c @@ -2206,6 +2206,7 @@ void replicationResurrectCachedMaster(int newfd) { /* Re-add to the list of clients. */ listAddNodeTail(server.clients,server.master); + server.master->client_list_node = listLast(server.clients); if (aeCreateFileEvent(server.el, newfd, AE_READABLE, readQueryFromClient, server.master)) { serverLog(LL_WARNING,"Error resurrecting the cached master, impossible to add the readable handler: %s", strerror(errno)); diff --git a/src/server.h b/src/server.h index e3b56075..82cb9a7e 100644 --- a/src/server.h +++ b/src/server.h @@ -722,6 +722,7 @@ typedef struct client { dict *pubsub_channels; /* channels a client is interested in (SUBSCRIBE) */ list *pubsub_patterns; /* patterns a client is interested in (SUBSCRIBE) */ sds peerid; /* Cached peer ID. */ + listNode *client_list_node; /* list node in client list */ /* Response buffer */ int bufpos; From d8f8701032b8f87b998129e03dddf992baa803d2 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 30 Nov 2017 18:08:21 +0100 Subject: [PATCH 020/102] Be more verbose when DEBUG RESTART fails. --- src/server.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/server.c b/src/server.c index 6bc8bc66..e56b542a 100644 --- a/src/server.c +++ b/src/server.c @@ -1549,16 +1549,29 @@ int restartServer(int flags, mstime_t delay) { /* Check if we still have accesses to the executable that started this * server instance. */ - if (access(server.executable,X_OK) == -1) return C_ERR; + if (access(server.executable,X_OK) == -1) { + serverLog(LL_WARNING,"Can't restart: this process has no " + "permissions to execute %s", server.executable); + return C_ERR; + } /* Config rewriting. */ if (flags & RESTART_SERVER_CONFIG_REWRITE && server.configfile && - rewriteConfig(server.configfile) == -1) return C_ERR; + rewriteConfig(server.configfile) == -1) + { + serverLog(LL_WARNING,"Can't restart: configuration rewrite process " + "failed"); + return C_ERR; + } /* Perform a proper shutdown. */ if (flags & RESTART_SERVER_GRACEFULLY && - prepareForShutdown(SHUTDOWN_NOFLAGS) != C_OK) return C_ERR; + prepareForShutdown(SHUTDOWN_NOFLAGS) != C_OK) + { + serverLog(LL_WARNING,"Can't restart: error preparing for shutdown"); + return C_ERR; + } /* Close all file descriptors, with the exception of stdin, stdout, strerr * which are useful if we restart a Redis server which is not daemonized. */ From 3b9be93fdab81e27d68814aa794807897055af0d Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 30 Nov 2017 18:30:06 +0100 Subject: [PATCH 021/102] Prevent corruption of server.executable after DEBUG RESTART. Doing the following ended with a broken server.executable: 1. Start Redis with src/redis-server 2. Send CONFIG SET DIR /tmp/ 3. Send DEBUG RESTART At this point we called execve with an argv[0] that is no longer related to the new path. So after the restart the absolute path of the executable is recomputed in the wrong way. With this fix we pass the absolute path already computed as argv[0]. --- src/server.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/server.c b/src/server.c index e56b542a..7498a25f 100644 --- a/src/server.c +++ b/src/server.c @@ -1583,6 +1583,8 @@ int restartServer(int flags, mstime_t delay) { /* Execute the server with the original command line. */ if (delay) usleep(delay*1000); + zfree(server.exec_argv[0]); + server.exec_argv[0] = zstrdup(server.executable); execve(server.executable,server.exec_argv,environ); /* If an error occurred here, there is nothing we can do, but exit. */ From 6fb04d46374d9b79452a9e66786a16974077248b Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 30 Nov 2017 18:37:07 +0100 Subject: [PATCH 022/102] Regression test: Slave restart with EVALSHA in backlog issue #4483. --- tests/integration/psync2.tcl | 65 +++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/tests/integration/psync2.tcl b/tests/integration/psync2.tcl index d91969e3..3d9e5527 100644 --- a/tests/integration/psync2.tcl +++ b/tests/integration/psync2.tcl @@ -10,7 +10,7 @@ start_server {} { # Config set debug_msg 0 ; # Enable additional debug messages - set no_exit 0; ; # Do not exit at end of the test + set no_exit 0 ; # Do not exit at end of the test set duration 20 ; # Total test seconds @@ -175,6 +175,69 @@ start_server {} { assert {$sync_count == $new_sync_count} } + test "PSYNC2: Slave RDB restart with EVALSHA in backlog issue #4483" { + # Pick a random slave + set slave_id [expr {($master_id+1)%5}] + set sync_count [status $R($master_id) sync_full] + + # Make sure to replicate the first EVAL while the salve is online + # so that it's part of the scripts the master believes it's safe + # to propagate as EVALSHA. + $R($master_id) EVAL {return redis.call("incr","__mycounter")} 0 + $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0 + + # Wait for the two to sync + wait_for_condition 50 1000 { + [$R($master_id) debug digest] == [$R($slave_id) debug digest] + } else { + fail "Slave not reconnecting" + } + + # Prevent the slave from receiving master updates, and at + # the same time send a new script several times to the + # master, so that we'll end with EVALSHA into the backlog. + $R($slave_id) slaveof 127.0.0.1 0 + + $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0 + $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0 + $R($master_id) EVALSHA e6e0b547500efcec21eddb619ac3724081afee89 0 + + catch { + $R($slave_id) config rewrite + $R($slave_id) debug restart + } + + # Reconfigure the slave correctly again, when it's back online. + set retry 50 + while {$retry} { + if {[catch { + $R($slave_id) slaveof $master_host $master_port + }]} { + after 1000 + } else { + break + } + incr retry -1 + } + + # The master should be back at 4 slaves eventually + wait_for_condition 50 1000 { + [status $R($master_id) connected_slaves] == 4 + } else { + fail "Slave not reconnecting" + } + set new_sync_count [status $R($master_id) sync_full] + assert {$sync_count == $new_sync_count} + + # However if the slave started with the full state of the + # scripting engine, we should now have the same digest. + wait_for_condition 50 1000 { + [$R($master_id) debug digest] == [$R($slave_id) debug digest] + } else { + fail "Debug digest mismatch between master and slave in post-restart handshake" + } + } + if {$no_exit} { while 1 { puts -nonewline .; flush stdout; after 1000} } From f11a7585a8498689e8fd1afbcab4fdc2ba38c38f Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 29 Nov 2017 15:09:07 +0100 Subject: [PATCH 023/102] PSYNC2: Save Lua scripts state into RDB file. This is currently needed in order to fix #4483, but this can be useful in other contexts, so maybe later we may want to remove the conditionals and always save/load scripts. Note that we are using the "lua" AUX field here, in order to guarantee backward compatibility of the RDB file. The unknown AUX fields must be discarded by past versions of Redis. --- src/rdb.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ src/server.h | 1 + 2 files changed, 48 insertions(+) diff --git a/src/rdb.c b/src/rdb.c index 00106cac..d1495e79 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -943,6 +943,27 @@ int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) { } di = NULL; /* So that we don't release it again on error. */ + /* If we are storing the replication information on disk, persist + * the script cache as well: on successful PSYNC after a restart, we need + * to be able to process any EVALSHA inside the replication backlog the + * master will send us. */ + if (rsi && dictSize(server.lua_scripts)) { + di = dictGetIterator(server.lua_scripts); + while((de = dictNext(di)) != NULL) { + sds sha = dictGetKey(de); + robj *body = dictGetVal(de); + /* Concatenate the SHA1 and the Lua script together. Because the + * SHA1 is fixed length, we will always be able to load it back + * telling apart the name from the body. */ + sds combo = sdsdup(sha); + combo = sdscatlen(combo,body->ptr,sdslen(body->ptr)); + if (rdbSaveAuxField(rdb,"lua",3,combo,sdslen(combo)) == -1) + goto werr; + sdsfree(combo); + } + dictReleaseIterator(di); + } + /* EOF opcode */ if (rdbSaveType(rdb,RDB_OPCODE_EOF) == -1) goto werr; @@ -1589,6 +1610,32 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) { } } else if (!strcasecmp(auxkey->ptr,"repl-offset")) { if (rsi) rsi->repl_offset = strtoll(auxval->ptr,NULL,10); + } else if (!strcasecmp(auxkey->ptr,"lua")) { + /* Load the string combining the function name and body + * back in memory. The format is basically: + * . To load it back we need + * to create the function name as "f_" and load the + * body as a Redis string object. */ + sds combo = auxval->ptr; + if (sdslen(combo) < 40) { + rdbExitReportCorruptRDB( + "Lua script stored into the RDB file has invalid " + "length < 40 bytes: '%s'", combo); + } + char funcname[42]; + funcname[0] = 'f'; + funcname[1] = '_'; + memcpy(funcname+2,combo,40); + robj *body = createRawStringObject(combo+40,sdslen(combo)-40); + + /* Register the function. */ + if (luaCreateFunction(NULL,server.lua,funcname,body) == C_ERR) { + rdbExitReportCorruptRDB( + "Can't load Lua script from RDB file! " + "Script SHA1: %.42s BODY: %s", + combo, combo+42); + } + decrRefCount(body); } else { /* We ignore fields we don't understand, as by AUX field * contract. */ diff --git a/src/server.h b/src/server.h index 9b7da1d3..11eb36f3 100644 --- a/src/server.h +++ b/src/server.h @@ -1781,6 +1781,7 @@ void scriptingInit(int setup); int ldbRemoveChild(pid_t pid); void ldbKillForkedSessions(void); int ldbPendingChildren(void); +int luaCreateFunction(client *c, lua_State *lua, char *funcname, robj *body); /* Blocked clients */ void processUnblockedClients(void); From 28dfdca7335721de53ab296d80f005d7a7d2aa8c Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 29 Nov 2017 16:21:14 +0100 Subject: [PATCH 024/102] PSYNC2: luaCreateFunction() should handle NULL client parameter. See #4483. This is needed because luaCreateFunction() is now called from RDB loading code outside a client context. --- src/scripting.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/scripting.c b/src/scripting.c index d9f95406..64de1edc 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -1160,16 +1160,21 @@ int luaCreateFunction(client *c, lua_State *lua, char *funcname, robj *body) { funcdef = sdscatlen(funcdef,"\nend",4); if (luaL_loadbuffer(lua,funcdef,sdslen(funcdef),"@user_script")) { - addReplyErrorFormat(c,"Error compiling script (new function): %s\n", - lua_tostring(lua,-1)); + if (c != NULL) { + addReplyErrorFormat(c, + "Error compiling script (new function): %s\n", + lua_tostring(lua,-1)); + } lua_pop(lua,1); sdsfree(funcdef); return C_ERR; } sdsfree(funcdef); if (lua_pcall(lua,0,0,0)) { - addReplyErrorFormat(c,"Error running script (new function): %s\n", - lua_tostring(lua,-1)); + if (c != NULL) { + addReplyErrorFormat(c,"Error running script (new function): %s\n", + lua_tostring(lua,-1)); + } lua_pop(lua,1); return C_ERR; } @@ -1180,7 +1185,7 @@ int luaCreateFunction(client *c, lua_State *lua, char *funcname, robj *body) { { int retval = dictAdd(server.lua_scripts, sdsnewlen(funcname+2,40),body); - serverAssertWithInfo(c,NULL,retval == DICT_OK); + serverAssertWithInfo(c ? c : server.lua_client,NULL,retval == DICT_OK); incrRefCount(body); } return C_OK; From 452ad2e928524cfe42856e869effd2d8b37ae280 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 29 Nov 2017 16:38:16 +0100 Subject: [PATCH 025/102] PSYNC2: just store script bodies into RDB. Related to #4483. As suggested by @soloestoy, we can retrieve the SHA1 from the body. Given that in the new implementation using AUX fields we ended copying around a lot to create new objects and strings, extremize such concept and trade CPU for space inside the RDB file. --- src/rdb.c | 34 ++++------------------------------ src/scripting.c | 15 +++++++++++++++ 2 files changed, 19 insertions(+), 30 deletions(-) diff --git a/src/rdb.c b/src/rdb.c index d1495e79..19ba59ab 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -950,16 +950,9 @@ int rdbSaveRio(rio *rdb, int *error, int flags, rdbSaveInfo *rsi) { if (rsi && dictSize(server.lua_scripts)) { di = dictGetIterator(server.lua_scripts); while((de = dictNext(di)) != NULL) { - sds sha = dictGetKey(de); robj *body = dictGetVal(de); - /* Concatenate the SHA1 and the Lua script together. Because the - * SHA1 is fixed length, we will always be able to load it back - * telling apart the name from the body. */ - sds combo = sdsdup(sha); - combo = sdscatlen(combo,body->ptr,sdslen(body->ptr)); - if (rdbSaveAuxField(rdb,"lua",3,combo,sdslen(combo)) == -1) + if (rdbSaveAuxField(rdb,"lua",3,body->ptr,sdslen(body->ptr)) == -1) goto werr; - sdsfree(combo); } dictReleaseIterator(di); } @@ -1611,31 +1604,12 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) { } else if (!strcasecmp(auxkey->ptr,"repl-offset")) { if (rsi) rsi->repl_offset = strtoll(auxval->ptr,NULL,10); } else if (!strcasecmp(auxkey->ptr,"lua")) { - /* Load the string combining the function name and body - * back in memory. The format is basically: - * . To load it back we need - * to create the function name as "f_" and load the - * body as a Redis string object. */ - sds combo = auxval->ptr; - if (sdslen(combo) < 40) { - rdbExitReportCorruptRDB( - "Lua script stored into the RDB file has invalid " - "length < 40 bytes: '%s'", combo); - } - char funcname[42]; - funcname[0] = 'f'; - funcname[1] = '_'; - memcpy(funcname+2,combo,40); - robj *body = createRawStringObject(combo+40,sdslen(combo)-40); - - /* Register the function. */ - if (luaCreateFunction(NULL,server.lua,funcname,body) == C_ERR) { + /* Load the script back in memory. */ + if (luaCreateFunction(NULL,server.lua,NULL,auxval) == C_ERR) { rdbExitReportCorruptRDB( "Can't load Lua script from RDB file! " - "Script SHA1: %.42s BODY: %s", - combo, combo+42); + "BODY: %s", auxval->ptr); } - decrRefCount(body); } else { /* We ignore fields we don't understand, as by AUX field * contract. */ diff --git a/src/scripting.c b/src/scripting.c index 64de1edc..1ef91a4d 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -1147,11 +1147,26 @@ int redis_math_randomseed (lua_State *L) { * * f_ * + * If 'funcname' is NULL, the function name is created by the function + * on the fly doing the SHA1 of the body, this means that passing the funcname + * is just an optimization in case it's already at hand. + * + * The function increments the reference count of the 'body' object as a + * side effect of a successful call. + * * On success C_OK is returned, and nothing is left on the Lua stack. * On error C_ERR is returned and an appropriate error is set in the * client context. */ int luaCreateFunction(client *c, lua_State *lua, char *funcname, robj *body) { sds funcdef = sdsempty(); + char fname[42]; + + if (funcname == NULL) { + fname[0] = 'f'; + fname[1] = '_'; + sha1hex(fname+2,body->ptr,sdslen(body->ptr)); + funcname = fname; + } funcdef = sdscat(funcdef,"function "); funcdef = sdscatlen(funcdef,funcname,42); From 045d65c3af460a71d2b89b84f5e0b85d98320a77 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 29 Nov 2017 17:11:46 +0100 Subject: [PATCH 026/102] PSYNC2: Fix off by one buffer size in luaCreateFunction(). --- src/scripting.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripting.c b/src/scripting.c index 1ef91a4d..848629e2 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -1159,7 +1159,7 @@ int redis_math_randomseed (lua_State *L) { * client context. */ int luaCreateFunction(client *c, lua_State *lua, char *funcname, robj *body) { sds funcdef = sdsempty(); - char fname[42]; + char fname[43]; if (funcname == NULL) { fname[0] = 'f'; From 79866a6361829ed0602dedff9cb378c66977227a Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 30 Aug 2017 12:40:27 +0200 Subject: [PATCH 027/102] Streams: 12 commits squashed into the initial Streams implementation. --- src/Makefile | 2 +- src/listpack.c | 783 ++++++++++++++++++++++++++++++++++++++++++ src/listpack.h | 61 ++++ src/listpack_malloc.h | 44 +++ src/object.c | 7 + src/rax.c | 24 +- src/rax.h | 1 + src/rdb.h | 3 +- src/server.c | 2 + src/server.h | 9 + src/stream.h | 21 ++ src/t_stream.c | 376 ++++++++++++++++++++ 12 files changed, 1323 insertions(+), 10 deletions(-) create mode 100644 src/listpack.c create mode 100644 src/listpack.h create mode 100644 src/listpack_malloc.h create mode 100644 src/stream.h create mode 100644 src/t_stream.c diff --git a/src/Makefile b/src/Makefile index 86e0b3fe..b896b126 100644 --- a/src/Makefile +++ b/src/Makefile @@ -144,7 +144,7 @@ endif REDIS_SERVER_NAME=redis-server REDIS_SENTINEL_NAME=redis-sentinel -REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o +REDIS_SERVER_OBJ=adlist.o quicklist.o ae.o anet.o dict.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o crc16.o endianconv.o slowlog.o scripting.o bio.o rio.o rand.o memtest.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.c REDIS_CLI_NAME=redis-cli REDIS_CLI_OBJ=anet.o adlist.o redis-cli.o zmalloc.o release.o anet.o ae.o crc64.o REDIS_BENCHMARK_NAME=redis-benchmark diff --git a/src/listpack.c b/src/listpack.c new file mode 100644 index 00000000..e2702b65 --- /dev/null +++ b/src/listpack.c @@ -0,0 +1,783 @@ +/* Listpack -- A lists of strings serialization format + * + * This file implements the specification you can find at: + * + * https://github.com/antirez/listpack + * + * Copyright (c) 2017, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include + +#include "listpack.h" +#include "listpack_malloc.h" + +#define LP_HDR_SIZE 6 /* 32 bit total len + 16 bit number of elements. */ +#define LP_HDR_NUMELE_UNKNOWN UINT16_MAX +#define LP_MAX_INT_ENCODING_LEN 9 +#define LP_MAX_BACKLEN_SIZE 5 +#define LP_MAX_ENTRY_BACKLEN 34359738367ULL +#define LP_ENCODING_INT 0 +#define LP_ENCODING_STRING 1 + +#define LP_ENCODING_7BIT_UINT 0 +#define LP_ENCODING_7BIT_UINT_MASK 0x80 +#define LP_ENCODING_IS_7BIT_UINT(byte) (((byte)&LP_ENCODING_7BIT_UINT_MASK)==LP_ENCODING_7BIT_UINT) + +#define LP_ENCODING_6BIT_STR 0x80 +#define LP_ENCODING_6BIT_STR_MASK 0xC0 +#define LP_ENCODING_IS_6BIT_STR(byte) (((byte)&LP_ENCODING_6BIT_STR_MASK)==LP_ENCODING_6BIT_STR) + +#define LP_ENCODING_13BIT_INT 0xC0 +#define LP_ENCODING_13BIT_INT_MASK 0xE0 +#define LP_ENCODING_IS_13BIT_INT(byte) (((byte)&LP_ENCODING_13BIT_INT_MASK)==LP_ENCODING_13BIT_INT) + +#define LP_ENCODING_12BIT_STR 0xE0 +#define LP_ENCODING_12BIT_STR_MASK 0xF0 +#define LP_ENCODING_IS_12BIT_STR(byte) (((byte)&LP_ENCODING_12BIT_STR_MASK)==LP_ENCODING_12BIT_STR) + +#define LP_ENCODING_16BIT_INT 0xF1 +#define LP_ENCODING_16BIT_INT_MASK 0xFF +#define LP_ENCODING_IS_16BIT_INT(byte) (((byte)&LP_ENCODING_16BIT_INT_MASK)==LP_ENCODING_16BIT_INT) + +#define LP_ENCODING_24BIT_INT 0xF2 +#define LP_ENCODING_24BIT_INT_MASK 0xFF +#define LP_ENCODING_IS_24BIT_INT(byte) (((byte)&LP_ENCODING_24BIT_INT_MASK)==LP_ENCODING_24BIT_INT) + +#define LP_ENCODING_32BIT_INT 0xF3 +#define LP_ENCODING_32BIT_INT_MASK 0xFF +#define LP_ENCODING_IS_32BIT_INT(byte) (((byte)&LP_ENCODING_32BIT_INT_MASK)==LP_ENCODING_32BIT_INT) + +#define LP_ENCODING_64BIT_INT 0xF4 +#define LP_ENCODING_64BIT_INT_MASK 0xFF +#define LP_ENCODING_IS_64BIT_INT(byte) (((byte)&LP_ENCODING_64BIT_INT_MASK)==LP_ENCODING_64BIT_INT) + +#define LP_ENCODING_32BIT_STR 0xF0 +#define LP_ENCODING_32BIT_STR_MASK 0xFF +#define LP_ENCODING_IS_32BIT_STR(byte) (((byte)&LP_ENCODING_32BIT_STR_MASK)==LP_ENCODING_32BIT_STR) + +#define LP_EOF 0xFF + +#define LP_ENCODING_6BIT_STR_LEN(p) ((p)[0] & 0x3F) +#define LP_ENCODING_12BIT_STR_LEN(p) ((((p)[0] & 0xF) << 8) | (p)[1]) +#define LP_ENCODING_32BIT_STR_LEN(p) (((uint32_t)(p)[1]<<0) | \ + ((uint32_t)(p)[2]<<8) | \ + ((uint32_t)(p)[3]<<16) | \ + ((uint32_t)(p)[4]<<24)) + +#define lpGetTotalBytes(p) (((uint32_t)(p)[0]<<0) | \ + ((uint32_t)(p)[1]<<8) | \ + ((uint32_t)(p)[2]<<16) | \ + ((uint32_t)(p)[3]<<24)) + +#define lpGetNumElements(p) (((uint32_t)(p)[4]<<0) | \ + ((uint32_t)(p)[5]<<8)) +#define lpSetTotalBytes(p,v) do { \ + (p)[0] = (v)&0xff; \ + (p)[1] = ((v)>>8)&0xff; \ + (p)[2] = ((v)>>16)&0xff; \ + (p)[3] = ((v)>>24)&0xff; \ +} while(0) + +#define lpSetNumElements(p,v) do { \ + (p)[4] = (v)&0xff; \ + (p)[5] = ((v)>>8)&0xff; \ +} while(0) + +/* Convert a string into a signed 64 bit integer. + * The function returns 1 if the string could be parsed into a (non-overflowing) + * signed 64 bit int, 0 otherwise. The 'value' will be set to the parsed value + * when the function returns success. + * + * Note that this function demands that the string strictly represents + * a int64 value: no spaces or other characters before or after the string + * representing the number are accepted, nor zeroes at the start if not + * for the string "0" representing the zero number. + * + * Because of its strictness, it is safe to use this function to check if + * you can convert a string into a long long, and obtain back the string + * from the number without any loss in the string representation. * + * + * ----------------------------------------------------------------------------- + * + * Credits: this function was adapted from the Redis source code, file + * "utils.c", function string2ll(), and is copyright: + * + * Copyright(C) 2011, Pieter Noordhuis + * Copyright(C) 2011, Salvatore Sanfilippo + * + * The function is released under the BSD 3-clause license. + */ +int lpStringToInt64(const char *s, unsigned long slen, int64_t *value) { + const char *p = s; + unsigned long plen = 0; + int negative = 0; + uint64_t v; + + if (plen == slen) + return 0; + + /* Special case: first and only digit is 0. */ + if (slen == 1 && p[0] == '0') { + if (value != NULL) *value = 0; + return 1; + } + + if (p[0] == '-') { + negative = 1; + p++; plen++; + + /* Abort on only a negative sign. */ + if (plen == slen) + return 0; + } + + /* First digit should be 1-9, otherwise the string should just be 0. */ + if (p[0] >= '1' && p[0] <= '9') { + v = p[0]-'0'; + p++; plen++; + } else if (p[0] == '0' && slen == 1) { + *value = 0; + return 1; + } else { + return 0; + } + + while (plen < slen && p[0] >= '0' && p[0] <= '9') { + if (v > (UINT64_MAX / 10)) /* Overflow. */ + return 0; + v *= 10; + + if (v > (UINT64_MAX - (p[0]-'0'))) /* Overflow. */ + return 0; + v += p[0]-'0'; + + p++; plen++; + } + + /* Return if not all bytes were used. */ + if (plen < slen) + return 0; + + if (negative) { + if (v > ((uint64_t)(-(INT64_MIN+1))+1)) /* Overflow. */ + return 0; + if (value != NULL) *value = -v; + } else { + if (v > INT64_MAX) /* Overflow. */ + return 0; + if (value != NULL) *value = v; + } + return 1; +} + +/* Create a new, empty listpack. + * On success the new listpack is returned, otherwise an error is returned. */ +unsigned char *lpNew(void) { + unsigned char *lp = lp_malloc(LP_HDR_SIZE+1); + if (lp == NULL) return NULL; + lpSetTotalBytes(lp,LP_HDR_SIZE+1); + lpSetNumElements(lp,0); + lp[LP_HDR_SIZE] = LP_EOF; + return lp; +} + +/* Free the specified listpack. */ +void lpFree(unsigned char *lp) { + lp_free(lp); +} + +/* Given an element 'ele' of size 'size', determine if the element can be + * represented inside the listpack encoded as integer, and returns + * LP_ENCODING_INT if so. Otherwise returns LP_ENCODING_STR if no integer + * encoding is possible. + * + * If the LP_ENCODING_INT is returned, the function stores the integer encoded + * representation of the element in the 'intenc' buffer. + * + * Regardless of the returned encoding, 'enclen' is populated by reference to + * the number of bytes that the string or integer encoded element will require + * in order to be represented. */ +int lpEncodeGetType(unsigned char *ele, uint32_t size, unsigned char *intenc, uint64_t *enclen) { + int64_t v; + if (lpStringToInt64((const char*)ele, size, &v)) { + if (v >= 0 && v <= 127) { + /* Single byte 0-127 integer. */ + intenc[0] = v; + *enclen = 1; + } else if (v >= -4096 && v <= 4095) { + /* 13 bit integer. */ + if (v < 0) v = ((int64_t)1<<13)+v; + intenc[0] = (v>>8)|LP_ENCODING_13BIT_INT; + intenc[1] = v&0xff; + *enclen = 2; + } else if (v >= -32768 && v <= 32767) { + /* 16 bit integer. */ + if (v < 0) v = ((int64_t)1<<16)+v; + intenc[0] = LP_ENCODING_16BIT_INT; + intenc[1] = v&0xff; + intenc[2] = v>>8; + *enclen = 3; + } else if (v >= -8388608 && v <= 8388607) { + /* 24 bit integer. */ + if (v < 0) v = ((int64_t)1<<24)+v; + intenc[0] = LP_ENCODING_24BIT_INT; + intenc[1] = v&0xff; + intenc[2] = (v>>8)&0xff; + intenc[3] = v>>16; + *enclen = 4; + } else if (v >= -2147483648 && v <= 2147483647) { + /* 32 bit integer. */ + if (v < 0) v = ((int64_t)1<<32)+v; + intenc[0] = LP_ENCODING_32BIT_INT; + intenc[1] = v&0xff; + intenc[2] = (v>>8)&0xff; + intenc[3] = (v>>16)&0xff; + intenc[4] = v>>24; + *enclen = 5; + } else { + /* 64 bit integer. */ + uint64_t uv = v; + intenc[0] = LP_ENCODING_64BIT_INT; + intenc[1] = uv&0xff; + intenc[2] = (uv>>8)&0xff; + intenc[3] = (uv>>16)&0xff; + intenc[4] = (uv>>24)&0xff; + intenc[5] = (uv>>32)&0xff; + intenc[6] = (uv>>40)&0xff; + intenc[7] = (uv>>48)&0xff; + intenc[8] = uv>>56; + *enclen = 9; + } + return LP_ENCODING_INT; + } else { + if (size < 64) *enclen = 1+size; + else if (size < 4096) *enclen = 2+size; + else *enclen = 4+size; + return LP_ENCODING_STRING; + } +} + +/* Store a reverse-encoded variable length field, representing the length + * of the previous element of size 'l', in the target buffer 'buf'. + * The function returns the number of bytes used to encode it, from + * 1 to 5. If 'buf' is NULL the funciton just returns the number of bytes + * needed in order to encode the backlen. */ +unsigned long lpEncodeBacklen(unsigned char *buf, uint64_t l) { + if (l <= 127) { + if (buf) buf[0] = l; + return 1; + } else if (l < 16383) { + if (buf) { + buf[0] = l>>7; + buf[1] = (l&127)|128; + } + return 2; + } else if (l < 2097151) { + if (buf) { + buf[0] = l>>14; + buf[1] = ((l>>7)&127)|128; + buf[2] = (l&127)|128; + } + return 3; + } else if (l < 268435455) { + if (buf) { + buf[0] = l>>21; + buf[1] = ((l>>14)&127)|128; + buf[2] = ((l>>7)&127)|128; + buf[3] = (l&127)|128; + } + return 4; + } else { + if (buf) { + buf[0] = l>>28; + buf[1] = ((l>>21)&127)|128; + buf[2] = ((l>>14)&127)|128; + buf[3] = ((l>>7)&127)|128; + buf[4] = (l&127)|128; + } + return 5; + } +} + +/* Decode the backlen and returns it. If the encoding looks invalid (more than + * 5 bytes are used), UINT64_MAX is returned to report the problem. */ +uint64_t lpDecodeBacklen(unsigned char *p) { + uint64_t val = 0; + uint64_t shift = 0; + do { + val |= (uint64_t)(p[0] & 127) << shift; + if (!(p[0] & 128)) break; + shift += 7; + p--; + if (shift > 28) return UINT64_MAX; + } while(1); + return val; +} + +/* Encode the string element pointed by 's' of size 'len' in the target + * buffer 's'. The function should be called with 'buf' having always enough + * space for encoding the string. This is done by calling lpEncodeGetType() + * before calling this function. */ +void lpEncodeString(unsigned char *buf, unsigned char *s, uint32_t len) { + if (len < 64) { + buf[0] = len | LP_ENCODING_6BIT_STR; + memcpy(buf+1,s,len); + } else if (len < 4096) { + buf[0] = (len >> 8) | LP_ENCODING_12BIT_STR; + buf[1] = len & 0xff; + memcpy(buf+2,s,len); + } else { + buf[0] = LP_ENCODING_32BIT_STR; + buf[1] = len & 0xff; + buf[2] = (len >> 8) & 0xff; + buf[3] = (len >> 16) & 0xff; + buf[4] = (len >> 24) & 0xff; + memcpy(buf+4,s,len); + } +} + +/* Return the encoded length of the listpack element pointed by 'p'. If the + * element encoding is wrong then 0 is returned. */ +uint32_t lpCurrentEncodedSize(unsigned char *p) { + if (LP_ENCODING_IS_7BIT_UINT(p[0])) return 1; + if (LP_ENCODING_IS_6BIT_STR(p[0])) return 1+LP_ENCODING_6BIT_STR_LEN(p); + if (LP_ENCODING_IS_13BIT_INT(p[0])) return 2; + if (LP_ENCODING_IS_16BIT_INT(p[0])) return 3; + if (LP_ENCODING_IS_24BIT_INT(p[0])) return 4; + if (LP_ENCODING_IS_32BIT_INT(p[0])) return 5; + if (LP_ENCODING_IS_64BIT_INT(p[0])) return 9; + if (LP_ENCODING_IS_12BIT_STR(p[0])) return 2+LP_ENCODING_12BIT_STR_LEN(p); + if (LP_ENCODING_IS_32BIT_STR(p[0])) return 5+LP_ENCODING_32BIT_STR_LEN(p); + if (p[0] == LP_EOF) return 1; + return 0; +} + +/* Skip the current entry returning the next. It is invalid to call this + * function if the current element is the EOF element at the end of the + * listpack, however, while this function is used to implement lpNext(), + * it does not return NULL when the EOF element is encountered. */ +unsigned char *lpSkip(unsigned char *p) { + unsigned long entrylen = lpCurrentEncodedSize(p); + entrylen += lpEncodeBacklen(NULL,entrylen); + p += entrylen; + return p; +} + +/* If 'p' points to an element of the listpack, calling lpNext() will return + * the pointer to the next element (the one on the right), or NULL if 'p' + * already pointed to the last element of the listpack. */ +unsigned char *lpNext(unsigned char *lp, unsigned char *p) { + ((void) lp); /* lp is not used for now. However lpPrev() uses it. */ + p = lpSkip(p); + if (p[0] == LP_EOF) return NULL; + return p; +} + +/* If 'p' points to an element of the listpack, calling lpPrev() will return + * the pointer to the preivous element (the one on the left), or NULL if 'p' + * already pointed to the first element of the listpack. */ +unsigned char *lpPrev(unsigned char *lp, unsigned char *p) { + if (p-lp == LP_HDR_SIZE) return NULL; + p--; /* Seek the first backlen byte of the last element. */ + uint64_t prevlen = lpDecodeBacklen(p); + prevlen += lpEncodeBacklen(NULL,prevlen); + return p-prevlen+1; /* Seek the first byte of the previous entry. */ +} + +/* Return a pointer to the first element of the listpack, or NULL if the + * listpack has no elements. */ +unsigned char *lpFirst(unsigned char *lp) { + lp += LP_HDR_SIZE; /* Skip the header. */ + if (lp[0] == LP_EOF) return NULL; + return lp; +} + +/* Return a pointer to the last element of the listpack, or NULL if the + * listpack has no elements. */ +unsigned char *lpLast(unsigned char *lp) { + unsigned char *p = lp+lpGetTotalBytes(lp)-1; /* Seek EOF element. */ + return lpPrev(lp,p); /* Will return NULL if EOF is the only element. */ +} + +/* Return the number of elements inside the listpack. This function attempts + * to use the cached value when within range, otherwise a full scan is + * needed. As a side effect of calling this function, the listpack header + * could be modified, because if the count is found to be already within + * the 'numele' header field range, the new value is set. */ +uint32_t lpLength(unsigned char *lp) { + uint32_t numele = lpGetNumElements(lp); + if (numele != LP_HDR_NUMELE_UNKNOWN) return numele; + + /* Too many elements inside the listpack. We need to scan in order + * to get the total number. */ + uint32_t count = 0; + unsigned char *p = lpFirst(lp); + while(p) { + count++; + p = lpNext(lp,p); + } + + /* If the count is again within range of the header numele field, + * set it. */ + if (count < LP_HDR_NUMELE_UNKNOWN) lpSetNumElements(lp,count); + return count; +} + +/* Return the listpack element pointed by 'p'. + * + * The function changes behavior depending on the passed 'intbuf' value. + * Specifically, if 'intbuf' is NULL: + * + * If the element is internally encoded as an integer, the function returns + * NULL and populates the integer value by reference in 'count'. Otherwise if + * the element is encoded as a string a pointer to the string (pointing inside + * the listpack itself) is returned, and 'count' is set to the length of the + * string. + * + * If instead 'intbuf' points to a buffer passed by the caller, that must be + * at least LP_INTBUF_SIZE bytes, the function always returns the element as + * it was a string (returning the pointer to the string and setting the + * 'count' argument to the string length by reference). However if the element + * is encoded as an integer, the 'intbuf' buffer is used in order to store + * the string representation. + * + * The user should use one or the other form depending on what the value will + * be used for. If there is immediate usage for an integer value returned + * by the function, than to pass a buffer (and convert it back to a number) + * is of course useless. + * + * If the function is called against a badly encoded ziplist, so that there + * is no valid way to parse it, the function returns like if there was an + * integer encoded with value 12345678900000000 + , this may + * be an hint to understand that something is wrong. To crash in this case is + * not sensible because of the different requirements of the application using + * this lib. + * + * Similarly, there is no error returned since the listpack normally can be + * assumed to be valid, so that would be a very high API cost. However a function + * in order to check the integrity of the listpack at load time is provided, + * check lpIsValid(). */ +unsigned char *lpGet(unsigned char *p, int64_t *count, unsigned char *intbuf) { + int64_t val; + uint64_t uval, negstart, negmax; + + if (LP_ENCODING_IS_7BIT_UINT(p[0])) { + negstart = UINT64_MAX; /* 7 bit ints are always positive. */ + negmax = 0; + uval = p[0] & 0x7f; + } else if (LP_ENCODING_IS_6BIT_STR(p[0])) { + *count = LP_ENCODING_6BIT_STR_LEN(p); + return p+1; + } else if (LP_ENCODING_IS_13BIT_INT(p[0])) { + uval = ((p[0]&0x1f)<<8) | p[1]; + negstart = (uint64_t)1<<12; + negmax = 8191; + } else if (LP_ENCODING_IS_16BIT_INT(p[0])) { + uval = (uint64_t)p[1] | + (uint64_t)p[2]<<8; + negstart = (uint64_t)1<<15; + negmax = UINT16_MAX; + } else if (LP_ENCODING_IS_24BIT_INT(p[0])) { + uval = (uint64_t)p[1] | + (uint64_t)p[2]<<8 | + (uint64_t)p[3]<<16; + negstart = (uint64_t)1<<23; + negmax = UINT32_MAX>>8; + } else if (LP_ENCODING_IS_32BIT_INT(p[0])) { + uval = (uint64_t)p[1] | + (uint64_t)p[2]<<8 | + (uint64_t)p[3]<<16 | + (uint64_t)p[4]<<24; + negstart = (uint64_t)1<<31; + negmax = UINT32_MAX; + } else if (LP_ENCODING_IS_64BIT_INT(p[0])) { + uval = (uint64_t)p[1] | + (uint64_t)p[2]<<8 | + (uint64_t)p[3]<<16 | + (uint64_t)p[4]<<24 | + (uint64_t)p[5]<<32 | + (uint64_t)p[6]<<40 | + (uint64_t)p[7]<<48 | + (uint64_t)p[8]<<56; + negstart = (uint64_t)1<<63; + negmax = UINT64_MAX; + } else if (LP_ENCODING_IS_12BIT_STR(p[0])) { + *count = LP_ENCODING_12BIT_STR_LEN(p); + return p+2; + } else if (LP_ENCODING_IS_32BIT_STR(p[0])) { + *count = LP_ENCODING_32BIT_STR_LEN(p); + return p+5; + } else { + uval = 12345678900000000ULL + p[0]; + negstart = UINT64_MAX; + negmax = 0; + } + + /* We reach this code path only for integer encodings. + * Convert the unsigned value to the signed one using two's complement + * rule. */ + if (uval >= negstart) { + /* This three steps conversion should avoid undefined behaviors + * in the unsigned -> signed conversion. */ + uval = negmax-uval; + val = uval; + val = -val-1; + } else { + val = uval; + } + + /* Return the string representation of the integer or the value itself + * depending on intbuf being NULL or not. */ + if (intbuf) { + *count = snprintf((char*)intbuf,LP_INTBUF_SIZE,"%lld",val); + return intbuf; + } else { + *count = val; + return NULL; + } +} + +/* Insert, delete or replace the specified element 'ele' of lenght 'len' at + * the specified position 'p', with 'p' being a listpack element pointer + * obtained with lpFirst(), lpLast(), lpIndex(), lpNext(), lpPrev() or + * lpSeek(). + * + * The element is inserted before, after, or replaces the element pointed + * by 'p' depending on the 'where' argument, that can be LP_BEFORE, LP_AFTER + * or LP_REPLACE. + * + * If 'ele' is set to NULL, the function removes the element pointed by 'p' + * instead of inserting one. + * + * Returns NULL on out of memory or when the listpack total length would exceed + * the max allowed size of 2^32-1, otherwise the new pointer to the listpack + * holding the new element is returned (and the old pointer passed is no longer + * considered valid) + * + * If 'newp' is not NULL, at the end of a successful call '*newp' will be set + * to the address of the element just added, so that it will be possible to + * continue an interation with lpNext() and lpPrev(). + * + * For deletion operations ('ele' set to NULL) 'newp' is set to the next + * element, on the right of the deleted one, or to NULL if the deleted element + * was the last one. */ +unsigned char *lpInsert(unsigned char *lp, unsigned char *ele, uint32_t size, unsigned char *p, int where, unsigned char **newp) { + unsigned char intenc[LP_MAX_INT_ENCODING_LEN]; + unsigned char backlen[LP_MAX_BACKLEN_SIZE]; + + uint64_t enclen; /* The length of the encoded element. */ + + /* An element pointer set to NULL means deletion, which is conceptually + * replacing the element with a zero-length element. So whatever we + * get passed as 'where', set it to LP_REPLACE. */ + if (ele == NULL) where = LP_REPLACE; + + /* If we need to insert after the current element, we just jump to the + * next element (that could be the EOF one) and handle the case of + * inserting before. So the function will actually deal with just two + * cases: LP_BEFORE and LP_REPLACE. */ + if (where == LP_AFTER) { + p = lpSkip(p); + where = LP_BEFORE; + } + + /* Store the offset of the element 'p', so that we can obtain its + * address again after a reallocation. */ + unsigned long poff = p-lp; + + /* Calling lpEncodeGetType() results into the encoded version of the + * element to be stored into 'intenc' in case it is representable as + * an integer: in that case, the function returns LP_ENCODING_INT. + * Otherwise if LP_ENCODING_STR is returned, we'll have to call + * lpEncodeString() to actually write the encoded string on place later. + * + * Whatever the returned encoding is, 'enclen' is populated with the + * length of the encoded element. */ + int enctype; + if (ele) { + enctype = lpEncodeGetType(ele,size,intenc,&enclen); + } else { + enctype = -1; + enclen = 0; + } + + /* We need to also encode the backward-parsable length of the element + * and append it to the end: this allows to traverse the listpack from + * the end to the start. */ + unsigned long backlen_size = ele ? lpEncodeBacklen(backlen,enclen) : 0; + uint64_t old_listpack_bytes = lpGetTotalBytes(lp); + uint32_t replaced_len = 0; + if (where == LP_REPLACE) { + replaced_len = lpCurrentEncodedSize(p); + replaced_len += lpEncodeBacklen(NULL,replaced_len); + } + + uint64_t new_listpack_bytes = old_listpack_bytes + enclen + backlen_size + - replaced_len; + if (new_listpack_bytes > UINT32_MAX) return NULL; + + /* We now need to reallocate in order to make space or shrink the + * allocation (in case 'when' value is LP_REPLACE and the new element is + * smaller). However we do that before memmoving the memory to + * make room for the new element if the final allocation will get + * larger, or we do it after if the final allocation will get smaller. */ + + unsigned char *dst = lp + poff; /* May be updated after reallocation. */ + + /* Realloc before: we need more room. */ + if (new_listpack_bytes > old_listpack_bytes) { + if ((lp = lp_realloc(lp,new_listpack_bytes)) == NULL) return NULL; + dst = lp + poff; + } + + /* Setup the listpack relocating the elements to make the exact room + * we need to store the new one. */ + if (where == LP_BEFORE) { + memmove(dst+enclen+backlen_size,dst,old_listpack_bytes-poff); + } else { /* LP_REPLACE. */ + long lendiff = (enclen+backlen_size)-replaced_len; + memmove(dst+replaced_len+lendiff, + dst+replaced_len, + old_listpack_bytes-poff-replaced_len); + } + + /* Realloc after: we need to free space. */ + if (new_listpack_bytes < old_listpack_bytes) { + if ((lp = lp_realloc(lp,new_listpack_bytes)) == NULL) return NULL; + dst = lp + poff; + } + + /* Store the entry. */ + if (newp) { + *newp = dst; + /* In case of deletion, set 'newp' to NULL if the next element is + * the EOF element. */ + if (!ele && dst[0] == LP_EOF) *newp = NULL; + } + if (ele) { + if (enctype == LP_ENCODING_INT) { + memcpy(dst,intenc,enclen); + } else { + lpEncodeString(dst,ele,size); + } + dst += enclen; + memcpy(dst,backlen,backlen_size); + dst += backlen_size; + } + + /* Update header. */ + if (where != LP_REPLACE || ele == NULL) { + uint32_t num_elements = lpGetNumElements(lp); + if (num_elements != LP_HDR_NUMELE_UNKNOWN) { + if (ele) + lpSetNumElements(lp,num_elements+1); + else + lpSetNumElements(lp,num_elements-1); + } + } + lpSetTotalBytes(lp,new_listpack_bytes); + return lp; +} + +/* Append the specified element 'ele' of lenght 'len' at the end of the + * listpack. It is implemented in terms of lpInsert(), so the return value is + * the same as lpInsert(). */ +unsigned char *lpAppend(unsigned char *lp, unsigned char *ele, uint32_t size) { + uint64_t listpack_bytes = lpGetTotalBytes(lp); + unsigned char *eofptr = lp + listpack_bytes - 1; + return lpInsert(lp,ele,size,eofptr,LP_BEFORE,NULL); +} + +/* Remove the element pointed by 'p', and return the resulting listpack. + * If 'newp' is not NULL, the next element pointer (to the right of the + * deleted one) is returned by reference. If the deleted element was the + * last one, '*newp' is set to NULL. */ +unsigned char *lpDelete(unsigned char *lp, unsigned char *p, unsigned char **newp) { + return lpInsert(lp,NULL,0,p,LP_REPLACE,newp); +} + +/* Return the total number of bytes the listpack is composed of. */ +uint32_t lpBytes(unsigned char *lp) { + return lpGetTotalBytes(lp); +} + +/* Seek the specified element and returns the pointer to the seeked element. + * Positive indexes specify the zero-based element to seek from the head to + * the tail, negative indexes specify elements starting from the tail, where + * -1 means the last element, -2 the penultimate and so forth. If the index + * is out of range, NULL is returned. */ +unsigned char *lpSeek(unsigned char *lp, long index) { + int forward = 1; /* Seek forward by default. */ + + /* We want to seek from left to right or the other way around + * depending on the listpack length and the element position. + * However if the listpack length cannot be obtained in constant time, + * we always seek from left to right. */ + uint32_t numele = lpGetNumElements(lp); + if (numele != LP_HDR_NUMELE_UNKNOWN) { + if (index < 0) index = (long)numele+index; + if (index < 0) return NULL; /* Index still < 0 means out of range. */ + if (index >= numele) return NULL; /* Out of range the other side. */ + /* We want to scan right-to-left if the element we are looking for + * is past the half of the listpack. */ + if (index > numele/2) { + forward = 0; + /* Left to right scanning always expects a negative index. Convert + * our index to negative form. */ + index -= numele; + } + } else { + /* If the listpack length is unspecified, for negative indexes we + * want to always scan left-to-right. */ + if (index < 0) forward = 0; + } + + /* Forward and backward scanning is trivially based on lpNext()/lpPrev(). */ + if (forward) { + unsigned char *ele = lpFirst(lp); + while (index > 0 && ele) { + ele = lpNext(lp,ele); + index--; + } + return ele; + } else { + unsigned char *ele = lpLast(lp); + while (index < -1 && ele) { + ele = lpPrev(lp,ele); + index++; + } + return ele; + } +} + diff --git a/src/listpack.h b/src/listpack.h new file mode 100644 index 00000000..af67b4b4 --- /dev/null +++ b/src/listpack.h @@ -0,0 +1,61 @@ +/* Listpack -- A lists of strings serialization format + * + * This file implements the specification you can find at: + * + * https://github.com/antirez/listpack + * + * Copyright (c) 2017, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __LISTPACK_H +#define __LISTPACK_H + +#include + +#define LP_INTBUF_SIZE 21 /* 20 digits of -2^63 + 1 null term = 21. */ + +/* lpInsert() where argument possible values: */ +#define LP_BEFORE 0 +#define LP_AFTER 1 +#define LP_REPLACE 2 + +unsigned char *lpNew(void); +void lpFree(unsigned char *lp); +unsigned char *lpInsert(unsigned char *lp, unsigned char *ele, uint32_t size, unsigned char *p, int where, unsigned char **newp); +unsigned char *lpAppend(unsigned char *lp, unsigned char *ele, uint32_t size); +unsigned char *lpDelete(unsigned char *lp, unsigned char *p, unsigned char **newp); +uint32_t lpLength(unsigned char *lp); +unsigned char *lpGet(unsigned char *p, int64_t *count, unsigned char *intbuf); +unsigned char *lpFirst(unsigned char *lp); +unsigned char *lpLast(unsigned char *lp); +unsigned char *lpNext(unsigned char *lp, unsigned char *p); +unsigned char *lpPrev(unsigned char *lp, unsigned char *p); +uint32_t lpBytes(unsigned char *lp); +unsigned char *lpSeek(unsigned char *lp, long index); + +#endif diff --git a/src/listpack_malloc.h b/src/listpack_malloc.h new file mode 100644 index 00000000..a3a077fc --- /dev/null +++ b/src/listpack_malloc.h @@ -0,0 +1,44 @@ +/* Listpack -- A lists of strings serialization format + * https://github.com/antirez/listpack + * + * Copyright (c) 2017, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* Allocator selection. + * + * This file is used in order to change the Rax allocator at compile time. + * Just define the following defines to what you want to use. Also add + * the include of your alternate allocator if needed (not needed in order + * to use the default libc allocator). */ + +#ifndef LISTPACK_ALLOC_H +#define LISTPACK_ALLOC_H +#define lp_malloc malloc +#define lp_realloc realloc +#define lp_free free +#endif diff --git a/src/object.c b/src/object.c index d2f8d53c..8eeb5c6c 100644 --- a/src/object.c +++ b/src/object.c @@ -232,6 +232,13 @@ robj *createZsetZiplistObject(void) { return o; } +robj *createStreamObject(void) { + stream *s = streamNew(); + robj *o = createObject(OBJ_STREAM,s); + o->encoding = OBJ_ENCODING_STREAM; + return o; +} + robj *createModuleObject(moduleType *mt, void *value) { moduleValue *mv = zmalloc(sizeof(*mv)); mv->type = mt; diff --git a/src/rax.c b/src/rax.c index dda008df..b4f5ae05 100644 --- a/src/rax.c +++ b/src/rax.c @@ -131,7 +131,7 @@ static inline void raxStackFree(raxStack *ts) { } /* ---------------------------------------------------------------------------- - * Radis tree implementation + * Radix tree implementation * --------------------------------------------------------------------------*/ /* Allocate a new non compressed node with the specified number of children. @@ -873,7 +873,8 @@ raxNode *raxRemoveChild(raxNode *parent, raxNode *child) { memmove(((char*)cp)-1,cp,(parent->size-taillen-1)*sizeof(raxNode**)); /* Move the remaining "tail" pointer at the right position as well. */ - memmove(((char*)c)-1,c+1,taillen*sizeof(raxNode**)+parent->iskey*sizeof(void*)); + size_t valuelen = (parent->iskey && !parent->isnull) ? sizeof(void*) : 0; + memmove(((char*)c)-1,c+1,taillen*sizeof(raxNode**)+valuelen); /* 4. Update size. */ parent->size--; @@ -1175,7 +1176,7 @@ void raxIteratorDelChars(raxIterator *it, size_t count) { * The function returns 1 on success or 0 on out of memory. */ int raxIteratorNextStep(raxIterator *it, int noup) { if (it->flags & RAX_ITER_EOF) { - return 0; + return 1; } else if (it->flags & RAX_ITER_JUST_SEEKED) { it->flags &= ~RAX_ITER_JUST_SEEKED; return 1; @@ -1187,10 +1188,6 @@ int raxIteratorNextStep(raxIterator *it, int noup) { size_t orig_stack_items = it->stack.items; raxNode *orig_node = it->node; - /* Clear the EOF flag: it will be set again if the EOF condition - * is still valid. */ - it->flags &= ~RAX_ITER_EOF; - while(1) { int children = it->node->iscompr ? 1 : it->node->size; if (!noup && children) { @@ -1291,7 +1288,7 @@ int raxSeekGreatest(raxIterator *it) { * effect to the one of raxIteratorPrevSte(). */ int raxIteratorPrevStep(raxIterator *it, int noup) { if (it->flags & RAX_ITER_EOF) { - return 0; + return 1; } else if (it->flags & RAX_ITER_JUST_SEEKED) { it->flags &= ~RAX_ITER_JUST_SEEKED; return 1; @@ -1412,6 +1409,7 @@ int raxSeek(raxIterator *it, const char *op, unsigned char *ele, size_t len) { it->node = it->rt->head; if (!raxSeekGreatest(it)) return 0; assert(it->node->iskey); + it->data = raxGetData(it->node); return 1; } @@ -1430,6 +1428,7 @@ int raxSeek(raxIterator *it, const char *op, unsigned char *ele, size_t len) { /* We found our node, since the key matches and we have an * "equal" condition. */ if (!raxIteratorAddChars(it,ele,len)) return 0; /* OOM. */ + it->data = raxGetData(it->node); } else if (lt || gt) { /* Exact key not found or eq flag not set. We have to set as current * key the one represented by the node we stopped at, and perform @@ -1502,6 +1501,7 @@ int raxSeek(raxIterator *it, const char *op, unsigned char *ele, size_t len) { * the previous sub-tree. */ if (nodechar < keychar) { if (!raxSeekGreatest(it)) return 0; + it->data = raxGetData(it->node); } else { if (!raxIteratorAddChars(it,it->node->data,it->node->size)) return 0; @@ -1647,6 +1647,14 @@ void raxStop(raxIterator *it) { raxStackFree(&it->stack); } +/* Return if the iterator is in an EOF state. This happens when raxSeek() + * failed to seek an appropriate element, so that raxNext() or raxPrev() + * will return zero, or when an EOF condition was reached while iterating + * with raxNext() and raxPrev(). */ +int raxEOF(raxIterator *it) { + return it->flags & RAX_ITER_EOF; +} + /* ----------------------------- Introspection ------------------------------ */ /* This function is mostly used for debugging and learning purposes. diff --git a/src/rax.h b/src/rax.h index 6f91f4c1..f6985c37 100644 --- a/src/rax.h +++ b/src/rax.h @@ -155,6 +155,7 @@ int raxPrev(raxIterator *it); int raxRandomWalk(raxIterator *it, size_t steps); int raxCompare(raxIterator *iter, const char *op, unsigned char *key, size_t key_len); void raxStop(raxIterator *it); +int raxEOF(raxIterator *it); void raxShow(rax *rax); #endif diff --git a/src/rdb.h b/src/rdb.h index 62a13f44..bf115045 100644 --- a/src/rdb.h +++ b/src/rdb.h @@ -89,10 +89,11 @@ #define RDB_TYPE_ZSET_ZIPLIST 12 #define RDB_TYPE_HASH_ZIPLIST 13 #define RDB_TYPE_LIST_QUICKLIST 14 +#define RDB_TYPE_STREAM_LISTPACKS 15 /* NOTE: WHEN ADDING NEW RDB TYPE, UPDATE rdbIsObjectType() BELOW */ /* Test if a type is an object type. */ -#define rdbIsObjectType(t) ((t >= 0 && t <= 7) || (t >= 9 && t <= 14)) +#define rdbIsObjectType(t) ((t >= 0 && t <= 7) || (t >= 9 && t <= 15)) /* Special RDB opcodes (saved/loaded with rdbSaveType/rdbLoadType). */ #define RDB_OPCODE_AUX 250 diff --git a/src/server.c b/src/server.c index 7498a25f..2c3647db 100644 --- a/src/server.c +++ b/src/server.c @@ -302,6 +302,8 @@ struct redisCommand redisCommandTable[] = { {"pfcount",pfcountCommand,-2,"r",0,NULL,1,-1,1,0,0}, {"pfmerge",pfmergeCommand,-2,"wm",0,NULL,1,-1,1,0,0}, {"pfdebug",pfdebugCommand,-3,"w",0,NULL,0,0,0,0,0}, + {"xadd",xaddCommand,-4,"wmF",0,NULL,1,1,1,0,0}, + {"xrange",xrangeCommand,-4,"r",0,NULL,1,1,1,0,0}, {"post",securityWarningCommand,-1,"lt",0,NULL,0,0,0,0,0}, {"host:",securityWarningCommand,-1,"lt",0,NULL,0,0,0,0,0}, {"latency",latencyCommand,-2,"aslt",0,NULL,0,0,0,0,0} diff --git a/src/server.h b/src/server.h index 11eb36f3..38a76d00 100644 --- a/src/server.h +++ b/src/server.h @@ -59,6 +59,7 @@ typedef long long mstime_t; /* millisecond time type. */ #include "anet.h" /* Networking the easy way */ #include "ziplist.h" /* Compact list data structure */ #include "intset.h" /* Compact integer set structure */ +#include "stream.h" /* Stream data type header file. */ #include "version.h" /* Version macro */ #include "util.h" /* Misc functions useful in many places */ #include "latency.h" /* Latency monitor API */ @@ -451,6 +452,7 @@ typedef long long mstime_t; /* millisecond time type. */ #define OBJ_SET 2 #define OBJ_ZSET 3 #define OBJ_HASH 4 +#define OBJ_STREAM 5 /* The "module" object type is a special one that signals that the object * is one directly managed by a Redis module. In this case the value points @@ -575,6 +577,7 @@ typedef struct RedisModuleDigest { #define OBJ_ENCODING_SKIPLIST 7 /* Encoded as skiplist */ #define OBJ_ENCODING_EMBSTR 8 /* Embedded sds string encoding */ #define OBJ_ENCODING_QUICKLIST 9 /* Encoded as linked list of ziplists */ +#define OBJ_ENCODING_STREAM 10 /* Encoded as a radix tree of listpacks */ #define LRU_BITS 24 #define LRU_CLOCK_MAX ((1<lru */ @@ -1414,6 +1417,9 @@ void handleClientsBlockedOnLists(void); void popGenericCommand(client *c, int where); void signalListAsReady(redisDb *db, robj *key); +/* Stream data type. */ +stream *streamNew(void); + /* MULTI/EXEC/WATCH... */ void unwatchAllKeys(client *c); void initClientMultiState(client *c); @@ -1455,6 +1461,7 @@ robj *createIntsetObject(void); robj *createHashObject(void); robj *createZsetObject(void); robj *createZsetZiplistObject(void); +robj *createStreamObject(void); robj *createModuleObject(moduleType *mt, void *value); int getLongFromObjectOrReply(client *c, robj *o, long *target, const char *msg); int checkType(client *c, robj *o, int type); @@ -1992,6 +1999,8 @@ void pfdebugCommand(client *c); void latencyCommand(client *c); void moduleCommand(client *c); void securityWarningCommand(client *c); +void xaddCommand(client *c); +void xrangeCommand(client *c); #if defined(__GNUC__) void *calloc(size_t count, size_t size) __attribute__ ((deprecated)); diff --git a/src/stream.h b/src/stream.h new file mode 100644 index 00000000..065c328e --- /dev/null +++ b/src/stream.h @@ -0,0 +1,21 @@ +#ifndef STREAM_H +#define STREAM_H + +#include "rax.h" + +/* Stream item ID: a 128 bit number composed of a milliseconds time and + * a sequence counter. IDs generated in the same millisecond (or in a past + * millisecond if the clock jumped backward) will use the millisecond time + * of the latest generated ID and an incremented sequence. */ +typedef struct streamID { + uint64_t ms; /* Unix time in milliseconds. */ + uint64_t seq; /* Sequence number. */ +} streamID; + +typedef struct stream { + rax *rax; /* The radix tree holding the stream. */ + uint64_t length; /* Number of elements inside this stream. */ + streamID last_id; /* Zero if there are yet no items. */ +} stream; + +#endif diff --git a/src/t_stream.c b/src/t_stream.c new file mode 100644 index 00000000..c64f5059 --- /dev/null +++ b/src/t_stream.c @@ -0,0 +1,376 @@ +/* + * Copyright (c) 2017, Salvatore Sanfilippo + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Redis nor the names of its contributors may be used + * to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* TODO: + * - After loading a stream, populate the last ID. + */ + +#include "server.h" +#include "listpack.h" +#include "endianconv.h" +#include "stream.h" + +#define STREAM_BYTES_PER_LISTPACK 4096 + +/* ----------------------------------------------------------------------- + * Low level stream encoding: a radix tree of listpacks. + * ----------------------------------------------------------------------- */ + +/* Create a new stream data structure. */ +stream *streamNew(void) { + stream *s = zmalloc(sizeof(*s)); + s->rax = raxNew(); + s->length = 0; + s->last_id.ms = 0; + s->last_id.seq = 0; + return s; +} + +/* Generate the next stream item ID given the previous one. If the current + * milliseconds Unix time is greater than the previous one, just use this + * as time part and start with sequence part of zero. Otherwise we use the + * previous time (and never go backward) and increment the sequence. */ +void streamNextID(streamID *last_id, streamID *new_id) { + uint64_t ms = mstime(); + if (ms > last_id->ms) { + new_id->ms = ms; + new_id->seq = 0; + } else { + new_id->ms = last_id->ms; + new_id->seq = last_id->seq+1; + } +} + +/* This is just a wrapper for lpAppend() to directly use a 64 bit integer + * instead of a string. */ +unsigned char *lpAppendInteger(unsigned char *lp, int64_t value) { + char buf[LONG_STR_SIZE]; + int slen = ll2string(buf,sizeof(buf),value); + return lpAppend(lp,(unsigned char*)buf,slen); +} + +/* This is a wrapper function for lpGet() to directly get an integer value + * from the listpack (that may store numbers as a string), converting + * the string if needed. */ +int64_t lpGetInteger(unsigned char *ele) { + int64_t v; + unsigned char *e = lpGet(ele,&v,NULL); + if (e == NULL) return v; + /* The following code path should never be used for how listpacks work: + * they should always be able to store an int64_t value in integer + * encoded form. However the implementation may change. */ + int retval = string2ll((char*)e,v,&v); + serverAssert(retval != 0); + return v; +} + +/* Convert the specified stream entry ID as a 128 bit big endian number, so + * that the IDs can be sorted lexicographically. */ +void streamEncodeID(void *buf, streamID *id) { + uint64_t e[2]; + e[0] = htonu64(id->ms); + e[1] = htonu64(id->seq); + memcpy(buf,e,sizeof(e)); +} + +/* This is the reverse of streamEncodeID(): the decoded ID will be stored + * in the 'id' structure passed by reference. The buffer 'buf' must point + * to a 128 bit big-endian encoded ID. */ +void streamDecodeID(void *buf, streamID *id) { + uint64_t e[2]; + memcpy(e,buf,sizeof(e)); + id->ms = ntohu64(e[0]); + id->seq = ntohu64(e[1]); +} + +/* Adds a new item into the stream 's' having the specified number of + * field-value pairs as specified in 'numfields' and stored into 'argv'. + * Returns the new entry ID populating the 'added_id' structure. */ +void streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id) { + raxIterator ri; + raxStart(&ri,s->rax); + raxSeek(&ri,"$",NULL,0); + + size_t lp_bytes = 0; /* Total bytes in the tail listpack. */ + unsigned char *lp = NULL; /* Tail listpack pointer. */ + + /* Get a reference to the tail node listpack. */ + if (raxNext(&ri)) { + lp = ri.data; + lp_bytes = lpBytes(lp); + } + raxStop(&ri); + + /* Generate the new entry ID. */ + streamID id; + streamNextID(&s->last_id,&id); + + /* We have to add the key into the radix tree in lexicographic order, + * to do so we consider the ID as a single 128 bit number written in + * big endian, so that the most significant bytes are the first ones. */ + uint64_t rax_key[2]; /* Key in the radix tree containing the listpack.*/ + uint64_t entry_id[2]; /* Entry ID of the new item as 128 bit string. */ + streamEncodeID(entry_id,&id); + + /* Create a new listpack and radix tree node if needed. */ + if (lp == NULL || lp_bytes > STREAM_BYTES_PER_LISTPACK) { + lp = lpNew(); + rax_key[0] = entry_id[0]; + rax_key[1] = entry_id[1]; + raxInsert(s->rax,(unsigned char*)&rax_key,sizeof(rax_key),lp,NULL); + } else { + serverAssert(ri.key_len == sizeof(rax_key)); + memcpy(rax_key,ri.key,sizeof(rax_key)); + } + + /* Populate the listpack with the new entry. */ + lp = lpAppend(lp,(unsigned char*)entry_id,sizeof(entry_id)); + lp = lpAppendInteger(lp,numfields); + for (int i = 0; i < numfields; i++) { + sds field = argv[i*2]->ptr, value = argv[i*2+1]->ptr; + lp = lpAppend(lp,(unsigned char*)field,sdslen(field)); + lp = lpAppend(lp,(unsigned char*)value,sdslen(value)); + } + + /* Insert back into the tree in order to update the listpack pointer. */ + raxInsert(s->rax,(unsigned char*)&rax_key,sizeof(rax_key),lp,NULL); + s->length++; + s->last_id = id; + if (added_id) *added_id = id; + raxShow(s->rax); +} + +/* Send the specified range to the client 'c'. The range the client will + * receive is between start and end inclusive, if 'count' is non zero, no more + * than 'count' elemnets are sent. The 'end' pointer can be NULL to mean that + * we want all the elements from 'start' till the end of the stream. */ +size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end, size_t count) { + void *arraylen_ptr = addDeferredMultiBulkLength(c); + size_t arraylen = 0; + + /* Seek the radix tree node that contains our start item. */ + uint64_t key[2]; + uint64_t end_key[2]; + streamEncodeID(key,start); + if (end) streamEncodeID(end_key,end); + raxIterator ri; + raxStart(&ri,s->rax); + + /* Seek the correct node in the radix tree. */ + if (start->ms || start->seq) { + raxSeek(&ri,"<=",(unsigned char*)key,sizeof(key)); + if (raxEOF(&ri)) raxSeek(&ri,">",(unsigned char*)key,sizeof(key)); + } else { + raxSeek(&ri,"^",NULL,0); + } + + /* For every radix tree node, iterate the corresponding listpack, + * returning elmeents when they are within range. */ + while (raxNext(&ri)) { + serverAssert(ri.key_len == sizeof(key)); + unsigned char *lp = ri.data; + unsigned char *lp_ele = lpFirst(lp); + while(lp_ele) { + int64_t e_len; + unsigned char buf[LP_INTBUF_SIZE]; + unsigned char *e = lpGet(lp_ele,&e_len,buf); + serverAssert(e_len == sizeof(streamID)); + + /* Seek next field: number of elements. */ + lp_ele = lpNext(lp,lp_ele); + if (memcmp(e,key,sizeof(key)) >= 0) { /* If current >= start */ + if (end && memcmp(e,end_key,sizeof(key)) > 0) { + break; /* We are already out of range. */ + } + streamID thisid; + streamDecodeID(e,&thisid); + sds replyid = sdscatfmt(sdsempty(),"+%U.%U\r\n", + thisid.ms,thisid.seq); + + /* Emit this stream entry in the client output. */ + addReplyMultiBulkLen(c,2); + addReplySds(c,replyid); + int64_t numfields = lpGetInteger(lp_ele); + lp_ele = lpNext(lp,lp_ele); + addReplyMultiBulkLen(c,numfields*2); + for (int64_t i = 0; i < numfields; i++) { + /* Emit two items (key-value) per iteration. */ + for (int k = 0; k < 2; k++) { + e = lpGet(lp_ele,&e_len,buf); + addReplyBulkCBuffer(c,e,e_len); + lp_ele = lpNext(lp,lp_ele); + } + } + + arraylen++; + if (count && count == arraylen) break; + } else { + /* If we do not emit, we have to discard. */ + int64_t numfields = lpGetInteger(lp_ele); + lp_ele = lpNext(lp,lp_ele); + for (int64_t i = 0; i < numfields*2; i++) + lp_ele = lpNext(lp,lp_ele); + } + } + if (count && count == arraylen) break; + } + raxStop(&ri); + setDeferredMultiBulkLength(c,arraylen_ptr,arraylen); + return arraylen; +} + +/* ----------------------------------------------------------------------- + * Stream commands implementation + * ----------------------------------------------------------------------- */ + +/* Look the stream at 'key' and return the corresponding stream object. + * The function creates a key setting it to an empty stream if needed. */ +robj *streamTypeLookupWriteOrCreate(client *c, robj *key) { + robj *o = lookupKeyWrite(c->db,key); + if (o == NULL) { + o = createStreamObject(); + dbAdd(c->db,key,o); + } else { + if (o->type != OBJ_STREAM) { + addReply(c,shared.wrongtypeerr); + return NULL; + } + } + return o; +} + +/* Helper function to convert a string to an unsigned long long value. + * The function attempts to use the faster string2ll() function inside + * Redis: if it fails, strtoull() is used instead. The function returns + * 1 if the conversion happened successfully or 0 if the number is + * invalid or out of range. */ +int string2ull(const char *s, unsigned long long *value) { + long long ll; + if (string2ll(s,strlen(s),&ll)) { + if (ll < 0) return 0; /* Negative values are out of range. */ + *value = ll; + return 1; + } + errno = 0; + *value = strtoull(s,NULL,10); + if (errno == EINVAL || errno == ERANGE) return 0; /* strtoull() failed. */ + return 1; /* Conversion done! */ +} + +/* Parse a stream ID in the format given by clients to Redis, that is + * ., and converts it into a streamID structure. If + * the specified ID is invalid C_ERR is returned and an error is reported + * to the client, otherwise C_OK is returned. The ID may be in incomplete + * form, just stating the milliseconds time part of the stream. In such a case + * the missing part is set according to the value of 'missing_seq' parameter. + * The IDs "-" and "+" specify respectively the minimum and maximum IDs + * that can be represented. */ +int streamParseIDOrReply(client *c, robj *o, streamID *id, uint64_t missing_seq) { + char buf[128]; + if (sdslen(o->ptr) > sizeof(buf)-1) goto invalid; + memcpy(buf,o->ptr,sdslen(o->ptr)+1); + + /* Handle the "-" and "+" special cases. */ + if (buf[0] == '-' && buf[1] == '\0') { + id->ms = 0; + id->seq = 0; + return C_OK; + } else if (buf[0] == '+' && buf[1] == '\0') { + id->ms = UINT64_MAX; + id->seq = UINT64_MAX; + return C_OK; + } + + /* Parse . form. */ + char *dot = strchr(buf,'.'); + if (dot) *dot = '\0'; + uint64_t ms, seq; + if (string2ull(buf,&ms) == 0) goto invalid; + if (dot && string2ull(dot+1,&seq) == 0) goto invalid; + if (!dot) seq = missing_seq; + id->ms = ms; + id->seq = seq; + return C_OK; + +invalid: + addReplyError(c,"Invalid stream ID specified as stream command argument"); + return C_ERR; +} + +/* XADD key [field value] [field value] ... */ +void xaddCommand(client *c) { + if ((c->argc % 2) == 1) { + addReplyError(c,"wrong number of arguments for XADD"); + return; + } + + /* Lookup the stream at key. */ + robj *o; + stream *s; + if ((o = streamTypeLookupWriteOrCreate(c,c->argv[1])) == NULL) return; + s = o->ptr; + + /* Append using the low level function and return the ID. */ + streamID id; + streamAppendItem(s,c->argv+2,(c->argc-2)/2,&id); + sds reply = sdscatfmt(sdsempty(),"+%U.%U\r\n",id.ms,id.seq); + addReplySds(c,reply); + + signalModifiedKey(c->db,c->argv[1]); + notifyKeyspaceEvent(NOTIFY_HASH,"xadd",c->argv[1],c->db->id); + server.dirty++; +} + +/* XRANGE key start end [COUNT ] */ +void xrangeCommand(client *c) { + robj *o; + stream *s; + streamID startid, endid; + long long count = 0; + + if (streamParseIDOrReply(c,c->argv[2],&startid,0) == C_ERR) return; + if (streamParseIDOrReply(c,c->argv[3],&endid,UINT64_MAX) == C_ERR) return; + + /* Parse the COUNT option if any. */ + if (c->argc > 4) { + if (strcasecmp(c->argv[4]->ptr,"COUNT") == 0) { + if (getLongLongFromObjectOrReply(c,c->argv[5],&count,NULL) != C_OK) + return; + } else { + addReply(c,shared.syntaxerr); + return; + } + } + + /* Return the specified range to the user. */ + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL + || checkType(c,o,OBJ_STREAM)) return; + s = o->ptr; + streamReplyWithRange(c,s,&startid,&endid,count); +} From 100d43c1ac48e8e949bd622b302ba309ae498752 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 5 Sep 2017 12:13:16 +0200 Subject: [PATCH 028/102] Streams: assign value of 6 to OBJ_STREAM + some refactoring. --- src/rdb.h | 5 +++-- src/server.h | 14 +++++++------- src/t_stream.c | 8 +++++++- 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/rdb.h b/src/rdb.h index bf115045..ecb066fb 100644 --- a/src/rdb.h +++ b/src/rdb.h @@ -69,8 +69,9 @@ #define RDB_ENC_INT32 2 /* 32 bit signed integer */ #define RDB_ENC_LZF 3 /* string compressed with FASTLZ */ -/* Dup object types to RDB object types. Only reason is readability (are we - * dealing with RDB types or with in-memory object types?). */ +/* Map object types to RDB object types. Macros starting with OBJ_ are for + * memory storage and may change. Instead RDB types must be fixed because + * we store them on disk. */ #define RDB_TYPE_STRING 0 #define RDB_TYPE_LIST 1 #define RDB_TYPE_SET 2 diff --git a/src/server.h b/src/server.h index 38a76d00..1185d119 100644 --- a/src/server.h +++ b/src/server.h @@ -447,12 +447,11 @@ typedef long long mstime_t; /* millisecond time type. */ /* A redis object, that is a type able to hold a string / list / set */ /* The actual Redis Object */ -#define OBJ_STRING 0 -#define OBJ_LIST 1 -#define OBJ_SET 2 -#define OBJ_ZSET 3 -#define OBJ_HASH 4 -#define OBJ_STREAM 5 +#define OBJ_STRING 0 /* String object. */ +#define OBJ_LIST 1 /* List object. */ +#define OBJ_SET 2 /* Set object. */ +#define OBJ_ZSET 3 /* Sorted set object. */ +#define OBJ_HASH 4 /* Hash object. */ /* The "module" object type is a special one that signals that the object * is one directly managed by a Redis module. In this case the value points @@ -465,7 +464,8 @@ typedef long long mstime_t; /* millisecond time type. */ * by a 64 bit module type ID, which has a 54 bits module-specific signature * in order to dispatch the loading to the right module, plus a 10 bits * encoding version. */ -#define OBJ_MODULE 5 +#define OBJ_MODULE 5 /* Module object. */ +#define OBJ_STREAM 6 /* Stream object. */ /* Extract encver / signature from a module type ID. */ #define REDISMODULE_TYPE_ENCVER_BITS 10 diff --git a/src/t_stream.c b/src/t_stream.c index c64f5059..dcf9fcce 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -149,7 +149,13 @@ void streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id) memcpy(rax_key,ri.key,sizeof(rax_key)); } - /* Populate the listpack with the new entry. */ + /* Populate the listpack with the new entry. We use the following + * encoding: + * + * +--------+----------+-------+-------+-/-+-------+-------+ + * |entry-id|num-fields|field-1|value-1|...|field-N|value-N| + * +--------+----------+-------+-------+-/-+-------+-------+ + */ lp = lpAppend(lp,(unsigned char*)entry_id,sizeof(entry_id)); lp = lpAppendInteger(lp,numfields); for (int i = 0; i < numfields; i++) { From 485014cc74d05436afc8257c5d7b05370410adc7 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 5 Sep 2017 13:14:13 +0200 Subject: [PATCH 029/102] Streams: RDB saving. --- src/rax.c | 5 +++++ src/rax.h | 1 + src/rdb.c | 22 ++++++++++++++++++++++ src/stream.h | 1 + src/t_stream.c | 2 -- 5 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/rax.c b/src/rax.c index b4f5ae05..3ead27ed 100644 --- a/src/rax.c +++ b/src/rax.c @@ -1655,6 +1655,11 @@ int raxEOF(raxIterator *it) { return it->flags & RAX_ITER_EOF; } +/* Return the number of elements inside the radix tree. */ +uint64_t raxSize(rax *rax) { + return rax->numele; +} + /* ----------------------------- Introspection ------------------------------ */ /* This function is mostly used for debugging and learning purposes. diff --git a/src/rax.h b/src/rax.h index f6985c37..e22b6e69 100644 --- a/src/rax.h +++ b/src/rax.h @@ -157,5 +157,6 @@ int raxCompare(raxIterator *iter, const char *op, unsigned char *key, size_t key void raxStop(raxIterator *it); int raxEOF(raxIterator *it); void raxShow(rax *rax); +uint64_t raxSize(rax *rax); #endif diff --git a/src/rdb.c b/src/rdb.c index 19ba59ab..c79bfa8d 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -31,6 +31,7 @@ #include "lzf.h" /* LZF compression library */ #include "zipmap.h" #include "endianconv.h" +#include "stream.h" #include #include @@ -622,6 +623,8 @@ int rdbSaveObjectType(rio *rdb, robj *o) { return rdbSaveType(rdb,RDB_TYPE_HASH); else serverPanic("Unknown hash encoding"); + case OBJ_STREAM: + return rdbSaveType(rdb,RDB_TYPE_STREAM_LISTPACKS); case OBJ_MODULE: return rdbSaveType(rdb,RDB_TYPE_MODULE_2); default: @@ -762,7 +765,26 @@ ssize_t rdbSaveObject(rio *rdb, robj *o) { } else { serverPanic("Unknown hash encoding"); } + } else if (o->type == OBJ_STREAM) { + /* Store how many listpacks we have inside the radix tree. */ + stream *s = o->ptr; + rax *rax = s->rax; + if ((n = rdbSaveLen(rdb,raxSize(rax))) == -1) return -1; + nwritten += n; + /* Serialize all the listpacks inside the radix tree as they are, + * when loading back, we'll use the first entry of each listpack + * to insert it back into the radix tree. */ + raxIterator ri; + raxStart(&ri,rax); + raxSeek(&ri,"^",NULL,0); + while (raxNext(&ri)) { + unsigned char *lp = ri.data; + size_t lp_bytes = lpBytes(lp); + if ((n = rdbSaveRawString(rdb,lp,lp_bytes)) == -1) return -1; + nwritten += n; + } + raxStop(&ri); } else if (o->type == OBJ_MODULE) { /* Save a module-specific value. */ RedisModuleIO io; diff --git a/src/stream.h b/src/stream.h index 065c328e..e78af5bc 100644 --- a/src/stream.h +++ b/src/stream.h @@ -2,6 +2,7 @@ #define STREAM_H #include "rax.h" +#include "listpack.h" /* Stream item ID: a 128 bit number composed of a milliseconds time and * a sequence counter. IDs generated in the same millisecond (or in a past diff --git a/src/t_stream.c b/src/t_stream.c index dcf9fcce..9ca001d7 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -32,7 +32,6 @@ */ #include "server.h" -#include "listpack.h" #include "endianconv.h" #include "stream.h" @@ -169,7 +168,6 @@ void streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id) s->length++; s->last_id = id; if (added_id) *added_id = id; - raxShow(s->rax); } /* Send the specified range to the client 'c'. The range the client will From edd70c1993b79d85bfc2812b0bf4bf4771ff40ed Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 5 Sep 2017 16:24:11 +0200 Subject: [PATCH 030/102] Streams: RDB loading. RDB saving modified. After a few attempts it looked quite saner to just add the last item ID at the end of the serialized listpacks, instead of scanning the last listpack loaded from head to tail just to fetch it. It's a disk space VS CPU-and-simplicity tradeoff basically. --- src/rdb.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/rdb.c b/src/rdb.c index c79bfa8d..acc6ca87 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -785,6 +785,12 @@ ssize_t rdbSaveObject(rio *rdb, robj *o) { nwritten += n; } raxStop(&ri); + + /* Save the last entry ID. */ + if ((n = rdbSaveLen(rdb,s->last_id.ms)) == -1) return -1; + nwritten += n; + if ((n = rdbSaveLen(rdb,s->last_id.seq)) == -1) return -1; + nwritten += n; } else if (o->type == OBJ_MODULE) { /* Save a module-specific value. */ RedisModuleIO io; @@ -1431,6 +1437,40 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { rdbExitReportCorruptRDB("Unknown RDB encoding type %d",rdbtype); break; } + } else if (rdbtype == RDB_TYPE_STREAM_LISTPACKS) { + o = createStreamObject(); + stream *s = o->ptr; + uint64_t listpacks = rdbLoadLen(rdb,NULL); + + while(listpacks--) { + unsigned char *lp = + rdbGenericLoadStringObject(rdb,RDB_LOAD_PLAIN,NULL); + if (lp == NULL) return NULL; + unsigned char *first = lpFirst(lp); + if (first == NULL) { + /* Serialized listpacks should never be free, since on + * deletion we should remove the radix tree key if the + * resulting listpack is emtpy. */ + rdbExitReportCorruptRDB("Empty listpack inside stream"); + } + + /* Get the ID of the first entry: we'll use it as key to add the + * listpack into the radix tree. */ + int64_t e_len; + unsigned char buf[LP_INTBUF_SIZE]; + unsigned char *e = lpGet(first,&e_len,buf); + if (e_len != sizeof(streamID)) { + rdbExitReportCorruptRDB("Listpack first entry is not the " + "size of a stream ID"); + } + int retval = raxInsert(s->rax,e,sizeof(streamID),lp,NULL); + if (!retval) + rdbExitReportCorruptRDB("Listpack re-added with existing key"); + } + + /* Load the last entry ID. */ + s->last_id.ms = rdbLoadLen(rdb,NULL); + s->last_id.seq = rdbLoadLen(rdb,NULL); } else if (rdbtype == RDB_TYPE_MODULE || rdbtype == RDB_TYPE_MODULE_2) { uint64_t moduleid = rdbLoadLen(rdb,NULL); moduleType *mt = moduleTypeLookupModuleByID(moduleid); From cd18f06e9c674646e17b35125358df6eb11954dc Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 6 Sep 2017 12:00:03 +0200 Subject: [PATCH 031/102] Streams: change listpack allocator to zmalloc. --- src/listpack_malloc.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/listpack_malloc.h b/src/listpack_malloc.h index a3a077fc..401ab6f7 100644 --- a/src/listpack_malloc.h +++ b/src/listpack_malloc.h @@ -38,7 +38,8 @@ #ifndef LISTPACK_ALLOC_H #define LISTPACK_ALLOC_H -#define lp_malloc malloc -#define lp_realloc realloc -#define lp_free free +#include "zmalloc.h" +#define lp_malloc zmalloc +#define lp_realloc zrealloc +#define lp_free zfree #endif From 98d184db12957d0558519022ad2205c3ac740a5a Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 6 Sep 2017 12:00:18 +0200 Subject: [PATCH 032/102] Streams: Save stream->length in RDB. --- src/rdb.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/rdb.c b/src/rdb.c index acc6ca87..5d15539c 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -786,6 +786,11 @@ ssize_t rdbSaveObject(rio *rdb, robj *o) { } raxStop(&ri); + /* Save the number of elements inside the stream. We cannot obtain + * this easily later, since our macro nodes should be checked for + * number of items: not a great CPU / space tradeoff. */ + if ((n = rdbSaveLen(rdb,s->length)) == -1) return -1; + nwritten += n; /* Save the last entry ID. */ if ((n = rdbSaveLen(rdb,s->last_id.ms)) == -1) return -1; nwritten += n; @@ -1467,7 +1472,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { if (!retval) rdbExitReportCorruptRDB("Listpack re-added with existing key"); } - + /* Load total number of items inside the stream. */ + s->length = rdbLoadLen(rdb,NULL); /* Load the last entry ID. */ s->last_id.ms = rdbLoadLen(rdb,NULL); s->last_id.seq = rdbLoadLen(rdb,NULL); From ec9bbe96bf47ae1f104c51cc6078eb72ca43cef0 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 6 Sep 2017 12:03:17 +0200 Subject: [PATCH 033/102] Streams: XLEN command. --- src/server.c | 1 + src/server.h | 1 + src/t_stream.c | 9 +++++++++ 3 files changed, 11 insertions(+) diff --git a/src/server.c b/src/server.c index 2c3647db..f3338f56 100644 --- a/src/server.c +++ b/src/server.c @@ -304,6 +304,7 @@ struct redisCommand redisCommandTable[] = { {"pfdebug",pfdebugCommand,-3,"w",0,NULL,0,0,0,0,0}, {"xadd",xaddCommand,-4,"wmF",0,NULL,1,1,1,0,0}, {"xrange",xrangeCommand,-4,"r",0,NULL,1,1,1,0,0}, + {"xlen",xlenCommand,2,"rF",0,NULL,1,1,1,0,0}, {"post",securityWarningCommand,-1,"lt",0,NULL,0,0,0,0,0}, {"host:",securityWarningCommand,-1,"lt",0,NULL,0,0,0,0,0}, {"latency",latencyCommand,-2,"aslt",0,NULL,0,0,0,0,0} diff --git a/src/server.h b/src/server.h index 1185d119..c934d7f6 100644 --- a/src/server.h +++ b/src/server.h @@ -2001,6 +2001,7 @@ void moduleCommand(client *c); void securityWarningCommand(client *c); void xaddCommand(client *c); void xrangeCommand(client *c); +void xlenCommand(client *c); #if defined(__GNUC__) void *calloc(size_t count, size_t size) __attribute__ ((deprecated)); diff --git a/src/t_stream.c b/src/t_stream.c index 9ca001d7..3474d478 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -378,3 +378,12 @@ void xrangeCommand(client *c) { s = o->ptr; streamReplyWithRange(c,s,&startid,&endid,count); } + +/* XLEN */ +void xlenCommand(client *c) { + robj *o; + if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.czero)) == NULL + || checkType(c,o,OBJ_STREAM)) return; + stream *s = o->ptr; + addReplyLongLong(c,s->length); +} From 439120c62076718e8f7e7e602c623febaec6f04a Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 6 Sep 2017 13:11:47 +0200 Subject: [PATCH 034/102] Streams: implement stream object release. --- src/object.c | 5 +++++ src/rax.c | 18 +++++++++++++----- src/rax.h | 1 + src/server.h | 1 + src/t_stream.c | 5 +++++ 5 files changed, 25 insertions(+), 5 deletions(-) diff --git a/src/object.c b/src/object.c index 8eeb5c6c..b689edcf 100644 --- a/src/object.c +++ b/src/object.c @@ -310,6 +310,10 @@ void freeModuleObject(robj *o) { zfree(mv); } +void freeStreamObject(robj *o) { + freeStream(o->ptr); +} + void incrRefCount(robj *o) { if (o->refcount != OBJ_SHARED_REFCOUNT) o->refcount++; } @@ -323,6 +327,7 @@ void decrRefCount(robj *o) { case OBJ_ZSET: freeZsetObject(o); break; case OBJ_HASH: freeHashObject(o); break; case OBJ_MODULE: freeModuleObject(o); break; + case OBJ_STREAM: freeStreamObject(o); break; default: serverPanic("Unknown object type"); break; } zfree(o); diff --git a/src/rax.c b/src/rax.c index 3ead27ed..442e7bfe 100644 --- a/src/rax.c +++ b/src/rax.c @@ -1093,28 +1093,36 @@ int raxRemove(rax *rax, unsigned char *s, size_t len, void **old) { /* This is the core of raxFree(): performs a depth-first scan of the * tree and releases all the nodes found. */ -void raxRecursiveFree(rax *rax, raxNode *n) { +void raxRecursiveFree(rax *rax, raxNode *n, void (*free_callback)(void*)) { debugnode("free traversing",n); int numchildren = n->iscompr ? 1 : n->size; raxNode **cp = raxNodeLastChildPtr(n); while(numchildren--) { raxNode *child; memcpy(&child,cp,sizeof(child)); - raxRecursiveFree(rax,child); + raxRecursiveFree(rax,child,free_callback); cp--; } debugnode("free depth-first",n); + if (free_callback && n->iskey && !n->isnull) + free_callback(raxGetData(n)); rax_free(n); rax->numnodes--; } -/* Free a whole radix tree. */ -void raxFree(rax *rax) { - raxRecursiveFree(rax,rax->head); +/* Free a whole radix tree, calling the specified callback in order to + * free the auxiliary data. */ +void raxFreeWithCallback(rax *rax, void (*free_callback)(void*)) { + raxRecursiveFree(rax,rax->head,free_callback); assert(rax->numnodes == 0); rax_free(rax); } +/* Free a whole radix tree. */ +void raxFree(rax *rax) { + raxFreeWithCallback(rax,NULL); +} + /* ------------------------------- Iterator --------------------------------- */ /* Initialize a Rax iterator. This call should be performed a single time diff --git a/src/rax.h b/src/rax.h index e22b6e69..b4e2fd91 100644 --- a/src/rax.h +++ b/src/rax.h @@ -148,6 +148,7 @@ int raxInsert(rax *rax, unsigned char *s, size_t len, void *data, void **old); int raxRemove(rax *rax, unsigned char *s, size_t len, void **old); void *raxFind(rax *rax, unsigned char *s, size_t len); void raxFree(rax *rax); +void raxFreeWithCallback(rax *rax, void (*free_callback)(void*)); void raxStart(raxIterator *it, rax *rt); int raxSeek(raxIterator *it, const char *op, unsigned char *ele, size_t len); int raxNext(raxIterator *it); diff --git a/src/server.h b/src/server.h index c934d7f6..8ea18341 100644 --- a/src/server.h +++ b/src/server.h @@ -1419,6 +1419,7 @@ void signalListAsReady(redisDb *db, robj *key); /* Stream data type. */ stream *streamNew(void); +void freeStream(stream *s); /* MULTI/EXEC/WATCH... */ void unwatchAllKeys(client *c); diff --git a/src/t_stream.c b/src/t_stream.c index 3474d478..52b0e105 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -51,6 +51,11 @@ stream *streamNew(void) { return s; } +/* Free a stream, including the listpacks stored inside the radix tree. */ +void freeStream(stream *s) { + raxFreeWithCallback(s->rax,(void(*)(void*))lpFree); +} + /* Generate the next stream item ID given the previous one. If the current * milliseconds Unix time is greater than the previous one, just use this * as time part and start with sequence part of zero. Otherwise we use the From 4a377cecd82e21307a887bb5f9fba55d79044bb8 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 6 Sep 2017 15:43:28 +0200 Subject: [PATCH 035/102] Streams: initial work to use blocking lists logic for streams XREAD. --- src/blocked.c | 208 +++++++++++++++++++++++++++++++++++++++++++++++ src/db.c | 9 +- src/networking.c | 2 +- src/server.c | 15 +++- src/server.h | 11 ++- src/t_list.c | 204 ---------------------------------------------- 6 files changed, 234 insertions(+), 215 deletions(-) diff --git a/src/blocked.c b/src/blocked.c index 54b26b71..acd3b948 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -65,6 +65,8 @@ #include "server.h" +int serveClientBlockedOnList(client *receiver, robj *key, robj *dstkey, redisDb *db, robj *value, int where); + /* Get a timeout value from an object and store it into 'timeout'. * The final timeout is always stored as milliseconds as a time where the * timeout will expire, however the parsing is performed according to @@ -193,3 +195,209 @@ void disconnectAllBlockedClients(void) { } } } + +/* This function should be called by Redis every time a single command, + * a MULTI/EXEC block, or a Lua script, terminated its execution after + * being called by a client. + * + * All the keys with at least one client blocked that received at least + * one new element via some PUSH/XADD operation are accumulated into + * the server.ready_keys list. This function will run the list and will + * serve clients accordingly. Note that the function will iterate again and + * again as a result of serving BRPOPLPUSH we can have new blocking clients + * to serve because of the PUSH side of BRPOPLPUSH. */ +void handleClientsBlockedOnKeys(void) { + while(listLength(server.ready_keys) != 0) { + list *l; + + /* Point server.ready_keys to a fresh list and save the current one + * locally. This way as we run the old list we are free to call + * signalKeyAsReady() that may push new elements in server.ready_keys + * when handling clients blocked into BRPOPLPUSH. */ + l = server.ready_keys; + server.ready_keys = listCreate(); + + while(listLength(l) != 0) { + listNode *ln = listFirst(l); + readyList *rl = ln->value; + + /* First of all remove this key from db->ready_keys so that + * we can safely call signalKeyAsReady() against this key. */ + dictDelete(rl->db->ready_keys,rl->key); + + /* If the key exists and it's a list, serve blocked clients + * with data. */ + robj *o = lookupKeyWrite(rl->db,rl->key); + if (o != NULL && o->type == OBJ_LIST) { + dictEntry *de; + + /* We serve clients in the same order they blocked for + * this key, from the first blocked to the last. */ + de = dictFind(rl->db->blocking_keys,rl->key); + if (de) { + list *clients = dictGetVal(de); + int numclients = listLength(clients); + + while(numclients--) { + listNode *clientnode = listFirst(clients); + client *receiver = clientnode->value; + robj *dstkey = receiver->bpop.target; + int where = (receiver->lastcmd && + receiver->lastcmd->proc == blpopCommand) ? + LIST_HEAD : LIST_TAIL; + robj *value = listTypePop(o,where); + + if (value) { + /* Protect receiver->bpop.target, that will be + * freed by the next unblockClient() + * call. */ + if (dstkey) incrRefCount(dstkey); + unblockClient(receiver); + + if (serveClientBlockedOnList(receiver, + rl->key,dstkey,rl->db,value, + where) == C_ERR) + { + /* If we failed serving the client we need + * to also undo the POP operation. */ + listTypePush(o,value,where); + } + + if (dstkey) decrRefCount(dstkey); + decrRefCount(value); + } else { + break; + } + } + } + + if (listTypeLength(o) == 0) { + dbDelete(rl->db,rl->key); + } + /* We don't call signalModifiedKey() as it was already called + * when an element was pushed on the list. */ + } + + /* Free this item. */ + decrRefCount(rl->key); + zfree(rl); + listDelNode(l,ln); + } + listRelease(l); /* We have the new list on place at this point. */ + } +} + +/* This is how the current blocking POP works, we use BLPOP as example: + * - If the user calls BLPOP and the key exists and contains a non empty list + * then LPOP is called instead. So BLPOP is semantically the same as LPOP + * if blocking is not required. + * - If instead BLPOP is called and the key does not exists or the list is + * empty we need to block. In order to do so we remove the notification for + * new data to read in the client socket (so that we'll not serve new + * requests if the blocking request is not served). Also we put the client + * in a dictionary (db->blocking_keys) mapping keys to a list of clients + * blocking for this keys. + * - If a PUSH operation against a key with blocked clients waiting is + * performed, we mark this key as "ready", and after the current command, + * MULTI/EXEC block, or script, is executed, we serve all the clients waiting + * for this list, from the one that blocked first, to the last, accordingly + * to the number of elements we have in the ready list. + */ + +/* Set a client in blocking mode for the specified key, with the specified + * timeout */ +void blockForKeys(client *c, robj **keys, int numkeys, mstime_t timeout, robj *target) { + dictEntry *de; + list *l; + int j; + + c->bpop.timeout = timeout; + c->bpop.target = target; + + if (target != NULL) incrRefCount(target); + + for (j = 0; j < numkeys; j++) { + /* If the key already exists in the dict ignore it. */ + if (dictAdd(c->bpop.keys,keys[j],NULL) != DICT_OK) continue; + incrRefCount(keys[j]); + + /* And in the other "side", to map keys -> clients */ + de = dictFind(c->db->blocking_keys,keys[j]); + if (de == NULL) { + int retval; + + /* For every key we take a list of clients blocked for it */ + l = listCreate(); + retval = dictAdd(c->db->blocking_keys,keys[j],l); + incrRefCount(keys[j]); + serverAssertWithInfo(c,keys[j],retval == DICT_OK); + } else { + l = dictGetVal(de); + } + listAddNodeTail(l,c); + } + blockClient(c,BLOCKED_LIST); +} + +/* Unblock a client that's waiting in a blocking operation such as BLPOP. + * You should never call this function directly, but unblockClient() instead. */ +void unblockClientWaitingData(client *c) { + dictEntry *de; + dictIterator *di; + list *l; + + serverAssertWithInfo(c,NULL,dictSize(c->bpop.keys) != 0); + di = dictGetIterator(c->bpop.keys); + /* The client may wait for multiple keys, so unblock it for every key. */ + while((de = dictNext(di)) != NULL) { + robj *key = dictGetKey(de); + + /* Remove this client from the list of clients waiting for this key. */ + l = dictFetchValue(c->db->blocking_keys,key); + serverAssertWithInfo(c,key,l != NULL); + listDelNode(l,listSearchKey(l,c)); + /* If the list is empty we need to remove it to avoid wasting memory */ + if (listLength(l) == 0) + dictDelete(c->db->blocking_keys,key); + } + dictReleaseIterator(di); + + /* Cleanup the client structure */ + dictEmpty(c->bpop.keys,NULL); + if (c->bpop.target) { + decrRefCount(c->bpop.target); + c->bpop.target = NULL; + } +} + +/* If the specified key has clients blocked waiting for list pushes, this + * function will put the key reference into the server.ready_keys list. + * Note that db->ready_keys is a hash table that allows us to avoid putting + * the same key again and again in the list in case of multiple pushes + * made by a script or in the context of MULTI/EXEC. + * + * The list will be finally processed by handleClientsBlockedOnLists() */ +void signalKeyAsReady(redisDb *db, robj *key) { + readyList *rl; + + /* No clients blocking for this key? No need to queue it. */ + if (dictFind(db->blocking_keys,key) == NULL) return; + + /* Key was already signaled? No need to queue it again. */ + if (dictFind(db->ready_keys,key) != NULL) return; + + /* Ok, we need to queue this key into server.ready_keys. */ + rl = zmalloc(sizeof(*rl)); + rl->key = key; + rl->db = db; + incrRefCount(key); + listAddNodeTail(server.ready_keys,rl); + + /* We also add the key in the db->ready_keys dictionary in order + * to avoid adding it multiple times into a list with a simple O(1) + * check. */ + incrRefCount(key); + serverAssert(dictAdd(db->ready_keys,key,NULL) == DICT_OK); +} + + diff --git a/src/db.c b/src/db.c index 4d6999be..6682e573 100644 --- a/src/db.c +++ b/src/db.c @@ -169,9 +169,10 @@ void dbAdd(redisDb *db, robj *key, robj *val) { int retval = dictAdd(db->dict, copy, val); serverAssertWithInfo(NULL,key,retval == DICT_OK); - if (val->type == OBJ_LIST) signalListAsReady(db, key); + if (val->type == OBJ_LIST || val->type == OBJ_STREAM) + signalKeyAsReady(db, key); if (server.cluster_enabled) slotToKeyAdd(key); - } +} /* Overwrite an existing key with a new value. Incrementing the reference * count of the new value is up to the caller. @@ -951,8 +952,8 @@ void scanDatabaseForReadyLists(redisDb *db) { while((de = dictNext(di)) != NULL) { robj *key = dictGetKey(de); robj *value = lookupKey(db,key,LOOKUP_NOTOUCH); - if (value && value->type == OBJ_LIST) - signalListAsReady(db, key); + if (value && (value->type == OBJ_LIST || value->type == OBJ_STREAM)) + signalKeyAsReady(db, key); } dictReleaseIterator(di); } diff --git a/src/networking.c b/src/networking.c index aeaeca96..d672ec32 100644 --- a/src/networking.c +++ b/src/networking.c @@ -124,7 +124,7 @@ client *createClient(int fd) { listSetDupMethod(c->reply,dupClientReplyValue); c->btype = BLOCKED_NONE; c->bpop.timeout = 0; - c->bpop.keys = dictCreate(&objectKeyPointerValueDictType,NULL); + c->bpop.keys = dictCreate(&objectKeyHeapPointerValueDictType,NULL); c->bpop.target = NULL; c->bpop.numreplicas = 0; c->bpop.reploffset = 0; diff --git a/src/server.c b/src/server.c index f3338f56..56b2188e 100644 --- a/src/server.c +++ b/src/server.c @@ -550,10 +550,21 @@ dictType objectKeyPointerValueDictType = { NULL, /* key dup */ NULL, /* val dup */ dictEncObjKeyCompare, /* key compare */ - dictObjectDestructor, /* key destructor */ + dictObjectDestructor, /* key destructor */ NULL /* val destructor */ }; +/* Like objectKeyPointerValueDictType(), but values can be destroyed, if + * not NULL, calling zfree(). */ +dictType objectKeyHeapPointerValueDictType = { + dictEncObjHash, /* hash function */ + NULL, /* key dup */ + NULL, /* val dup */ + dictEncObjKeyCompare, /* key compare */ + dictObjectDestructor, /* key destructor */ + dictVanillaFree /* val destructor */ +}; + /* Set dictionary type. Keys are SDS strings, values are ot used. */ dictType setDictType = { dictSdsHash, /* hash function */ @@ -2508,7 +2519,7 @@ int processCommand(client *c) { call(c,CMD_CALL_FULL); c->woff = server.master_repl_offset; if (listLength(server.ready_keys)) - handleClientsBlockedOnLists(); + handleClientsBlockedOnKeys(); } return C_OK; } diff --git a/src/server.h b/src/server.h index 8ea18341..8e50d030 100644 --- a/src/server.h +++ b/src/server.h @@ -256,6 +256,7 @@ typedef long long mstime_t; /* millisecond time type. */ #define BLOCKED_LIST 1 /* BLPOP & co. */ #define BLOCKED_WAIT 2 /* WAIT for synchronous replication. */ #define BLOCKED_MODULE 3 /* Blocked by a loadable module. */ +#define BLOCKED_STREAM 4 /* XREAD. */ /* Client request types */ #define PROTO_REQ_INLINE 1 @@ -641,9 +642,9 @@ typedef struct blockingState { mstime_t timeout; /* Blocking operation timeout. If UNIX current time * is > timeout then the operation timed out. */ - /* BLOCKED_LIST */ + /* BLOCKED_LIST and BLOCKED_STREAM */ dict *keys; /* The keys we are waiting to terminate a blocking - * operation such as BLPOP. Otherwise NULL. */ + * operation such as BLPOP or XREAD. Or NULL. */ robj *target; /* The key that should receive the element, * for BRPOPLPUSH. */ @@ -1291,6 +1292,7 @@ typedef struct { extern struct redisServer server; extern struct sharedObjectsStruct shared; extern dictType objectKeyPointerValueDictType; +extern dictType objectKeyHeapPointerValueDictType; extern dictType setDictType; extern dictType zsetDictType; extern dictType clusterNodesDictType; @@ -1413,9 +1415,7 @@ int listTypeEqual(listTypeEntry *entry, robj *o); void listTypeDelete(listTypeIterator *iter, listTypeEntry *entry); void listTypeConvert(robj *subject, int enc); void unblockClientWaitingData(client *c); -void handleClientsBlockedOnLists(void); void popGenericCommand(client *c, int where); -void signalListAsReady(redisDb *db, robj *key); /* Stream data type. */ stream *streamNew(void); @@ -1798,6 +1798,9 @@ void unblockClient(client *c); void replyToBlockedClientTimedOut(client *c); int getTimeoutFromObjectOrReply(client *c, robj *object, mstime_t *timeout, int unit); void disconnectAllBlockedClients(void); +void handleClientsBlockedOnKeys(void); +void signalKeyAsReady(redisDb *db, robj *key); +void blockForKeys(client *c, robj **keys, int numkeys, mstime_t timeout, robj *target); /* expire.c -- Handling of expired keys */ void activeExpireCycle(int type); diff --git a/src/t_list.c b/src/t_list.c index a0a30998..c7eacb0e 100644 --- a/src/t_list.c +++ b/src/t_list.c @@ -603,119 +603,6 @@ void rpoplpushCommand(client *c) { * Blocking POP operations *----------------------------------------------------------------------------*/ -/* This is how the current blocking POP works, we use BLPOP as example: - * - If the user calls BLPOP and the key exists and contains a non empty list - * then LPOP is called instead. So BLPOP is semantically the same as LPOP - * if blocking is not required. - * - If instead BLPOP is called and the key does not exists or the list is - * empty we need to block. In order to do so we remove the notification for - * new data to read in the client socket (so that we'll not serve new - * requests if the blocking request is not served). Also we put the client - * in a dictionary (db->blocking_keys) mapping keys to a list of clients - * blocking for this keys. - * - If a PUSH operation against a key with blocked clients waiting is - * performed, we mark this key as "ready", and after the current command, - * MULTI/EXEC block, or script, is executed, we serve all the clients waiting - * for this list, from the one that blocked first, to the last, accordingly - * to the number of elements we have in the ready list. - */ - -/* Set a client in blocking mode for the specified key, with the specified - * timeout */ -void blockForKeys(client *c, robj **keys, int numkeys, mstime_t timeout, robj *target) { - dictEntry *de; - list *l; - int j; - - c->bpop.timeout = timeout; - c->bpop.target = target; - - if (target != NULL) incrRefCount(target); - - for (j = 0; j < numkeys; j++) { - /* If the key already exists in the dict ignore it. */ - if (dictAdd(c->bpop.keys,keys[j],NULL) != DICT_OK) continue; - incrRefCount(keys[j]); - - /* And in the other "side", to map keys -> clients */ - de = dictFind(c->db->blocking_keys,keys[j]); - if (de == NULL) { - int retval; - - /* For every key we take a list of clients blocked for it */ - l = listCreate(); - retval = dictAdd(c->db->blocking_keys,keys[j],l); - incrRefCount(keys[j]); - serverAssertWithInfo(c,keys[j],retval == DICT_OK); - } else { - l = dictGetVal(de); - } - listAddNodeTail(l,c); - } - blockClient(c,BLOCKED_LIST); -} - -/* Unblock a client that's waiting in a blocking operation such as BLPOP. - * You should never call this function directly, but unblockClient() instead. */ -void unblockClientWaitingData(client *c) { - dictEntry *de; - dictIterator *di; - list *l; - - serverAssertWithInfo(c,NULL,dictSize(c->bpop.keys) != 0); - di = dictGetIterator(c->bpop.keys); - /* The client may wait for multiple keys, so unblock it for every key. */ - while((de = dictNext(di)) != NULL) { - robj *key = dictGetKey(de); - - /* Remove this client from the list of clients waiting for this key. */ - l = dictFetchValue(c->db->blocking_keys,key); - serverAssertWithInfo(c,key,l != NULL); - listDelNode(l,listSearchKey(l,c)); - /* If the list is empty we need to remove it to avoid wasting memory */ - if (listLength(l) == 0) - dictDelete(c->db->blocking_keys,key); - } - dictReleaseIterator(di); - - /* Cleanup the client structure */ - dictEmpty(c->bpop.keys,NULL); - if (c->bpop.target) { - decrRefCount(c->bpop.target); - c->bpop.target = NULL; - } -} - -/* If the specified key has clients blocked waiting for list pushes, this - * function will put the key reference into the server.ready_keys list. - * Note that db->ready_keys is a hash table that allows us to avoid putting - * the same key again and again in the list in case of multiple pushes - * made by a script or in the context of MULTI/EXEC. - * - * The list will be finally processed by handleClientsBlockedOnLists() */ -void signalListAsReady(redisDb *db, robj *key) { - readyList *rl; - - /* No clients blocking for this key? No need to queue it. */ - if (dictFind(db->blocking_keys,key) == NULL) return; - - /* Key was already signaled? No need to queue it again. */ - if (dictFind(db->ready_keys,key) != NULL) return; - - /* Ok, we need to queue this key into server.ready_keys. */ - rl = zmalloc(sizeof(*rl)); - rl->key = key; - rl->db = db; - incrRefCount(key); - listAddNodeTail(server.ready_keys,rl); - - /* We also add the key in the db->ready_keys dictionary in order - * to avoid adding it multiple times into a list with a simple O(1) - * check. */ - incrRefCount(key); - serverAssert(dictAdd(db->ready_keys,key,NULL) == DICT_OK); -} - /* This is a helper function for handleClientsBlockedOnLists(). It's work * is to serve a specific client (receiver) that is blocked on 'key' * in the context of the specified 'db', doing the following: @@ -785,97 +672,6 @@ int serveClientBlockedOnList(client *receiver, robj *key, robj *dstkey, redisDb return C_OK; } -/* This function should be called by Redis every time a single command, - * a MULTI/EXEC block, or a Lua script, terminated its execution after - * being called by a client. - * - * All the keys with at least one client blocked that received at least - * one new element via some PUSH operation are accumulated into - * the server.ready_keys list. This function will run the list and will - * serve clients accordingly. Note that the function will iterate again and - * again as a result of serving BRPOPLPUSH we can have new blocking clients - * to serve because of the PUSH side of BRPOPLPUSH. */ -void handleClientsBlockedOnLists(void) { - while(listLength(server.ready_keys) != 0) { - list *l; - - /* Point server.ready_keys to a fresh list and save the current one - * locally. This way as we run the old list we are free to call - * signalListAsReady() that may push new elements in server.ready_keys - * when handling clients blocked into BRPOPLPUSH. */ - l = server.ready_keys; - server.ready_keys = listCreate(); - - while(listLength(l) != 0) { - listNode *ln = listFirst(l); - readyList *rl = ln->value; - - /* First of all remove this key from db->ready_keys so that - * we can safely call signalListAsReady() against this key. */ - dictDelete(rl->db->ready_keys,rl->key); - - /* If the key exists and it's a list, serve blocked clients - * with data. */ - robj *o = lookupKeyWrite(rl->db,rl->key); - if (o != NULL && o->type == OBJ_LIST) { - dictEntry *de; - - /* We serve clients in the same order they blocked for - * this key, from the first blocked to the last. */ - de = dictFind(rl->db->blocking_keys,rl->key); - if (de) { - list *clients = dictGetVal(de); - int numclients = listLength(clients); - - while(numclients--) { - listNode *clientnode = listFirst(clients); - client *receiver = clientnode->value; - robj *dstkey = receiver->bpop.target; - int where = (receiver->lastcmd && - receiver->lastcmd->proc == blpopCommand) ? - LIST_HEAD : LIST_TAIL; - robj *value = listTypePop(o,where); - - if (value) { - /* Protect receiver->bpop.target, that will be - * freed by the next unblockClient() - * call. */ - if (dstkey) incrRefCount(dstkey); - unblockClient(receiver); - - if (serveClientBlockedOnList(receiver, - rl->key,dstkey,rl->db,value, - where) == C_ERR) - { - /* If we failed serving the client we need - * to also undo the POP operation. */ - listTypePush(o,value,where); - } - - if (dstkey) decrRefCount(dstkey); - decrRefCount(value); - } else { - break; - } - } - } - - if (listTypeLength(o) == 0) { - dbDelete(rl->db,rl->key); - } - /* We don't call signalModifiedKey() as it was already called - * when an element was pushed on the list. */ - } - - /* Free this item. */ - decrRefCount(rl->key); - zfree(rl); - listDelNode(l,ln); - } - listRelease(l); /* We have the new list on place at this point. */ - } -} - /* Blocking RPOP/LPOP */ void blockingPopGenericCommand(client *c, int where) { robj *o; From f80dfbf464e2a2de00cb8de5ed064f302a7a7c82 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 6 Sep 2017 17:50:11 +0200 Subject: [PATCH 036/102] Streams: more internal preparation for blocking XREAD. --- src/blocked.c | 34 +++++++++++++++++++++++++--------- src/server.h | 2 +- src/t_list.c | 4 ++-- 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/src/blocked.c b/src/blocked.c index acd3b948..74dab0c1 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -225,8 +225,7 @@ void handleClientsBlockedOnKeys(void) { * we can safely call signalKeyAsReady() against this key. */ dictDelete(rl->db->ready_keys,rl->key); - /* If the key exists and it's a list, serve blocked clients - * with data. */ + /* Serve clients blocked on list key. */ robj *o = lookupKeyWrite(rl->db,rl->key); if (o != NULL && o->type == OBJ_LIST) { dictEntry *de; @@ -241,6 +240,8 @@ void handleClientsBlockedOnKeys(void) { while(numclients--) { listNode *clientnode = listFirst(clients); client *receiver = clientnode->value; + if (receiver->btype != BLOCKED_LIST) continue; + robj *dstkey = receiver->bpop.target; int where = (receiver->lastcmd && receiver->lastcmd->proc == blpopCommand) ? @@ -287,7 +288,8 @@ void handleClientsBlockedOnKeys(void) { } } -/* This is how the current blocking POP works, we use BLPOP as example: +/* This is how the current blocking lists/streams work, we use BLPOP as + * example, but the concept is the same for other list ops and XREAD. * - If the user calls BLPOP and the key exists and contains a non empty list * then LPOP is called instead. So BLPOP is semantically the same as LPOP * if blocking is not required. @@ -304,9 +306,15 @@ void handleClientsBlockedOnKeys(void) { * to the number of elements we have in the ready list. */ -/* Set a client in blocking mode for the specified key, with the specified - * timeout */ -void blockForKeys(client *c, robj **keys, int numkeys, mstime_t timeout, robj *target) { +/* Set a client in blocking mode for the specified key (list or stream), with + * the specified timeout. The 'type' argument is BLOCKED_LIST or BLOCKED_STREAM + * depending on the kind of operation we are waiting for an empty key in + * order to awake the client. The client is blocked for all the 'numkeys' + * keys as in the 'keys' argument. When we block for stream keys, we also + * provide an array of streamID structures: clients will be unblocked only + * when items with an ID greater or equal to the specified one is appended + * to the stream. */ +void blockForKeys(client *c, int btype, robj **keys, int numkeys, mstime_t timeout, robj *target, streamID *ids) { dictEntry *de; list *l; int j; @@ -317,8 +325,16 @@ void blockForKeys(client *c, robj **keys, int numkeys, mstime_t timeout, robj *t if (target != NULL) incrRefCount(target); for (j = 0; j < numkeys; j++) { - /* If the key already exists in the dict ignore it. */ - if (dictAdd(c->bpop.keys,keys[j],NULL) != DICT_OK) continue; + /* The value associated with the key name in the bpop.keys dictionary + * is NULL for lists, or the stream ID for streams. */ + void *key_data = NULL; + if (btype == BLOCKED_STREAM) { + key_data = zmalloc(sizeof(streamID)); + memcpy(key_data,ids+j,sizeof(streamID)); + } + + /* If the key already exists in the dictionary ignore it. */ + if (dictAdd(c->bpop.keys,keys[j],key_data) != DICT_OK) continue; incrRefCount(keys[j]); /* And in the other "side", to map keys -> clients */ @@ -336,7 +352,7 @@ void blockForKeys(client *c, robj **keys, int numkeys, mstime_t timeout, robj *t } listAddNodeTail(l,c); } - blockClient(c,BLOCKED_LIST); + blockClient(c,btype); } /* Unblock a client that's waiting in a blocking operation such as BLPOP. diff --git a/src/server.h b/src/server.h index 8e50d030..2c69a94c 100644 --- a/src/server.h +++ b/src/server.h @@ -1800,7 +1800,7 @@ int getTimeoutFromObjectOrReply(client *c, robj *object, mstime_t *timeout, int void disconnectAllBlockedClients(void); void handleClientsBlockedOnKeys(void); void signalKeyAsReady(redisDb *db, robj *key); -void blockForKeys(client *c, robj **keys, int numkeys, mstime_t timeout, robj *target); +void blockForKeys(client *c, int btype, robj **keys, int numkeys, mstime_t timeout, robj *target, streamID *ids); /* expire.c -- Handling of expired keys */ void activeExpireCycle(int type); diff --git a/src/t_list.c b/src/t_list.c index c7eacb0e..c7e6aac0 100644 --- a/src/t_list.c +++ b/src/t_list.c @@ -726,7 +726,7 @@ void blockingPopGenericCommand(client *c, int where) { } /* If the list is empty or the key does not exists we must block */ - blockForKeys(c, c->argv + 1, c->argc - 2, timeout, NULL); + blockForKeys(c,BLOCKED_LIST,c->argv + 1,c->argc - 2,timeout,NULL,NULL); } void blpopCommand(client *c) { @@ -752,7 +752,7 @@ void brpoplpushCommand(client *c) { addReply(c, shared.nullbulk); } else { /* The list is empty and the client blocks. */ - blockForKeys(c, c->argv + 1, 1, timeout, c->argv[2]); + blockForKeys(c,BLOCKED_LIST,c->argv + 1,1,timeout,c->argv[2],NULL); } } else { if (key->type != OBJ_LIST) { From 4086dff477cc3d979d39c6c4ba9457575fc67d3e Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 7 Sep 2017 09:30:50 +0200 Subject: [PATCH 037/102] Streams: augment client.bpop with XREAD specific fields. --- src/blocked.c | 4 ++++ src/networking.c | 1 + src/server.h | 5 +++++ src/t_stream.c | 8 ++++++++ 4 files changed, 18 insertions(+) diff --git a/src/blocked.c b/src/blocked.c index 74dab0c1..376b343d 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -384,6 +384,10 @@ void unblockClientWaitingData(client *c) { decrRefCount(c->bpop.target); c->bpop.target = NULL; } + if (c->bpop.xread_group) { + decrRefCount(c->bpop.xread_group); + c->bpop.xread_group = NULL; + } } /* If the specified key has clients blocked waiting for list pushes, this diff --git a/src/networking.c b/src/networking.c index d672ec32..f0bdacfa 100644 --- a/src/networking.c +++ b/src/networking.c @@ -126,6 +126,7 @@ client *createClient(int fd) { c->bpop.timeout = 0; c->bpop.keys = dictCreate(&objectKeyHeapPointerValueDictType,NULL); c->bpop.target = NULL; + c->bpop.xread_group = NULL; c->bpop.numreplicas = 0; c->bpop.reploffset = 0; c->woff = 0; diff --git a/src/server.h b/src/server.h index 2c69a94c..34c5fb06 100644 --- a/src/server.h +++ b/src/server.h @@ -648,6 +648,11 @@ typedef struct blockingState { robj *target; /* The key that should receive the element, * for BRPOPLPUSH. */ + /* BLOCK_STREAM */ + size_t xread_count; /* XREAD COUNT option. */ + robj *xread_group; /* XREAD group name. */ + mstime_t xread_retry_time, xread_retry_ttl; + /* BLOCKED_WAIT */ int numreplicas; /* Number of replicas we are waiting for ACK. */ long long reploffset; /* Replication offset to reach. */ diff --git a/src/t_stream.c b/src/t_stream.c index 52b0e105..66c6cb89 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -392,3 +392,11 @@ void xlenCommand(client *c) { stream *s = o->ptr; addReplyLongLong(c,s->length); } + +/* XREAD [BLOCK ] [COUNT ] [GROUP ] + * [RETRY ] STREAMS key_1 ID_1 key_2 ID_2 ... + * key_N ID_N */ +void xreadCommand(client *c) { +} + + From e65b4825f0216f526b71f41818a494b0853ce715 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 7 Sep 2017 16:48:20 +0200 Subject: [PATCH 038/102] Streams: XREAD arguments parsing. --- src/t_stream.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/src/t_stream.c b/src/t_stream.c index 66c6cb89..485ea29a 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -371,6 +371,7 @@ void xrangeCommand(client *c) { if (strcasecmp(c->argv[4]->ptr,"COUNT") == 0) { if (getLongLongFromObjectOrReply(c,c->argv[5],&count,NULL) != C_OK) return; + if (count < 0) count = 0; } else { addReply(c,shared.syntaxerr); return; @@ -397,6 +398,77 @@ void xlenCommand(client *c) { * [RETRY ] STREAMS key_1 ID_1 key_2 ID_2 ... * key_N ID_N */ void xreadCommand(client *c) { + long long block = 0; + long long count = 0; + int streams_count = 0; + int streams_argc = 0; + #define STREAMID_STATIC_VECTOR_LEN 8 + streamID static_ids[STREAMID_STATIC_VECTOR_LEN]; + streamID *ids = static_ids; + + /* Parse arguments. */ + for (int i = 1; i < c->argc; i++) { + int moreargs = i != c->argc-1; + char *o = c->argv[i]->ptr; + if (!strcasecmp(o,"BLOCK") && moreargs) { + i++; + if (getLongLongFromObjectOrReply(c,c->argv[i],&block,NULL) != C_OK) + return; + if (block < 0) block = 0; + } else if (!strcasecmp(o,"COUNT") && moreargs) { + i++; + if (getLongLongFromObjectOrReply(c,c->argv[i],&count,NULL) != C_OK) + return; + if (count < 0) count = 0; + } else if (!strcasecmp(o,"STREAMS") && moreargs) { + streams_argc = i+1; + streams_count = (c->argc-streams_argc); + if ((streams_count % 2) != 0) { + addReplyError(c,"Unbalanced XREAD list of streams: " + "for each stream key an ID or '$' must be " + "specified."); + return; + } + streams_count /= 2; /* We have two arguments for each stream. */ + break; + } else { + addReply(c,shared.syntaxerr); + return; + } + } + + /* STREAMS option is mandatory. */ + if (streams_argc == 0) { + addReply(c,shared.syntaxerr); + return; + } + + /* Parse the IDs. */ + if (streams_count > STREAMID_STATIC_VECTOR_LEN) + ids = zmalloc(sizeof(streamID)*streams_count); + + /* Try to serve the client synchronously. */ + for (int i = streams_argc + streams_count; i < c->argc; i++) { + /* Specifying "$" as last-known-id means that the client wants to be + * served with just the messages that will arrive into the stream + * starting from now. */ + if (strcmp(c->argv[i]->ptr,"$") == 0) { + robj *o = lookupKeyRead(c->db,c->argv[i-streams_count]); + if (o) { + stream *s = o->ptr; + ids[i] = s->last_id; + } else { + ids[i].ms = 0; + ids[i].seq = 0; + } + continue; + } + if (streamParseIDOrReply(c,c->argv[i],ids+i,0) != C_OK) goto cleanup; + } + +cleanup: + /* Cleanup. */ + if (ids != static_ids) zfree(ids); } From fa61720d30b6c3088329fd5921fd8173bcd0a368 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 7 Sep 2017 17:45:34 +0200 Subject: [PATCH 039/102] Streams: XREAD, first draft. Handling of blocked clients still missing. --- src/t_stream.c | 56 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 10 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index 485ea29a..0820a743 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -398,10 +398,10 @@ void xlenCommand(client *c) { * [RETRY ] STREAMS key_1 ID_1 key_2 ID_2 ... * key_N ID_N */ void xreadCommand(client *c) { - long long block = 0; + long long timeout = 0; long long count = 0; int streams_count = 0; - int streams_argc = 0; + int streams_arg = 0; #define STREAMID_STATIC_VECTOR_LEN 8 streamID static_ids[STREAMID_STATIC_VECTOR_LEN]; streamID *ids = static_ids; @@ -412,17 +412,17 @@ void xreadCommand(client *c) { char *o = c->argv[i]->ptr; if (!strcasecmp(o,"BLOCK") && moreargs) { i++; - if (getLongLongFromObjectOrReply(c,c->argv[i],&block,NULL) != C_OK) - return; - if (block < 0) block = 0; + if (getLongLongFromObjectOrReply(c,c->argv[i],&timeout,NULL) + != C_OK) return; + if (timeout < 0) timeout = 0; } else if (!strcasecmp(o,"COUNT") && moreargs) { i++; if (getLongLongFromObjectOrReply(c,c->argv[i],&count,NULL) != C_OK) return; if (count < 0) count = 0; } else if (!strcasecmp(o,"STREAMS") && moreargs) { - streams_argc = i+1; - streams_count = (c->argc-streams_argc); + streams_arg = i+1; + streams_count = (c->argc-streams_arg); if ((streams_count % 2) != 0) { addReplyError(c,"Unbalanced XREAD list of streams: " "for each stream key an ID or '$' must be " @@ -438,7 +438,7 @@ void xreadCommand(client *c) { } /* STREAMS option is mandatory. */ - if (streams_argc == 0) { + if (streams_arg == 0) { addReply(c,shared.syntaxerr); return; } @@ -447,8 +447,7 @@ void xreadCommand(client *c) { if (streams_count > STREAMID_STATIC_VECTOR_LEN) ids = zmalloc(sizeof(streamID)*streams_count); - /* Try to serve the client synchronously. */ - for (int i = streams_argc + streams_count; i < c->argc; i++) { + for (int i = streams_arg + streams_count; i < c->argc; i++) { /* Specifying "$" as last-known-id means that the client wants to be * served with just the messages that will arrive into the stream * starting from now. */ @@ -466,6 +465,43 @@ void xreadCommand(client *c) { if (streamParseIDOrReply(c,c->argv[i],ids+i,0) != C_OK) goto cleanup; } + /* Try to serve the client synchronously. */ + for (int i = 0; i < streams_count; i++) { + robj *o = lookupKeyRead(c->db,c->argv[i+streams_arg]); + if (o == NULL) continue; + stream *s = o->ptr; + streamID *gt = ids+i; /* ID must be greater than this. */ + if (s->last_id.ms > gt->ms || + (s->last_id.ms == gt->ms && s->last_id.seq > gt->seq)) + { + /* streamReplyWithRange() handles the 'start' ID as inclusive, + * so start from the next ID, since we want only messages with + * IDs greater than start. */ + streamID start = *gt; + start.seq++; /* Can't overflow, it's an uint64_t */ + streamReplyWithRange(c,s,&start,NULL,count); + goto cleanup; + } + } + + /* Block if needed. */ + if (timeout) { + /* If we are inside a MULTI/EXEC and the list is empty the only thing + * we can do is treating it as a timeout (even with timeout 0). */ + if (c->flags & CLIENT_MULTI) { + addReply(c,shared.nullmultibulk); + goto cleanup; + } + blockForKeys(c, BLOCKED_STREAM, c->argv+streams_arg, streams_count, + timeout, NULL, ids); + goto cleanup; + } + + /* No BLOCK option, nor any stream we can serve. Reply as with a + * timeout happened. */ + addReply(c,shared.nullmultibulk); + /* Continue to cleanup... */ + cleanup: /* Cleanup. */ if (ids != static_ids) zfree(ids); From 110041825c3af14feff03f43d12e3683f64cdc48 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 8 Sep 2017 11:40:16 +0200 Subject: [PATCH 040/102] Streams: XREAD get-keys method. --- src/db.c | 28 ++++++++++++++++++++++++++++ src/server.c | 1 + src/server.h | 2 ++ 3 files changed, 31 insertions(+) diff --git a/src/db.c b/src/db.c index 6682e573..8b43d4b5 100644 --- a/src/db.c +++ b/src/db.c @@ -1363,6 +1363,34 @@ int *georadiusGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numk return keys; } +/* XREAD [BLOCK ] [COUNT ] [GROUP ] + * [RETRY ] STREAMS key_1 ID_1 key_2 ID_2 ... + * key_N ID_N */ +int *xreadGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys) { + int i, num, *keys; + UNUSED(cmd); + + /* We need to seek the last argument that contains "STREAMS", because other + * arguments before may contain it (for example the group name). */ + int streams_pos = -1; + for (i = 1; i < argc; i++) { + char *arg = argv[i]->ptr; + if (!strcasecmp(arg, "streams")) streams_pos = i; + } + + /* Syntax error. */ + if (streams_pos == -1) { + *numkeys = 0; + return NULL; + } + + num = argc - streams_pos - 1; + keys = zmalloc(sizeof(int) * num); + for (i = streams_pos+1; i < argc; i++) keys[i-streams_pos-1] = i; + *numkeys = num; + return keys; +} + /* Slot to Key API. This is used by Redis Cluster in order to obtain in * a fast way a key that belongs to a specified hash slot. This is useful * while rehashing the cluster and in other conditions when we need to diff --git a/src/server.c b/src/server.c index 56b2188e..e1d9abef 100644 --- a/src/server.c +++ b/src/server.c @@ -305,6 +305,7 @@ struct redisCommand redisCommandTable[] = { {"xadd",xaddCommand,-4,"wmF",0,NULL,1,1,1,0,0}, {"xrange",xrangeCommand,-4,"r",0,NULL,1,1,1,0,0}, {"xlen",xlenCommand,2,"rF",0,NULL,1,1,1,0,0}, + {"xread",xreadCommand,-3,"rs",0,xreadGetKeys,1,1,1,0,0}, {"post",securityWarningCommand,-1,"lt",0,NULL,0,0,0,0,0}, {"host:",securityWarningCommand,-1,"lt",0,NULL,0,0,0,0,0}, {"latency",latencyCommand,-2,"aslt",0,NULL,0,0,0,0,0} diff --git a/src/server.h b/src/server.h index 34c5fb06..4b84486e 100644 --- a/src/server.h +++ b/src/server.h @@ -1767,6 +1767,7 @@ int *evalGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys); int *sortGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys); int *migrateGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys); int *georadiusGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys); +int *xreadGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys); /* Cluster */ void clusterInit(void); @@ -2011,6 +2012,7 @@ void securityWarningCommand(client *c); void xaddCommand(client *c); void xrangeCommand(client *c); void xlenCommand(client *c); +void xreadCommand(client *c); #if defined(__GNUC__) void *calloc(size_t count, size_t size) __attribute__ ((deprecated)); From a7d898334afea5593a67a8633a0c8b7e7cb8ab62 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 8 Sep 2017 11:51:53 +0200 Subject: [PATCH 041/102] Streams: XREAD get-key method fixed. --- src/db.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/db.c b/src/db.c index 8b43d4b5..e422d4b8 100644 --- a/src/db.c +++ b/src/db.c @@ -1364,8 +1364,8 @@ int *georadiusGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numk } /* XREAD [BLOCK ] [COUNT ] [GROUP ] - * [RETRY ] STREAMS key_1 ID_1 key_2 ID_2 ... - * key_N ID_N */ + * [RETRY ] STREAMS key_1 key_2 ... key_N + * ID_1 ID_2 ... ID_N */ int *xreadGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys) { int i, num, *keys; UNUSED(cmd); @@ -1377,14 +1377,16 @@ int *xreadGetKeys(struct redisCommand *cmd, robj **argv, int argc, int *numkeys) char *arg = argv[i]->ptr; if (!strcasecmp(arg, "streams")) streams_pos = i; } + if (streams_pos != -1) num = argc - streams_pos - 1; /* Syntax error. */ - if (streams_pos == -1) { + if (streams_pos == -1 || num % 2 != 0) { *numkeys = 0; return NULL; } + num /= 2; /* We have half the keys as there are arguments because + there are also the IDs, one per key. */ - num = argc - streams_pos - 1; keys = zmalloc(sizeof(int) * num); for (i = streams_pos+1; i < argc; i++) keys[i-streams_pos-1] = i; *numkeys = num; From 6a1c92d52dd7f3928795de910dc848f71dae5b3c Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 8 Sep 2017 12:09:02 +0200 Subject: [PATCH 042/102] Streams: synchronous xread fixes and improvements. --- src/t_stream.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index 0820a743..92c62077 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -395,8 +395,8 @@ void xlenCommand(client *c) { } /* XREAD [BLOCK ] [COUNT ] [GROUP ] - * [RETRY ] STREAMS key_1 ID_1 key_2 ID_2 ... - * key_N ID_N */ + * [RETRY ] STREAMS key_1 key_2 ... key_N + * ID_1 ID_2 ... ID_N */ void xreadCommand(client *c) { long long timeout = 0; long long count = 0; @@ -453,12 +453,13 @@ void xreadCommand(client *c) { * starting from now. */ if (strcmp(c->argv[i]->ptr,"$") == 0) { robj *o = lookupKeyRead(c->db,c->argv[i-streams_count]); + int id_idx = i - streams_arg - streams_count; if (o) { stream *s = o->ptr; - ids[i] = s->last_id; + ids[id_idx] = s->last_id; } else { - ids[i].ms = 0; - ids[i].seq = 0; + ids[id_idx].ms = 0; + ids[id_idx].seq = 0; } continue; } @@ -466,24 +467,38 @@ void xreadCommand(client *c) { } /* Try to serve the client synchronously. */ + size_t arraylen = 0; + void *arraylen_ptr = NULL; for (int i = 0; i < streams_count; i++) { - robj *o = lookupKeyRead(c->db,c->argv[i+streams_arg]); + robj *o = lookupKeyRead(c->db,c->argv[streams_arg+i]); if (o == NULL) continue; stream *s = o->ptr; streamID *gt = ids+i; /* ID must be greater than this. */ if (s->last_id.ms > gt->ms || (s->last_id.ms == gt->ms && s->last_id.seq > gt->seq)) { + arraylen++; + if (arraylen == 1) arraylen_ptr = addDeferredMultiBulkLength(c); /* streamReplyWithRange() handles the 'start' ID as inclusive, * so start from the next ID, since we want only messages with * IDs greater than start. */ streamID start = *gt; start.seq++; /* Can't overflow, it's an uint64_t */ + + /* Emit the two elements sub-array consisting of the name + * of the stream and the data we extracted from it. */ + addReplyMultiBulkLen(c,2); + addReplyBulk(c,c->argv[i+streams_arg]); streamReplyWithRange(c,s,&start,NULL,count); - goto cleanup; } } + /* We replied synchronously! Set the top array len and return to caller. */ + if (arraylen) { + setDeferredMultiBulkLength(c,arraylen_ptr,arraylen); + goto cleanup; + } + /* Block if needed. */ if (timeout) { /* If we are inside a MULTI/EXEC and the list is empty the only thing From 0adb43b68febc4dfb2acc6818de36504a63162e4 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 8 Sep 2017 12:25:06 +0200 Subject: [PATCH 043/102] Streams: XREAD ability to block fixed. --- src/blocked.c | 4 ++-- src/t_stream.c | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/blocked.c b/src/blocked.c index 376b343d..fccce35d 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -134,7 +134,7 @@ void processUnblockedClients(void) { /* Unblock a client calling the right function depending on the kind * of operation the client is blocking for. */ void unblockClient(client *c) { - if (c->btype == BLOCKED_LIST) { + if (c->btype == BLOCKED_LIST || c->btype == BLOCKED_STREAM) { unblockClientWaitingData(c); } else if (c->btype == BLOCKED_WAIT) { unblockClientWaitingReplicas(c); @@ -160,7 +160,7 @@ void unblockClient(client *c) { * send it a reply of some kind. After this function is called, * unblockClient() will be called with the same client as argument. */ void replyToBlockedClientTimedOut(client *c) { - if (c->btype == BLOCKED_LIST) { + if (c->btype == BLOCKED_LIST || c->btype == BLOCKED_STREAM) { addReply(c,shared.nullmultibulk); } else if (c->btype == BLOCKED_WAIT) { addReplyLongLong(c,replicationCountAcksByOffset(c->bpop.reploffset)); diff --git a/src/t_stream.c b/src/t_stream.c index 92c62077..0358e644 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -412,9 +412,8 @@ void xreadCommand(client *c) { char *o = c->argv[i]->ptr; if (!strcasecmp(o,"BLOCK") && moreargs) { i++; - if (getLongLongFromObjectOrReply(c,c->argv[i],&timeout,NULL) - != C_OK) return; - if (timeout < 0) timeout = 0; + if (getTimeoutFromObjectOrReply(c,c->argv[i],&timeout, + UNIT_MILLISECONDS) != C_OK) return; } else if (!strcasecmp(o,"COUNT") && moreargs) { i++; if (getLongLongFromObjectOrReply(c,c->argv[i],&count,NULL) != C_OK) From 2cacdcd6f8ee0af32618ceff2d303acaa61645ab Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 8 Sep 2017 16:57:32 +0200 Subject: [PATCH 044/102] Streams: XREAD related code to serve blocked clients. --- src/blocked.c | 50 +++++++++++++++++++++++++++++++++++++++++++++++++- src/server.h | 1 + 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/src/blocked.c b/src/blocked.c index fccce35d..84d74f24 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -240,7 +240,14 @@ void handleClientsBlockedOnKeys(void) { while(numclients--) { listNode *clientnode = listFirst(clients); client *receiver = clientnode->value; - if (receiver->btype != BLOCKED_LIST) continue; + + if (receiver->btype != BLOCKED_LIST) { + /* Put on the tail, so that at the next call + * we'll not run into it again. */ + listDelNode(clients,clientnode); + listAddNodeTail(clients,receiver); + continue; + } robj *dstkey = receiver->bpop.target; int where = (receiver->lastcmd && @@ -279,6 +286,47 @@ void handleClientsBlockedOnKeys(void) { * when an element was pushed on the list. */ } + /* Serve clients blocked on stream key. */ + else if (o != NULL && o->type == OBJ_STREAM) { + dictEntry *de = dictFind(rl->db->blocking_keys,rl->key); + stream *s = o->ptr; + + /* We need to provide the new data arrived on the stream + * to all the clients that are waiting for an offset smaller + * than the current top item. */ + if (de) { + list *clients = dictGetVal(de); + listNode *ln; + listIter li; + listRewind(clients,&li); + + while((ln = listNext(&li))) { + client *receiver = listNodeValue(ln); + if (receiver->btype != BLOCKED_STREAM) continue; + streamID *gt = dictFetchValue(receiver->bpop.keys, + rl->key); + if (s->last_id.ms > gt->ms || + (s->last_id.ms == gt->ms && + s->last_id.seq > gt->seq)) + { + unblockClient(receiver); + streamID start = *gt; + start.seq++; /* Can't overflow, it's an uint64_t */ + + /* Emit the two elements sub-array consisting of + * the name of the stream and the data we + * extracted from it. Wrapped in a single-item + * array, since we have just one key. */ + addReplyMultiBulkLen(receiver,1); + addReplyMultiBulkLen(receiver,2); + addReplyBulk(receiver,rl->key); + streamReplyWithRange(receiver,s,&start,NULL, + receiver->bpop.xread_count); + } + } + } + } + /* Free this item. */ decrRefCount(rl->key); zfree(rl); diff --git a/src/server.h b/src/server.h index 4b84486e..8fa7380e 100644 --- a/src/server.h +++ b/src/server.h @@ -1425,6 +1425,7 @@ void popGenericCommand(client *c, int where); /* Stream data type. */ stream *streamNew(void); void freeStream(stream *s); +size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end, size_t count); /* MULTI/EXEC/WATCH... */ void unwatchAllKeys(client *c); From b5be5093fecd02f4ee537af5ad473899bb6d7a61 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 8 Sep 2017 20:48:28 +0200 Subject: [PATCH 045/102] Streams: fix XREAD timeout handling, zero is valid. --- src/t_stream.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index 0358e644..afa8224c 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -398,7 +398,7 @@ void xlenCommand(client *c) { * [RETRY ] STREAMS key_1 key_2 ... key_N * ID_1 ID_2 ... ID_N */ void xreadCommand(client *c) { - long long timeout = 0; + long long timeout = -1; /* -1 means, no BLOCK argument given. */ long long count = 0; int streams_count = 0; int streams_arg = 0; @@ -499,7 +499,7 @@ void xreadCommand(client *c) { } /* Block if needed. */ - if (timeout) { + if (timeout != -1) { /* If we are inside a MULTI/EXEC and the list is empty the only thing * we can do is treating it as a timeout (even with timeout 0). */ if (c->flags & CLIENT_MULTI) { From 6468cb2e825cf8258654f83e82324332e9879745 Mon Sep 17 00:00:00 2001 From: antirez Date: Sat, 9 Sep 2017 11:10:59 +0200 Subject: [PATCH 046/102] Streams: fix XREAD ready-key signaling. With lists we need to signal only on key creation, but streams can provide data to clients listening at every new item added. To make this slightly more efficient we now track different classes of blocked clients to avoid signaling keys when there is nobody listening. A typical case is when the stream is used as a time series DB and accessed only by range with XRANGE. --- src/blocked.c | 6 ++++-- src/db.c | 3 +-- src/server.c | 6 ++++-- src/server.h | 4 +++- src/t_stream.c | 2 ++ 5 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/blocked.c b/src/blocked.c index 84d74f24..3cf661aa 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -102,7 +102,8 @@ int getTimeoutFromObjectOrReply(client *c, robj *object, mstime_t *timeout, int void blockClient(client *c, int btype) { c->flags |= CLIENT_BLOCKED; c->btype = btype; - server.bpop_blocked_clients++; + server.blocked_clients++; + server.blocked_clients_by_type[btype]++; } /* This function is called in the beforeSleep() function of the event loop @@ -145,9 +146,10 @@ void unblockClient(client *c) { } /* Clear the flags, and put the client in the unblocked list so that * we'll process new commands in its query buffer ASAP. */ + server.blocked_clients--; + server.blocked_clients_by_type[c->btype]--; c->flags &= ~CLIENT_BLOCKED; c->btype = BLOCKED_NONE; - server.bpop_blocked_clients--; /* The client may already be into the unblocked list because of a previous * blocking operation, don't add back it into the list multiple times. */ if (!(c->flags & CLIENT_UNBLOCKED)) { diff --git a/src/db.c b/src/db.c index e422d4b8..74c2be62 100644 --- a/src/db.c +++ b/src/db.c @@ -169,8 +169,7 @@ void dbAdd(redisDb *db, robj *key, robj *val) { int retval = dictAdd(db->dict, copy, val); serverAssertWithInfo(NULL,key,retval == DICT_OK); - if (val->type == OBJ_LIST || val->type == OBJ_STREAM) - signalKeyAsReady(db, key); + if (val->type == OBJ_LIST) signalKeyAsReady(db, key); if (server.cluster_enabled) slotToKeyAdd(key); } diff --git a/src/server.c b/src/server.c index e1d9abef..38f16179 100644 --- a/src/server.c +++ b/src/server.c @@ -1426,7 +1426,9 @@ void initServerConfig(void) { server.active_defrag_running = 0; server.notify_keyspace_events = 0; server.maxclients = CONFIG_DEFAULT_MAX_CLIENTS; - server.bpop_blocked_clients = 0; + server.blocked_clients = 0; + memset(server.blocked_clients_by_type,0, + sizeof(server.blocked_clients_by_type)); server.maxmemory = CONFIG_DEFAULT_MAXMEMORY; server.maxmemory_policy = CONFIG_DEFAULT_MAXMEMORY_POLICY; server.maxmemory_samples = CONFIG_DEFAULT_MAXMEMORY_SAMPLES; @@ -2929,7 +2931,7 @@ sds genRedisInfoString(char *section) { "blocked_clients:%d\r\n", listLength(server.clients)-listLength(server.slaves), lol, bib, - server.bpop_blocked_clients); + server.blocked_clients); } /* Memory */ diff --git a/src/server.h b/src/server.h index 8fa7380e..2d98b6f1 100644 --- a/src/server.h +++ b/src/server.h @@ -257,6 +257,7 @@ typedef long long mstime_t; /* millisecond time type. */ #define BLOCKED_WAIT 2 /* WAIT for synchronous replication. */ #define BLOCKED_MODULE 3 /* Blocked by a loadable module. */ #define BLOCKED_STREAM 4 /* XREAD. */ +#define BLOCKED_NUM 5 /* Number of blocked states. */ /* Client request types */ #define PROTO_REQ_INLINE 1 @@ -1130,7 +1131,8 @@ struct redisServer { int lfu_log_factor; /* LFU logarithmic counter factor. */ int lfu_decay_time; /* LFU counter decay factor. */ /* Blocked clients */ - unsigned int bpop_blocked_clients; /* Number of clients blocked by lists */ + unsigned int blocked_clients; /* # of clients executing a blocking cmd.*/ + unsigned int blocked_clients_by_type[BLOCKED_NUM]; list *unblocked_clients; /* list of clients to unblock before next loop */ list *ready_keys; /* List of readyList structures for BLPOP & co */ /* Sort parameters - qsort_r() is only available under BSD so we diff --git a/src/t_stream.c b/src/t_stream.c index afa8224c..c47c5dde 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -354,6 +354,8 @@ void xaddCommand(client *c) { signalModifiedKey(c->db,c->argv[1]); notifyKeyspaceEvent(NOTIFY_HASH,"xadd",c->argv[1],c->db->id); server.dirty++; + if (server.blocked_clients_by_type[BLOCKED_STREAM]) + signalKeyAsReady(c->db, c->argv[1]); } /* XRANGE key start end [COUNT ] */ From c128190026efd36e8b472d8874f7f54c79ba3e06 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 11 Sep 2017 11:06:53 +0200 Subject: [PATCH 047/102] Streams: fix handleClientsBlockedOnKeys() access to invalid ID. --- src/blocked.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/blocked.c b/src/blocked.c index 3cf661aa..519a402c 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -311,9 +311,12 @@ void handleClientsBlockedOnKeys(void) { (s->last_id.ms == gt->ms && s->last_id.seq > gt->seq)) { - unblockClient(receiver); streamID start = *gt; start.seq++; /* Can't overflow, it's an uint64_t */ + /* Note that after we unblock the client, 'gt' + * is no longer valid, so we must do it after + * we copied the ID into the 'start' variable. */ + unblockClient(receiver); /* Emit the two elements sub-array consisting of * the name of the stream and the data we From db89f7474d3f4c784bfd8757b2bd3321e3efd9a1 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 11 Sep 2017 11:20:36 +0200 Subject: [PATCH 048/102] Streams: When XREAD blocks without COUNT, set a default one. A client may lose a lot of time between invocations of blocking XREAD, for example because it is processing the messages or for any other cause. When it returns back, it may provide a low enough message ID that the server will block to send an unreasonable number of messages in a single call. For this reason we set a COUNT when the client is blocked with XREAD calls, even if no COUNT is given. This is arbitrarily set to 1000 because it's enough to avoid slowing down the reception of many messages, but low enough to avoid to block. --- src/t_stream.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/t_stream.c b/src/t_stream.c index c47c5dde..1836ae73 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -399,6 +399,7 @@ void xlenCommand(client *c) { /* XREAD [BLOCK ] [COUNT ] [GROUP ] * [RETRY ] STREAMS key_1 key_2 ... key_N * ID_1 ID_2 ... ID_N */ +#define XREAD_BLOCKED_DEFAULT_COUNT 1000 void xreadCommand(client *c) { long long timeout = -1; /* -1 means, no BLOCK argument given. */ long long count = 0; @@ -510,6 +511,11 @@ void xreadCommand(client *c) { } blockForKeys(c, BLOCKED_STREAM, c->argv+streams_arg, streams_count, timeout, NULL, ids); + /* If no COUNT is given and we block, set a relatively small count: + * in case the ID provided is too low, we do not want the server to + * block just to serve this client a huge stream of messages. */ + c->bpop.xread_count = count ? count : XREAD_BLOCKED_DEFAULT_COUNT; + c->bpop.xread_group = NULL; /* Not used for now. */ goto cleanup; } From 19b06935d59a39021330670776eb6b79002599c8 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 11 Sep 2017 18:02:57 +0200 Subject: [PATCH 049/102] Streams: fix XADD API and keyspace notifications. XADD was suboptimal in the first incarnation of the command, not being able to accept an ID (very useufl for replication), nor options for having capped streams. The keyspace notification for streams was not implemented. --- src/notify.c | 2 ++ src/server.c | 2 +- src/server.h | 3 +- src/t_stream.c | 80 +++++++++++++++++++++++++++++++++++++++++++------- 4 files changed, 75 insertions(+), 12 deletions(-) diff --git a/src/notify.c b/src/notify.c index 94a1f2e7..9bbeb142 100644 --- a/src/notify.c +++ b/src/notify.c @@ -54,6 +54,7 @@ int keyspaceEventsStringToFlags(char *classes) { case 'e': flags |= NOTIFY_EVICTED; break; case 'K': flags |= NOTIFY_KEYSPACE; break; case 'E': flags |= NOTIFY_KEYEVENT; break; + case 't': flags |= NOTIFY_STREAM; break; default: return -1; } } @@ -79,6 +80,7 @@ sds keyspaceEventsFlagsToString(int flags) { if (flags & NOTIFY_ZSET) res = sdscatlen(res,"z",1); if (flags & NOTIFY_EXPIRED) res = sdscatlen(res,"x",1); if (flags & NOTIFY_EVICTED) res = sdscatlen(res,"e",1); + if (flags & NOTIFY_STREAM) res = sdscatlen(res,"t",1); } if (flags & NOTIFY_KEYSPACE) res = sdscatlen(res,"K",1); if (flags & NOTIFY_KEYEVENT) res = sdscatlen(res,"E",1); diff --git a/src/server.c b/src/server.c index 38f16179..f1fd06ca 100644 --- a/src/server.c +++ b/src/server.c @@ -302,7 +302,7 @@ struct redisCommand redisCommandTable[] = { {"pfcount",pfcountCommand,-2,"r",0,NULL,1,-1,1,0,0}, {"pfmerge",pfmergeCommand,-2,"wm",0,NULL,1,-1,1,0,0}, {"pfdebug",pfdebugCommand,-3,"w",0,NULL,0,0,0,0,0}, - {"xadd",xaddCommand,-4,"wmF",0,NULL,1,1,1,0,0}, + {"xadd",xaddCommand,-5,"wmF",0,NULL,1,1,1,0,0}, {"xrange",xrangeCommand,-4,"r",0,NULL,1,1,1,0,0}, {"xlen",xlenCommand,2,"rF",0,NULL,1,1,1,0,0}, {"xread",xreadCommand,-3,"rs",0,xreadGetKeys,1,1,1,0,0}, diff --git a/src/server.h b/src/server.h index 2d98b6f1..37df429b 100644 --- a/src/server.h +++ b/src/server.h @@ -427,7 +427,8 @@ typedef long long mstime_t; /* millisecond time type. */ #define NOTIFY_ZSET (1<<7) /* z */ #define NOTIFY_EXPIRED (1<<8) /* x */ #define NOTIFY_EVICTED (1<<9) /* e */ -#define NOTIFY_ALL (NOTIFY_GENERIC | NOTIFY_STRING | NOTIFY_LIST | NOTIFY_SET | NOTIFY_HASH | NOTIFY_ZSET | NOTIFY_EXPIRED | NOTIFY_EVICTED) /* A */ +#define NOTIFY_STREAM (1<<10) /* t */ +#define NOTIFY_ALL (NOTIFY_GENERIC | NOTIFY_STRING | NOTIFY_LIST | NOTIFY_SET | NOTIFY_HASH | NOTIFY_ZSET | NOTIFY_EXPIRED | NOTIFY_EVICTED | NOTIFY_STREAM) /* A flag */ /* Get the first bind addr or NULL */ #define NET_FIRST_BIND_ADDR (server.bindaddr_count ? server.bindaddr[0] : NULL) diff --git a/src/t_stream.c b/src/t_stream.c index 1836ae73..0921a54b 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -115,8 +115,24 @@ void streamDecodeID(void *buf, streamID *id) { /* Adds a new item into the stream 's' having the specified number of * field-value pairs as specified in 'numfields' and stored into 'argv'. - * Returns the new entry ID populating the 'added_id' structure. */ -void streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id) { + * Returns the new entry ID populating the 'added_id' structure. + * + * If 'use_id' is not NULL, the ID is not auto-generated by the function, + * but instead the passed ID is uesd to add the new entry. In this case + * adding the entry may fail as specified later in this comment. + * + * The function returns C_OK if the item was added, this is always true + * if the ID was generated by the function. However the function may return + * C_ERR if an ID was given via 'use_id', but adding it failed since the + * current top ID is greater or equal. */ +int streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id, streamID *use_id) { + /* If an ID was given, check that it's greater than the last entry ID + * or return an error. */ + if (use_id && (use_id->ms < s->last_id.ms || + (use_id->ms == s->last_id.ms && + use_id->seq <= s->last_id.seq))) return C_ERR; + + /* Add the new entry. */ raxIterator ri; raxStart(&ri,s->rax); raxSeek(&ri,"$",NULL,0); @@ -133,7 +149,10 @@ void streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id) /* Generate the new entry ID. */ streamID id; - streamNextID(&s->last_id,&id); + if (use_id) + id = *use_id; + else + streamNextID(&s->last_id,&id); /* We have to add the key into the radix tree in lexicographic order, * to do so we consider the ID as a single 128 bit number written in @@ -173,6 +192,7 @@ void streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id) s->length++; s->last_id = id; if (added_id) *added_id = id; + return C_OK; } /* Send the specified range to the client 'c'. The range the client will @@ -299,7 +319,9 @@ int string2ull(const char *s, unsigned long long *value) { * form, just stating the milliseconds time part of the stream. In such a case * the missing part is set according to the value of 'missing_seq' parameter. * The IDs "-" and "+" specify respectively the minimum and maximum IDs - * that can be represented. */ + * that can be represented. + * + * If 'c' is set to NULL, no reply is sent to the client. */ int streamParseIDOrReply(client *c, robj *o, streamID *id, uint64_t missing_seq) { char buf[128]; if (sdslen(o->ptr) > sizeof(buf)-1) goto invalid; @@ -328,13 +350,45 @@ int streamParseIDOrReply(client *c, robj *o, streamID *id, uint64_t missing_seq) return C_OK; invalid: - addReplyError(c,"Invalid stream ID specified as stream command argument"); + if (c) addReplyError(c,"Invalid stream ID specified as stream " + "command argument"); return C_ERR; } -/* XADD key [field value] [field value] ... */ +/* XADD key [MAXLEN ] [field value] [field value] ... */ void xaddCommand(client *c) { - if ((c->argc % 2) == 1) { + streamID id; + int id_given = 0; /* Was an ID different than "*" specified? */ + + /* Parse options. */ + int i = 2; /* This is the first argument position where we could + find an option, or the ID. */ + for (; i < c->argc; i++) { + int moreargs = i != c->argc-1; + char *opt = c->argv[i]->ptr; + if (opt[0] == '*' && opt[1] == '\0') { + /* This is just a fast path for the common case of auto-ID + * creation. */ + break; + } else if (!strcasecmp(opt,"maxlen") && moreargs) { + addReplyError(c,"Sorry, MAXLEN is still not implemented"); + i++; + return; + } else { + /* If we are here is a syntax error or a valid ID. */ + if (streamParseIDOrReply(NULL,c->argv[i],&id,0) == C_OK) { + id_given = 1; + break; + } else { + addReply(c,shared.syntaxerr); + return; + } + } + } + int field_pos = i+1; + + /* Check arity. */ + if ((c->argc - field_pos) < 2 || (c->argc-field_pos % 2) == 1) { addReplyError(c,"wrong number of arguments for XADD"); return; } @@ -346,13 +400,19 @@ void xaddCommand(client *c) { s = o->ptr; /* Append using the low level function and return the ID. */ - streamID id; - streamAppendItem(s,c->argv+2,(c->argc-2)/2,&id); + if (streamAppendItem(s,c->argv+field_pos,(c->argc-field_pos)/2, + &id, id_given ? &id : NULL) + == C_ERR) + { + addReplyError(c,"The ID specified in XADD is smaller than the " + "target stream top item"); + return; + } sds reply = sdscatfmt(sdsempty(),"+%U.%U\r\n",id.ms,id.seq); addReplySds(c,reply); signalModifiedKey(c->db,c->argv[1]); - notifyKeyspaceEvent(NOTIFY_HASH,"xadd",c->argv[1],c->db->id); + notifyKeyspaceEvent(NOTIFY_STREAM,"xadd",c->argv[1],c->db->id); server.dirty++; if (server.blocked_clients_by_type[BLOCKED_STREAM]) signalKeyAsReady(c->db, c->argv[1]); From 3a0b78bc52e30a8599203ef501ea548b96f08b89 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Sep 2017 10:48:27 +0200 Subject: [PATCH 050/102] Streams: rewrite XADD ID argument for AOF/slaves. --- src/t_stream.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/t_stream.c b/src/t_stream.c index 0921a54b..84e0541e 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -414,6 +414,16 @@ void xaddCommand(client *c) { signalModifiedKey(c->db,c->argv[1]); notifyKeyspaceEvent(NOTIFY_STREAM,"xadd",c->argv[1],c->db->id); server.dirty++; + + /* Let's rewrite the ID argument with the one actually generated for + * AOF/replication propagation. */ + robj *idarg = createObject(OBJ_STRING, + sdscatfmt(sdsempty(),"%U.%U",id.ms,id.seq)); + rewriteClientCommandArgument(c,i,idarg); + decrRefCount(idarg); + + /* We need to signal to blocked clients that there is new data on this + * stream. */ if (server.blocked_clients_by_type[BLOCKED_STREAM]) signalKeyAsReady(c->db, c->argv[1]); } From 94af55c5ea55890ed5e3afdb9bb802d88d61eac7 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Sep 2017 10:54:20 +0200 Subject: [PATCH 051/102] Streams: fix memory leak in freeStream(). --- src/t_stream.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/t_stream.c b/src/t_stream.c index 84e0541e..3b0072ec 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -54,6 +54,7 @@ stream *streamNew(void) { /* Free a stream, including the listpacks stored inside the radix tree. */ void freeStream(stream *s) { raxFreeWithCallback(s->rax,(void(*)(void*))lpFree); + zfree(s); } /* Generate the next stream item ID given the previous one. If the current From 1a603e1a87d88500a9723460dcf60c9b57dc99d7 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 12 Sep 2017 16:19:26 +0200 Subject: [PATCH 052/102] Streams: fix bug in XREAD last received ID processing. --- src/t_stream.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index 3b0072ec..a8230109 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -524,9 +524,9 @@ void xreadCommand(client *c) { /* Specifying "$" as last-known-id means that the client wants to be * served with just the messages that will arrive into the stream * starting from now. */ + int id_idx = i - streams_arg - streams_count; if (strcmp(c->argv[i]->ptr,"$") == 0) { robj *o = lookupKeyRead(c->db,c->argv[i-streams_count]); - int id_idx = i - streams_arg - streams_count; if (o) { stream *s = o->ptr; ids[id_idx] = s->last_id; @@ -536,7 +536,8 @@ void xreadCommand(client *c) { } continue; } - if (streamParseIDOrReply(c,c->argv[i],ids+i,0) != C_OK) goto cleanup; + if (streamParseIDOrReply(c,c->argv[i],ids+id_idx,0) != C_OK) + goto cleanup; } /* Try to serve the client synchronously. */ From b1ec3336337eabbd28b90832cebcaac4ffb69410 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 13 Sep 2017 18:05:34 +0200 Subject: [PATCH 053/102] Streams: stream iteration refactoring, WIP 1. --- src/t_stream.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/src/t_stream.c b/src/t_stream.c index a8230109..a1d3f8a1 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -196,6 +196,120 @@ int streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id, return C_OK; } +/* We define an iterator to iterate stream items in an abstract way, without + * caring about the radix tree + listpack representation. Technically speaking + * the iterator is only used inside streamReplyWithRange(), so could just + * be implemented inside the function, but practically there is the AOF + * rewriting code that also needs to iterate the stream to emit the XADD + * commands. */ +typedef struct streamIterator { + uint64_t start_key[2]; /* Start key as 128 bit big endian. */ + uint64_t end_key[2]; /* End key as 128 bit big endian. */ + raxIterator ri; /* Rax iterator. */ + unsigned char *lp; /* Current listpack. */ + unsigned char *lp_ele; /* Current listpack cursor. */ +} streamIterator; + +/* Initialize the stream iterator, so that we can call iterating functions + * to get the next items. This requires a corresponding streamIteratorStop() + * at the end. + * + * Once the iterator is initalized, we iterate like this: + * + * streamIterator myiterator; + * streamIteratorStart(&myiterator,...); + * size_t numfields; + * while(streamIteratorGetID(&myitereator,&ID,&numfields)) { + * while(numfields--) { + * unsigned char *key, *value; + * size_t key_len, value_len; + * streamIteratorGetField(&myiterator,&key,&value,&key_len,&value_len); + * + * ... do what you want with key and value ... + * } + * } + * streamIteratorStop(&myiterator); */ +void streamIteratorStart(streamIterator *si, stream *s, streamID *start, streamID *end) { + /* Intialize the iterator and translates the iteration start/stop + * elements into a 128 big big-endian number. */ + streamEncodeID(si->start_key,start); + if (end) { + streamEncodeID(si->end_key,end); + } else { + /* We assume that UINT64_MAX is the same in little and big + * endian, that is, all bits set. */ + si->end_key[0] = UINT64_MAX; + si->end_key[0] = UINT64_MAX; + } + raxStart(&si->ri,s->rax); + + /* Seek the correct node in the radix tree. */ + if (start->ms || start->seq) { + raxSeek(&si->ri,"<=",(unsigned char*)si->start_key, + sizeof(si->start_key)); + if (raxEOF(&si->ri)) + raxSeek(&si->ri,">",(unsigned char*)si->start_key, + sizeof(si->start_key)); + } else { + raxSeek(&si->ri,"^",NULL,0); + } + si->lp = NULL; /* There is no current listpack right now. */ + si->lp_ele = NULL; /* Current listpack cursor. */ +} + +/* Return 1 and store the current item ID at 'id' if there are still + * elements within the iteration range, otherwise return 0 in order to + * signal the iteration terminated. */ +int streamIteratorGetID(streamIterator *si, streamID *id, size_t *numfields) { + while(1) { /* Will stop when element > stop_key or end of radix tree. */ + /* If the current listpack is set to NULL, this is the start of the + * iteration or the previous listpack was completely iterated. + * Go to the next node. */ + if (si->lp == NULL || si->lp_ele == NULL) { + if (!raxNext(&si->ri)) return 0; + serverAssert(si->ri.key_len == sizeof(streamID)); + si->lp = si->ri.data; + si->lp_ele = lpFirst(si->lp); + } + + /* For every radix tree node, iterate the corresponding listpack, + * returning elements when they are within range. */ + while(si->lp_ele) { + int64_t e_len; + unsigned char buf[LP_INTBUF_SIZE]; + unsigned char *e = lpGet(si->lp_ele,&e_len,buf); + serverAssert(e_len == sizeof(streamID)); + + /* Go to next field: number of elements. */ + si->lp_ele = lpNext(si->lp,si->lp_ele); + + /* If current >= start */ + if (memcmp(e,si->start_key,sizeof(streamID)) >= 0) { + if (memcmp(e,si->end_key,sizeof(streamID)) > 0) + return 0; /* We are already out of range. */ + streamDecodeID(e,id); + *numfields = lpGetInteger(si->lp_ele); + return 1; /* Valid item returned. */ + } else { + /* If we do not emit, we have to discard. */ + int64_t numfields = lpGetInteger(si->lp_ele); + si->lp_ele = lpNext(si->lp,si->lp_ele); + for (int64_t i = 0; i < numfields*2; i++) + si->lp_ele = lpNext(si->lp,si->lp_ele); + } + } + + /* End of listpack reached. Try the next radix tree node. */ + } +} + +/* Stop the stream iterator. The only cleanup we need is to free the rax + * itereator, since the stream iterator itself is supposed to be stack + * allocated. */ +void streamIteratorStop(streamIterator *si) { + raxStop(&si->ri); +} + /* Send the specified range to the client 'c'. The range the client will * receive is between start and end inclusive, if 'count' is non zero, no more * than 'count' elemnets are sent. The 'end' pointer can be NULL to mean that From a58733cacf45259b3e18fc68724950aceb86831d Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 14 Sep 2017 14:23:27 +0200 Subject: [PATCH 054/102] Streams: stream iteration refactoring, WIP 2. --- src/t_stream.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/t_stream.c b/src/t_stream.c index a1d3f8a1..03860b8e 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -208,6 +208,11 @@ typedef struct streamIterator { raxIterator ri; /* Rax iterator. */ unsigned char *lp; /* Current listpack. */ unsigned char *lp_ele; /* Current listpack cursor. */ + /* Buffers used to hold the string of lpGet() when the element is + * integer encoded, so that there is no string representation of the + * element inside the listpack itself. */ + unsigned char field_buf[LP_INTBUF_SIZE]; + unsigned char value_buf[LP_INTBUF_SIZE]; } streamIterator; /* Initialize the stream iterator, so that we can call iterating functions @@ -289,6 +294,7 @@ int streamIteratorGetID(streamIterator *si, streamID *id, size_t *numfields) { return 0; /* We are already out of range. */ streamDecodeID(e,id); *numfields = lpGetInteger(si->lp_ele); + si->lp_ele = lpNext(si->lp,si->lp_ele); return 1; /* Valid item returned. */ } else { /* If we do not emit, we have to discard. */ @@ -303,6 +309,19 @@ int streamIteratorGetID(streamIterator *si, streamID *id, size_t *numfields) { } } +/* Get the field and value of the current item we are iterating. This should + * be called immediately after streamIteratorGetID(), and for each field + * according to the number of fields returned by streamIteratorGetID(). + * The function populates the field and value pointers and the corresponding + * lengths by reference, that are valid until the next iterator call, assuming + * no one touches the stream meanwhile. */ +void streamIteratorGetField(streamIterator *si, unsigned char **fieldptr, unsigned char **valueptr, int64_t *fieldlen, int64_t *valuelen) { + *fieldptr = lpGet(si->lp_ele,fieldlen,si->field_buf); + si->lp_ele = lpNext(si->lp,si->lp_ele); + *valueptr = lpGet(si->lp_ele,valuelen,si->value_buf); + si->lp_ele = lpNext(si->lp,si->lp_ele); +} + /* Stop the stream iterator. The only cleanup we need is to free the rax * itereator, since the stream iterator itself is supposed to be stack * allocated. */ From 9ed40f0fc35ce0050bd362a35c4f3f7fea189fb5 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 14 Sep 2017 14:46:31 +0200 Subject: [PATCH 055/102] Streams: implement streamReplyWithRange() in terms of the iterator. --- src/t_stream.c | 87 +++++++++++++------------------------------------- 1 file changed, 22 insertions(+), 65 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index 03860b8e..de9561a5 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -223,7 +223,7 @@ typedef struct streamIterator { * * streamIterator myiterator; * streamIteratorStart(&myiterator,...); - * size_t numfields; + * int64_t numfields; * while(streamIteratorGetID(&myitereator,&ID,&numfields)) { * while(numfields--) { * unsigned char *key, *value; @@ -265,7 +265,7 @@ void streamIteratorStart(streamIterator *si, stream *s, streamID *start, streamI /* Return 1 and store the current item ID at 'id' if there are still * elements within the iteration range, otherwise return 0 in order to * signal the iteration terminated. */ -int streamIteratorGetID(streamIterator *si, streamID *id, size_t *numfields) { +int streamIteratorGetID(streamIterator *si, streamID *id, int64_t *numfields) { while(1) { /* Will stop when element > stop_key or end of radix tree. */ /* If the current listpack is set to NULL, this is the start of the * iteration or the previous listpack was completely iterated. @@ -336,74 +336,31 @@ void streamIteratorStop(streamIterator *si) { size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end, size_t count) { void *arraylen_ptr = addDeferredMultiBulkLength(c); size_t arraylen = 0; + streamIterator si; + int64_t numfields; + streamID id; - /* Seek the radix tree node that contains our start item. */ - uint64_t key[2]; - uint64_t end_key[2]; - streamEncodeID(key,start); - if (end) streamEncodeID(end_key,end); - raxIterator ri; - raxStart(&ri,s->rax); + streamIteratorStart(&si,s,start,end); + while(streamIteratorGetID(&si,&id,&numfields)) { + /* Emit a two elements array for each item. The first is + * the ID, the second is an array of field-value pairs. */ + sds replyid = sdscatfmt(sdsempty(),"+%U.%U\r\n",id.ms,id.seq); + addReplyMultiBulkLen(c,2); + addReplySds(c,replyid); + addReplyMultiBulkLen(c,numfields*2); - /* Seek the correct node in the radix tree. */ - if (start->ms || start->seq) { - raxSeek(&ri,"<=",(unsigned char*)key,sizeof(key)); - if (raxEOF(&ri)) raxSeek(&ri,">",(unsigned char*)key,sizeof(key)); - } else { - raxSeek(&ri,"^",NULL,0); - } - - /* For every radix tree node, iterate the corresponding listpack, - * returning elmeents when they are within range. */ - while (raxNext(&ri)) { - serverAssert(ri.key_len == sizeof(key)); - unsigned char *lp = ri.data; - unsigned char *lp_ele = lpFirst(lp); - while(lp_ele) { - int64_t e_len; - unsigned char buf[LP_INTBUF_SIZE]; - unsigned char *e = lpGet(lp_ele,&e_len,buf); - serverAssert(e_len == sizeof(streamID)); - - /* Seek next field: number of elements. */ - lp_ele = lpNext(lp,lp_ele); - if (memcmp(e,key,sizeof(key)) >= 0) { /* If current >= start */ - if (end && memcmp(e,end_key,sizeof(key)) > 0) { - break; /* We are already out of range. */ - } - streamID thisid; - streamDecodeID(e,&thisid); - sds replyid = sdscatfmt(sdsempty(),"+%U.%U\r\n", - thisid.ms,thisid.seq); - - /* Emit this stream entry in the client output. */ - addReplyMultiBulkLen(c,2); - addReplySds(c,replyid); - int64_t numfields = lpGetInteger(lp_ele); - lp_ele = lpNext(lp,lp_ele); - addReplyMultiBulkLen(c,numfields*2); - for (int64_t i = 0; i < numfields; i++) { - /* Emit two items (key-value) per iteration. */ - for (int k = 0; k < 2; k++) { - e = lpGet(lp_ele,&e_len,buf); - addReplyBulkCBuffer(c,e,e_len); - lp_ele = lpNext(lp,lp_ele); - } - } - - arraylen++; - if (count && count == arraylen) break; - } else { - /* If we do not emit, we have to discard. */ - int64_t numfields = lpGetInteger(lp_ele); - lp_ele = lpNext(lp,lp_ele); - for (int64_t i = 0; i < numfields*2; i++) - lp_ele = lpNext(lp,lp_ele); - } + /* Emit the field-value pairs. */ + while(numfields--) { + unsigned char *key, *value; + int64_t key_len, value_len; + streamIteratorGetField(&si,&key,&value,&key_len,&value_len); + addReplyBulkCBuffer(c,key,key_len); + addReplyBulkCBuffer(c,value,value_len); } + arraylen++; if (count && count == arraylen) break; } - raxStop(&ri); + streamIteratorStop(&si); setDeferredMultiBulkLength(c,arraylen_ptr,arraylen); return arraylen; } From 01ea018c4080e24b00d36e1cbf36c4d98b82ff40 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 15 Sep 2017 12:17:25 +0200 Subject: [PATCH 056/102] Streams: export iteration API. --- src/server.h | 5 ----- src/stream.h | 31 +++++++++++++++++++++++++++++++ src/t_stream.c | 19 ------------------- 3 files changed, 31 insertions(+), 24 deletions(-) diff --git a/src/server.h b/src/server.h index 37df429b..bc572b1e 100644 --- a/src/server.h +++ b/src/server.h @@ -1425,11 +1425,6 @@ void listTypeConvert(robj *subject, int enc); void unblockClientWaitingData(client *c); void popGenericCommand(client *c, int where); -/* Stream data type. */ -stream *streamNew(void); -void freeStream(stream *s); -size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end, size_t count); - /* MULTI/EXEC/WATCH... */ void unwatchAllKeys(client *c); void initClientMultiState(client *c); diff --git a/src/stream.h b/src/stream.h index e78af5bc..e3800932 100644 --- a/src/stream.h +++ b/src/stream.h @@ -19,4 +19,35 @@ typedef struct stream { streamID last_id; /* Zero if there are yet no items. */ } stream; +/* We define an iterator to iterate stream items in an abstract way, without + * caring about the radix tree + listpack representation. Technically speaking + * the iterator is only used inside streamReplyWithRange(), so could just + * be implemented inside the function, but practically there is the AOF + * rewriting code that also needs to iterate the stream to emit the XADD + * commands. */ +typedef struct streamIterator { + uint64_t start_key[2]; /* Start key as 128 bit big endian. */ + uint64_t end_key[2]; /* End key as 128 bit big endian. */ + raxIterator ri; /* Rax iterator. */ + unsigned char *lp; /* Current listpack. */ + unsigned char *lp_ele; /* Current listpack cursor. */ + /* Buffers used to hold the string of lpGet() when the element is + * integer encoded, so that there is no string representation of the + * element inside the listpack itself. */ + unsigned char field_buf[LP_INTBUF_SIZE]; + unsigned char value_buf[LP_INTBUF_SIZE]; +} streamIterator; + +/* Prototypes of exported APIs. */ + +struct client; + +stream *streamNew(void); +void freeStream(stream *s); +size_t streamReplyWithRange(struct client *c, stream *s, streamID *start, streamID *end, size_t count); +void streamIteratorStart(streamIterator *si, stream *s, streamID *start, streamID *end); +int streamIteratorGetID(streamIterator *si, streamID *id, int64_t *numfields); +void streamIteratorGetField(streamIterator *si, unsigned char **fieldptr, unsigned char **valueptr, int64_t *fieldlen, int64_t *valuelen); +void streamIteratorStop(streamIterator *si); + #endif diff --git a/src/t_stream.c b/src/t_stream.c index de9561a5..3144adc7 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -196,25 +196,6 @@ int streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id, return C_OK; } -/* We define an iterator to iterate stream items in an abstract way, without - * caring about the radix tree + listpack representation. Technically speaking - * the iterator is only used inside streamReplyWithRange(), so could just - * be implemented inside the function, but practically there is the AOF - * rewriting code that also needs to iterate the stream to emit the XADD - * commands. */ -typedef struct streamIterator { - uint64_t start_key[2]; /* Start key as 128 bit big endian. */ - uint64_t end_key[2]; /* End key as 128 bit big endian. */ - raxIterator ri; /* Rax iterator. */ - unsigned char *lp; /* Current listpack. */ - unsigned char *lp_ele; /* Current listpack cursor. */ - /* Buffers used to hold the string of lpGet() when the element is - * integer encoded, so that there is no string representation of the - * element inside the listpack itself. */ - unsigned char field_buf[LP_INTBUF_SIZE]; - unsigned char value_buf[LP_INTBUF_SIZE]; -} streamIterator; - /* Initialize the stream iterator, so that we can call iterating functions * to get the next items. This requires a corresponding streamIteratorStop() * at the end. From 26d4f8e3ec74811076e8a71cd384ea89b10e0c13 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 15 Sep 2017 12:37:04 +0200 Subject: [PATCH 057/102] Streams: AOF rewriting + minor iterator improvements. --- src/aof.c | 33 +++++++++++++++++++++++++++++++++ src/t_stream.c | 14 +++++++++----- 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/src/aof.c b/src/aof.c index 0593b270..5fbfdd69 100644 --- a/src/aof.c +++ b/src/aof.c @@ -1031,6 +1031,37 @@ int rewriteHashObject(rio *r, robj *key, robj *o) { return 1; } +/* Emit the commands needed to rebuild a stream object. + * The function returns 0 on error, 1 on success. */ +int rewriteStreamObject(rio *r, robj *key, robj *o) { + streamIterator si; + streamIteratorStart(&si,o->ptr,NULL,NULL); + streamID id; + int64_t numfields; + + while(streamIteratorGetID(&si,&id,&numfields)) { + /* Emit a two elements array for each item. The first is + * the ID, the second is an array of field-value pairs. */ + + /* Emit the XADD ...fields... command. */ + if (rioWriteBulkCount(r,'*',3+numfields*2) == 0) return 0; + if (rioWriteBulkString(r,"XADD",4) == 0) return 0; + if (rioWriteBulkObject(r,key) == 0) return 0; + sds replyid = sdscatfmt(sdsempty(),"%U.%U",id.ms,id.seq); + if (rioWriteBulkString(r,replyid,sdslen(replyid)) == 0) return 0; + sdsfree(replyid); + while(numfields--) { + unsigned char *field, *value; + int64_t field_len, value_len; + streamIteratorGetField(&si,&field,&value,&field_len,&value_len); + if (rioWriteBulkString(r,(char*)field,field_len) == 0) return 0; + if (rioWriteBulkString(r,(char*)value,value_len) == 0) return 0; + } + } + streamIteratorStop(&si); + return 1; +} + /* Call the module type callback in order to rewrite a data type * that is exported by a module and is not handled by Redis itself. * The function returns 0 on error, 1 on success. */ @@ -1111,6 +1142,8 @@ int rewriteAppendOnlyFileRio(rio *aof) { if (rewriteSortedSetObject(aof,&key,o) == 0) goto werr; } else if (o->type == OBJ_HASH) { if (rewriteHashObject(aof,&key,o) == 0) goto werr; + } else if (o->type == OBJ_STREAM) { + if (rewriteStreamObject(aof,&key,o) == 0) goto werr; } else if (o->type == OBJ_MODULE) { if (rewriteModuleObject(aof,&key,o) == 0) goto werr; } else { diff --git a/src/t_stream.c b/src/t_stream.c index 3144adc7..76005008 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -218,19 +218,23 @@ int streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id, void streamIteratorStart(streamIterator *si, stream *s, streamID *start, streamID *end) { /* Intialize the iterator and translates the iteration start/stop * elements into a 128 big big-endian number. */ - streamEncodeID(si->start_key,start); + if (start) { + streamEncodeID(si->start_key,start); + } else { + si->start_key[0] = 0; + si->start_key[0] = 0; + } + if (end) { streamEncodeID(si->end_key,end); } else { - /* We assume that UINT64_MAX is the same in little and big - * endian, that is, all bits set. */ si->end_key[0] = UINT64_MAX; si->end_key[0] = UINT64_MAX; } - raxStart(&si->ri,s->rax); /* Seek the correct node in the radix tree. */ - if (start->ms || start->seq) { + raxStart(&si->ri,s->rax); + if (start && (start->ms || start->seq)) { raxSeek(&si->ri,"<=",(unsigned char*)si->start_key, sizeof(si->start_key)); if (raxEOF(&si->ri)) From 7a41b402c1a5caae30dcd234d3d01c7f76ca5757 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 15 Sep 2017 15:54:18 +0200 Subject: [PATCH 058/102] Streams: basic XADD tests. --- tests/test_helper.tcl | 1 + tests/unit/type/stream.tcl | 42 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 tests/unit/type/stream.tcl diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 41c86780..7def9a7f 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -26,6 +26,7 @@ set ::all_tests { unit/type/set unit/type/zset unit/type/hash + unit/type/stream unit/sort unit/expire unit/other diff --git a/tests/unit/type/stream.tcl b/tests/unit/type/stream.tcl new file mode 100644 index 00000000..a668ddf4 --- /dev/null +++ b/tests/unit/type/stream.tcl @@ -0,0 +1,42 @@ +# return value is like strcmp() and similar. +proc streamCompareID {a b} { + if {$a == $b} {return 0} + lassign [split $a .] a_ms a_seq + lassign [split $b .] b_ms b_seq + if {$a_ms > $b_ms} {return 1} + if {$a_ms < $b_ms} {return -1} + # Same ms case, compare seq. + if {$a_seq > $b_seq} {return 1} + if {$a_seq < $b_seq} {return -1} +} + +start_server { + tags {"stream"} +} { + test {XADD can add entries into a stream that XRANGE can fetch} { + r XADD mystream * item 1 value a + r XADD mystream * item 2 value b + assert_equal 2 [r XLEN mystream] + set items [r XRANGE mystream - +] + assert_equal [lindex $items 0 1] {item 1 value a} + assert_equal [lindex $items 1 1] {item 2 value b} + } + + test {XADD IDs are incremental} { + set id1 [r XADD mystream * item 1 value a] + set id2 [r XADD mystream * item 2 value b] + set id3 [r XADD mystream * item 3 value c] + assert {[streamCompareID $id1 $id2] == -1} + assert {[streamCompareID $id2 $id3] == -1} + } + + test {XADD IDs are incremental when ms is the same as well} { + r multi + r XADD mystream * item 1 value a + r XADD mystream * item 2 value b + r XADD mystream * item 3 value c + lassign [r exec] id1 id2 id3 + assert {[streamCompareID $id1 $id2] == -1} + assert {[streamCompareID $id2 $id3] == -1} + } +} From fa707ca15477c5d5871668a7b2ca2040b992df48 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 15 Sep 2017 16:56:18 +0200 Subject: [PATCH 059/102] Streams: more advanced XADD and XRANGE tests. --- tests/unit/type/stream.tcl | 43 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/unit/type/stream.tcl b/tests/unit/type/stream.tcl index a668ddf4..35de5c1f 100644 --- a/tests/unit/type/stream.tcl +++ b/tests/unit/type/stream.tcl @@ -10,6 +10,15 @@ proc streamCompareID {a b} { if {$a_seq < $b_seq} {return -1} } +# return the ID immediately greater than the specified one. +# Note that this function does not care to handle 'seq' overflow +# since it's a 64 bit value. +proc streamNextID {id} { + lassign [split $id .] ms seq + incr seq + join [list $ms $seq] . +} + start_server { tags {"stream"} } { @@ -39,4 +48,38 @@ start_server { assert {[streamCompareID $id1 $id2] == -1} assert {[streamCompareID $id2 $id3] == -1} } + + test {XADD mass insertion and XLEN} { + r DEL mystream + r multi + for {set j 0} {$j < 10000} {incr j} { + r XADD mystream * item $j + } + r exec + + set items [r XRANGE mystream - +] + for {set j 0} {$j < 10000} {incr j} { + assert {[lindex $items $j 1] eq [list item $j]} + } + assert {[r xlen mystream] == $j} + } + + test {XRANGE COUNT works as expected} { + assert {[llength [r xrange mystream - + COUNT 10]] == 10} + } + + test {XRANGE can be used to iterate the whole stream} { + set last_id "-" + set j 0 + while 1 { + set elements [r xrange mystream $last_id + COUNT 100] + if {[llength $elements] == 0} break + foreach e $elements { + assert {[lindex $e 1] eq [list item $j]} + incr j; + } + set last_id [streamNextID [lindex $elements end 0]] + } + assert {$j == 10000} + } } From eb1230c9990562da739f15913a795cf0d8f2f5af Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 18 Sep 2017 10:48:27 +0200 Subject: [PATCH 060/102] Streams: XRANGE fuzz testing. --- tests/unit/type/stream.tcl | 53 +++++++++++++++++++++++++++++++++++++- 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/tests/unit/type/stream.tcl b/tests/unit/type/stream.tcl index 35de5c1f..8a94aa5e 100644 --- a/tests/unit/type/stream.tcl +++ b/tests/unit/type/stream.tcl @@ -1,6 +1,6 @@ # return value is like strcmp() and similar. proc streamCompareID {a b} { - if {$a == $b} {return 0} + if {$a eq $b} {return 0} lassign [split $a .] a_ms a_seq lassign [split $b .] b_ms b_seq if {$a_ms > $b_ms} {return 1} @@ -19,6 +19,36 @@ proc streamNextID {id} { join [list $ms $seq] . } +# Generate a random stream entry ID with the ms part between min and max +# and a low sequence number (0 - 999 range), in order to stress test +# XRANGE against a Tcl implementation implementing the same concept +# with Tcl-only code in a linear array. +proc streamRandomID {min_id max_id} { + lassign [split $min_id .] min_ms min_seq + lassign [split $max_id .] max_ms max_seq + set delta [expr {$max_ms-$min_ms+1}] + set ms [expr {$min_ms+[randomInt $delta]}] + set seq [randomInt 1000] + return $ms.$seq +} + +# Tcl-side implementation of XRANGE to perform fuzz testing in the Redis +# XRANGE implementation. +proc streamSimulateXRANGE {items start end} { + set res {} + foreach i $items { + set this_id [lindex $i 0] + if {[streamCompareID $this_id $start] >= 0} { + if {[streamCompareID $this_id $end] <= 0} { + lappend res $i + } + } + } + return $res +} + +set content {} ;# Will be populated with Tcl side copy of the stream content. + start_server { tags {"stream"} } { @@ -82,4 +112,25 @@ start_server { } assert {$j == 10000} } + + test {XRANGE fuzzing} { + # puts $items + set low_id [lindex $items 0 0] + set high_id [lindex $items end 0] + for {set j 0} {$j < 100} {incr j} { + set start [streamRandomID $low_id $high_id] + set end [streamRandomID $low_id $high_id] + set range [r xrange mystream $start $end] + set tcl_range [streamSimulateXRANGE $items $start $end] + if {$range ne $tcl_range} { + puts "*** WARNING *** - XRANGE fuzzing mismatch: $start - $end" + puts "---" + puts "XRANGE: '$range'" + puts "---" + puts "TCL: '$tcl_range'" + puts "---" + fail "XRANGE fuzzing failed, check logs for details" + } + } + } } From ae9065d8080cff60efb97676de5f3bfb26c74285 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Sep 2017 16:49:56 +0200 Subject: [PATCH 061/102] Streams: tests for blocking and non-blocking XREAD. --- tests/unit/type/stream.tcl | 90 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/tests/unit/type/stream.tcl b/tests/unit/type/stream.tcl index 8a94aa5e..dbed9f0a 100644 --- a/tests/unit/type/stream.tcl +++ b/tests/unit/type/stream.tcl @@ -113,6 +113,96 @@ start_server { assert {$j == 10000} } + test {XREAD with non empty stream} { + set res [r XREAD COUNT 1 STREAMS mystream 0.0] + assert {[lindex $res 0 1 0 1] eq {item 0}} + } + + test {Non blocking XREAD with empty streams} { + set res [r XREAD STREAMS s1 s2 0.0 0.0] + assert {$res eq {}} + } + + test {XREAD with non empty second stream} { + set res [r XREAD COUNT 1 STREAMS nostream mystream 0.0 0.0] + assert {[lindex $res 0 0] eq {mystream}} + assert {[lindex $res 0 1 0 1] eq {item 0}} + } + + test {Blocking XREAD waiting new data} { + r XADD s2 * old abcd1234 + set rd [redis_deferring_client] + $rd XREAD BLOCK 20000 STREAMS s1 s2 s3 $ $ $ + r XADD s2 * new abcd1234 + set res [$rd read] + assert {[lindex $res 0 0] eq {s2}} + assert {[lindex $res 0 1 0 1] eq {new abcd1234}} + } + + test {Blocking XREAD waiting old data} { + set rd [redis_deferring_client] + $rd XREAD BLOCK 20000 STREAMS s1 s2 s3 $ 0.0 $ + r XADD s2 * foo abcd1234 + set res [$rd read] + assert {[lindex $res 0 0] eq {s2}} + assert {[lindex $res 0 1 0 1] eq {old abcd1234}} + } + + test "XREAD: XADD + DEL should not awake client" { + set rd [redis_deferring_client] + r del s1 + $rd XREAD BLOCK 20000 STREAMS s1 $ + r multi + r XADD s1 * old abcd1234 + r DEL s1 + r exec + r XADD s1 * new abcd1234 + set res [$rd read] + assert {[lindex $res 0 0] eq {s1}} + assert {[lindex $res 0 1 0 1] eq {new abcd1234}} + } + + test "XREAD: XADD + DEL + LPUSH should not awake client" { + set rd [redis_deferring_client] + r del s1 + $rd XREAD BLOCK 20000 STREAMS s1 $ + r multi + r XADD s1 * old abcd1234 + r DEL s1 + r LPUSH s1 foo bar + r exec + r DEL s1 + r XADD s1 * new abcd1234 + set res [$rd read] + assert {[lindex $res 0 0] eq {s1}} + assert {[lindex $res 0 1 0 1] eq {new abcd1234}} + } + + test {XREAD with same stream name multiple times should work} { + r XADD s2 * old abcd1234 + set rd [redis_deferring_client] + $rd XREAD BLOCK 20000 STREAMS s2 s2 s2 $ $ $ + r XADD s2 * new abcd1234 + set res [$rd read] + assert {[lindex $res 0 0] eq {s2}} + assert {[lindex $res 0 1 0 1] eq {new abcd1234}} + } + + test {XREAD + multiple XADD inside transaction} { + r XADD s2 * old abcd1234 + set rd [redis_deferring_client] + $rd XREAD BLOCK 20000 STREAMS s2 s2 s2 $ $ $ + r MULTI + r XADD s2 * field one + r XADD s2 * field two + r XADD s2 * field three + r EXEC + set res [$rd read] + assert {[lindex $res 0 0] eq {s2}} + assert {[lindex $res 0 1 0 1] eq {field one}} + assert {[lindex $res 0 1 1 1] eq {field two}} + } + test {XRANGE fuzzing} { # puts $items set low_id [lindex $items 0 0] From 8f00cf85a7ee93da987e8d0b899ada33f2a88505 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 19 Sep 2017 16:57:37 +0200 Subject: [PATCH 062/102] Streams: fixed memory leaks when blocking again for same stream. blockForKeys() was not freeing the allocation holding the ID when the key was already found busy. Fortunately the unit test checked explicitly for blocking multiple times for the same key (copying a regression in the blocking lists tests), so the bug was detected by the Redis test leak checker. --- src/blocked.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/blocked.c b/src/blocked.c index 519a402c..734e6ffd 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -387,7 +387,10 @@ void blockForKeys(client *c, int btype, robj **keys, int numkeys, mstime_t timeo } /* If the key already exists in the dictionary ignore it. */ - if (dictAdd(c->bpop.keys,keys[j],key_data) != DICT_OK) continue; + if (dictAdd(c->bpop.keys,keys[j],key_data) != DICT_OK) { + zfree(key_data); + continue; + } incrRefCount(keys[j]); /* And in the other "side", to map keys -> clients */ From 3f2d7e277e1754a5a421948bfbd45b5725a05148 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 27 Sep 2017 17:41:32 +0200 Subject: [PATCH 063/102] Streams: items compression implemented. The approach used is to set a fixed header at the start of every listpack blob (that contains many entries). The header contains a "master" ID and fields, that are initially just obtained from the first entry inserted in the listpack, so that the first enty is always well compressed. Later every new entry is checked against these fields, and if it matches, the SAMEFIELD flag is set in the entry so that we know to just use the master entry flags. The IDs are always delta-encoded against the first entry. This approach avoids cascading effects in which entries are encoded depending on the previous entries, in order to avoid complexity and rewritings of the data when data is removed in the middle (which is a planned feature). --- src/stream.h | 5 ++ src/t_stream.c | 171 +++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 142 insertions(+), 34 deletions(-) diff --git a/src/stream.h b/src/stream.h index e3800932..df29e9e7 100644 --- a/src/stream.h +++ b/src/stream.h @@ -26,6 +26,11 @@ typedef struct stream { * rewriting code that also needs to iterate the stream to emit the XADD * commands. */ typedef struct streamIterator { + streamID master_id; /* ID of the master entry at listpack head. */ + uint64_t master_fields_count; /* Master entries # of fields. */ + unsigned char *master_fields_start; /* Master entries start in listapck. */ + unsigned char *master_fields_ptr; /* Master field to emit next. */ + int entry_flags; /* Flags of entry we are emitting. */ uint64_t start_key[2]; /* Start key as 128 bit big endian. */ uint64_t end_key[2]; /* End key as 128 bit big endian. */ raxIterator ri; /* Rax iterator. */ diff --git a/src/t_stream.c b/src/t_stream.c index 76005008..5250c36b 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -27,16 +27,19 @@ * POSSIBILITY OF SUCH DAMAGE. */ -/* TODO: - * - After loading a stream, populate the last ID. - */ - #include "server.h" #include "endianconv.h" #include "stream.h" #define STREAM_BYTES_PER_LISTPACK 4096 +/* Every stream item inside the listpack, has a flags field that is used to + * mark the entry as deleted, or having the same field as the "master" + * entry at the start of the listpack> */ +#define STREAM_ITEM_FLAG_NONE 0 /* No special flags. */ +#define STREAM_ITEM_FLAG_DELETED (1<<0) /* Entry is delted. Skip it. */ +#define STREAM_ITEM_FLAG_SAMEFIELDS (1<<1) /* Same fields as master entry. */ + /* ----------------------------------------------------------------------- * Low level stream encoding: a radix tree of listpacks. * ----------------------------------------------------------------------- */ @@ -95,6 +98,19 @@ int64_t lpGetInteger(unsigned char *ele) { return v; } +/* Debugging function to log the full content of a listpack. Useful + * for development and debugging. */ +void streamLogListpackContent(unsigned char *lp) { + unsigned char *p = lpFirst(lp); + while(p) { + unsigned char buf[LP_INTBUF_SIZE]; + int64_t v; + unsigned char *ele = lpGet(p,&v,buf); + serverLog(LL_WARNING,"- [%d] '%.*s'", (int)v, (int)v, ele); + p = lpNext(lp,p); + } +} + /* Convert the specified stream entry ID as a 128 bit big endian number, so * that the IDs can be sorted lexicographically. */ void streamEncodeID(void *buf, streamID *id) { @@ -159,32 +175,82 @@ int streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id, * to do so we consider the ID as a single 128 bit number written in * big endian, so that the most significant bytes are the first ones. */ uint64_t rax_key[2]; /* Key in the radix tree containing the listpack.*/ - uint64_t entry_id[2]; /* Entry ID of the new item as 128 bit string. */ - streamEncodeID(entry_id,&id); + streamID master_id; /* ID of the master entry in the listpack. */ /* Create a new listpack and radix tree node if needed. */ + int flags = STREAM_ITEM_FLAG_NONE; if (lp == NULL || lp_bytes > STREAM_BYTES_PER_LISTPACK) { + master_id = id; + streamEncodeID(rax_key,&id); + /* Create the listpack having the master entry ID and fields. */ lp = lpNew(); - rax_key[0] = entry_id[0]; - rax_key[1] = entry_id[1]; + lp = lpAppend(lp,(unsigned char*)rax_key,sizeof(rax_key)); + lp = lpAppendInteger(lp,numfields); + for (int i = 0; i < numfields; i++) { + sds field = argv[i*2]->ptr; + lp = lpAppend(lp,(unsigned char*)field,sdslen(field)); + } raxInsert(s->rax,(unsigned char*)&rax_key,sizeof(rax_key),lp,NULL); + /* The first entry we insert, has obviously the same fields of the + * master entry. */ + flags |= STREAM_ITEM_FLAG_SAMEFIELDS; } else { serverAssert(ri.key_len == sizeof(rax_key)); memcpy(rax_key,ri.key,sizeof(rax_key)); + + /* Read the master entry ID. */ + int64_t e_len; + unsigned char *lp_ele = lpFirst(lp); + unsigned char buf[LP_INTBUF_SIZE]; + unsigned char *e = lpGet(lp_ele,&e_len,buf); + serverAssert(e_len == sizeof(streamID)); + streamDecodeID(e,&master_id); + lp_ele = lpNext(lp,lp_ele); + + /* Check if the entry we are adding, have the same fields + * as the master entry. */ + int master_fields_count = lpGetInteger(lp_ele); + lp_ele = lpNext(lp,lp_ele); + if (numfields == master_fields_count) { + int i; + for (i = 0; i < master_fields_count; i++) { + sds field = argv[i*2]->ptr; + unsigned char *e = lpGet(lp_ele,&e_len,buf); + /* Stop if there is a mismatch. */ + if (sdslen(field) != (size_t)e_len || + memcmp(e,field,e_len) != 0) break; + lp_ele = lpNext(lp,lp_ele); + } + if (i == master_fields_count) flags |= STREAM_ITEM_FLAG_SAMEFIELDS; + } } /* Populate the listpack with the new entry. We use the following * encoding: * - * +--------+----------+-------+-------+-/-+-------+-------+ - * |entry-id|num-fields|field-1|value-1|...|field-N|value-N| - * +--------+----------+-------+-------+-/-+-------+-------+ + * +-----+--------+----------+-------+-------+-/-+-------+-------+ + * |flags|entry-id|num-fields|field-1|value-1|...|field-N|value-N| + * +-----+--------+----------+-------+-------+-/-+-------+-------+ + * + * However if the SAMEFIELD flag is set, we have just to populate + * the entry with the values, so it becomes: + * + * +-----+--------+-------+-/-+-------+ + * |flags|entry-id|value-1|...|value-N| + * +-----+--------+-------+-/-+-------+ + * + * The entry-id field is actually two separated fields: the ms + * and seq difference compared to the master entry. */ - lp = lpAppend(lp,(unsigned char*)entry_id,sizeof(entry_id)); - lp = lpAppendInteger(lp,numfields); + lp = lpAppendInteger(lp,flags); + lp = lpAppendInteger(lp,id.ms - master_id.ms); + lp = lpAppendInteger(lp,id.seq - master_id.seq); + if (!(flags & STREAM_ITEM_FLAG_SAMEFIELDS)) + lp = lpAppendInteger(lp,numfields); for (int i = 0; i < numfields; i++) { sds field = argv[i*2]->ptr, value = argv[i*2+1]->ptr; - lp = lpAppend(lp,(unsigned char*)field,sdslen(field)); + if (!(flags & STREAM_ITEM_FLAG_SAMEFIELDS)) + lp = lpAppend(lp,(unsigned char*)field,sdslen(field)); lp = lpAppend(lp,(unsigned char*)value,sdslen(value)); } @@ -259,35 +325,67 @@ int streamIteratorGetID(streamIterator *si, streamID *id, int64_t *numfields) { if (!raxNext(&si->ri)) return 0; serverAssert(si->ri.key_len == sizeof(streamID)); si->lp = si->ri.data; - si->lp_ele = lpFirst(si->lp); + si->lp_ele = lpFirst(si->lp); /* Seek the master ID. */ + /* Get the master ID. */ + int64_t e_len; + unsigned char buf[LP_INTBUF_SIZE]; + unsigned char *e = lpGet(si->lp_ele,&e_len,buf); + serverAssert(e_len == sizeof(streamID)); + streamDecodeID(e,&si->master_id); + si->lp_ele = lpNext(si->lp,si->lp_ele); /* Seek fields count. */ + /* Get the master fields count. */ + si->master_fields_count = lpGetInteger(si->lp_ele); + si->lp_ele = lpNext(si->lp,si->lp_ele); /* Seek first field. */ + si->master_fields_start = si->lp_ele; + /* Skip master fileds to seek the first entry. */ + for (uint64_t i = 0; i < si->master_fields_count; i++) + si->lp_ele = lpNext(si->lp,si->lp_ele); } /* For every radix tree node, iterate the corresponding listpack, * returning elements when they are within range. */ while(si->lp_ele) { - int64_t e_len; - unsigned char buf[LP_INTBUF_SIZE]; - unsigned char *e = lpGet(si->lp_ele,&e_len,buf); - serverAssert(e_len == sizeof(streamID)); + /* Get the flags entry. */ + int flags = lpGetInteger(si->lp_ele); + si->lp_ele = lpNext(si->lp,si->lp_ele); /* Seek ID. */ - /* Go to next field: number of elements. */ + /* Get the ID: it is encoded as difference between the master + * ID and this entry ID. */ + *id = si->master_id; + id->ms += lpGetInteger(si->lp_ele); si->lp_ele = lpNext(si->lp,si->lp_ele); + id->seq += lpGetInteger(si->lp_ele); + si->lp_ele = lpNext(si->lp,si->lp_ele); + unsigned char buf[sizeof(streamID)]; + streamEncodeID(buf,id); - /* If current >= start */ - if (memcmp(e,si->start_key,sizeof(streamID)) >= 0) { - if (memcmp(e,si->end_key,sizeof(streamID)) > 0) - return 0; /* We are already out of range. */ - streamDecodeID(e,id); + /* The number of entries is here or not depending on the + * flags. */ + if (flags & STREAM_ITEM_FLAG_SAMEFIELDS) { + *numfields = si->master_fields_count; + } else { *numfields = lpGetInteger(si->lp_ele); si->lp_ele = lpNext(si->lp,si->lp_ele); - return 1; /* Valid item returned. */ - } else { - /* If we do not emit, we have to discard. */ - int64_t numfields = lpGetInteger(si->lp_ele); - si->lp_ele = lpNext(si->lp,si->lp_ele); - for (int64_t i = 0; i < numfields*2; i++) - si->lp_ele = lpNext(si->lp,si->lp_ele); } + + /* If current >= start, and the entry is not marked as + * deleted, emit it. */ + if (memcmp(buf,si->start_key,sizeof(streamID)) >= 0 && + !(flags & STREAM_ITEM_FLAG_DELETED)) + { + if (memcmp(buf,si->end_key,sizeof(streamID)) > 0) + return 0; /* We are already out of range. */ + si->entry_flags = flags; + if (flags & STREAM_ITEM_FLAG_SAMEFIELDS) + si->master_fields_ptr = si->master_fields_start; + return 1; /* Valid item returned. */ + } + + /* If we do not emit, we have to discard. */ + int to_discard = (flags & STREAM_ITEM_FLAG_SAMEFIELDS) ? + *numfields : *numfields*2; + for (int64_t i = 0; i < to_discard; i++) + si->lp_ele = lpNext(si->lp,si->lp_ele); } /* End of listpack reached. Try the next radix tree node. */ @@ -301,8 +399,13 @@ int streamIteratorGetID(streamIterator *si, streamID *id, int64_t *numfields) { * lengths by reference, that are valid until the next iterator call, assuming * no one touches the stream meanwhile. */ void streamIteratorGetField(streamIterator *si, unsigned char **fieldptr, unsigned char **valueptr, int64_t *fieldlen, int64_t *valuelen) { - *fieldptr = lpGet(si->lp_ele,fieldlen,si->field_buf); - si->lp_ele = lpNext(si->lp,si->lp_ele); + if (si->entry_flags & STREAM_ITEM_FLAG_SAMEFIELDS) { + *fieldptr = lpGet(si->master_fields_ptr,fieldlen,si->field_buf); + si->master_fields_ptr = lpNext(si->lp,si->master_fields_ptr); + } else { + *fieldptr = lpGet(si->lp_ele,fieldlen,si->field_buf); + si->lp_ele = lpNext(si->lp,si->lp_ele); + } *valueptr = lpGet(si->lp_ele,valuelen,si->value_buf); si->lp_ele = lpNext(si->lp,si->lp_ele); } From 7d0d9693c19b92c8ea60cef8ebaba4a456bd0f73 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 27 Sep 2017 23:04:31 +0200 Subject: [PATCH 064/102] Streams: modify tests to stress compression. --- tests/unit/type/stream.tcl | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tests/unit/type/stream.tcl b/tests/unit/type/stream.tcl index dbed9f0a..c63ed8a2 100644 --- a/tests/unit/type/stream.tcl +++ b/tests/unit/type/stream.tcl @@ -83,13 +83,19 @@ start_server { r DEL mystream r multi for {set j 0} {$j < 10000} {incr j} { - r XADD mystream * item $j + # From time to time insert a field with a different set + # of fields in order to stress the stream compression code. + if {rand() < 0.9} { + r XADD mystream * item $j + } else { + r XADD mystream * item $j otherfield foo + } } r exec set items [r XRANGE mystream - +] for {set j 0} {$j < 10000} {incr j} { - assert {[lindex $items $j 1] eq [list item $j]} + assert {[lrange [lindex $items $j 1] 0 1] eq [list item $j]} } assert {[r xlen mystream] == $j} } @@ -105,7 +111,7 @@ start_server { set elements [r xrange mystream $last_id + COUNT 100] if {[llength $elements] == 0} break foreach e $elements { - assert {[lindex $e 1] eq [list item $j]} + assert {[lrange [lindex $e 1] 0 1] eq [list item $j]} incr j; } set last_id [streamNextID [lindex $elements end 0]] @@ -115,7 +121,7 @@ start_server { test {XREAD with non empty stream} { set res [r XREAD COUNT 1 STREAMS mystream 0.0] - assert {[lindex $res 0 1 0 1] eq {item 0}} + assert {[lrange [lindex $res 0 1 0 1] 0 1] eq {item 0}} } test {Non blocking XREAD with empty streams} { @@ -204,7 +210,6 @@ start_server { } test {XRANGE fuzzing} { - # puts $items set low_id [lindex $items 0 0] set high_id [lindex $items end 0] for {set j 0} {$j < 100} {incr j} { From cea421a0211ac75e27513a8c7371042da6c8f4d3 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 27 Sep 2017 23:12:43 +0200 Subject: [PATCH 065/102] Streams: specify better how the master enty works. --- src/t_stream.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/t_stream.c b/src/t_stream.c index 5250c36b..bfc6e4c9 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -177,7 +177,29 @@ int streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id, uint64_t rax_key[2]; /* Key in the radix tree containing the listpack.*/ streamID master_id; /* ID of the master entry in the listpack. */ - /* Create a new listpack and radix tree node if needed. */ + /* Create a new listpack and radix tree node if needed. Note that when + * a new listpack is created, we populate it with a "master entry". This + * is just an ID and a set of fields that is taken as refernce in order + * to compress the stream entries that we'll add inside the listpack. + * + * Note that while we use the first added entry ID and fields to create + * the master entry, the first added entry is NOT represented in the master + * entry, which is a stand alone object. But of course, the first entry + * will compress well because it's used as reference. + * + * The master entry is composed of just: an ID and a set of fields, like: + * + * +------------+------------+---------+---------+--/--+---------+ + * | 128 bit ID | num-fields | field_1 | field_2 | ... | field_N | + * +------------+------------+---------+---------+--/--+---------+ + * + * The real entries will be encoded with an ID that is just the + * millisecond and sequence difference compared to the master entry + * (delta encoding), and if the fields of the entry are the same as + * the master enty fields, the entry flags will specify this fact + * and the entry fields and number of fields will be omitted (see later + * in the code of this function). */ + int flags = STREAM_ITEM_FLAG_NONE; if (lp == NULL || lp_bytes > STREAM_BYTES_PER_LISTPACK) { master_id = id; @@ -221,6 +243,8 @@ int streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id, memcmp(e,field,e_len) != 0) break; lp_ele = lpNext(lp,lp_ele); } + /* All fields are the same! We can compress the field names + * setting a single bit in the flags. */ if (i == master_fields_count) flags |= STREAM_ITEM_FLAG_SAMEFIELDS; } } From f24d3a7de05d213f702621186f31a4c227f366c6 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 28 Sep 2017 16:55:46 +0200 Subject: [PATCH 066/102] Streams: delta encode IDs based on key. Add count + deleted fields. We used to have the master ID stored at the start of the listpack, however using the key directly makes more sense in order to create a space efficient representation: anyway the key at the radix tree is very unlikely to change because of how the stream is implemented. Moreover on nodes merging, to rewrite the merged listpacks is anyway the most sensible operation, and we can use the iterator and the append-to-stream function in order to avoid re-implementing the code needed for merging. This commit also adds two items at the start of the listpack: the number of valid items inside the listpack, and the number of items marked as deleted. This means that there is no need to scan a listpack in order to understand if it's a good candidate for garbage collection, if the ration between valid/deleted items triggers the GC. --- src/rdb.c | 28 ++++++++++++-------- src/t_stream.c | 72 ++++++++++++++++++++++++++++++-------------------- 2 files changed, 61 insertions(+), 39 deletions(-) diff --git a/src/rdb.c b/src/rdb.c index 5d15539c..17a93275 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -781,6 +781,8 @@ ssize_t rdbSaveObject(rio *rdb, robj *o) { while (raxNext(&ri)) { unsigned char *lp = ri.data; size_t lp_bytes = lpBytes(lp); + if ((n = rdbSaveRawString(rdb,ri.key,ri.key_len)) == -1) return -1; + nwritten += n; if ((n = rdbSaveRawString(rdb,lp,lp_bytes)) == -1) return -1; nwritten += n; } @@ -1448,27 +1450,31 @@ robj *rdbLoadObject(int rdbtype, rio *rdb) { uint64_t listpacks = rdbLoadLen(rdb,NULL); while(listpacks--) { + /* Get the master ID, the one we'll use as key of the radix tree + * node: the entries inside the listpack itself are delta-encoded + * relatively to this ID. */ + sds nodekey = rdbGenericLoadStringObject(rdb,RDB_LOAD_SDS,NULL); + if (sdslen(nodekey) != sizeof(streamID)) { + rdbExitReportCorruptRDB("Stream node key entry is not the " + "size of a stream ID"); + } + + /* Load the listpack. */ unsigned char *lp = rdbGenericLoadStringObject(rdb,RDB_LOAD_PLAIN,NULL); if (lp == NULL) return NULL; unsigned char *first = lpFirst(lp); if (first == NULL) { - /* Serialized listpacks should never be free, since on + /* Serialized listpacks should never be empty, since on * deletion we should remove the radix tree key if the * resulting listpack is emtpy. */ rdbExitReportCorruptRDB("Empty listpack inside stream"); } - /* Get the ID of the first entry: we'll use it as key to add the - * listpack into the radix tree. */ - int64_t e_len; - unsigned char buf[LP_INTBUF_SIZE]; - unsigned char *e = lpGet(first,&e_len,buf); - if (e_len != sizeof(streamID)) { - rdbExitReportCorruptRDB("Listpack first entry is not the " - "size of a stream ID"); - } - int retval = raxInsert(s->rax,e,sizeof(streamID),lp,NULL); + /* Insert the key in the radix tree. */ + int retval = raxInsert(s->rax, + (unsigned char*)nodekey,sizeof(streamID),lp,NULL); + sdsfree(nodekey); if (!retval) rdbExitReportCorruptRDB("Listpack re-added with existing key"); } diff --git a/src/t_stream.c b/src/t_stream.c index bfc6e4c9..00d07ac5 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -83,6 +83,16 @@ unsigned char *lpAppendInteger(unsigned char *lp, int64_t value) { return lpAppend(lp,(unsigned char*)buf,slen); } +/* This is just a wrapper for lpReplace() to directly use a 64 bit integer + * instead of a string to replace the current element. The function returns + * the new listpack as return value, and also updates the current cursor + * by updating '*pos'. */ +unsigned char *lpReplaceInteger(unsigned char *lp, unsigned char **pos, int64_t value) { + char buf[LONG_STR_SIZE]; + int slen = ll2string(buf,sizeof(buf),value); + return lpInsert(lp, (unsigned char*)buf, slen, *pos, LP_REPLACE, pos); +} + /* This is a wrapper function for lpGet() to directly get an integer value * from the listpack (that may store numbers as a string), converting * the string if needed. */ @@ -179,26 +189,31 @@ int streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id, /* Create a new listpack and radix tree node if needed. Note that when * a new listpack is created, we populate it with a "master entry". This - * is just an ID and a set of fields that is taken as refernce in order - * to compress the stream entries that we'll add inside the listpack. + * is just a set of fields that is taken as refernce in order to compress + * the stream entries that we'll add inside the listpack. * - * Note that while we use the first added entry ID and fields to create + * Note that while we use the first added entry fields to create * the master entry, the first added entry is NOT represented in the master * entry, which is a stand alone object. But of course, the first entry * will compress well because it's used as reference. * - * The master entry is composed of just: an ID and a set of fields, like: + * The master entry is composed like in the following example: * - * +------------+------------+---------+---------+--/--+---------+ - * | 128 bit ID | num-fields | field_1 | field_2 | ... | field_N | - * +------------+------------+---------+---------+--/--+---------+ + * +-------+---------+------------+---------+--/--+---------+---------+ + * | count | deleted | num-fields | field_1 | field_2 | ... | field_N | + * +-------+---------+------------+---------+--/--+---------+---------+ + * + * count and deleted just represent respectively the total number of + * entires inside the listpack that are valid, and marked as deleted + * (delted flag in the entry flags set). So the total number of items + * actually inside the listpack (both deleted and not) is count+deleted. * * The real entries will be encoded with an ID that is just the - * millisecond and sequence difference compared to the master entry - * (delta encoding), and if the fields of the entry are the same as - * the master enty fields, the entry flags will specify this fact - * and the entry fields and number of fields will be omitted (see later - * in the code of this function). */ + * millisecond and sequence difference compared to the key stored at + * the radix tree node containing the listpack (delta encoding), and + * if the fields of the entry are the same as the master enty fields, the + * entry flags will specify this fact and the entry fields and number + * of fields will be omitted (see later in the code of this function). */ int flags = STREAM_ITEM_FLAG_NONE; if (lp == NULL || lp_bytes > STREAM_BYTES_PER_LISTPACK) { @@ -206,7 +221,8 @@ int streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id, streamEncodeID(rax_key,&id); /* Create the listpack having the master entry ID and fields. */ lp = lpNew(); - lp = lpAppend(lp,(unsigned char*)rax_key,sizeof(rax_key)); + lp = lpAppendInteger(lp,1); /* One item, the one we are adding. */ + lp = lpAppendInteger(lp,0); /* Zero deleted so far. */ lp = lpAppendInteger(lp,numfields); for (int i = 0; i < numfields; i++) { sds field = argv[i*2]->ptr; @@ -220,14 +236,15 @@ int streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id, serverAssert(ri.key_len == sizeof(rax_key)); memcpy(rax_key,ri.key,sizeof(rax_key)); - /* Read the master entry ID. */ - int64_t e_len; + /* Read the master ID from the radix tree key. */ + streamDecodeID(rax_key,&master_id); unsigned char *lp_ele = lpFirst(lp); - unsigned char buf[LP_INTBUF_SIZE]; - unsigned char *e = lpGet(lp_ele,&e_len,buf); - serverAssert(e_len == sizeof(streamID)); - streamDecodeID(e,&master_id); - lp_ele = lpNext(lp,lp_ele); + + /* Update count and skip the deleted fields. */ + int64_t count = lpGetInteger(lp_ele); + lp = lpReplaceInteger(lp,&lp_ele,count+1); + lp_ele = lpNext(lp,lp_ele); /* seek delted. */ + lp_ele = lpNext(lp,lp_ele); /* seek master entry num fields. */ /* Check if the entry we are adding, have the same fields * as the master entry. */ @@ -237,6 +254,8 @@ int streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id, int i; for (i = 0; i < master_fields_count; i++) { sds field = argv[i*2]->ptr; + int64_t e_len; + unsigned char buf[LP_INTBUF_SIZE]; unsigned char *e = lpGet(lp_ele,&e_len,buf); /* Stop if there is a mismatch. */ if (sdslen(field) != (size_t)e_len || @@ -348,16 +367,13 @@ int streamIteratorGetID(streamIterator *si, streamID *id, int64_t *numfields) { if (si->lp == NULL || si->lp_ele == NULL) { if (!raxNext(&si->ri)) return 0; serverAssert(si->ri.key_len == sizeof(streamID)); - si->lp = si->ri.data; - si->lp_ele = lpFirst(si->lp); /* Seek the master ID. */ /* Get the master ID. */ - int64_t e_len; - unsigned char buf[LP_INTBUF_SIZE]; - unsigned char *e = lpGet(si->lp_ele,&e_len,buf); - serverAssert(e_len == sizeof(streamID)); - streamDecodeID(e,&si->master_id); - si->lp_ele = lpNext(si->lp,si->lp_ele); /* Seek fields count. */ + streamDecodeID(si->ri.key,&si->master_id); /* Get the master fields count. */ + si->lp = si->ri.data; + si->lp_ele = lpFirst(si->lp); /* Seek items count */ + si->lp_ele = lpNext(si->lp,si->lp_ele); /* Seek deleted count. */ + si->lp_ele = lpNext(si->lp,si->lp_ele); /* Seek num fields. */ si->master_fields_count = lpGetInteger(si->lp_ele); si->lp_ele = lpNext(si->lp,si->lp_ele); /* Seek first field. */ si->master_fields_start = si->lp_ele; From 0c00fd7834215ec311a91def5d6737892222b9c3 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 28 Sep 2017 18:12:15 +0200 Subject: [PATCH 067/102] Streams: reduce listpack max size to 2k to speedup range queries. Listpack max size is a tradeoff between space and time. A 2k max entry puts the memory usage approximately at a similar order of magnitude (5 million entries went from 96 to 120 MB), but the range queries speed doubled (because there are half entries to scan in the average case). Lower values could be considered, or maybe this parameter should be made tunable. --- src/t_stream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/t_stream.c b/src/t_stream.c index 00d07ac5..956a9af1 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -31,7 +31,7 @@ #include "endianconv.h" #include "stream.h" -#define STREAM_BYTES_PER_LISTPACK 4096 +#define STREAM_BYTES_PER_LISTPACK 2048 /* Every stream item inside the listpack, has a flags field that is used to * mark the entry as deleted, or having the same field as the "master" From 0540803288dd137c0a0f3fc345165c6a87f0957e Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 29 Sep 2017 12:40:29 +0200 Subject: [PATCH 068/102] Streams: XADD MAXLEN implementation. The core of this change is the implementation of stream trimming, and the resulting MAXLEN option of XADD as a trivial result of having trimming functionalities. MAXLEN already works but in order to be more efficient listpack GC should be implemented, currently marked as a TODO item inside the comments. --- src/t_stream.c | 132 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 129 insertions(+), 3 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index 956a9af1..a7505d15 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -305,6 +305,107 @@ int streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id, return C_OK; } +/* Trim the stream 's' to have no more than maxlen elements, and return the + * number of elements removed from the stream. The 'approx' option, if non-zero, + * specifies that the trimming must be performed in a approximated way in + * order to maximize performances. This means that the stream may contain + * more elements than 'maxlen', and elements are only removed if we can remove + * a *whole* node of the radix tree. The elements are removed from the head + * of the stream (older elements). + * + * The function may return zero if: + * + * 1) The stream is already shorter or equal to the specified max length. + * 2) The 'approx' option is true and the head node had not enough elements + * to be deleted, leaving the stream with a number of elements >= maxlen. + */ +int64_t streamTrimByLength(stream *s, size_t maxlen, int approx) { + if (s->length <= maxlen) return 0; + + raxIterator ri; + raxStart(&ri,s->rax); + raxSeek(&ri,"^",NULL,0); + + int64_t deleted = 0; + while(s->length > maxlen && raxNext(&ri)) { + unsigned char *lp = ri.data, *p = lpFirst(lp); + int64_t entries = lpGetInteger(p); + + /* Check if we can remove the whole node, and still have at + * least maxlen elements. */ + if (s->length - entries >= maxlen) { + raxRemove(s->rax,ri.key,ri.key_len,NULL); + raxSeek(&ri,">=",ri.key,ri.key_len); + s->length -= entries; + deleted += entries; + continue; + } + + /* If we cannot remove a whole element, and approx is true, + * stop here. */ + if (approx) break; + + /* Otherwise, we have to mark single entries inside the listpack + * as deleted. We start by updating the entries/deleted counters. */ + int64_t to_delete = s->length - maxlen; + serverAssert(to_delete < entries); + lp = lpReplaceInteger(lp,&p,entries-to_delete); + p = lpNext(lp,p); /* Seek deleted field. */ + int64_t deleted = lpGetInteger(p); + lp = lpReplaceInteger(lp,&p,deleted+to_delete); + p = lpNext(lp,p); /* Seek num-of-fields in the master entry. */ + + /* Skip all the master fields. */ + int64_t master_fields_count = lpGetInteger(p); + p = lpNext(lp,p); /* Seek the first field. */ + for (int64_t j = 0; j < master_fields_count; j++) + p = lpNext(lp,p); /* Skip all master fields. */ + + /* 'p' is now pointing to the first entry inside the listpack. + * We have to run entry after entry, marking entries as deleted + * if they are already not deleted. */ + while(p) { + int flags = lpGetInteger(p); + int to_skip; + + /* Mark the entry as deleted. */ + if (!(flags & STREAM_ITEM_FLAG_DELETED)) { + flags |= STREAM_ITEM_FLAG_DELETED; + lp = lpReplaceInteger(lp,&p,flags); + deleted++; + s->length--; + if (s->length <= maxlen) break; /* Enough entries deleted. */ + } + + p = lpNext(lp,p); /* Skip ID ms delta. */ + p = lpNext(lp,p); /* Skip ID seq delta. */ + p = lpNext(lp,p); /* Seek num-fields or values (if compressed). */ + if (flags & STREAM_ITEM_FLAG_SAMEFIELDS) { + to_skip = master_fields_count; + } else { + to_skip = lpGetInteger(p); p = lpNext(lp,p); + to_skip = 1+(to_skip*2); + } + + while(to_skip--) p = lpNext(lp,p); /* Skip the whole entry. */ + } + + /* Here we should perform garbage collection in case at this point + * there are too many entries deleted inside the listpack. */ + entries -= to_delete; + deleted += to_delete; + if (entries + deleted > 10 && deleted > entries/2) { + /* TODO: perform a garbage collection. */ + } + + break; /* If we are here, there was enough to delete in the current + node, so no need to go to the next node. */ + } + + raxStop(&ri); + return deleted; +} + /* Initialize the stream iterator, so that we can call iterating functions * to get the next items. This requires a corresponding streamIteratorStop() * at the end. @@ -578,21 +679,32 @@ invalid: void xaddCommand(client *c) { streamID id; int id_given = 0; /* Was an ID different than "*" specified? */ + long long maxlen = 0; /* 0 means no maximum length. */ + int approx_maxlen = 0; /* If 1 only delete whole radix tree nodes, so + the maxium length is not applied verbatim. */ + int maxlen_arg_idx = 0; /* Index of the count in MAXLEN, for rewriting. */ /* Parse options. */ int i = 2; /* This is the first argument position where we could find an option, or the ID. */ for (; i < c->argc; i++) { - int moreargs = i != c->argc-1; + int moreargs = (c->argc-1) - i; /* Number of additional arguments. */ char *opt = c->argv[i]->ptr; if (opt[0] == '*' && opt[1] == '\0') { /* This is just a fast path for the common case of auto-ID * creation. */ break; } else if (!strcasecmp(opt,"maxlen") && moreargs) { - addReplyError(c,"Sorry, MAXLEN is still not implemented"); + char *next = c->argv[i+1]->ptr; + /* Check for the form MAXLEN ~ . */ + if (moreargs >= 2 && next[0] == '~' && next[1] == '\0') { + approx_maxlen = 1; + i++; + } + if (getLongLongFromObjectOrReply(c,c->argv[i+1],&maxlen,NULL) + != C_OK) return; i++; - return; + maxlen_arg_idx = i; } else { /* If we are here is a syntax error or a valid ID. */ if (streamParseIDOrReply(NULL,c->argv[i],&id,0) == C_OK) { @@ -634,6 +746,20 @@ void xaddCommand(client *c) { notifyKeyspaceEvent(NOTIFY_STREAM,"xadd",c->argv[1],c->db->id); server.dirty++; + /* Remove older elements if MAXLEN was specified. */ + if (maxlen) { + if (!streamTrimByLength(s,maxlen,approx_maxlen)) { + /* If no trimming was performed, for instance because approximated + * trimming length was specified, rewrite the MAXLEN argument + * as zero, so that the command is propagated without trimming. */ + robj *zeroobj = createStringObjectFromLongLong(0); + rewriteClientCommandArgument(c,maxlen_arg_idx,zeroobj); + decrRefCount(zeroobj); + } else { + notifyKeyspaceEvent(NOTIFY_STREAM,"xtrim",c->argv[1],c->db->id); + } + } + /* Let's rewrite the ID argument with the one actually generated for * AOF/replication propagation. */ robj *idarg = createObject(OBJ_STRING, From 0248a6b125df10ab74de3db82bc348cc4a6dce63 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 29 Sep 2017 16:16:19 +0200 Subject: [PATCH 069/102] Streams: fix streamTrimByLength() standalone items skipping. --- src/t_stream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/t_stream.c b/src/t_stream.c index a7505d15..4365aa47 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -383,7 +383,7 @@ int64_t streamTrimByLength(stream *s, size_t maxlen, int approx) { if (flags & STREAM_ITEM_FLAG_SAMEFIELDS) { to_skip = master_fields_count; } else { - to_skip = lpGetInteger(p); p = lpNext(lp,p); + to_skip = lpGetInteger(p); to_skip = 1+(to_skip*2); } From e53c90308b2940fa548a26583099e7a640d7abc9 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 29 Sep 2017 16:17:46 +0200 Subject: [PATCH 070/102] Streams: add XADD + MAXLEN test. --- tests/unit/type/stream.tcl | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/unit/type/stream.tcl b/tests/unit/type/stream.tcl index c63ed8a2..e9f187ae 100644 --- a/tests/unit/type/stream.tcl +++ b/tests/unit/type/stream.tcl @@ -79,6 +79,23 @@ start_server { assert {[streamCompareID $id2 $id3] == -1} } + test {XADD with MAXLEN option} { + r DEL mystream + for {set j 0} {$j < 1000} {incr j} { + if {rand() < 0.9} { + r XADD mystream MAXLEN 5 * xitem $j + } else { + r XADD mystream MAXLEN 5 * yitem $j + } + } + set res [r xrange mystream - +] + set expected 995 + foreach r $res { + assert {[lindex $r 1 1] == $expected} + incr expected + } + } + test {XADD mass insertion and XLEN} { r DEL mystream r multi From a4e6aae6b82a5118d128110f9f1c0ab0f979d5c2 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 29 Sep 2017 16:18:59 +0200 Subject: [PATCH 071/102] Streams: fix memory leak in streamTrimByLength(). --- src/t_stream.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/t_stream.c b/src/t_stream.c index 4365aa47..03fa5300 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -334,6 +334,7 @@ int64_t streamTrimByLength(stream *s, size_t maxlen, int approx) { /* Check if we can remove the whole node, and still have at * least maxlen elements. */ if (s->length - entries >= maxlen) { + lpFree(lp); raxRemove(s->rax,ri.key,ri.key_len,NULL); raxSeek(&ri,">=",ri.key,ri.key_len); s->length -= entries; From 50595a58898474acf12e33137a83d4201b4b2d29 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 29 Sep 2017 18:00:40 +0200 Subject: [PATCH 072/102] Streams: fix XADD + MAXLEN propagation due to var shadowing. Clang should be more prone to return warnings by default when there is same-var-name shadowing. GCC does this and can avoid bugs like that. --- src/t_stream.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index 03fa5300..55d06dd7 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -352,8 +352,8 @@ int64_t streamTrimByLength(stream *s, size_t maxlen, int approx) { serverAssert(to_delete < entries); lp = lpReplaceInteger(lp,&p,entries-to_delete); p = lpNext(lp,p); /* Seek deleted field. */ - int64_t deleted = lpGetInteger(p); - lp = lpReplaceInteger(lp,&p,deleted+to_delete); + int64_t marked_deleted = lpGetInteger(p); + lp = lpReplaceInteger(lp,&p,marked_deleted+to_delete); p = lpNext(lp,p); /* Seek num-of-fields in the master entry. */ /* Skip all the master fields. */ @@ -394,8 +394,8 @@ int64_t streamTrimByLength(stream *s, size_t maxlen, int approx) { /* Here we should perform garbage collection in case at this point * there are too many entries deleted inside the listpack. */ entries -= to_delete; - deleted += to_delete; - if (entries + deleted > 10 && deleted > entries/2) { + marked_deleted += to_delete; + if (entries + marked_deleted > 10 && marked_deleted > entries/2) { /* TODO: perform a garbage collection. */ } From 5082ec6419e58b59ac5f911c353276bf1340a9fd Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 3 Oct 2017 11:42:08 +0200 Subject: [PATCH 073/102] Streams: move ID ms/seq separator from '.' to '-' After checking with the community via Twitter (here: https://twitter.com/antirez/status/915130876861788161) the verdict was to use ":". However I later realized, after users lamented the fact that it's hard to copy IDs just with double click, that this was the reason why I moved to "." in the first instance. Fortunately "-", that was the other option with most votes, also gets selected with double click on most terminal applications on Linux and MacOS. So my reasoning was: 1) We can't retain "." because it's actually confusing to newcomers, it looks like a floating number, people may be tricked into thinking they can order IDs numerically as floats. 2) Moving to a double-click-to-select format is much better. People will work with such IDs for long time when coding / debugging. Why making now a choice that will impact this for the next years? The only other viable option was "-", and that's what I did. Thanks. --- src/t_stream.c | 8 ++++---- tests/unit/type/stream.tcl | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index 55d06dd7..7838b92b 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -574,7 +574,7 @@ size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end while(streamIteratorGetID(&si,&id,&numfields)) { /* Emit a two elements array for each item. The first is * the ID, the second is an array of field-value pairs. */ - sds replyid = sdscatfmt(sdsempty(),"+%U.%U\r\n",id.ms,id.seq); + sds replyid = sdscatfmt(sdsempty(),"+%U-%U\r\n",id.ms,id.seq); addReplyMultiBulkLen(c,2); addReplySds(c,replyid); addReplyMultiBulkLen(c,numfields*2); @@ -660,7 +660,7 @@ int streamParseIDOrReply(client *c, robj *o, streamID *id, uint64_t missing_seq) } /* Parse . form. */ - char *dot = strchr(buf,'.'); + char *dot = strchr(buf,'-'); if (dot) *dot = '\0'; uint64_t ms, seq; if (string2ull(buf,&ms) == 0) goto invalid; @@ -740,7 +740,7 @@ void xaddCommand(client *c) { "target stream top item"); return; } - sds reply = sdscatfmt(sdsempty(),"+%U.%U\r\n",id.ms,id.seq); + sds reply = sdscatfmt(sdsempty(),"+%U-%U\r\n",id.ms,id.seq); addReplySds(c,reply); signalModifiedKey(c->db,c->argv[1]); @@ -764,7 +764,7 @@ void xaddCommand(client *c) { /* Let's rewrite the ID argument with the one actually generated for * AOF/replication propagation. */ robj *idarg = createObject(OBJ_STRING, - sdscatfmt(sdsempty(),"%U.%U",id.ms,id.seq)); + sdscatfmt(sdsempty(),"%U-%U",id.ms,id.seq)); rewriteClientCommandArgument(c,i,idarg); decrRefCount(idarg); diff --git a/tests/unit/type/stream.tcl b/tests/unit/type/stream.tcl index e9f187ae..06f31e08 100644 --- a/tests/unit/type/stream.tcl +++ b/tests/unit/type/stream.tcl @@ -1,8 +1,8 @@ # return value is like strcmp() and similar. proc streamCompareID {a b} { if {$a eq $b} {return 0} - lassign [split $a .] a_ms a_seq - lassign [split $b .] b_ms b_seq + lassign [split $a -] a_ms a_seq + lassign [split $b -] b_ms b_seq if {$a_ms > $b_ms} {return 1} if {$a_ms < $b_ms} {return -1} # Same ms case, compare seq. @@ -14,9 +14,9 @@ proc streamCompareID {a b} { # Note that this function does not care to handle 'seq' overflow # since it's a 64 bit value. proc streamNextID {id} { - lassign [split $id .] ms seq + lassign [split $id -] ms seq incr seq - join [list $ms $seq] . + join [list $ms $seq] - } # Generate a random stream entry ID with the ms part between min and max @@ -24,12 +24,12 @@ proc streamNextID {id} { # XRANGE against a Tcl implementation implementing the same concept # with Tcl-only code in a linear array. proc streamRandomID {min_id max_id} { - lassign [split $min_id .] min_ms min_seq - lassign [split $max_id .] max_ms max_seq + lassign [split $min_id -] min_ms min_seq + lassign [split $max_id -] max_ms max_seq set delta [expr {$max_ms-$min_ms+1}] set ms [expr {$min_ms+[randomInt $delta]}] set seq [randomInt 1000] - return $ms.$seq + return $ms-$seq } # Tcl-side implementation of XRANGE to perform fuzz testing in the Redis From 1898c50573d5c9162cb579b3495a694c18e31739 Mon Sep 17 00:00:00 2001 From: antirez Date: Sat, 4 Nov 2017 18:05:46 +0100 Subject: [PATCH 074/102] Streams: fix XREAD test broken after previous tests improvements. 10% of times the data is not just "item 0" but there is also the "otherfield" part. Use [lrange] to avoid the issue. This commit fixes #4416. --- tests/unit/type/stream.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/type/stream.tcl b/tests/unit/type/stream.tcl index 06f31e08..5c58e7fb 100644 --- a/tests/unit/type/stream.tcl +++ b/tests/unit/type/stream.tcl @@ -149,7 +149,7 @@ start_server { test {XREAD with non empty second stream} { set res [r XREAD COUNT 1 STREAMS nostream mystream 0.0 0.0] assert {[lindex $res 0 0] eq {mystream}} - assert {[lindex $res 0 1 0 1] eq {item 0}} + assert {[lrange [lindex $res 0 1 0 1] 0 1] eq {item 0}} } test {Blocking XREAD waiting new data} { From 671b1f6a9dd517a51b02af49a226d3d59e2e2724 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 8 Nov 2017 22:57:10 +0100 Subject: [PATCH 075/102] Streams: fix TYPE for stream type. --- src/db.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/db.c b/src/db.c index 74c2be62..0ded5558 100644 --- a/src/db.c +++ b/src/db.c @@ -798,6 +798,7 @@ void typeCommand(client *c) { case OBJ_SET: type = "set"; break; case OBJ_ZSET: type = "zset"; break; case OBJ_HASH: type = "hash"; break; + case OBJ_STREAM: type = "stream"; break; case OBJ_MODULE: { moduleValue *mv = o->ptr; type = mv->type->name; From abab0b7817e48cb1ab0aaec6fff35890000396c5 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 8 Nov 2017 22:59:15 +0100 Subject: [PATCH 076/102] Streams: fix redis-cli to understand the stream type. --- src/redis-cli.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/redis-cli.c b/src/redis-cli.c index 4ad32578..1f80bc61 100644 --- a/src/redis-cli.c +++ b/src/redis-cli.c @@ -2074,7 +2074,8 @@ static void pipeMode(void) { #define TYPE_SET 2 #define TYPE_HASH 3 #define TYPE_ZSET 4 -#define TYPE_NONE 5 +#define TYPE_STREAM 5 +#define TYPE_NONE 6 static redisReply *sendScan(unsigned long long *it) { redisReply *reply = redisCommand(context, "SCAN %llu", *it); @@ -2133,6 +2134,8 @@ static int toIntType(char *key, char *type) { return TYPE_HASH; } else if(!strcmp(type, "zset")) { return TYPE_ZSET; + } else if(!strcmp(type, "stream")) { + return TYPE_STREAM; } else if(!strcmp(type, "none")) { return TYPE_NONE; } else { @@ -2221,7 +2224,7 @@ static void findBigKeys(void) { unsigned long long biggest[5] = {0}, counts[5] = {0}, totalsize[5] = {0}; unsigned long long sampled = 0, total_keys, totlen=0, *sizes=NULL, it=0; sds maxkeys[5] = {0}; - char *typename[] = {"string","list","set","hash","zset"}; + char *typename[] = {"string","list","set","hash","zset","stream"}; char *typeunit[] = {"bytes","items","members","fields","members"}; redisReply *reply, *keys; unsigned int arrsize=0, i; From 020fe26bd6300a153dc6a02e3411f65af7310118 Mon Sep 17 00:00:00 2001 From: antirez Date: Thu, 9 Nov 2017 12:04:26 +0100 Subject: [PATCH 077/102] Streams: fix COUNT parsing, issue #4433. --- src/t_stream.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/t_stream.c b/src/t_stream.c index 7838b92b..61b229a5 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -785,7 +785,7 @@ void xrangeCommand(client *c) { if (streamParseIDOrReply(c,c->argv[3],&endid,UINT64_MAX) == C_ERR) return; /* Parse the COUNT option if any. */ - if (c->argc > 4) { + if (c->argc > 5) { if (strcasecmp(c->argv[4]->ptr,"COUNT") == 0) { if (getLongLongFromObjectOrReply(c,c->argv[5],&count,NULL) != C_OK) return; From 0381931b4c6acbbd1cbbaa6814defaea9fd33847 Mon Sep 17 00:00:00 2001 From: antirez Date: Wed, 15 Nov 2017 12:48:32 +0100 Subject: [PATCH 078/102] Streams: Update listpack to fix 32bit strings encoding error. Note that streams produced by XADD in previous broken versions having elements with 4096 bytes or more will be permanently broken and must be created again from scratch. Fix #4428 Fix #4349 --- src/listpack.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/listpack.c b/src/listpack.c index e2702b65..6db4086e 100644 --- a/src/listpack.c +++ b/src/listpack.c @@ -283,7 +283,7 @@ int lpEncodeGetType(unsigned char *ele, uint32_t size, unsigned char *intenc, ui } else { if (size < 64) *enclen = 1+size; else if (size < 4096) *enclen = 2+size; - else *enclen = 4+size; + else *enclen = 5+size; return LP_ENCODING_STRING; } } @@ -363,7 +363,7 @@ void lpEncodeString(unsigned char *buf, unsigned char *s, uint32_t len) { buf[2] = (len >> 8) & 0xff; buf[3] = (len >> 16) & 0xff; buf[4] = (len >> 24) & 0xff; - memcpy(buf+4,s,len); + memcpy(buf+5,s,len); } } From 3c5d773f82eede4497cb3695d2cd32eec3e10382 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 17 Nov 2017 10:16:30 +0100 Subject: [PATCH 079/102] Streams: augment stream entries to allow backward scanning. --- src/t_stream.c | 47 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index 61b229a5..14eba44c 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -199,9 +199,9 @@ int streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id, * * The master entry is composed like in the following example: * - * +-------+---------+------------+---------+--/--+---------+---------+ - * | count | deleted | num-fields | field_1 | field_2 | ... | field_N | - * +-------+---------+------------+---------+--/--+---------+---------+ + * +-------+---------+------------+---------+--/--+---------+---------+-+ + * | count | deleted | num-fields | field_1 | field_2 | ... | field_N |0| + * +-------+---------+------------+---------+--/--+---------+---------+-+ * * count and deleted just represent respectively the total number of * entires inside the listpack that are valid, and marked as deleted @@ -213,7 +213,11 @@ int streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id, * the radix tree node containing the listpack (delta encoding), and * if the fields of the entry are the same as the master enty fields, the * entry flags will specify this fact and the entry fields and number - * of fields will be omitted (see later in the code of this function). */ + * of fields will be omitted (see later in the code of this function). + * + * The "0" entry at the end is the same as the 'lp-count' entry in the + * regular stream entries (see below), and marks the fact that there are + * no more entires, when we scan the stream from right to left. */ int flags = STREAM_ITEM_FLAG_NONE; if (lp == NULL || lp_bytes > STREAM_BYTES_PER_LISTPACK) { @@ -228,6 +232,7 @@ int streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id, sds field = argv[i*2]->ptr; lp = lpAppend(lp,(unsigned char*)field,sdslen(field)); } + lp = lpAppendInteger(lp,0); /* Master entry zero terminator. */ raxInsert(s->rax,(unsigned char*)&rax_key,sizeof(rax_key),lp,NULL); /* The first entry we insert, has obviously the same fields of the * master entry. */ @@ -271,20 +276,25 @@ int streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id, /* Populate the listpack with the new entry. We use the following * encoding: * - * +-----+--------+----------+-------+-------+-/-+-------+-------+ - * |flags|entry-id|num-fields|field-1|value-1|...|field-N|value-N| - * +-----+--------+----------+-------+-------+-/-+-------+-------+ + * +-----+--------+----------+-------+-------+-/-+-------+-------+--------+ + * |flags|entry-id|num-fields|field-1|value-1|...|field-N|value-N|lp-count| + * +-----+--------+----------+-------+-------+-/-+-------+-------+--------+ * * However if the SAMEFIELD flag is set, we have just to populate * the entry with the values, so it becomes: * - * +-----+--------+-------+-/-+-------+ - * |flags|entry-id|value-1|...|value-N| - * +-----+--------+-------+-/-+-------+ + * +-----+--------+-------+-/-+-------+--------+ + * |flags|entry-id|value-1|...|value-N|lp-count| + * +-----+--------+-------+-/-+-------+--------+ * * The entry-id field is actually two separated fields: the ms * and seq difference compared to the master entry. - */ + * + * The lp-count field is a number that states the number of listpack pieces + * that compose the entry, so that it's possible to travel the entry + * in reverse order: we can just start from the end of the listpack, read + * the entry, and jump back N times to seek the "flags" field to read + * the stream full entry. */ lp = lpAppendInteger(lp,flags); lp = lpAppendInteger(lp,id.ms - master_id.ms); lp = lpAppendInteger(lp,id.seq - master_id.seq); @@ -296,6 +306,11 @@ int streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id, lp = lpAppend(lp,(unsigned char*)field,sdslen(field)); lp = lpAppend(lp,(unsigned char*)value,sdslen(value)); } + /* Compute and store the lp-count field. */ + int lp_count = numfields; + if (!(flags & STREAM_ITEM_FLAG_SAMEFIELDS)) lp_count *= 2; + lp_count += 3; /* Add the 3 fixed fileds flags + ms-diff + seq-diff. */ + lp = lpAppendInteger(lp,lp_count); /* Insert back into the tree in order to update the listpack pointer. */ raxInsert(s->rax,(unsigned char*)&rax_key,sizeof(rax_key),lp,NULL); @@ -361,6 +376,7 @@ int64_t streamTrimByLength(stream *s, size_t maxlen, int approx) { p = lpNext(lp,p); /* Seek the first field. */ for (int64_t j = 0; j < master_fields_count; j++) p = lpNext(lp,p); /* Skip all master fields. */ + p = lpNext(lp,p); /* Skip the zero master entry terminator. */ /* 'p' is now pointing to the first entry inside the listpack. * We have to run entry after entry, marking entries as deleted @@ -389,6 +405,7 @@ int64_t streamTrimByLength(stream *s, size_t maxlen, int approx) { } while(to_skip--) p = lpNext(lp,p); /* Skip the whole entry. */ + p = lpNext(lp,p); /* Skip the final lp-count field. */ } /* Here we should perform garbage collection in case at this point @@ -482,11 +499,17 @@ int streamIteratorGetID(streamIterator *si, streamID *id, int64_t *numfields) { /* Skip master fileds to seek the first entry. */ for (uint64_t i = 0; i < si->master_fields_count; i++) si->lp_ele = lpNext(si->lp,si->lp_ele); + /* We are now pointing the zero term of the master entry. */ } /* For every radix tree node, iterate the corresponding listpack, * returning elements when they are within range. */ - while(si->lp_ele) { + while(1) { + /* Skip the previous entry lp-count field, or in case of the + * master entry, the zero term field. */ + si->lp_ele = lpNext(si->lp,si->lp_ele); + if (si->lp_ele == NULL) break; + /* Get the flags entry. */ int flags = lpGetInteger(si->lp_ele); si->lp_ele = lpNext(si->lp,si->lp_ele); /* Seek ID. */ From ee3490ec481c7f1ef89fe685b03c2b5f171d335b Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 17 Nov 2017 13:24:20 +0100 Subject: [PATCH 080/102] Streams: state machine for reverse iteration WIP 1. --- src/aof.c | 2 +- src/blocked.c | 2 +- src/stream.h | 7 +-- src/t_stream.c | 129 +++++++++++++++++++++++++++++++++++-------------- 4 files changed, 98 insertions(+), 42 deletions(-) diff --git a/src/aof.c b/src/aof.c index 5fbfdd69..79962fd0 100644 --- a/src/aof.c +++ b/src/aof.c @@ -1035,7 +1035,7 @@ int rewriteHashObject(rio *r, robj *key, robj *o) { * The function returns 0 on error, 1 on success. */ int rewriteStreamObject(rio *r, robj *key, robj *o) { streamIterator si; - streamIteratorStart(&si,o->ptr,NULL,NULL); + streamIteratorStart(&si,o->ptr,NULL,NULL,0); streamID id; int64_t numfields; diff --git a/src/blocked.c b/src/blocked.c index 734e6ffd..f438c335 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -326,7 +326,7 @@ void handleClientsBlockedOnKeys(void) { addReplyMultiBulkLen(receiver,2); addReplyBulk(receiver,rl->key); streamReplyWithRange(receiver,s,&start,NULL, - receiver->bpop.xread_count); + receiver->bpop.xread_count,0); } } } diff --git a/src/stream.h b/src/stream.h index df29e9e7..214b6d9a 100644 --- a/src/stream.h +++ b/src/stream.h @@ -28,9 +28,10 @@ typedef struct stream { typedef struct streamIterator { streamID master_id; /* ID of the master entry at listpack head. */ uint64_t master_fields_count; /* Master entries # of fields. */ - unsigned char *master_fields_start; /* Master entries start in listapck. */ + unsigned char *master_fields_start; /* Master entries start in listpack. */ unsigned char *master_fields_ptr; /* Master field to emit next. */ int entry_flags; /* Flags of entry we are emitting. */ + int rev; /* True if iterating end to start (reverse). */ uint64_t start_key[2]; /* Start key as 128 bit big endian. */ uint64_t end_key[2]; /* End key as 128 bit big endian. */ raxIterator ri; /* Rax iterator. */ @@ -49,8 +50,8 @@ struct client; stream *streamNew(void); void freeStream(stream *s); -size_t streamReplyWithRange(struct client *c, stream *s, streamID *start, streamID *end, size_t count); -void streamIteratorStart(streamIterator *si, stream *s, streamID *start, streamID *end); +size_t streamReplyWithRange(struct client *c, stream *s, streamID *start, streamID *end, size_t count, int rev); +void streamIteratorStart(streamIterator *si, stream *s, streamID *start, streamID *end, int rev); int streamIteratorGetID(streamIterator *si, streamID *id, int64_t *numfields); void streamIteratorGetField(streamIterator *si, unsigned char **fieldptr, unsigned char **valueptr, int64_t *fieldlen, int64_t *valuelen); void streamIteratorStop(streamIterator *si); diff --git a/src/t_stream.c b/src/t_stream.c index 14eba44c..945fc28c 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -426,7 +426,9 @@ int64_t streamTrimByLength(stream *s, size_t maxlen, int approx) { /* Initialize the stream iterator, so that we can call iterating functions * to get the next items. This requires a corresponding streamIteratorStop() - * at the end. + * at the end. The 'rev' parameter controls the direction. If it's zero the + * iteration is from the start to the end element (inclusive), otherwise + * if rev is non-zero, the iteration is reversed. * * Once the iterator is initalized, we iterate like this: * @@ -443,7 +445,7 @@ int64_t streamTrimByLength(stream *s, size_t maxlen, int approx) { * } * } * streamIteratorStop(&myiterator); */ -void streamIteratorStart(streamIterator *si, stream *s, streamID *start, streamID *end) { +void streamIteratorStart(streamIterator *si, stream *s, streamID *start, streamID *end, int rev) { /* Intialize the iterator and translates the iteration start/stop * elements into a 128 big big-endian number. */ if (start) { @@ -462,17 +464,26 @@ void streamIteratorStart(streamIterator *si, stream *s, streamID *start, streamI /* Seek the correct node in the radix tree. */ raxStart(&si->ri,s->rax); - if (start && (start->ms || start->seq)) { - raxSeek(&si->ri,"<=",(unsigned char*)si->start_key, - sizeof(si->start_key)); - if (raxEOF(&si->ri)) - raxSeek(&si->ri,">",(unsigned char*)si->start_key, + if (!rev) { + if (start && (start->ms || start->seq)) { + raxSeek(&si->ri,"<=",(unsigned char*)si->start_key, sizeof(si->start_key)); + if (raxEOF(&si->ri)) raxSeek(&si->ri,"^",NULL,0); + } else { + raxSeek(&si->ri,"^",NULL,0); + } } else { - raxSeek(&si->ri,"^",NULL,0); + if (end && (end->ms || end->seq)) { + raxSeek(&si->ri,"<=",(unsigned char*)si->end_key, + sizeof(si->end_key)); + if (raxEOF(&si->ri)) raxSeek(&si->ri,"$",NULL,0); + } else { + raxSeek(&si->ri,"$",NULL,0); + } } si->lp = NULL; /* There is no current listpack right now. */ si->lp_ele = NULL; /* Current listpack cursor. */ + si->rev = rev; /* Direction, if non-zero reversed, from end to start. */ } /* Return 1 and store the current item ID at 'id' if there are still @@ -484,7 +495,8 @@ int streamIteratorGetID(streamIterator *si, streamID *id, int64_t *numfields) { * iteration or the previous listpack was completely iterated. * Go to the next node. */ if (si->lp == NULL || si->lp_ele == NULL) { - if (!raxNext(&si->ri)) return 0; + if (!si->rev && !raxNext(&si->ri)) return 0; + else if (si->rev && !raxPrev(&si->ri)) return 0; serverAssert(si->ri.key_len == sizeof(streamID)); /* Get the master ID. */ streamDecodeID(si->ri.key,&si->master_id); @@ -499,16 +511,38 @@ int streamIteratorGetID(streamIterator *si, streamID *id, int64_t *numfields) { /* Skip master fileds to seek the first entry. */ for (uint64_t i = 0; i < si->master_fields_count; i++) si->lp_ele = lpNext(si->lp,si->lp_ele); - /* We are now pointing the zero term of the master entry. */ + /* We are now pointing the zero term of the master entry. If + * we are iterating in reverse order, we need to seek the + * end of the listpack. */ + if (si->rev) si->lp_ele = lpLast(si->lp); + } else if (si->rev) { + /* If we are itereating in the reverse order, and this is not + * the first entry emitted for this listpack, then we already + * emitted the current entry, and have to go back to the previous + * one. */ + int lp_count = lpGetInteger(si->lp_ele); + while(lp_count--) si->lp_ele = lpPrev(si->lp,si->lp_ele); + /* Seek lp-count of prev entry. */ + si->lp_ele = lpPrev(si->lp,si->lp_ele); } /* For every radix tree node, iterate the corresponding listpack, * returning elements when they are within range. */ while(1) { - /* Skip the previous entry lp-count field, or in case of the - * master entry, the zero term field. */ - si->lp_ele = lpNext(si->lp,si->lp_ele); - if (si->lp_ele == NULL) break; + if (!si->rev) { + /* If we are going forward, skip the previous entry + * lp-count field (or in case of the master entry, the zero + * term field) */ + si->lp_ele = lpNext(si->lp,si->lp_ele); + if (si->lp_ele == NULL) break; + } else { + /* If we are going backward, read the number of elements this + * entry is composed of, and jump backward N times to seek + * its start. */ + int lp_count = lpGetInteger(si->lp_ele); + if (lp_count == 0) break; /* We reached the master entry. */ + while(lp_count--) si->lp_ele = lpPrev(si->lp,si->lp_ele); + } /* Get the flags entry. */ int flags = lpGetInteger(si->lp_ele); @@ -535,15 +569,28 @@ int streamIteratorGetID(streamIterator *si, streamID *id, int64_t *numfields) { /* If current >= start, and the entry is not marked as * deleted, emit it. */ - if (memcmp(buf,si->start_key,sizeof(streamID)) >= 0 && - !(flags & STREAM_ITEM_FLAG_DELETED)) - { - if (memcmp(buf,si->end_key,sizeof(streamID)) > 0) - return 0; /* We are already out of range. */ - si->entry_flags = flags; - if (flags & STREAM_ITEM_FLAG_SAMEFIELDS) - si->master_fields_ptr = si->master_fields_start; - return 1; /* Valid item returned. */ + if (!si->rev) { + if (memcmp(buf,si->start_key,sizeof(streamID)) >= 0 && + !(flags & STREAM_ITEM_FLAG_DELETED)) + { + if (memcmp(buf,si->end_key,sizeof(streamID)) > 0) + return 0; /* We are already out of range. */ + si->entry_flags = flags; + if (flags & STREAM_ITEM_FLAG_SAMEFIELDS) + si->master_fields_ptr = si->master_fields_start; + return 1; /* Valid item returned. */ + } + } else { + if (memcmp(buf,si->end_key,sizeof(streamID)) <= 0 && + !(flags & STREAM_ITEM_FLAG_DELETED)) + { + if (memcmp(buf,si->start_key,sizeof(streamID)) < 0) + return 0; /* We are already out of range. */ + si->entry_flags = flags; + if (flags & STREAM_ITEM_FLAG_SAMEFIELDS) + si->master_fields_ptr = si->master_fields_start; + return 1; /* Valid item returned. */ + } } /* If we do not emit, we have to discard. */ @@ -553,7 +600,7 @@ int streamIteratorGetID(streamIterator *si, streamID *id, int64_t *numfields) { si->lp_ele = lpNext(si->lp,si->lp_ele); } - /* End of listpack reached. Try the next radix tree node. */ + /* End of listpack reached. Try the next/prev radix tree node. */ } } @@ -585,15 +632,16 @@ void streamIteratorStop(streamIterator *si) { /* Send the specified range to the client 'c'. The range the client will * receive is between start and end inclusive, if 'count' is non zero, no more * than 'count' elemnets are sent. The 'end' pointer can be NULL to mean that - * we want all the elements from 'start' till the end of the stream. */ -size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end, size_t count) { + * we want all the elements from 'start' till the end of the stream. If 'rev' + * is non zero, elements are produced in reversed order from end to start. */ +size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end, size_t count, int rev) { void *arraylen_ptr = addDeferredMultiBulkLength(c); size_t arraylen = 0; streamIterator si; int64_t numfields; streamID id; - streamIteratorStart(&si,s,start,end); + streamIteratorStart(&si,s,start,end,rev); while(streamIteratorGetID(&si,&id,&numfields)) { /* Emit a two elements array for each item. The first is * the ID, the second is an array of field-value pairs. */ @@ -797,25 +845,32 @@ void xaddCommand(client *c) { signalKeyAsReady(c->db, c->argv[1]); } -/* XRANGE key start end [COUNT ] */ +/* XRANGE key start end [COUNT ] [REV] */ void xrangeCommand(client *c) { robj *o; stream *s; streamID startid, endid; long long count = 0; + int rev = 0; if (streamParseIDOrReply(c,c->argv[2],&startid,0) == C_ERR) return; if (streamParseIDOrReply(c,c->argv[3],&endid,UINT64_MAX) == C_ERR) return; /* Parse the COUNT option if any. */ - if (c->argc > 5) { - if (strcasecmp(c->argv[4]->ptr,"COUNT") == 0) { - if (getLongLongFromObjectOrReply(c,c->argv[5],&count,NULL) != C_OK) + if (c->argc > 4) { + for (int j = 4; j < c->argc; j++) { + int additional = c->argc-j-1; + if (strcasecmp(c->argv[j]->ptr,"COUNT") == 0 && additional >= 1) { + if (getLongLongFromObjectOrReply(c,c->argv[j+1],&count,NULL) + != C_OK) return; + if (count < 0) count = 0; + j++; /* Consume additional arg. */ + } else if (strcasecmp(c->argv[j]->ptr,"REV") == 0) { + rev = 1; + } else { + addReply(c,shared.syntaxerr); return; - if (count < 0) count = 0; - } else { - addReply(c,shared.syntaxerr); - return; + } } } @@ -823,7 +878,7 @@ void xrangeCommand(client *c) { if ((o = lookupKeyReadOrReply(c,c->argv[1],shared.emptymultibulk)) == NULL || checkType(c,o,OBJ_STREAM)) return; s = o->ptr; - streamReplyWithRange(c,s,&startid,&endid,count); + streamReplyWithRange(c,s,&startid,&endid,count,rev); } /* XLEN */ @@ -931,7 +986,7 @@ void xreadCommand(client *c) { * of the stream and the data we extracted from it. */ addReplyMultiBulkLen(c,2); addReplyBulk(c,c->argv[i+streams_arg]); - streamReplyWithRange(c,s,&start,NULL,count); + streamReplyWithRange(c,s,&start,NULL,count,0); } } From 6919280cc5eb9a35887ddaa528053380d584327a Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 17 Nov 2017 13:47:50 +0100 Subject: [PATCH 081/102] Streams: fix reverse iteration next node jumping. --- src/t_stream.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/t_stream.c b/src/t_stream.c index 945fc28c..f64824c9 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -540,7 +540,11 @@ int streamIteratorGetID(streamIterator *si, streamID *id, int64_t *numfields) { * entry is composed of, and jump backward N times to seek * its start. */ int lp_count = lpGetInteger(si->lp_ele); - if (lp_count == 0) break; /* We reached the master entry. */ + if (lp_count == 0) { /* We reached the master entry. */ + si->lp = NULL; + si->lp_ele = NULL; + break; + } while(lp_count--) si->lp_ele = lpPrev(si->lp,si->lp_ele); } From 9dc79c039a16674458a39c8bdfbcfe049f3fae77 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 17 Nov 2017 16:02:11 +0100 Subject: [PATCH 082/102] Streams: fix reverse iterator discarding of items out of range. --- src/t_stream.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index f64824c9..efb01ef6 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -597,11 +597,18 @@ int streamIteratorGetID(streamIterator *si, streamID *id, int64_t *numfields) { } } - /* If we do not emit, we have to discard. */ - int to_discard = (flags & STREAM_ITEM_FLAG_SAMEFIELDS) ? - *numfields : *numfields*2; - for (int64_t i = 0; i < to_discard; i++) - si->lp_ele = lpNext(si->lp,si->lp_ele); + /* If we do not emit, we have to discard if we are going + * forward, or seek the previous entry if we are going + * backward. */ + if (!si->rev) { + int to_discard = (flags & STREAM_ITEM_FLAG_SAMEFIELDS) ? + *numfields : *numfields*2; + for (int64_t i = 0; i < to_discard; i++) + si->lp_ele = lpNext(si->lp,si->lp_ele); + } else { + int prev_times = 4; /* flag + id ms/seq diff + numfields. */ + while(prev_times--) si->lp_ele = lpPrev(si->lp,si->lp_ele); + } } /* End of listpack reached. Try the next/prev radix tree node. */ From 9bb18e54380250f3fb931028952379c3ab2dec29 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 20 Nov 2017 11:25:05 +0100 Subject: [PATCH 083/102] Streams: XRANGE REV option -> XREVRANGE command. --- src/server.c | 1 + src/server.h | 1 + src/t_stream.c | 23 ++++++++++++++++------- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/src/server.c b/src/server.c index f1fd06ca..af19b5a3 100644 --- a/src/server.c +++ b/src/server.c @@ -304,6 +304,7 @@ struct redisCommand redisCommandTable[] = { {"pfdebug",pfdebugCommand,-3,"w",0,NULL,0,0,0,0,0}, {"xadd",xaddCommand,-5,"wmF",0,NULL,1,1,1,0,0}, {"xrange",xrangeCommand,-4,"r",0,NULL,1,1,1,0,0}, + {"xrevrange",xrevrangeCommand,-4,"r",0,NULL,1,1,1,0,0}, {"xlen",xlenCommand,2,"rF",0,NULL,1,1,1,0,0}, {"xread",xreadCommand,-3,"rs",0,xreadGetKeys,1,1,1,0,0}, {"post",securityWarningCommand,-1,"lt",0,NULL,0,0,0,0,0}, diff --git a/src/server.h b/src/server.h index bc572b1e..d65fd0c5 100644 --- a/src/server.h +++ b/src/server.h @@ -2010,6 +2010,7 @@ void moduleCommand(client *c); void securityWarningCommand(client *c); void xaddCommand(client *c); void xrangeCommand(client *c); +void xrevrangeCommand(client *c); void xlenCommand(client *c); void xreadCommand(client *c); diff --git a/src/t_stream.c b/src/t_stream.c index efb01ef6..837a812a 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -856,16 +856,17 @@ void xaddCommand(client *c) { signalKeyAsReady(c->db, c->argv[1]); } -/* XRANGE key start end [COUNT ] [REV] */ -void xrangeCommand(client *c) { +/* XRANGE/XREVRANGE actual implementation. */ +void xrangeGenericCommand(client *c, int rev) { robj *o; stream *s; streamID startid, endid; long long count = 0; - int rev = 0; + robj *startarg = rev ? c->argv[3] : c->argv[2]; + robj *endarg = rev ? c->argv[2] : c->argv[3]; - if (streamParseIDOrReply(c,c->argv[2],&startid,0) == C_ERR) return; - if (streamParseIDOrReply(c,c->argv[3],&endid,UINT64_MAX) == C_ERR) return; + if (streamParseIDOrReply(c,startarg,&startid,0) == C_ERR) return; + if (streamParseIDOrReply(c,endarg,&endid,UINT64_MAX) == C_ERR) return; /* Parse the COUNT option if any. */ if (c->argc > 4) { @@ -876,8 +877,6 @@ void xrangeCommand(client *c) { != C_OK) return; if (count < 0) count = 0; j++; /* Consume additional arg. */ - } else if (strcasecmp(c->argv[j]->ptr,"REV") == 0) { - rev = 1; } else { addReply(c,shared.syntaxerr); return; @@ -892,6 +891,16 @@ void xrangeCommand(client *c) { streamReplyWithRange(c,s,&startid,&endid,count,rev); } +/* XRANGE key start end [COUNT ] */ +void xrangeCommand(client *c) { + xrangeGenericCommand(c,0); +} + +/* XREVRANGE key end start [COUNT ] */ +void xrevrangeCommand(client *c) { + xrangeGenericCommand(c,1); +} + /* XLEN */ void xlenCommand(client *c) { robj *o; From 115d076d655d265ef534bb0782da8a2ba0d877ac Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 21 Nov 2017 22:21:37 +0100 Subject: [PATCH 084/102] Streams: fix lp-count field for non-same-fields entries. --- src/t_stream.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index 837a812a..213a46bb 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -308,8 +308,12 @@ int streamAppendItem(stream *s, robj **argv, int numfields, streamID *added_id, } /* Compute and store the lp-count field. */ int lp_count = numfields; - if (!(flags & STREAM_ITEM_FLAG_SAMEFIELDS)) lp_count *= 2; - lp_count += 3; /* Add the 3 fixed fileds flags + ms-diff + seq-diff. */ + lp_count += 3; /* Add the 3 fixed fields flags + ms-diff + seq-diff. */ + if (!(flags & STREAM_ITEM_FLAG_SAMEFIELDS)) { + /* If the item is not compressed, it also has the fields other than + * the values, and an additional num-fileds field. */ + lp_count += numfields+1; + } lp = lpAppendInteger(lp,lp_count); /* Insert back into the tree in order to update the listpack pointer. */ From 45fe1f5e0019c6745e531da3ea51eee609f15ec3 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 21 Nov 2017 22:22:05 +0100 Subject: [PATCH 085/102] Streams: add some initial test for XREVRANGE. --- tests/unit/type/stream.tcl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/unit/type/stream.tcl b/tests/unit/type/stream.tcl index 5c58e7fb..d7b5ca2a 100644 --- a/tests/unit/type/stream.tcl +++ b/tests/unit/type/stream.tcl @@ -121,6 +121,10 @@ start_server { assert {[llength [r xrange mystream - + COUNT 10]] == 10} } + test {XREVRANGE COUNT works as expected} { + assert {[llength [r xrevrange mystream + - COUNT 10]] == 10} + } + test {XRANGE can be used to iterate the whole stream} { set last_id "-" set j 0 @@ -136,6 +140,10 @@ start_server { assert {$j == 10000} } + test {XREVRANGE returns the reverse of XRANGE} { + assert {[r xrange mystream - +] == [lreverse [r xrevrange mystream + -]]} + } + test {XREAD with non empty stream} { set res [r XREAD COUNT 1 STREAMS mystream 0.0] assert {[lrange [lindex $res 0 1 0 1] 0 1] eq {item 0}} From f42df6f43a59fa92a82e651d1c5858bc5e72c3ef Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 1 Dec 2017 12:50:18 +0100 Subject: [PATCH 086/102] Streams: add code to compute the stream memory usage. It's a bit of black magic without actually tracking it inside rax.c, however Redis usage of the radix tree for the stream data structure is quite consistent, so a few magic constants apparently are producing results that make sense. --- src/object.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/src/object.c b/src/object.c index b689edcf..28df5790 100644 --- a/src/object.c +++ b/src/object.c @@ -800,6 +800,49 @@ size_t objectComputeSize(robj *o, size_t sample_size) { } else { serverPanic("Unknown hash encoding"); } + } else if (o->type == OBJ_STREAM) { + stream *s = o->ptr; + /* Note: to guess the size of the radix tree is not trivial, so we + * approximate it considering 64 bytes of data overhead for each + * key (the ID), and then adding the number of bare nodes, plus some + * overhead due by the data and child pointers. This secret recipe + * was obtained by checking the average radix tree created by real + * workloads, and then adjusting the constants to get numbers that + * more or less match the real memory usage. + * + * Actually the number of nodes and keys may be different depending + * on the insertion speed and thus the ability of the radix tree + * to compress prefixes. */ + asize = sizeof(*o); + asize += s->rax->numele * 64; + asize += s->rax->numnodes * sizeof(raxNode); + asize += s->rax->numnodes * 32*7; /* Add a few child pointers... */ + + /* Now we have to add the listpacks. The last listpack is often non + * complete, so we estimate the size of the first N listpacks, and + * use the average to compute the size of the first N-1 listpacks, and + * finally add the real size of the last node. */ + raxIterator ri; + raxStart(&ri,s->rax); + raxSeek(&ri,"^",NULL,0); + size_t lpsize = 0, samples = 0; + while(samples < sample_size && raxNext(&ri)) { + unsigned char *lp = ri.data; + lpsize += lpBytes(lp); + samples++; + } + if (s->rax->numele <= samples) { + asize += lpsize; + } else { + if (samples) lpsize /= samples; /* Compute the average. */ + asize += lpsize * (s->rax->numele-1); + /* No need to check if seek succeeded, we enter this branch only + * if there are a few elements in the radix tree. */ + raxSeek(&ri,"$",NULL,0); + raxNext(&ri); + asize += lpBytes(ri.data); + } + raxStop(&ri); } else if (o->type == OBJ_MODULE) { moduleValue *mv = o->ptr; moduleType *mt = mv->type; From 8ac76be5f2d44341a1c7f67645c753aafbde7804 Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 1 Dec 2017 15:04:05 +0100 Subject: [PATCH 087/102] Streams: DEBUG DIGEST support. --- src/debug.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/debug.c b/src/debug.c index 5c3fd347..5b08475a 100644 --- a/src/debug.c +++ b/src/debug.c @@ -239,6 +239,27 @@ void computeDatasetDigest(unsigned char *final) { xorDigest(digest,eledigest,20); } hashTypeReleaseIterator(hi); + } else if (o->type == OBJ_STREAM) { + streamIterator si; + streamIteratorStart(&si,o->ptr,NULL,NULL,0); + streamID id; + int64_t numfields; + + while(streamIteratorGetID(&si,&id,&numfields)) { + sds itemid = sdscatfmt(sdsempty(),"%U.%U",id.ms,id.seq); + mixDigest(digest,itemid,sdslen(itemid)); + sdsfree(itemid); + + while(numfields--) { + unsigned char *field, *value; + int64_t field_len, value_len; + streamIteratorGetField(&si,&field,&value, + &field_len,&value_len); + mixDigest(digest,field,field_len); + mixDigest(digest,value,value_len); + } + } + streamIteratorStop(&si); } else if (o->type == OBJ_MODULE) { RedisModuleDigest md; moduleValue *mv = o->ptr; From 65a9740fa880a8e5b4640037a3670f8a2f33080b Mon Sep 17 00:00:00 2001 From: antirez Date: Fri, 1 Dec 2017 16:01:10 +0100 Subject: [PATCH 088/102] Fix loading of RDB files lua AUX fields when the script is defined. In the case of slaves loading the RDB from master, or in other similar cases, the script is already defined, and the function registering the script should not fail in the assert() call. --- src/rdb.c | 2 +- src/scripting.c | 12 +++++++++--- src/server.h | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/rdb.c b/src/rdb.c index 17a93275..19f25499 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -1679,7 +1679,7 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) { if (rsi) rsi->repl_offset = strtoll(auxval->ptr,NULL,10); } else if (!strcasecmp(auxkey->ptr,"lua")) { /* Load the script back in memory. */ - if (luaCreateFunction(NULL,server.lua,NULL,auxval) == C_ERR) { + if (luaCreateFunction(NULL,server.lua,NULL,auxval,1) == C_ERR) { rdbExitReportCorruptRDB( "Can't load Lua script from RDB file! " "BODY: %s", auxval->ptr); diff --git a/src/scripting.c b/src/scripting.c index 848629e2..ea167365 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -1151,13 +1151,16 @@ int redis_math_randomseed (lua_State *L) { * on the fly doing the SHA1 of the body, this means that passing the funcname * is just an optimization in case it's already at hand. * + * if 'allow_dup' is true, the function can be called with a script already + * in memory without crashing in assert(). In this case C_OK is returned. + * * The function increments the reference count of the 'body' object as a * side effect of a successful call. * * On success C_OK is returned, and nothing is left on the Lua stack. * On error C_ERR is returned and an appropriate error is set in the * client context. */ -int luaCreateFunction(client *c, lua_State *lua, char *funcname, robj *body) { +int luaCreateFunction(client *c, lua_State *lua, char *funcname, robj *body, int allow_dup) { sds funcdef = sdsempty(); char fname[43]; @@ -1168,6 +1171,9 @@ int luaCreateFunction(client *c, lua_State *lua, char *funcname, robj *body) { funcname = fname; } + if (allow_dup && dictFind(server.lua_scripts,funcname+2) != NULL) + return C_OK; + funcdef = sdscat(funcdef,"function "); funcdef = sdscatlen(funcdef,funcname,42); funcdef = sdscatlen(funcdef,"() ",3); @@ -1302,7 +1308,7 @@ void evalGenericCommand(client *c, int evalsha) { addReply(c, shared.noscripterr); return; } - if (luaCreateFunction(c,lua,funcname,c->argv[1]) == C_ERR) { + if (luaCreateFunction(c,lua,funcname,c->argv[1],0) == C_ERR) { lua_pop(lua,1); /* remove the error handler from the stack. */ /* The error is sent to the client by luaCreateFunction() * itself when it returns C_ERR. */ @@ -1474,7 +1480,7 @@ void scriptCommand(client *c) { sha1hex(funcname+2,c->argv[2]->ptr,sdslen(c->argv[2]->ptr)); sha = sdsnewlen(funcname+2,40); if (dictFind(server.lua_scripts,sha) == NULL) { - if (luaCreateFunction(c,server.lua,funcname,c->argv[2]) + if (luaCreateFunction(c,server.lua,funcname,c->argv[2],0) == C_ERR) { sdsfree(sha); return; diff --git a/src/server.h b/src/server.h index d65fd0c5..498a0550 100644 --- a/src/server.h +++ b/src/server.h @@ -1794,7 +1794,7 @@ void scriptingInit(int setup); int ldbRemoveChild(pid_t pid); void ldbKillForkedSessions(void); int ldbPendingChildren(void); -int luaCreateFunction(client *c, lua_State *lua, char *funcname, robj *body); +int luaCreateFunction(client *c, lua_State *lua, char *funcname, robj *body, int allow_dup); /* Blocked clients */ void processUnblockedClients(void); From 6a1bf07a46ce7d8cb3d9443abedd05dfa2f8096a Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 4 Dec 2017 10:24:52 +0100 Subject: [PATCH 089/102] DEBUG change-repl-id implemented. With PSYNC2 to force a full SYNC in tests is hard. With this new DEBUG subcommand we just need to call it and then CLIENT KILL TYPE master in the slave. --- src/debug.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/debug.c b/src/debug.c index 5b08475a..9e084f9d 100644 --- a/src/debug.c +++ b/src/debug.c @@ -329,6 +329,8 @@ void debugCommand(client *c) { "structsize -- Return the size of different Redis core C structures."); blen++; addReplyStatus(c, "htstats -- Return hash table statistics of the specified Redis database."); + blen++; addReplyStatus(c, + "change-repl-id -- Change the replication IDs of the instance. Dangerous, should be used only for testing the replication subsystem."); setDeferredMultiBulkLength(c,blenp,blen); } else if (!strcasecmp(c->argv[1]->ptr,"segfault")) { *((char*)-1) = 'x'; @@ -570,6 +572,11 @@ void debugCommand(client *c) { stats = sdscat(stats,buf); addReplyBulkSds(c,stats); + } else if (!strcasecmp(c->argv[1]->ptr,"change-repl-id") && c->argc == 2) { + serverLog(LL_WARNING,"Changing replication IDs after receiving DEBUG change-repl-id"); + changeReplicationId(); + clearReplicationId2(); + addReply(c,shared.ok); } else { addReplyErrorFormat(c, "Unknown DEBUG subcommand or wrong number of arguments for '%s'", (char*)c->argv[1]->ptr); From 6f0b19bc5b0f89d7d9d89e84de1f4c9a859df59c Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 4 Dec 2017 10:26:02 +0100 Subject: [PATCH 090/102] Regression test for #4505 (Lua AUX field loading). --- tests/integration/replication-3.tcl | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/tests/integration/replication-3.tcl b/tests/integration/replication-3.tcl index 50dcb9a9..580be760 100644 --- a/tests/integration/replication-3.tcl +++ b/tests/integration/replication-3.tcl @@ -100,7 +100,6 @@ start_server {tags {"repl"}} { close $fd puts "Master - Slave inconsistency" puts "Run diff -u against /tmp/repldump*.txt for more info" - } set old_digest [r debug digest] @@ -109,5 +108,27 @@ start_server {tags {"repl"}} { set new_digest [r debug digest] assert {$old_digest eq $new_digest} } + + test {SLAVE can reload "lua" AUX RDB fields of duplicated scripts} { + # Force a Slave full resynchronization + r debug change-repl-id + r -1 client kill type master + + # Check that after a full resync the slave can still load + # correctly the RDB file: such file will contain "lua" AUX + # sections with scripts already in the memory of the master. + + wait_for_condition 50 100 { + [s -1 master_link_status] eq {up} + } else { + fail "Replication not started." + } + + wait_for_condition 50 100 { + [r debug digest] eq [r -1 debug digest] + } else { + fail "DEBUG DIGEST mismatch after full SYNC with many scripts" + } + } } } From 68681f2bcf8d17e573c27ff3fc676ddde381204c Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 4 Dec 2017 10:33:04 +0100 Subject: [PATCH 091/102] Fix issue #4505, Lua RDB AUX field loading of existing scripts. Unfortunately, as outlined by @soloestoy in #4505, "lua" AUX RDB field loading in case of duplicated script was still broken. This commit fixes this problem and also a memory leak introduced by the past commit. Note that now we have a regression test able to duplicate the issue, so this commit was actually tested against the regression. The original PR also had a valid fix, but I prefer to hide the details of scripting.c outside scripting.c, and later "SCRIPT LOAD" should also be able to use the function luaCreateFunction() instead of redoing the work. --- src/scripting.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/scripting.c b/src/scripting.c index ea167365..9427b7b6 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -1161,7 +1161,6 @@ int redis_math_randomseed (lua_State *L) { * On error C_ERR is returned and an appropriate error is set in the * client context. */ int luaCreateFunction(client *c, lua_State *lua, char *funcname, robj *body, int allow_dup) { - sds funcdef = sdsempty(); char fname[43]; if (funcname == NULL) { @@ -1171,9 +1170,16 @@ int luaCreateFunction(client *c, lua_State *lua, char *funcname, robj *body, int funcname = fname; } - if (allow_dup && dictFind(server.lua_scripts,funcname+2) != NULL) - return C_OK; + if (allow_dup) { + sds sha = sdsnewlen(funcname+2,40); + if (allow_dup && dictFind(server.lua_scripts,sha) != NULL) { + sdsfree(sha); + return C_OK; + } + sdsfree(sha); + } + sds funcdef = sdsempty(); funcdef = sdscat(funcdef,"function "); funcdef = sdscatlen(funcdef,funcname,42); funcdef = sdscatlen(funcdef,"() ",3); From c6eca690ee0df88361dd878563fcbc6b1ce214a2 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 4 Dec 2017 10:55:46 +0100 Subject: [PATCH 092/102] Remove useless variable check from luaCreateFunction(). The block is already inside if (allow_dup). --- src/scripting.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripting.c b/src/scripting.c index 9427b7b6..e6a70e54 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -1172,7 +1172,7 @@ int luaCreateFunction(client *c, lua_State *lua, char *funcname, robj *body, int if (allow_dup) { sds sha = sdsnewlen(funcname+2,40); - if (allow_dup && dictFind(server.lua_scripts,sha) != NULL) { + if (dictFind(server.lua_scripts,sha) != NULL) { sdsfree(sha); return C_OK; } From 60d26acfc8bd4a4367d60b3a8b74af4031171fd6 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 4 Dec 2017 11:25:20 +0100 Subject: [PATCH 093/102] Refactoring: improve luaCreateFunction() API. The function in its initial form, and after the fixes for the PSYNC2 bugs, required code duplication in multiple spots. This commit modifies it in order to always compute the script name independently, and to return the SDS of the SHA of the body: this way it can be used in all the places, including for SCRIPT LOAD, without duplicating the code to create the Lua function name. Note that this requires to re-compute the body SHA1 in the case of EVAL seeing a script for the first time, but this should not change scripting performance in any way because new scripts definition is a rare event happening the first time a script is seen, and the SHA1 computation is anyway not a very slow process against the typical Redis script and compared to the actua Lua byte compiling of the body. Note that the function used to assert() if a duplicated script was loaded, however actually now two times over three, we want the function to handle duplicated scripts just fine: this happens in SCRIPT LOAD and in RDB AUX "lua" loading. Moreover the assert was not defending against some obvious failure mode, so now the function always tests against already defined functions at start. --- src/rdb.c | 2 +- src/scripting.c | 88 +++++++++++++++++++------------------------------ src/server.h | 2 +- 3 files changed, 36 insertions(+), 56 deletions(-) diff --git a/src/rdb.c b/src/rdb.c index 19f25499..28985b2a 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -1679,7 +1679,7 @@ int rdbLoadRio(rio *rdb, rdbSaveInfo *rsi) { if (rsi) rsi->repl_offset = strtoll(auxval->ptr,NULL,10); } else if (!strcasecmp(auxkey->ptr,"lua")) { /* Load the script back in memory. */ - if (luaCreateFunction(NULL,server.lua,NULL,auxval,1) == C_ERR) { + if (luaCreateFunction(NULL,server.lua,auxval) == NULL) { rdbExitReportCorruptRDB( "Can't load Lua script from RDB file! " "BODY: %s", auxval->ptr); diff --git a/src/scripting.c b/src/scripting.c index e6a70e54..a781e68e 100644 --- a/src/scripting.c +++ b/src/scripting.c @@ -1141,42 +1141,35 @@ int redis_math_randomseed (lua_State *L) { * EVAL and SCRIPT commands implementation * ------------------------------------------------------------------------- */ -/* Define a lua function with the specified function name and body. - * The function name musts be a 42 characters long string, since all the - * functions we defined in the Lua context are in the form: +/* Define a Lua function with the specified body. + * The function name will be generated in the following form: * * f_ * - * If 'funcname' is NULL, the function name is created by the function - * on the fly doing the SHA1 of the body, this means that passing the funcname - * is just an optimization in case it's already at hand. - * - * if 'allow_dup' is true, the function can be called with a script already - * in memory without crashing in assert(). In this case C_OK is returned. - * * The function increments the reference count of the 'body' object as a * side effect of a successful call. * - * On success C_OK is returned, and nothing is left on the Lua stack. - * On error C_ERR is returned and an appropriate error is set in the - * client context. */ -int luaCreateFunction(client *c, lua_State *lua, char *funcname, robj *body, int allow_dup) { - char fname[43]; + * On success a pointer to an SDS string representing the function SHA1 of the + * just added function is returned (and will be valid until the next call + * to scriptingReset() function), otherwise NULL is returned. + * + * The function handles the fact of being called with a script that already + * exists, and in such a case, it behaves like in the success case. + * + * If 'c' is not NULL, on error the client is informed with an appropriate + * error describing the nature of the problem and the Lua interpreter error. */ +sds luaCreateFunction(client *c, lua_State *lua, robj *body) { + char funcname[43]; + dictEntry *de; - if (funcname == NULL) { - fname[0] = 'f'; - fname[1] = '_'; - sha1hex(fname+2,body->ptr,sdslen(body->ptr)); - funcname = fname; - } + funcname[0] = 'f'; + funcname[1] = '_'; + sha1hex(funcname+2,body->ptr,sdslen(body->ptr)); - if (allow_dup) { - sds sha = sdsnewlen(funcname+2,40); - if (dictFind(server.lua_scripts,sha) != NULL) { - sdsfree(sha); - return C_OK; - } + sds sha = sdsnewlen(funcname+2,40); + if ((de = dictFind(server.lua_scripts,sha)) != NULL) { sdsfree(sha); + return dictGetKey(de); } sds funcdef = sdsempty(); @@ -1193,29 +1186,29 @@ int luaCreateFunction(client *c, lua_State *lua, char *funcname, robj *body, int lua_tostring(lua,-1)); } lua_pop(lua,1); + sdsfree(sha); sdsfree(funcdef); - return C_ERR; + return NULL; } sdsfree(funcdef); + if (lua_pcall(lua,0,0,0)) { if (c != NULL) { addReplyErrorFormat(c,"Error running script (new function): %s\n", lua_tostring(lua,-1)); } lua_pop(lua,1); - return C_ERR; + sdsfree(sha); + return NULL; } /* We also save a SHA1 -> Original script map in a dictionary * so that we can replicate / write in the AOF all the * EVALSHA commands as EVAL using the original script. */ - { - int retval = dictAdd(server.lua_scripts, - sdsnewlen(funcname+2,40),body); - serverAssertWithInfo(c ? c : server.lua_client,NULL,retval == DICT_OK); - incrRefCount(body); - } - return C_OK; + int retval = dictAdd(server.lua_scripts,sha,body); + serverAssertWithInfo(c ? c : server.lua_client,NULL,retval == DICT_OK); + incrRefCount(body); + return sha; } /* This is the Lua script "count" hook that we use to detect scripts timeout. */ @@ -1314,10 +1307,10 @@ void evalGenericCommand(client *c, int evalsha) { addReply(c, shared.noscripterr); return; } - if (luaCreateFunction(c,lua,funcname,c->argv[1],0) == C_ERR) { + if (luaCreateFunction(c,lua,c->argv[1]) == NULL) { lua_pop(lua,1); /* remove the error handler from the stack. */ /* The error is sent to the client by luaCreateFunction() - * itself when it returns C_ERR. */ + * itself when it returns NULL. */ return; } /* Now the following is guaranteed to return non nil */ @@ -1478,22 +1471,9 @@ void scriptCommand(client *c) { addReply(c,shared.czero); } } else if (c->argc == 3 && !strcasecmp(c->argv[1]->ptr,"load")) { - char funcname[43]; - sds sha; - - funcname[0] = 'f'; - funcname[1] = '_'; - sha1hex(funcname+2,c->argv[2]->ptr,sdslen(c->argv[2]->ptr)); - sha = sdsnewlen(funcname+2,40); - if (dictFind(server.lua_scripts,sha) == NULL) { - if (luaCreateFunction(c,server.lua,funcname,c->argv[2],0) - == C_ERR) { - sdsfree(sha); - return; - } - } - addReplyBulkCBuffer(c,funcname+2,40); - sdsfree(sha); + sds sha = luaCreateFunction(c,server.lua,c->argv[2]); + if (sha == NULL) return; /* The error was sent by luaCreateFunction(). */ + addReplyBulkCBuffer(c,sha,40); forceCommandPropagation(c,PROPAGATE_REPL|PROPAGATE_AOF); } else if (c->argc == 2 && !strcasecmp(c->argv[1]->ptr,"kill")) { if (server.lua_caller == NULL) { diff --git a/src/server.h b/src/server.h index 498a0550..16e91256 100644 --- a/src/server.h +++ b/src/server.h @@ -1794,7 +1794,7 @@ void scriptingInit(int setup); int ldbRemoveChild(pid_t pid); void ldbKillForkedSessions(void); int ldbPendingChildren(void); -int luaCreateFunction(client *c, lua_State *lua, char *funcname, robj *body, int allow_dup); +sds luaCreateFunction(client *c, lua_State *lua, robj *body); /* Blocked clients */ void processUnblockedClients(void); From 2869284e44ca64d633a75752981333d5c0520056 Mon Sep 17 00:00:00 2001 From: antirez Date: Mon, 4 Dec 2017 11:59:15 +0100 Subject: [PATCH 094/102] Streams: fix a few type mismatches in t_stream.c. --- src/t_stream.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index 213a46bb..da993958 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -103,8 +103,10 @@ int64_t lpGetInteger(unsigned char *ele) { /* The following code path should never be used for how listpacks work: * they should always be able to store an int64_t value in integer * encoded form. However the implementation may change. */ - int retval = string2ll((char*)e,v,&v); + long long ll; + int retval = string2ll((char*)e,v,&ll); serverAssert(retval != 0); + v = ll; return v; } @@ -748,7 +750,7 @@ int streamParseIDOrReply(client *c, robj *o, streamID *id, uint64_t missing_seq) /* Parse . form. */ char *dot = strchr(buf,'-'); if (dot) *dot = '\0'; - uint64_t ms, seq; + unsigned long long ms, seq; if (string2ull(buf,&ms) == 0) goto invalid; if (dot && string2ull(dot+1,&seq) == 0) goto invalid; if (!dot) seq = missing_seq; From 664bbfe7604a28e0d359c04a0b73d1a78d255568 Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Thu, 30 Nov 2017 13:38:54 +0800 Subject: [PATCH 095/102] quicklist: fix the return value of quicklistCount --- src/quicklist.c | 2 +- src/quicklist.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/quicklist.c b/src/quicklist.c index c8b72743..faa08c65 100644 --- a/src/quicklist.c +++ b/src/quicklist.c @@ -149,7 +149,7 @@ REDIS_STATIC quicklistNode *quicklistCreateNode(void) { } /* Return cached quicklist count */ -unsigned int quicklistCount(const quicklist *ql) { return ql->count; } +unsigned long quicklistCount(const quicklist *ql) { return ql->count; } /* Free entire quicklist. */ void quicklistRelease(quicklist *quicklist) { diff --git a/src/quicklist.h b/src/quicklist.h index 8f387590..7ca23d6e 100644 --- a/src/quicklist.h +++ b/src/quicklist.h @@ -154,7 +154,7 @@ int quicklistPopCustom(quicklist *quicklist, int where, unsigned char **data, void *(*saver)(unsigned char *data, unsigned int sz)); int quicklistPop(quicklist *quicklist, int where, unsigned char **data, unsigned int *sz, long long *slong); -unsigned int quicklistCount(const quicklist *ql); +unsigned long quicklistCount(const quicklist *ql); int quicklistCompare(unsigned char *p1, unsigned char *p2, int p2_len); size_t quicklistGetLzf(const quicklistNode *node, void **data); From b9491b65d991498bf62f2b5533405e161aee650a Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Sun, 3 Dec 2017 11:51:35 +0800 Subject: [PATCH 096/102] quicklist: change the len of quicklist to unsigned long --- src/debug.c | 4 ++-- src/quicklist.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/debug.c b/src/debug.c index 9e084f9d..0529e5a8 100644 --- a/src/debug.c +++ b/src/debug.c @@ -393,13 +393,13 @@ void debugCommand(client *c) { val = dictGetVal(de); strenc = strEncoding(val->encoding); - char extra[128] = {0}; + char extra[138] = {0}; if (val->encoding == OBJ_ENCODING_QUICKLIST) { char *nextra = extra; int remaining = sizeof(extra); quicklist *ql = val->ptr; /* Add number of quicklist nodes */ - int used = snprintf(nextra, remaining, " ql_nodes:%u", ql->len); + int used = snprintf(nextra, remaining, " ql_nodes:%lu", ql->len); nextra += used; remaining -= used; /* Add average quicklist fill factor */ diff --git a/src/quicklist.h b/src/quicklist.h index 7ca23d6e..955a22cf 100644 --- a/src/quicklist.h +++ b/src/quicklist.h @@ -64,7 +64,7 @@ typedef struct quicklistLZF { char compressed[]; } quicklistLZF; -/* quicklist is a 32 byte struct (on 64-bit systems) describing a quicklist. +/* quicklist is a 40 byte struct (on 64-bit systems) describing a quicklist. * 'count' is the number of total entries. * 'len' is the number of quicklist nodes. * 'compress' is: -1 if compression disabled, otherwise it's the number @@ -74,7 +74,7 @@ typedef struct quicklist { quicklistNode *head; quicklistNode *tail; unsigned long count; /* total count of all entries in all ziplists */ - unsigned int len; /* number of quicklistNodes */ + unsigned long len; /* number of quicklistNodes */ int fill : 16; /* fill factor for individual nodes */ unsigned int compress : 16; /* depth of end nodes not to compress;0=off */ } quicklist; From 3f232ebfb8712707f1a02dcc1eb532d05d0b8b71 Mon Sep 17 00:00:00 2001 From: WuYunlong Date: Tue, 5 Dec 2017 14:41:16 +0800 Subject: [PATCH 097/102] fix some notes --- redis.conf | 4 +++- src/server.c | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/redis.conf b/redis.conf index 54ba1298..7eb692a8 100644 --- a/redis.conf +++ b/redis.conf @@ -296,7 +296,9 @@ dir ./ # # 2) if slave-serve-stale-data is set to 'no' the slave will reply with # an error "SYNC with master in progress" to all the kind of commands -# but to INFO and SLAVEOF. +# but to INFO, SLAVEOF, AUTH, PING, SHUTDOWN, REPLCONF, ROLE, CONFIG, +# SUBSCRIBE, UNSUBSCRIBE, PSUBSCRIBE, PUNSUBSCRIBE, PUBLISH, PUBSUB, +# COMMAND, POST, HOST: and LATENCY. # slave-serve-stale-data yes diff --git a/src/server.c b/src/server.c index af19b5a3..ad5eedd2 100644 --- a/src/server.c +++ b/src/server.c @@ -2478,8 +2478,9 @@ int processCommand(client *c) { return C_OK; } - /* Only allow INFO and SLAVEOF when slave-serve-stale-data is no and - * we are a slave with a broken link with master. */ + /* Only allow commands with flag "t", such as INFO, SLAVEOF and so on, + * when slave-serve-stale-data is no and we are a slave with a broken + * link with master. */ if (server.masterhost && server.repl_state != REPL_STATE_CONNECTED && server.repl_serve_stale_data == 0 && !(c->cmd->flags & CMD_STALE)) From de809666f81675bb21ca09049ea67cf1d32ff9b7 Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Tue, 5 Dec 2017 17:19:19 +0800 Subject: [PATCH 098/102] set: fix the int problem for SPOP & SRANDMEMBER --- src/t_set.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/t_set.c b/src/t_set.c index d5a801e1..36299317 100644 --- a/src/t_set.c +++ b/src/t_set.c @@ -407,7 +407,7 @@ void spopWithCountCommand(client *c) { /* Get the count argument */ if (getLongFromObjectOrReply(c,c->argv[2],&l,NULL) != C_OK) return; if (l >= 0) { - count = (unsigned) l; + count = (unsigned long) l; } else { addReply(c,shared.outofrangeerr); return; @@ -626,7 +626,7 @@ void srandmemberWithCountCommand(client *c) { if (getLongFromObjectOrReply(c,c->argv[2],&l,NULL) != C_OK) return; if (l >= 0) { - count = (unsigned) l; + count = (unsigned long) l; } else { /* A negative count means: return the same elements multiple times * (i.e. don't remove the extracted element after every extraction). */ From 42387d6c1a3bf14baa70f811a79f20b439b58c12 Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Tue, 5 Dec 2017 17:42:19 +0800 Subject: [PATCH 099/102] set: fix the int problem for qsort --- src/t_set.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/t_set.c b/src/t_set.c index 36299317..8f21f71b 100644 --- a/src/t_set.c +++ b/src/t_set.c @@ -774,15 +774,21 @@ void srandmemberCommand(client *c) { } int qsortCompareSetsByCardinality(const void *s1, const void *s2) { - return setTypeSize(*(robj**)s1)-setTypeSize(*(robj**)s2); + if (setTypeSize(*(robj**)s1) > setTypeSize(*(robj**)s2)) return 1; + if (setTypeSize(*(robj**)s1) < setTypeSize(*(robj**)s2)) return -1; + return 0; } /* This is used by SDIFF and in this case we can receive NULL that should * be handled as empty sets. */ int qsortCompareSetsByRevCardinality(const void *s1, const void *s2) { robj *o1 = *(robj**)s1, *o2 = *(robj**)s2; + unsigned long first = o1 ? setTypeSize(o1) : 0; + unsigned long second = o2 ? setTypeSize(o2) : 0; - return (o2 ? setTypeSize(o2) : 0) - (o1 ? setTypeSize(o1) : 0); + if (first < second) return 1; + if (first > second) return -1; + return 0; } void sinterGenericCommand(client *c, robj **setkeys, From d1176b582c8f482e46821527114c5c1476bc66d4 Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Thu, 30 Nov 2017 11:29:05 +0800 Subject: [PATCH 100/102] dict: fix the int problem --- src/dict.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/dict.c b/src/dict.c index 210d50dc..49bbc92b 100644 --- a/src/dict.c +++ b/src/dict.c @@ -66,7 +66,7 @@ static unsigned int dict_force_resize_ratio = 5; static int _dictExpandIfNeeded(dict *ht); static unsigned long _dictNextPower(unsigned long size); -static int _dictKeyIndex(dict *ht, const void *key, unsigned int hash, dictEntry **existing); +static long _dictKeyIndex(dict *ht, const void *key, uint64_t hash, dictEntry **existing); static int _dictInit(dict *ht, dictType *type, void *privDataPtr); /* -------------------------- hash functions -------------------------------- */ @@ -202,7 +202,7 @@ int dictRehash(dict *d, int n) { de = d->ht[0].table[d->rehashidx]; /* Move all the keys in this bucket from the old to the new hash HT */ while(de) { - unsigned int h; + uint64_t h; nextde = de->next; /* Get the index in the new hash table */ @@ -291,7 +291,7 @@ int dictAdd(dict *d, void *key, void *val) */ dictEntry *dictAddRaw(dict *d, void *key, dictEntry **existing) { - int index; + long index; dictEntry *entry; dictht *ht; @@ -362,7 +362,7 @@ dictEntry *dictAddOrFind(dict *d, void *key) { * dictDelete() and dictUnlink(), please check the top comment * of those functions. */ static dictEntry *dictGenericDelete(dict *d, const void *key, int nofree) { - unsigned int h, idx; + uint64_t h, idx; dictEntry *he, *prevHe; int table; @@ -476,7 +476,7 @@ void dictRelease(dict *d) dictEntry *dictFind(dict *d, const void *key) { dictEntry *he; - unsigned int h, idx, table; + uint64_t h, idx, table; if (d->ht[0].used + d->ht[1].used == 0) return NULL; /* dict is empty */ if (dictIsRehashing(d)) _dictRehashStep(d); @@ -610,7 +610,7 @@ void dictReleaseIterator(dictIterator *iter) dictEntry *dictGetRandomKey(dict *d) { dictEntry *he, *orighe; - unsigned int h; + unsigned long h; int listlen, listele; if (dictSize(d) == 0) return NULL; @@ -955,9 +955,9 @@ static unsigned long _dictNextPower(unsigned long size) * * Note that if we are in the process of rehashing the hash table, the * index is always returned in the context of the second (new) hash table. */ -static int _dictKeyIndex(dict *d, const void *key, unsigned int hash, dictEntry **existing) +static long _dictKeyIndex(dict *d, const void *key, uint64_t hash, dictEntry **existing) { - unsigned int idx, table; + unsigned long idx, table; dictEntry *he; if (existing) *existing = NULL; @@ -1006,7 +1006,7 @@ unsigned int dictGetHash(dict *d, const void *key) { * return value is the reference to the dictEntry if found, or NULL if not found. */ dictEntry **dictFindEntryRefByPtrAndHash(dict *d, const void *oldptr, unsigned int hash) { dictEntry *he, **heref; - unsigned int idx, table; + unsigned long idx, table; if (d->ht[0].used + d->ht[1].used == 0) return NULL; /* dict is empty */ for (table = 0; table <= 1; table++) { From 7c6ddbc37d572600b6364348c9506dc190493e2e Mon Sep 17 00:00:00 2001 From: "zhaozhao.zz" Date: Thu, 30 Nov 2017 11:34:37 +0800 Subject: [PATCH 101/102] dict: fix the int problem for defrag --- src/defrag.c | 2 +- src/dict.c | 4 ++-- src/dict.h | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/defrag.c b/src/defrag.c index 4a1dcefe..3f0e6627 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -289,7 +289,7 @@ int defragKey(redisDb *db, dictEntry *de) { /* Dirty code: * I can't search in db->expires for that key after i already released * the pointer it holds it won't be able to do the string compare */ - unsigned int hash = dictGetHash(db->dict, de->key); + uint64_t hash = dictGetHash(db->dict, de->key); replaceSateliteDictKeyPtrAndOrDefragDictEntry(db->expires, keysds, newsds, hash, &defragged); } diff --git a/src/dict.c b/src/dict.c index 49bbc92b..97e63680 100644 --- a/src/dict.c +++ b/src/dict.c @@ -995,7 +995,7 @@ void dictDisableResize(void) { dict_can_resize = 0; } -unsigned int dictGetHash(dict *d, const void *key) { +uint64_t dictGetHash(dict *d, const void *key) { return dictHashKey(d, key); } @@ -1004,7 +1004,7 @@ unsigned int dictGetHash(dict *d, const void *key) { * the hash value should be provided using dictGetHash. * no string / key comparison is performed. * return value is the reference to the dictEntry if found, or NULL if not found. */ -dictEntry **dictFindEntryRefByPtrAndHash(dict *d, const void *oldptr, unsigned int hash) { +dictEntry **dictFindEntryRefByPtrAndHash(dict *d, const void *oldptr, uint64_t hash) { dictEntry *he, **heref; unsigned long idx, table; diff --git a/src/dict.h b/src/dict.h index bf316a00..62018cc4 100644 --- a/src/dict.h +++ b/src/dict.h @@ -178,8 +178,8 @@ int dictRehashMilliseconds(dict *d, int ms); void dictSetHashFunctionSeed(uint8_t *seed); uint8_t *dictGetHashFunctionSeed(void); unsigned long dictScan(dict *d, unsigned long v, dictScanFunction *fn, dictScanBucketFunction *bucketfn, void *privdata); -unsigned int dictGetHash(dict *d, const void *key); -dictEntry **dictFindEntryRefByPtrAndHash(dict *d, const void *oldptr, unsigned int hash); +uint64_t dictGetHash(dict *d, const void *key); +dictEntry **dictFindEntryRefByPtrAndHash(dict *d, const void *oldptr, uint64_t hash); /* Hash table types */ extern dictType dictTypeHeapStringCopyKey; From 62a4b817c6e83eedf96a451f45dd943099258fd0 Mon Sep 17 00:00:00 2001 From: antirez Date: Tue, 5 Dec 2017 15:59:56 +0100 Subject: [PATCH 102/102] add linkClient(): adds the client and caches the list node. We have this operation in two places: when caching the master and when linking a new client after the client creation. By having an API for this we avoid incurring in errors when modifying one of the two places forgetting the other. The function is also a good place where to document why we cache the linked list node. Related to #4497 and #4210. --- src/networking.c | 18 ++++++++++++------ src/replication.c | 3 +-- src/server.h | 1 + 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/networking.c b/src/networking.c index b1235ed4..3f880fe0 100644 --- a/src/networking.c +++ b/src/networking.c @@ -67,6 +67,16 @@ int listMatchObjects(void *a, void *b) { return equalStringObjects(a,b); } +/* This function links the client to the global linked list of clients. + * unlinkClient() does the opposite, among other things. */ +void linkClient(client *c) { + listAddNodeTail(server.clients,c); + /* Note that we remember the linked list node where the client is stored, + * this way removing the client in unlinkClient() will not require + * a linear scan, but just a constant time operation. */ + c->client_list_node = listLast(server.clients); +} + client *createClient(int fd) { client *c = zmalloc(sizeof(client)); @@ -134,14 +144,10 @@ client *createClient(int fd) { c->pubsub_channels = dictCreate(&objectKeyPointerValueDictType,NULL); c->pubsub_patterns = listCreate(); c->peerid = NULL; + c->client_list_node = NULL; listSetFreeMethod(c->pubsub_patterns,decrRefCountVoid); listSetMatchMethod(c->pubsub_patterns,listMatchObjects); - if (fd != -1) { - listAddNodeTail(server.clients,c); - c->client_list_node = listLast(server.clients); - } else { - c->client_list_node = NULL; - } + if (fd != -1) linkClient(c); initClientMultiState(c); return c; } diff --git a/src/replication.c b/src/replication.c index 1207e060..064d2bec 100644 --- a/src/replication.c +++ b/src/replication.c @@ -2205,8 +2205,7 @@ void replicationResurrectCachedMaster(int newfd) { server.repl_state = REPL_STATE_CONNECTED; /* Re-add to the list of clients. */ - listAddNodeTail(server.clients,server.master); - server.master->client_list_node = listLast(server.clients); + linkClient(server.master); if (aeCreateFileEvent(server.el, newfd, AE_READABLE, readQueryFromClient, server.master)) { serverLog(LL_WARNING,"Error resurrecting the cached master, impossible to add the readable handler: %s", strerror(errno)); diff --git a/src/server.h b/src/server.h index afae8c58..4731c22e 100644 --- a/src/server.h +++ b/src/server.h @@ -1399,6 +1399,7 @@ int handleClientsWithPendingWrites(void); int clientHasPendingReplies(client *c); void unlinkClient(client *c); int writeToClient(int fd, client *c, int handler_installed); +void linkClient(client *c); #ifdef __GNUC__ void addReplyErrorFormat(client *c, const char *fmt, ...)