SPOP: reimplemented for speed and better distribution.

The old version of SPOP with "count" argument used an API call of dict.c which was actually designed for a different goal, and was not capable of good distribution. We follow a different three-cases approach optimized for different ratiion between sets and requested number of elements. The implementation is simpler and allowed the removal of a large amount of code.
2025-05-03 22:42:14 +00:00 · 2015-02-10 22:59:12 +01:00 · 2015-02-10 22:59:12 +01:00 · 9feee428f2
commit 9feee428f2
parent 55003f7a11
5 changed files with 80 additions and 301 deletions
--- a/src/dict.c
+++ b/src/dict.c
@ -664,81 +664,6 @@ dictEntry *dictGetRandomKey(dict *d)
    return he;
 }

-/* XXX: This is going to be removed soon and SPOP internals
- *      reimplemented.
- *
- * This is a version of dictGetRandomKey() that is modified in order to
- * return multiple entries by jumping at a random place of the hash table
- * and scanning linearly for entries.
- *
- * Returned pointers to hash table entries are stored into 'des' that
- * points to an array of dictEntry pointers. The array must have room for
- * at least 'count' elements, that is the argument we pass to the function
- * to tell how many random elements we need.
- *
- * The function returns the number of items stored into 'des', that may
- * be less than 'count' if the hash table has less than 'count' elements
- * inside.
- *
- * Note that this function is not suitable when you need a good distribution
- * of the returned items, but only when you need to "sample" a given number
- * of continuous elements to run some kind of algorithm or to produce
- * statistics. However the function is much faster than dictGetRandomKey()
- * at producing N elements, and the elements are guaranteed to be non
- * repeating. */
-unsigned int dictGetRandomKeys(dict *d, dictEntry **des, unsigned int count) {
-    unsigned int j; /* internal hash table id, 0 or 1. */
-    unsigned int tables; /* 1 or 2 tables? */
-    unsigned int stored = 0, maxsizemask;
-
-    if (dictSize(d) < count) count = dictSize(d);
-
-    /* Try to do a rehashing work proportional to 'count'. */
-    for (j = 0; j < count; j++) {
-        if (dictIsRehashing(d))
-            _dictRehashStep(d);
-        else
-            break;
-    }
-
-    tables = dictIsRehashing(d) ? 2 : 1;
-    maxsizemask = d->ht[0].sizemask;
-    if (tables > 1 && maxsizemask < d->ht[1].sizemask)
-        maxsizemask = d->ht[1].sizemask;
-
-    /* Pick a random point inside the larger table. */
-    unsigned int i = random() & maxsizemask;
-    while(stored < count) {
-        for (j = 0; j < tables; j++) {
-            /* Invariant of the dict.c rehashing: up to the indexes already
-             * visited in ht[0] during the rehashing, there are no populated
-             * buckets, so we can skip ht[0] for indexes between 0 and idx-1. */
-            if (tables == 2 && j == 0 && i < d->rehashidx) {
-                /* Moreover, if we are currently out of range in the second
-                 * table, there will be no elements in both tables up to
-                 * the current rehashing index, so we jump if possible.
-                 * (this happens when going from big to small table). */
-                if (i >= d->ht[1].size) i = d->rehashidx;
-                continue;
-            }
-            if (i >= d->ht[j].size) continue; /* Out of range for this table. */
-            dictEntry *he = d->ht[j].table[i];
-            while (he) {
-                /* Collect all the elements of the buckets found non
-                 * empty while iterating. */
-                *des = he;
-                des++;
-                he = he->next;
-                stored++;
-                if (stored == count) return stored;
-            }
-        }
-        i = (i+1) & maxsizemask;
-    }
-    return stored; /* Never reached. */
-}
-
-
 /* This function samples the dictionary to return a few keys from random
 * locations.
 *
--- a/src/dict.h
+++ b/src/dict.h
@ -165,7 +165,6 @@ dictEntry *dictNext(dictIterator *iter);
 void dictReleaseIterator(dictIterator *iter);
 dictEntry *dictGetRandomKey(dict *d);
 unsigned int dictGetSomeKeys(dict *d, dictEntry **des, unsigned int count);
-unsigned int dictGetRandomKeys(dict *d, dictEntry **des, unsigned int count);
 void dictPrintStats(dict *d);
 unsigned int dictGenHashFunction(const void *key, int len);
 unsigned int dictGenCaseHashFunction(const unsigned char *buf, int len);
--- a/src/intset.c
+++ b/src/intset.c
@ -261,90 +261,6 @@ int64_t intsetRandom(intset *is) {
    return _intsetGet(is,rand()%intrev32ifbe(is->length));
 }

-/* How many times bigger should the set length be compared to the requested
- * count of members for us to use the Floyd algorithm instead of
- * the Knuth algorithm */
-#define RANDOMMEMBERS_ALGORITHM_SELECTION_RATIO (2)
-
-/* Copies 'count' random members from the set into the 'values' array.
- * 'values' must be an array of int64_t values, of length 'count'.
- * Returns the amount of items returned. If this amount is less than 'count',
- * then the remaining 'values' are left uninitialized. */
-int intsetRandomMembers(intset *is, int64_t* values, int count) {
-
-    /* We don't check that is and values are non-NULL - the caller must
-     * play nice. */
-
-    int length = intsetLen(is);
-
-    if (count > length) {
-        /* Return everything in the set */
-        count = length;
-    }
-
-    /* Choose between the Knuth shuffle algorithm, O(1) space, O(length) time,
-     * and the Floyd algorithm, O(length) space, O(count) time. */
-    if ((RANDOMMEMBERS_ALGORITHM_SELECTION_RATIO * count) > length) {
-
-        /* If the count of members requested is almost the length of the set,
-         * use the Knuth shuffle algorithm, O(1) space, O(length) time. */
-
-        /* First, fill the values array with unique random indexes inside
-         * the set. */
-        int in, im, rn, rm;
-        im = 0;
-        for (in = 0; in < length && im < count; in++) {
-
-            rn = length - in;
-            rm = count - im;
-            if (rand() % rn < rm) {
-                values[im++] = in;
-            }
-        }
-
-    } else {
-
-        /* If the length is considerably more than the count of members
-         * requested, use Robert Floyd's algorithm, O(length) space,
-         * O(count) time.
-         * Based on Jon Bentley's Programming Pearls */
-
-        int64_t *is_used = zcalloc(sizeof(int64_t) * length);
-        int in, im, r;
-
-        r = 0;
-        im = 0;
-
-        for (in = length - count; in < length && im < count; in++) {
-
-            /* Generate a random number r */
-            r = rand() % (in + 1);
-
-            /* Do we already have the value in r? */
-            if (is_used[r]) {
-                /* Use in instead of the generated number */
-                r = in;
-            }
-
-            values[im++] = r ;
-
-            /* Mark it as used */
-            is_used[r] = 1;
-        }
-
-        zfree(is_used);
-    }
-
-    /* Replace each random index with the value stored there in the intset */
-    uint8_t encoding = intrev32ifbe(is->encoding);
-    for (int currentValue = 0; currentValue < count; currentValue++) {
-        values[currentValue] =
-            _intsetGetEncoded(is, values[currentValue], encoding);
-    }
-
-    return count;
-}
-
 /* Sets the value to the value at the given position. When this position is
 * out of range the function returns 0, when in range it returns 1. */
 uint8_t intsetGet(intset *is, uint32_t pos, int64_t *value) {
--- a/src/intset.h
+++ b/src/intset.h
@ -43,7 +43,6 @@ intset *intsetAdd(intset *is, int64_t value, uint8_t *success);
 intset *intsetRemove(intset *is, int64_t value, int *success);
 uint8_t intsetFind(intset *is, int64_t value);
 int64_t intsetRandom(intset *is);
-int intsetRandomMembers(intset *is, int64_t* value, int count);
 uint8_t intsetGet(intset *is, uint32_t pos, int64_t *value);
 uint32_t intsetLen(intset *is);
 size_t intsetBlobLen(intset *is);
--- a/src/t_set.c
+++ b/src/t_set.c
@ -212,106 +212,6 @@ int setTypeRandomElement(robj *setobj, robj **objele, int64_t *llele) {
    return setobj->encoding;
 }

-/* Return a number of random elements from a non empty set.
- *
- * This is a version of setTypeRandomElement() that is modified in order to
- * return multiple entries, using dictGetRandomKeys() and intsetRandomMembers().
- *
- * The elements are stored into 'aux_set' which should be of a set type.
- *
- * The function returns the number of items stored into 'aux_set', that may
- * be less than 'count' if the hash table has less than 'count' elements
- * inside.
- *
- * Note that this function is not suitable when you need a good distribution
- * of the returned items, but only when you need to "sample" a given number
- * of continuous elements to run some kind of algorithm or to produce
- * statistics. However the function is much faster than setTypeRandomElement()
- * at producing N elements, and the elements are guaranteed to be non
- * repeating.
- */
-unsigned long setTypeRandomElements(robj *set, unsigned long count,
-                                    robj *aux_set) {
-    unsigned long set_size;
-    unsigned long elements_to_return = count;
-    unsigned long elements_copied = 0;
-    unsigned long current_element = 0;
-
-    /* Like all setType* functions, we assume good behavior on part of the
-     * caller, so no extra parameter checks are made. */
-
-    /* If the number of elements in the the set is less than the count
-     * requested, just return all of them. */
-    set_size = setTypeSize(set);
-    if (set_size < count) {
-        elements_to_return = set_size;
-    }
-
-    /* TODO: It is definitely faster adding items to the set by directly
-     * handling the Dict or intset inside it, avoiding the constant encoding
-     * checks inside setTypeAdd(). However, We don't want to touch the set
-     * internals in non setType* functions. So, we just call setTypeAdd()
-     * multiple times, but this isn't an optimal solution.
-     * Another option would be to create a bulk-add function:
-     * setTypeAddBulk(). */
-    if (set->encoding == REDIS_ENCODING_HT) {
-        /* Allocate result array */
-        dictEntry **random_elements =
-            zmalloc(sizeof(dictEntry*) * elements_to_return);
-
-        /* Get the random elements */
-        elements_copied =
-            dictGetRandomKeys(set->ptr, random_elements, elements_to_return);
-        redisAssert(elements_copied == elements_to_return);
-
-        /* Put them into the set */
-        for (current_element = 0; current_element < elements_copied;
-             current_element++) {
-
-            /* We get the key and duplicate it, as we know it is a string */
-            setTypeAdd(aux_set,
-                dictGetKey(random_elements[current_element]));
-        }
-
-        zfree(random_elements);
-
-    } else if (set->encoding == REDIS_ENCODING_INTSET) {
-        /* Allocate result array */
-        int64_t *random_elements =
-            zmalloc(sizeof(int64_t) * elements_to_return);
-        robj* element_as_str = NULL;
-
-        elements_copied =
-            intsetRandomMembers((intset*) set->ptr,
-                                random_elements,
-                                elements_to_return);
-
-        redisAssert(elements_copied == elements_to_return);
-
-        /* Put them into the set */
-        for (current_element = 0; current_element < elements_copied;
-             current_element++) {
-
-            element_as_str = createStringObjectFromLongLong(
-                    random_elements[current_element]);
-
-            /* Put the values in the set */
-            setTypeAdd(aux_set,
-                       element_as_str);
-
-            decrRefCount(element_as_str);
-        }
-
-        zfree(random_elements);
-    } else {
-        redisPanic("Unknown set encoding");
-    }
-
-    /* We have a set with random elements. Return the actual elements in
-       the aux_set. */
-    return elements_copied;
-}
-
 unsigned long setTypeSize(robj *subject) {
    if (subject->encoding == REDIS_ENCODING_HT) {
        return dictSize((dict*)subject->ptr);
@ -485,15 +385,18 @@ void scardCommand(redisClient *c) {
    addReplyLongLong(c,setTypeSize(o));
 }

-/* handle the "SPOP key <count>" variant. The normal version of the
+/* Handle the "SPOP key <count>" variant. The normal version of the
 * command is handled by the spopCommand() function itself. */

+/* How many times bigger should be the set compared to the remaining size
+ * for us to use the "create new set" strategy? Read later in the
+ * implementation for more info. */
+#define SPOP_MOVE_STRATEGY_MUL 5
+
 void spopWithCountCommand(redisClient *c) {
    long l;
    unsigned long count, size;
-    unsigned long elements_returned;
-    robj *set, *aux_set;
-    int64_t llele;
+    robj *set;

    /* Get the count argument */
    if (getLongFromObjectOrReply(c,c->argv[2],&l,NULL) != REDIS_OK) return;
@ -516,12 +419,11 @@ void spopWithCountCommand(redisClient *c) {
        return;
    }

-    /* Get the size of the set. It is always > 0, as empty sets get
-     * deleted.  */
    size = setTypeSize(set);

    /* Generate an SPOP keyspace notification */
    notifyKeyspaceEvent(REDIS_NOTIFY_SET,"spop",c->argv[1],c->db->id);
+    server.dirty += count;

    /* CASE 1:
     * The number of requested elements is greater than or equal to
@ -534,64 +436,102 @@ void spopWithCountCommand(redisClient *c) {
        dbDelete(c->db,c->argv[1]);
        notifyKeyspaceEvent(REDIS_NOTIFY_GENERIC,"del",c->argv[1],c->db->id);

-        /* Replicate/AOF this command as an SREM operation */
+        /* Propagate this command as an DEL operation */
        rewriteClientCommandVector(c,2,shared.del,c->argv[1]);
        signalModifiedKey(c->db,c->argv[1]);
        server.dirty++;
        return;
    }

-    /* CASE 2:
-     * The number of requested elements is less than the number
-     * of elements inside the set. */
+    /* Case 2 and 3 require to replicate SPOP as a set of SERM commands.
+     * Prepare our replication argument vector. Also send the array length
+     * which is common to both the code paths. */
+    robj *propargv[3];
+    propargv[0] = createStringObject("SREM",4);
+    propargv[1] = c->argv[1];
+    addReplyMultiBulkLen(c,count);

-    /* We need an auxiliary set. Optimistically, we create a set using an
-     * Intset internally. */
-    aux_set = createIntsetObject();
+    /* Common iteration vars. */
+    robj *objele;
+    int encoding;
+    int64_t llele;
+    unsigned long remaining = size-count; /* Elements left after SPOP. */

-    /* Get the count requested of random elements from the set into our
-     * auxiliary set. */
-    elements_returned = setTypeRandomElements(set, count, aux_set);
-    redisAssert(elements_returned == count);
-
-    {
-        setTypeIterator *si;
-        robj *objele, *propargv[3];
-        int element_encoding;
-
-        addReplyMultiBulkLen(c, elements_returned);
-        propargv[0] = createStringObject("SREM",4);
-        propargv[1] = c->argv[1];
-
-        si = setTypeInitIterator(aux_set);
-        while ((element_encoding = setTypeNext(si, &objele, &llele)) != -1) {
-            if (element_encoding == REDIS_ENCODING_HT) {
-                incrRefCount(objele);
-            } else if (element_encoding == REDIS_ENCODING_INTSET) {
+    /* If we are here, the number of requested elements is less than the
+     * number of elements inside the set. Also we are sure that count < size.
+     * Use two different strategies.
+     *
+     * CASE 2: The number of elements to return is small compared to the
+     * set size. We can just extract random elements and return them to
+     * the set. */
+    if (remaining*SPOP_MOVE_STRATEGY_MUL > count) {
+        while(count--) {
+            encoding = setTypeRandomElement(set,&objele,&llele);
+            if (encoding == REDIS_ENCODING_INTSET) {
                objele = createStringObjectFromLongLong(llele);
            } else {
-                redisPanic("Unknown set encoding");
+                incrRefCount(objele);
            }
-            setTypeRemove(set, objele);
-            addReplyBulk(c, objele);
+
+            /* Return the element to the client and remove from the set. */
+            addReplyBulk(c,objele);
+            setTypeRemove(set,objele);

            /* Replicate/AOF this command as an SREM operation */
            propargv[2] = objele;
-            alsoPropagate(server.sremCommand,c->db->id,propargv,3,REDIS_PROPAGATE_AOF|REDIS_PROPAGATE_REPL);
-
+            alsoPropagate(server.sremCommand,c->db->id,propargv,3,
+                REDIS_PROPAGATE_AOF|REDIS_PROPAGATE_REPL);
            decrRefCount(objele);
-            server.dirty++;
        }
-        decrRefCount(propargv[0]);
+    } else {
+    /* CASE 3: The number of elements to return is very big, approaching
+     * the size of the set itself. After some time extracting random elements
+     * from such a set becomes computationally expensive, so we use
+     * a different strategy, we extract random elements that we don't
+     * want to return (the elements that will remain part of the set),
+     * creating a new set as we do this (that will be stored as the original
+     * set). Then we return the elements left in the original set and
+     * release it. */
+        robj *newset = NULL;
+
+        /* Create a new set with just the remaining elements. */
+        while(remaining--) {
+            encoding = setTypeRandomElement(set,&objele,&llele);
+            if (encoding == REDIS_ENCODING_INTSET) {
+                objele = createStringObjectFromLongLong(llele);
+            } else {
+                incrRefCount(objele);
+            }
+            if (!newset) newset = setTypeCreate(objele);
+            setTypeAdd(newset,objele);
+            setTypeRemove(set,objele);
+            decrRefCount(objele);
+        }
+
+        /* Assign the new set as the key value. */
+        incrRefCount(set); /* Protect the old set value. */
+        dbOverwrite(c->db,c->argv[1],newset);
+
+        /* Tranfer the old set to the client and release it. */
+        setTypeIterator *si;
+        si = setTypeInitIterator(set);
+        while((encoding = setTypeNext(si,&objele,&llele)) != -1) {
+            if (encoding == REDIS_ENCODING_INTSET) {
+                addReplyBulkLongLong(c,llele);
+            } else {
+                addReplyBulk(c,objele);
+            }
+        }
        setTypeReleaseIterator(si);
+        decrRefCount(set);
    }

    /* Don't propagate the command itself even if we incremented the
     * dirty counter. We don't want to propagate an SPOP command since
     * we propagated the command as a set of SREMs operations using
     * the alsoPropagate() API. */
+    decrRefCount(propargv[0]);
    preventCommandPropagation(c);
-    decrRefCount(aux_set);
 }

 void spopCommand(redisClient *c) {