Added dictGetRandomKeys() to dict.c: mass get random entries.

This new function is useful to get a number of random entries from an hash table when we just need to do some sampling without particularly good distribution. It just jumps at a random place of the hash table and returns the first N items encountered by scanning linearly. The main usefulness of this function is to speedup Redis internal sampling of the key space, for example for key eviction or expiry.
2025-03-17 08:00:49 +00:00 · 2014-03-20 15:50:46 +01:00 · 2014-03-20 15:50:46 +01:00 · 5317f5e99a
commit 5317f5e99a
parent 22c9cfaf57
2 changed files with 53 additions and 0 deletions
--- a/src/dict.c
+++ b/src/dict.c
@ -649,6 +649,58 @@ dictEntry *dictGetRandomKey(dict *d)
    return he;
 }

+/* This is a version of dictGetRandomKey() that is modified in order to
+ * return multiple entries by jumping at a random place of the hash table
+ * and scanning linearly for entries.
+ *
+ * Returned pointers to hash table entries are stored into 'des' that
+ * points to an array of dictEntry pointers. The array must have room for
+ * at least 'count' elements, that is the argument we pass to the function
+ * to tell how many random elements we need.
+ *
+ * The function returns the number of items stored into 'des', that may
+ * be less than 'count' if the hash table has less than 'count' elements
+ * inside.
+ *
+ * Note that this function is not suitable when you need a good distribution
+ * of the returned items, but only when you need to "sample" a given number
+ * of continuous elements to run some kind of algorithm or to produce
+ * statistics. However the function is much faster than dictGetRandomKey()
+ * at producing N elements, and the elements are guaranteed to be non
+ * repeating. */
+int dictGetRandomKeys(dict *d, dictEntry **des, int count) {
+    int j; /* internal hash table id, 0 or 1. */
+    int stored = 0;
+
+    if (dictSize(d) < count) count = dictSize(d);
+    while(stored < count) {
+        for (j = 0; j < 2; j++) {
+            /* Pick a random point inside the hash table 0 or 1. */
+            unsigned int i = random() & d->ht[j].sizemask;
+            int size = d->ht[j].size;
+
+            /* Make sure to visit every bucket by iterating 'size' times. */
+            while(size--) {
+                dictEntry *he = d->ht[j].table[i];
+                while (he) {
+                    /* Collect all the elements of the buckets found non
+                     * empty while iterating. */
+                    *des = he;
+                    des++;
+                    he = he->next;
+                    stored++;
+                    if (stored == count) return stored;
+                }
+                i = (i+1) & d->ht[j].sizemask;
+            }
+            /* If there is only one table and we iterated it all, we should
+             * already have 'count' elements. Assert this condition. */
+            assert(dictIsRehashing(d) != 0);
+        }
+    }
+    return stored; /* Never reached. */
+}
+
 /* Function to reverse bits. Algorithm from:
 * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel */
 static unsigned long rev(unsigned long v) {
--- a/src/dict.h
+++ b/src/dict.h
@ -157,6 +157,7 @@ dictIterator *dictGetSafeIterator(dict *d);
 dictEntry *dictNext(dictIterator *iter);
 void dictReleaseIterator(dictIterator *iter);
 dictEntry *dictGetRandomKey(dict *d);
+int dictGetRandomKeys(dict *d, dictEntry **des, int count);
 void dictPrintStats(dict *d);
 unsigned int dictGenHashFunction(const void *key, int len);
 unsigned int dictGenCaseHashFunction(const unsigned char *buf, int len);