diff --git a/src/hyperloglog.c b/src/hyperloglog.c index b426efc5..e095c6cf 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -30,6 +30,7 @@ */ #include +#include #include "redis.h" /* The Redis HyperLogLog implementation is based on the following ideas: @@ -262,6 +263,41 @@ int hllAdd(uint8_t *registers, unsigned char *ele, size_t elesize) { } } +/* Return the approximated cardinality of the set based on the armonic + * mean of the registers values. */ +uint64_t hllCount(uint8_t *registers) { + double m = REDIS_HLL_REGISTERS; + double alpha = 0.7213/(1+1.079/m); + double E = 0; + int ez = 0; /* Number of registers equal to 0. */ + int j; + + for (j = 0; j < REDIS_HLL_REGISTERS; j++) { + uint8_t reg; + + /* Compute SUM(2^-register[0..i]). */ + HLL_GET_REGISTER(reg,registers,j); + if (reg == 0) { + ez++; + E += 1; /* 2^(-reg[j]) is 1 when m is 0. */ + } else { + E += 1.0/(2ULL << reg); /* 2^(-reg[j]) is the same as 1/2^reg[j]. */ + } + } + /* Muliply the inverse of E for alpha_m * m^2 to have the raw estimate. */ + E = (1/E)*alpha*m*m; + + /* Apply corrections for small cardinalities. */ + if (E < m*2.5 && ez != 0) { + E = m*log(m/ez); /* LINEARCOUNTING() */ + } + /* We don't apply the correction for E > 1/30 of 2^32 since we use + * a 64 bit function and 6 bit counters. To apply the correction for + * 1/30 of 2^64 is not needed since it would require a huge set + * to approach such a value. */ + return (uint64_t) E; +} + /* ========================== HyperLogLog commands ========================== */ /* HLLADD var ele ele ele ... ele => :0 or :1 */ @@ -299,7 +335,7 @@ void hllAddCommand(redisClient *c) { } } /* Perform the low level ADD operation for every element. */ - registers = (uint8_t*) o->ptr; + registers = o->ptr; for (j = 2; j < c->argc; j++) { if (hllAdd(registers, (unsigned char*)c->argv[j]->ptr, sdslen(c->argv[j]->ptr))) @@ -315,6 +351,34 @@ void hllAddCommand(redisClient *c) { addReply(c, updated ? shared.cone : shared.czero); } +/* HLLCOUNT var -> approximated cardinality of set. */ +void hllCountCommand(redisClient *c) { + robj *o = lookupKeyRead(c->db,c->argv[1]); + uint8_t *registers; + + if (o == NULL) { + /* No key? Cardinality is zero since no element was added, otherwise + * we would have a key as HHLADD creates it as a side effect. */ + addReply(c,shared.czero); + } else { + /* Key exists, check type */ + if (checkType(c,o,REDIS_STRING)) + return; + + /* If this is a string representing an HLL, the size should match + * exactly. */ + if (stringObjectLen(o) != REDIS_HLL_SIZE) { + addReplyErrorFormat(c, + "HLLCOUNT target key must contain a %d bytes string.", + REDIS_HLL_SIZE); + return; + } + registers = o->ptr; + addReplyLongLong(c,hllCount(registers)); + } +} + + /* This command performs a self-test of the HLL registers implementation. * Something that is not easy to test from within the outside. * diff --git a/src/redis.c b/src/redis.c index b2ddd8b2..3b5bca0f 100644 --- a/src/redis.c +++ b/src/redis.c @@ -269,7 +269,8 @@ struct redisCommand redisCommandTable[] = { {"bitpos",bitposCommand,-3,"r",0,NULL,1,1,1,0,0}, {"wait",waitCommand,3,"rs",0,NULL,0,0,0,0,0}, {"hllselftest",hllSelftestCommand,1,"r",0,NULL,0,0,0,0,0}, - {"hlladd",hllAddCommand,-2,"wm",0,NULL,1,1,1,0,0} + {"hlladd",hllAddCommand,-2,"wm",0,NULL,1,1,1,0,0}, + {"hllcount",hllCountCommand,2,"r",0,NULL,1,1,1,0,0} }; struct evictionPoolEntry *evictionPoolAlloc(void); diff --git a/src/redis.h b/src/redis.h index 19779356..aaef8f78 100644 --- a/src/redis.h +++ b/src/redis.h @@ -1449,6 +1449,7 @@ void replconfCommand(redisClient *c); void waitCommand(redisClient *c); void hllSelftestCommand(redisClient *c); void hllAddCommand(redisClient *c); +void hllCountCommand(redisClient *c); #if defined(__GNUC__) void *calloc(size_t count, size_t size) __attribute__ ((deprecated));