From c55e3fbae5273d8a6fd5582e44d8745b2b81b7df Mon Sep 17 00:00:00 2001 From: Harish Murthy Date: Fri, 9 Dec 2016 14:28:19 +0530 Subject: [PATCH] LogLog-Beta Algorithm support within HLL Config option to use LogLog-Beta Algorithm for Cardinality --- src/config.c | 11 +++++++- src/hyperloglog.c | 66 ++++++++++++++++++++++++++++++----------------- src/server.c | 1 + src/server.h | 2 ++ 4 files changed, 55 insertions(+), 25 deletions(-) diff --git a/src/config.c b/src/config.c index 8f3b81a1..6f4559e6 100644 --- a/src/config.c +++ b/src/config.c @@ -688,6 +688,10 @@ void loadServerConfigFromString(char *config) { err = sentinelHandleConfiguration(argv+1,argc-1); if (err) goto loaderr; } + } else if (!strcasecmp(argv[0],"hll-use-loglogbeta") && argc == 2) { + if ((server.hll_use_loglogbeta = yesnotoi(argv[1])) == -1) { + err = "argument must be 'yes' or 'no'"; goto loaderr; + } } else { err = "Bad directive or wrong number of arguments"; goto loaderr; } @@ -980,7 +984,9 @@ void configSetCommand(client *c) { } config_set_bool_field( "slave-lazy-flush",server.repl_slave_lazy_flush) { } config_set_bool_field( - "no-appendfsync-on-rewrite",server.aof_no_fsync_on_rewrite) { + "no-appendfsync-on-rewrite",server.aof_no_fsync_on_rewrite) { + } config_set_bool_field( + "hll-use-loglogbeta",server.hll_use_loglogbeta) { /* Numerical fields. * config_set_numerical_field(name,var,min,max) */ @@ -1245,6 +1251,8 @@ void configGetCommand(client *c) { server.lazyfree_lazy_server_del); config_get_bool_field("slave-lazy-flush", server.repl_slave_lazy_flush); + config_get_bool_field("hll-use-loglogbeta", + server.hll_use_loglogbeta); /* Enum values */ config_get_enum_field("maxmemory-policy", @@ -1963,6 +1971,7 @@ int rewriteConfig(char *path) { rewriteConfigYesNoOption(state,"lazyfree-lazy-expire",server.lazyfree_lazy_expire,CONFIG_DEFAULT_LAZYFREE_LAZY_EXPIRE); rewriteConfigYesNoOption(state,"lazyfree-lazy-server-del",server.lazyfree_lazy_server_del,CONFIG_DEFAULT_LAZYFREE_LAZY_SERVER_DEL); rewriteConfigYesNoOption(state,"slave-lazy-flush",server.repl_slave_lazy_flush,CONFIG_DEFAULT_SLAVE_LAZY_FLUSH); + rewriteConfigYesNoOption(state,"hll-use-loglogbeta",server.hll_use_loglogbeta,CONFIG_DEFAULT_HLL_USE_LOGLOGBETA); /* Rewrite Sentinel config if in Sentinel mode. */ if (server.sentinel_mode) rewriteConfigSentinelOption(state); diff --git a/src/hyperloglog.c b/src/hyperloglog.c index 8ccc16be..67a92872 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -993,33 +993,51 @@ uint64_t hllCount(struct hllhdr *hdr, int *invalid) { } else { serverPanic("Unknown HyperLogLog encoding in hllCount()"); } + + if(server.hll_use_loglogbeta) { + /* For loglog-beta there is a single formula to compute + * cardinality for the enture range + */ - /* Muliply the inverse of E for alpha_m * m^2 to have the raw estimate. */ - E = (1/E)*alpha*m*m; + double zl = log(ez + 1); + double beta = -0.370393911*ez + + 0.070471823*zl + + 0.17393686*pow(zl,2) + + 0.16339839*pow(zl,3) + + -0.09237745*pow(zl,4) + + 0.03738027*pow(zl,5) + + -0.005384159*pow(zl,6) + + 0.00042419*pow(zl,7); + + E = alpha*m*(m-ez)*(1/(E+beta)); + } else { + /* Muliply the inverse of E for alpha_m * m^2 to have the raw estimate. */ + E = (1/E)*alpha*m*m; - /* Use the LINEARCOUNTING algorithm for small cardinalities. - * For larger values but up to 72000 HyperLogLog raw approximation is - * used since linear counting error starts to increase. However HyperLogLog - * shows a strong bias in the range 2.5*16384 - 72000, so we try to - * compensate for it. */ - if (E < m*2.5 && ez != 0) { - E = m*log(m/ez); /* LINEARCOUNTING() */ - } else if (m == 16384 && E < 72000) { - /* We did polynomial regression of the bias for this range, this - * way we can compute the bias for a given cardinality and correct - * according to it. Only apply the correction for P=14 that's what - * we use and the value the correction was verified with. */ - double bias = 5.9119*1.0e-18*(E*E*E*E) - -1.4253*1.0e-12*(E*E*E)+ - 1.2940*1.0e-7*(E*E) - -5.2921*1.0e-3*E+ - 83.3216; - E -= E*(bias/100); + /* Use the LINEARCOUNTING algorithm for small cardinalities. + * For larger values but up to 72000 HyperLogLog raw approximation is + * used since linear counting error starts to increase. However HyperLogLog + * shows a strong bias in the range 2.5*16384 - 72000, so we try to + * compensate for it. */ + if (E < m*2.5 && ez != 0) { + E = m*log(m/ez); /* LINEARCOUNTING() */ + } else if (m == 16384 && E < 72000) { + /* We did polynomial regression of the bias for this range, this + * way we can compute the bias for a given cardinality and correct + * according to it. Only apply the correction for P=14 that's what + * we use and the value the correction was verified with. */ + double bias = 5.9119*1.0e-18*(E*E*E*E) + -1.4253*1.0e-12*(E*E*E)+ + 1.2940*1.0e-7*(E*E) + -5.2921*1.0e-3*E+ + 83.3216; + E -= E*(bias/100); + } + /* We don't apply the correction for E > 1/30 of 2^32 since we use + * a 64 bit function and 6 bit counters. To apply the correction for + * 1/30 of 2^64 is not needed since it would require a huge set + * to approach such a value. */ } - /* We don't apply the correction for E > 1/30 of 2^32 since we use - * a 64 bit function and 6 bit counters. To apply the correction for - * 1/30 of 2^64 is not needed since it would require a huge set - * to approach such a value. */ return (uint64_t) E; } diff --git a/src/server.c b/src/server.c index 0dc62c2b..f2049700 100644 --- a/src/server.c +++ b/src/server.c @@ -1400,6 +1400,7 @@ void initServerConfig(void) { server.lazyfree_lazy_eviction = CONFIG_DEFAULT_LAZYFREE_LAZY_EVICTION; server.lazyfree_lazy_expire = CONFIG_DEFAULT_LAZYFREE_LAZY_EXPIRE; server.lazyfree_lazy_server_del = CONFIG_DEFAULT_LAZYFREE_LAZY_SERVER_DEL; + server.hll_use_loglogbeta = CONFIG_DEFAULT_HLL_USE_LOGLOGBETA; server.lruclock = getLRUClock(); resetServerSaveParams(); diff --git a/src/server.h b/src/server.h index 7ff151de..07df986c 100644 --- a/src/server.h +++ b/src/server.h @@ -151,6 +151,7 @@ typedef long long mstime_t; /* millisecond time type. */ #define CONFIG_DEFAULT_LAZYFREE_LAZY_EVICTION 0 #define CONFIG_DEFAULT_LAZYFREE_LAZY_EXPIRE 0 #define CONFIG_DEFAULT_LAZYFREE_LAZY_SERVER_DEL 0 +#define CONFIG_DEFAULT_HLL_USE_LOGLOGBETA 0 #define ACTIVE_EXPIRE_CYCLE_LOOKUPS_PER_LOOP 20 /* Loopkups per loop. */ #define ACTIVE_EXPIRE_CYCLE_FAST_DURATION 1000 /* Microseconds */ @@ -1149,6 +1150,7 @@ struct redisServer { int watchdog_period; /* Software watchdog period in ms. 0 = off */ /* System hardware info */ size_t system_memory_size; /* Total memory in system as reported by OS */ + int hll_use_loglogbeta; /* Use loglog-beta algorithm for HLL */ }; typedef struct pubsubPattern {