From 6470b21f59d0c73fac2a08a477ed4ff1770ad3aa Mon Sep 17 00:00:00 2001 From: Otmar Ertl Date: Sat, 10 Mar 2018 20:09:41 +0100 Subject: [PATCH 1/6] replaced tab by spaces --- src/hyperloglog.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hyperloglog.c b/src/hyperloglog.c index ef33979a..8ab9d2a3 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -401,11 +401,11 @@ uint64_t MurmurHash64A (const void * key, int len, unsigned int seed) { uint64_t k; #if (BYTE_ORDER == LITTLE_ENDIAN) - #ifdef USE_ALIGNED_ACCESS - memcpy(&k,data,sizeof(uint64_t)); - #else + #ifdef USE_ALIGNED_ACCESS + memcpy(&k,data,sizeof(uint64_t)); + #else k = *((uint64_t*)data); - #endif + #endif #else k = (uint64_t) data[0]; k |= (uint64_t) data[1] << 8; From 1e9a7748716e1cd234893dd858d07ffa77920e41 Mon Sep 17 00:00:00 2001 From: Otmar Ertl Date: Sat, 10 Mar 2018 20:13:21 +0100 Subject: [PATCH 2/6] improved HyperLogLog cardinality estimation based on method described in https://arxiv.org/abs/1702.01284 that does not rely on any magic constants --- src/hyperloglog.c | 224 +++++++++++++++++++++++----------------------- 1 file changed, 114 insertions(+), 110 deletions(-) diff --git a/src/hyperloglog.c b/src/hyperloglog.c index 8ab9d2a3..7f5f6244 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -192,6 +192,7 @@ struct hllhdr { #define HLL_VALID_CACHE(hdr) (((hdr)->card[7] & (1<<7)) == 0) #define HLL_P 14 /* The greater is P, the smaller the error. */ +#define HLL_Q (63-HLL_P) #define HLL_REGISTERS (1<> 6 | r[1] << 2) & 63; if (r1 == 0) ez++; - r2 = (r[1] >> 4 | r[2] << 4) & 63; if (r2 == 0) ez++; - r3 = (r[2] >> 2) & 63; if (r3 == 0) ez++; - r4 = r[3] & 63; if (r4 == 0) ez++; - r5 = (r[3] >> 6 | r[4] << 2) & 63; if (r5 == 0) ez++; - r6 = (r[4] >> 4 | r[5] << 4) & 63; if (r6 == 0) ez++; - r7 = (r[5] >> 2) & 63; if (r7 == 0) ez++; - r8 = r[6] & 63; if (r8 == 0) ez++; - r9 = (r[6] >> 6 | r[7] << 2) & 63; if (r9 == 0) ez++; - r10 = (r[7] >> 4 | r[8] << 4) & 63; if (r10 == 0) ez++; - r11 = (r[8] >> 2) & 63; if (r11 == 0) ez++; - r12 = r[9] & 63; if (r12 == 0) ez++; - r13 = (r[9] >> 6 | r[10] << 2) & 63; if (r13 == 0) ez++; - r14 = (r[10] >> 4 | r[11] << 4) & 63; if (r14 == 0) ez++; - r15 = (r[11] >> 2) & 63; if (r15 == 0) ez++; + r0 = r[0] & 63; + r1 = (r[0] >> 6 | r[1] << 2) & 63; + r2 = (r[1] >> 4 | r[2] << 4) & 63; + r3 = (r[2] >> 2) & 63; + r4 = r[3] & 63; + r5 = (r[3] >> 6 | r[4] << 2) & 63; + r6 = (r[4] >> 4 | r[5] << 4) & 63; + r7 = (r[5] >> 2) & 63; + r8 = r[6] & 63; + r9 = (r[6] >> 6 | r[7] << 2) & 63; + r10 = (r[7] >> 4 | r[8] << 4) & 63; + r11 = (r[8] >> 2) & 63; + r12 = r[9] & 63; + r13 = (r[9] >> 6 | r[10] << 2) & 63; + r14 = (r[10] >> 4 | r[11] << 4) & 63; + r15 = (r[11] >> 2) & 63; + + regHisto[r0] += 1; + regHisto[r1] += 1; + regHisto[r2] += 1; + regHisto[r3] += 1; + regHisto[r4] += 1; + regHisto[r5] += 1; + regHisto[r6] += 1; + regHisto[r7] += 1; + regHisto[r8] += 1; + regHisto[r9] += 1; + regHisto[r10] += 1; + regHisto[r11] += 1; + regHisto[r12] += 1; + regHisto[r13] += 1; + regHisto[r14] += 1; + regHisto[r15] += 1; - /* Additional parens will allow the compiler to optimize the - * code more with a loss of precision that is not very relevant - * here (floating point math is not commutative!). */ - E += (PE[r0] + PE[r1]) + (PE[r2] + PE[r3]) + (PE[r4] + PE[r5]) + - (PE[r6] + PE[r7]) + (PE[r8] + PE[r9]) + (PE[r10] + PE[r11]) + - (PE[r12] + PE[r13]) + (PE[r14] + PE[r15]); r += 12; } } else { - for (j = 0; j < HLL_REGISTERS; j++) { + for(j = 0; j < HLL_REGISTERS; j++) { unsigned long reg; - HLL_DENSE_GET_REGISTER(reg,registers,j); - if (reg == 0) { - ez++; - /* Increment E at the end of the loop. */ - } else { - E += PE[reg]; /* Precomputed 2^(-reg[j]). */ - } + regHisto[reg] += 1; } - E += ez; /* Add 2^0 'ez' times. */ } - *ezp = ez; - return E; } /* ================== Sparse representation implementation ================= */ @@ -903,76 +902,96 @@ int hllSparseAdd(robj *o, unsigned char *ele, size_t elesize) { return hllSparseSet(o,index,count); } -/* Compute SUM(2^-reg) in the sparse representation. - * PE is an array with a pre-computer table of values 2^-reg indexed by reg. - * As a side effect the integer pointed by 'ezp' is set to the number - * of zero registers. */ -double hllSparseSum(uint8_t *sparse, int sparselen, double *PE, int *ezp, int *invalid) { - double E = 0; - int ez = 0, idx = 0, runlen, regval; +/* Compute the register histogram in the sparse representation. */ +void hllSparseRegHisto(uint8_t *sparse, int sparselen, int *invalid, int* regHisto) { + int idx = 0, runlen, regval; uint8_t *end = sparse+sparselen, *p = sparse; while(p < end) { if (HLL_SPARSE_IS_ZERO(p)) { runlen = HLL_SPARSE_ZERO_LEN(p); idx += runlen; - ez += runlen; - /* Increment E at the end of the loop. */ + regHisto[0] += runlen; p++; } else if (HLL_SPARSE_IS_XZERO(p)) { runlen = HLL_SPARSE_XZERO_LEN(p); idx += runlen; - ez += runlen; - /* Increment E at the end of the loop. */ + regHisto[0] += runlen; p += 2; } else { runlen = HLL_SPARSE_VAL_LEN(p); regval = HLL_SPARSE_VAL_VALUE(p); idx += runlen; - E += PE[regval]*runlen; + regHisto[regval] += runlen; p++; } } if (idx != HLL_REGISTERS && invalid) *invalid = 1; - E += ez; /* Add 2^0 'ez' times. */ - *ezp = ez; - return E; } /* ========================= HyperLogLog Count ============================== * This is the core of the algorithm where the approximated count is computed. - * The function uses the lower level hllDenseSum() and hllSparseSum() functions - * as helpers to compute the SUM(2^-reg) part of the computation, which is - * representation-specific, while all the rest is common. */ + * The function uses the lower level hllDenseRegHisto() and hllSparseRegHisto() + * functions as helpers to compute histogram of register values part of the + * computation, which is representation-specific, while all the rest is common. */ -/* Implements the SUM operation for uint8_t data type which is only used - * internally as speedup for PFCOUNT with multiple keys. */ -double hllRawSum(uint8_t *registers, double *PE, int *ezp) { - double E = 0; - int j, ez = 0; +/* Implements the register histogram calculation for uint8_t data type + * which is only used internally as speedup for PFCOUNT with multiple keys. */ +void hllRawRegHisto(uint8_t *registers, int* regHisto) { uint64_t *word = (uint64_t*) registers; uint8_t *bytes; + int j; for (j = 0; j < HLL_REGISTERS/8; j++) { if (*word == 0) { - ez += 8; + regHisto[0] += 8; } else { bytes = (uint8_t*) word; - if (bytes[0]) E += PE[bytes[0]]; else ez++; - if (bytes[1]) E += PE[bytes[1]]; else ez++; - if (bytes[2]) E += PE[bytes[2]]; else ez++; - if (bytes[3]) E += PE[bytes[3]]; else ez++; - if (bytes[4]) E += PE[bytes[4]]; else ez++; - if (bytes[5]) E += PE[bytes[5]]; else ez++; - if (bytes[6]) E += PE[bytes[6]]; else ez++; - if (bytes[7]) E += PE[bytes[7]]; else ez++; + regHisto[bytes[0]] += 1; + regHisto[bytes[1]] += 1; + regHisto[bytes[2]] += 1; + regHisto[bytes[3]] += 1; + regHisto[bytes[4]] += 1; + regHisto[bytes[5]] += 1; + regHisto[bytes[6]] += 1; + regHisto[bytes[7]] += 1; } word++; } - E += ez; /* 2^(-reg[j]) is 1 when m is 0, add it 'ez' times for every - zero register in the HLL. */ - *ezp = ez; - return E; +} + +/* Helper function sigma as defined in + * "New cardinality estimation algorithms for HyperLogLog sketches" + * Otmar Ertl, arXiv:1702.01284 */ +double hllSigma(double x) { + if (x == 1.) return INFINITY; + double zPrime; + double y = 1; + double z = x; + do { + x *= x; + zPrime = z; + z += x * y; + y += y; + } while(zPrime != z); + return z; +} + +/* Helper function tau as defined in + * "New cardinality estimation algorithms for HyperLogLog sketches" + * Otmar Ertl, arXiv:1702.01284 */ +double hllTau(double x) { + if (x == 0. || x == 1.) return 0.; + double zPrime; + double y = 1.0; + double z = 1 - x; + do { + x = sqrt(x); + zPrime = z; + y *= 0.5; + z -= pow(1 - x, 2)*y; + } while(zPrime != z); + return z / 3; } /* Return the approximated cardinality of the set based on the harmonic @@ -988,49 +1007,34 @@ double hllRawSum(uint8_t *registers, double *PE, int *ezp) { * keys (no need to work with 6-bit integers encoding). */ uint64_t hllCount(struct hllhdr *hdr, int *invalid) { double m = HLL_REGISTERS; - double E, alpha = 0.7213/(1+1.079/m); - int j, ez; /* Number of registers equal to 0. */ + double E; + int j; + double alphaInf = 0.5 / log(2.); + int regHisto[HLL_Q+2] = {0}; - /* We precompute 2^(-reg[j]) in a small table in order to - * speedup the computation of SUM(2^-register[0..i]). */ - static int initialized = 0; - static double PE[64]; - if (!initialized) { - PE[0] = 1; /* 2^(-reg[j]) is 1 when m is 0. */ - for (j = 1; j < 64; j++) { - /* 2^(-reg[j]) is the same as 1/2^reg[j]. */ - PE[j] = 1.0/(1ULL << j); - } - initialized = 1; - } - - /* Compute SUM(2^-register[0..i]). */ + /* Compute register histogram */ if (hdr->encoding == HLL_DENSE) { - E = hllDenseSum(hdr->registers,PE,&ez); + hllDenseRegHisto(hdr->registers,regHisto); } else if (hdr->encoding == HLL_SPARSE) { - E = hllSparseSum(hdr->registers, - sdslen((sds)hdr)-HLL_HDR_SIZE,PE,&ez,invalid); + hllSparseRegHisto(hdr->registers, + sdslen((sds)hdr)-HLL_HDR_SIZE,invalid,regHisto); } else if (hdr->encoding == HLL_RAW) { - E = hllRawSum(hdr->registers,PE,&ez); + hllRawRegHisto(hdr->registers,regHisto); } else { serverPanic("Unknown HyperLogLog encoding in hllCount()"); } - /* Apply loglog-beta to the raw estimate. See: - * "LogLog-Beta and More: A New Algorithm for Cardinality Estimation - * Based on LogLog Counting" Jason Qin, Denys Kim, Yumei Tung - * arXiv:1612.02284 */ - double zl = log(ez + 1); - double beta = -0.370393911*ez + - 0.070471823*zl + - 0.17393686*pow(zl,2) + - 0.16339839*pow(zl,3) + - -0.09237745*pow(zl,4) + - 0.03738027*pow(zl,5) + - -0.005384159*pow(zl,6) + - 0.00042419*pow(zl,7); + /* Estimate cardinality form register histogram. See: + * "New cardinality estimation algorithms for HyperLogLog sketches" + * Otmar Ertl, arXiv:1702.01284 */ + double z = m * hllTau((m-regHisto[HLL_Q+1])/(double)m); + for (j = HLL_Q; j >= 1; --j) { + z += regHisto[j]; + z *= 0.5; + } + z += m * hllSigma(regHisto[0]/(double)m); + E = llroundl(alphaInf*m*m/z); - E = llroundl(alpha*m*(m-ez)*(1/(E+beta))); return (uint64_t) E; } From 633983d4796a48949a0268c8593b26ff5f0206e2 Mon Sep 17 00:00:00 2001 From: Otmar Ertl Date: Sat, 10 Mar 2018 20:22:42 +0100 Subject: [PATCH 3/6] improved definition of HLL_Q --- src/hyperloglog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hyperloglog.c b/src/hyperloglog.c index 7f5f6244..c6c7edaa 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -192,11 +192,11 @@ struct hllhdr { #define HLL_VALID_CACHE(hdr) (((hdr)->card[7] & (1<<7)) == 0) #define HLL_P 14 /* The greater is P, the smaller the error. */ -#define HLL_Q (63-HLL_P) #define HLL_REGISTERS (1< Date: Sat, 10 Mar 2018 20:44:20 +0100 Subject: [PATCH 4/6] made constant static --- src/hyperloglog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hyperloglog.c b/src/hyperloglog.c index c6c7edaa..c97c4364 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -1009,7 +1009,7 @@ uint64_t hllCount(struct hllhdr *hdr, int *invalid) { double m = HLL_REGISTERS; double E; int j; - double alphaInf = 0.5 / log(2.); + static double alphaInf = 0.5 / log(2.); int regHisto[HLL_Q+2] = {0}; /* Compute register histogram */ From 97bde9f6236b65aa5a9165554f7ca690b59c2903 Mon Sep 17 00:00:00 2001 From: Otmar Ertl Date: Sun, 11 Mar 2018 09:18:00 +0100 Subject: [PATCH 5/6] use all 64 bits of the hash value instead of 63 --- src/hyperloglog.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/hyperloglog.c b/src/hyperloglog.c index c97c4364..9fa8eb43 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -192,11 +192,12 @@ struct hllhdr { #define HLL_VALID_CACHE(hdr) (((hdr)->card[7] & (1<<7)) == 0) #define HLL_P 14 /* The greater is P, the smaller the error. */ +#define HLL_Q (64-HLL_P) /* The number of bits of the hash value used for + determining the number of leading zeros. */ #define HLL_REGISTERS (1<>= HLL_P; /* Remove bits used to address the register. */ + hash |= ((uint64_t)1< Date: Wed, 14 Mar 2018 21:00:06 +0100 Subject: [PATCH 6/6] fixed compilation error when using clang as reported by michael-grunder --- src/hyperloglog.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hyperloglog.c b/src/hyperloglog.c index 9fa8eb43..f7f1b343 100644 --- a/src/hyperloglog.c +++ b/src/hyperloglog.c @@ -386,6 +386,7 @@ static char *invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected\r\n"; *(p) = (_l>>8) | HLL_SPARSE_XZERO_BIT; \ *((p)+1) = (_l&0xff); \ } while(0) +#define HLL_ALPHA_INF 0.721347520444481703680 /* constant for 0.5/ln(2) */ /* ========================= HyperLogLog algorithm ========================= */ @@ -1012,7 +1013,6 @@ uint64_t hllCount(struct hllhdr *hdr, int *invalid) { double m = HLL_REGISTERS; double E; int j; - static double alphaInf = 0.5 / log(2.); int regHisto[HLL_Q+2] = {0}; /* Compute register histogram */ @@ -1036,7 +1036,7 @@ uint64_t hllCount(struct hllhdr *hdr, int *invalid) { z *= 0.5; } z += m * hllSigma(regHisto[0]/(double)m); - E = llroundl(alphaInf*m*m/z); + E = llroundl(HLL_ALPHA_INF*m*m/z); return (uint64_t) E; }