mirror of
https://github.com/fluencelabs/redis
synced 2025-03-19 09:00:51 +00:00
Merge branch 'hyperloglog-improvement' of https://github.com/oertl/redis into oertl-hyperloglog-improvement
This commit is contained in:
commit
7e3c583532
@ -192,6 +192,8 @@ struct hllhdr {
|
|||||||
#define HLL_VALID_CACHE(hdr) (((hdr)->card[7] & (1<<7)) == 0)
|
#define HLL_VALID_CACHE(hdr) (((hdr)->card[7] & (1<<7)) == 0)
|
||||||
|
|
||||||
#define HLL_P 14 /* The greater is P, the smaller the error. */
|
#define HLL_P 14 /* The greater is P, the smaller the error. */
|
||||||
|
#define HLL_Q (64-HLL_P) /* The number of bits of the hash value used for
|
||||||
|
determining the number of leading zeros. */
|
||||||
#define HLL_REGISTERS (1<<HLL_P) /* With P=14, 16384 registers. */
|
#define HLL_REGISTERS (1<<HLL_P) /* With P=14, 16384 registers. */
|
||||||
#define HLL_P_MASK (HLL_REGISTERS-1) /* Mask to index register. */
|
#define HLL_P_MASK (HLL_REGISTERS-1) /* Mask to index register. */
|
||||||
#define HLL_BITS 6 /* Enough to count up to 63 leading zeroes. */
|
#define HLL_BITS 6 /* Enough to count up to 63 leading zeroes. */
|
||||||
@ -384,6 +386,7 @@ static char *invalid_hll_err = "-INVALIDOBJ Corrupted HLL object detected\r\n";
|
|||||||
*(p) = (_l>>8) | HLL_SPARSE_XZERO_BIT; \
|
*(p) = (_l>>8) | HLL_SPARSE_XZERO_BIT; \
|
||||||
*((p)+1) = (_l&0xff); \
|
*((p)+1) = (_l&0xff); \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
#define HLL_ALPHA_INF 0.721347520444481703680 /* constant for 0.5/ln(2) */
|
||||||
|
|
||||||
/* ========================= HyperLogLog algorithm ========================= */
|
/* ========================= HyperLogLog algorithm ========================= */
|
||||||
|
|
||||||
@ -451,7 +454,7 @@ int hllPatLen(unsigned char *ele, size_t elesize, long *regp) {
|
|||||||
|
|
||||||
/* Count the number of zeroes starting from bit HLL_REGISTERS
|
/* Count the number of zeroes starting from bit HLL_REGISTERS
|
||||||
* (that is a power of two corresponding to the first bit we don't use
|
* (that is a power of two corresponding to the first bit we don't use
|
||||||
* as index). The max run can be 64-P+1 bits.
|
* as index). The max run can be 64-P+1 = Q+1 bits.
|
||||||
*
|
*
|
||||||
* Note that the final "1" ending the sequence of zeroes must be
|
* Note that the final "1" ending the sequence of zeroes must be
|
||||||
* included in the count, so if we find "001" the count is 3, and
|
* included in the count, so if we find "001" the count is 3, and
|
||||||
@ -462,8 +465,10 @@ int hllPatLen(unsigned char *ele, size_t elesize, long *regp) {
|
|||||||
* there are high probabilities to find a 1 after a few iterations. */
|
* there are high probabilities to find a 1 after a few iterations. */
|
||||||
hash = MurmurHash64A(ele,elesize,0xadc83b19ULL);
|
hash = MurmurHash64A(ele,elesize,0xadc83b19ULL);
|
||||||
index = hash & HLL_P_MASK; /* Register index. */
|
index = hash & HLL_P_MASK; /* Register index. */
|
||||||
hash |= ((uint64_t)1<<63); /* Make sure the loop terminates. */
|
hash >>= HLL_P; /* Remove bits used to address the register. */
|
||||||
bit = HLL_REGISTERS; /* First bit not used to address the register. */
|
hash |= ((uint64_t)1<<HLL_Q); /* Make sure the loop terminates
|
||||||
|
and count will be <= Q+1. */
|
||||||
|
bit = 1;
|
||||||
count = 1; /* Initialized to 1 since we count the "00000...1" pattern. */
|
count = 1; /* Initialized to 1 since we count the "00000...1" pattern. */
|
||||||
while((hash & bit) == 0) {
|
while((hash & bit) == 0) {
|
||||||
count++;
|
count++;
|
||||||
@ -510,13 +515,9 @@ int hllDenseAdd(uint8_t *registers, unsigned char *ele, size_t elesize) {
|
|||||||
return hllDenseSet(registers,index,count);
|
return hllDenseSet(registers,index,count);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Compute SUM(2^-reg) in the dense representation.
|
/* Compute the register histogram in the dense representation. */
|
||||||
* PE is an array with a pre-computer table of values 2^-reg indexed by reg.
|
void hllDenseRegHisto(uint8_t *registers, int* regHisto) {
|
||||||
* As a side effect the integer pointed by 'ezp' is set to the number
|
int j;
|
||||||
* of zero registers. */
|
|
||||||
double hllDenseSum(uint8_t *registers, double *PE, int *ezp) {
|
|
||||||
double E = 0;
|
|
||||||
int j, ez = 0;
|
|
||||||
|
|
||||||
/* Redis default is to use 16384 registers 6 bits each. The code works
|
/* Redis default is to use 16384 registers 6 bits each. The code works
|
||||||
* with other values by modifying the defines, but for our target value
|
* with other values by modifying the defines, but for our target value
|
||||||
@ -527,47 +528,49 @@ double hllDenseSum(uint8_t *registers, double *PE, int *ezp) {
|
|||||||
r10, r11, r12, r13, r14, r15;
|
r10, r11, r12, r13, r14, r15;
|
||||||
for (j = 0; j < 1024; j++) {
|
for (j = 0; j < 1024; j++) {
|
||||||
/* Handle 16 registers per iteration. */
|
/* Handle 16 registers per iteration. */
|
||||||
r0 = r[0] & 63; if (r0 == 0) ez++;
|
r0 = r[0] & 63;
|
||||||
r1 = (r[0] >> 6 | r[1] << 2) & 63; if (r1 == 0) ez++;
|
r1 = (r[0] >> 6 | r[1] << 2) & 63;
|
||||||
r2 = (r[1] >> 4 | r[2] << 4) & 63; if (r2 == 0) ez++;
|
r2 = (r[1] >> 4 | r[2] << 4) & 63;
|
||||||
r3 = (r[2] >> 2) & 63; if (r3 == 0) ez++;
|
r3 = (r[2] >> 2) & 63;
|
||||||
r4 = r[3] & 63; if (r4 == 0) ez++;
|
r4 = r[3] & 63;
|
||||||
r5 = (r[3] >> 6 | r[4] << 2) & 63; if (r5 == 0) ez++;
|
r5 = (r[3] >> 6 | r[4] << 2) & 63;
|
||||||
r6 = (r[4] >> 4 | r[5] << 4) & 63; if (r6 == 0) ez++;
|
r6 = (r[4] >> 4 | r[5] << 4) & 63;
|
||||||
r7 = (r[5] >> 2) & 63; if (r7 == 0) ez++;
|
r7 = (r[5] >> 2) & 63;
|
||||||
r8 = r[6] & 63; if (r8 == 0) ez++;
|
r8 = r[6] & 63;
|
||||||
r9 = (r[6] >> 6 | r[7] << 2) & 63; if (r9 == 0) ez++;
|
r9 = (r[6] >> 6 | r[7] << 2) & 63;
|
||||||
r10 = (r[7] >> 4 | r[8] << 4) & 63; if (r10 == 0) ez++;
|
r10 = (r[7] >> 4 | r[8] << 4) & 63;
|
||||||
r11 = (r[8] >> 2) & 63; if (r11 == 0) ez++;
|
r11 = (r[8] >> 2) & 63;
|
||||||
r12 = r[9] & 63; if (r12 == 0) ez++;
|
r12 = r[9] & 63;
|
||||||
r13 = (r[9] >> 6 | r[10] << 2) & 63; if (r13 == 0) ez++;
|
r13 = (r[9] >> 6 | r[10] << 2) & 63;
|
||||||
r14 = (r[10] >> 4 | r[11] << 4) & 63; if (r14 == 0) ez++;
|
r14 = (r[10] >> 4 | r[11] << 4) & 63;
|
||||||
r15 = (r[11] >> 2) & 63; if (r15 == 0) ez++;
|
r15 = (r[11] >> 2) & 63;
|
||||||
|
|
||||||
|
regHisto[r0] += 1;
|
||||||
|
regHisto[r1] += 1;
|
||||||
|
regHisto[r2] += 1;
|
||||||
|
regHisto[r3] += 1;
|
||||||
|
regHisto[r4] += 1;
|
||||||
|
regHisto[r5] += 1;
|
||||||
|
regHisto[r6] += 1;
|
||||||
|
regHisto[r7] += 1;
|
||||||
|
regHisto[r8] += 1;
|
||||||
|
regHisto[r9] += 1;
|
||||||
|
regHisto[r10] += 1;
|
||||||
|
regHisto[r11] += 1;
|
||||||
|
regHisto[r12] += 1;
|
||||||
|
regHisto[r13] += 1;
|
||||||
|
regHisto[r14] += 1;
|
||||||
|
regHisto[r15] += 1;
|
||||||
|
|
||||||
/* Additional parens will allow the compiler to optimize the
|
|
||||||
* code more with a loss of precision that is not very relevant
|
|
||||||
* here (floating point math is not commutative!). */
|
|
||||||
E += (PE[r0] + PE[r1]) + (PE[r2] + PE[r3]) + (PE[r4] + PE[r5]) +
|
|
||||||
(PE[r6] + PE[r7]) + (PE[r8] + PE[r9]) + (PE[r10] + PE[r11]) +
|
|
||||||
(PE[r12] + PE[r13]) + (PE[r14] + PE[r15]);
|
|
||||||
r += 12;
|
r += 12;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (j = 0; j < HLL_REGISTERS; j++) {
|
for(j = 0; j < HLL_REGISTERS; j++) {
|
||||||
unsigned long reg;
|
unsigned long reg;
|
||||||
|
|
||||||
HLL_DENSE_GET_REGISTER(reg,registers,j);
|
HLL_DENSE_GET_REGISTER(reg,registers,j);
|
||||||
if (reg == 0) {
|
regHisto[reg] += 1;
|
||||||
ez++;
|
|
||||||
/* Increment E at the end of the loop. */
|
|
||||||
} else {
|
|
||||||
E += PE[reg]; /* Precomputed 2^(-reg[j]). */
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
E += ez; /* Add 2^0 'ez' times. */
|
|
||||||
}
|
|
||||||
*ezp = ez;
|
|
||||||
return E;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ================== Sparse representation implementation ================= */
|
/* ================== Sparse representation implementation ================= */
|
||||||
@ -903,76 +906,96 @@ int hllSparseAdd(robj *o, unsigned char *ele, size_t elesize) {
|
|||||||
return hllSparseSet(o,index,count);
|
return hllSparseSet(o,index,count);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Compute SUM(2^-reg) in the sparse representation.
|
/* Compute the register histogram in the sparse representation. */
|
||||||
* PE is an array with a pre-computer table of values 2^-reg indexed by reg.
|
void hllSparseRegHisto(uint8_t *sparse, int sparselen, int *invalid, int* regHisto) {
|
||||||
* As a side effect the integer pointed by 'ezp' is set to the number
|
int idx = 0, runlen, regval;
|
||||||
* of zero registers. */
|
|
||||||
double hllSparseSum(uint8_t *sparse, int sparselen, double *PE, int *ezp, int *invalid) {
|
|
||||||
double E = 0;
|
|
||||||
int ez = 0, idx = 0, runlen, regval;
|
|
||||||
uint8_t *end = sparse+sparselen, *p = sparse;
|
uint8_t *end = sparse+sparselen, *p = sparse;
|
||||||
|
|
||||||
while(p < end) {
|
while(p < end) {
|
||||||
if (HLL_SPARSE_IS_ZERO(p)) {
|
if (HLL_SPARSE_IS_ZERO(p)) {
|
||||||
runlen = HLL_SPARSE_ZERO_LEN(p);
|
runlen = HLL_SPARSE_ZERO_LEN(p);
|
||||||
idx += runlen;
|
idx += runlen;
|
||||||
ez += runlen;
|
regHisto[0] += runlen;
|
||||||
/* Increment E at the end of the loop. */
|
|
||||||
p++;
|
p++;
|
||||||
} else if (HLL_SPARSE_IS_XZERO(p)) {
|
} else if (HLL_SPARSE_IS_XZERO(p)) {
|
||||||
runlen = HLL_SPARSE_XZERO_LEN(p);
|
runlen = HLL_SPARSE_XZERO_LEN(p);
|
||||||
idx += runlen;
|
idx += runlen;
|
||||||
ez += runlen;
|
regHisto[0] += runlen;
|
||||||
/* Increment E at the end of the loop. */
|
|
||||||
p += 2;
|
p += 2;
|
||||||
} else {
|
} else {
|
||||||
runlen = HLL_SPARSE_VAL_LEN(p);
|
runlen = HLL_SPARSE_VAL_LEN(p);
|
||||||
regval = HLL_SPARSE_VAL_VALUE(p);
|
regval = HLL_SPARSE_VAL_VALUE(p);
|
||||||
idx += runlen;
|
idx += runlen;
|
||||||
E += PE[regval]*runlen;
|
regHisto[regval] += runlen;
|
||||||
p++;
|
p++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (idx != HLL_REGISTERS && invalid) *invalid = 1;
|
if (idx != HLL_REGISTERS && invalid) *invalid = 1;
|
||||||
E += ez; /* Add 2^0 'ez' times. */
|
|
||||||
*ezp = ez;
|
|
||||||
return E;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* ========================= HyperLogLog Count ==============================
|
/* ========================= HyperLogLog Count ==============================
|
||||||
* This is the core of the algorithm where the approximated count is computed.
|
* This is the core of the algorithm where the approximated count is computed.
|
||||||
* The function uses the lower level hllDenseSum() and hllSparseSum() functions
|
* The function uses the lower level hllDenseRegHisto() and hllSparseRegHisto()
|
||||||
* as helpers to compute the SUM(2^-reg) part of the computation, which is
|
* functions as helpers to compute histogram of register values part of the
|
||||||
* representation-specific, while all the rest is common. */
|
* computation, which is representation-specific, while all the rest is common. */
|
||||||
|
|
||||||
/* Implements the SUM operation for uint8_t data type which is only used
|
/* Implements the register histogram calculation for uint8_t data type
|
||||||
* internally as speedup for PFCOUNT with multiple keys. */
|
* which is only used internally as speedup for PFCOUNT with multiple keys. */
|
||||||
double hllRawSum(uint8_t *registers, double *PE, int *ezp) {
|
void hllRawRegHisto(uint8_t *registers, int* regHisto) {
|
||||||
double E = 0;
|
|
||||||
int j, ez = 0;
|
|
||||||
uint64_t *word = (uint64_t*) registers;
|
uint64_t *word = (uint64_t*) registers;
|
||||||
uint8_t *bytes;
|
uint8_t *bytes;
|
||||||
|
int j;
|
||||||
|
|
||||||
for (j = 0; j < HLL_REGISTERS/8; j++) {
|
for (j = 0; j < HLL_REGISTERS/8; j++) {
|
||||||
if (*word == 0) {
|
if (*word == 0) {
|
||||||
ez += 8;
|
regHisto[0] += 8;
|
||||||
} else {
|
} else {
|
||||||
bytes = (uint8_t*) word;
|
bytes = (uint8_t*) word;
|
||||||
if (bytes[0]) E += PE[bytes[0]]; else ez++;
|
regHisto[bytes[0]] += 1;
|
||||||
if (bytes[1]) E += PE[bytes[1]]; else ez++;
|
regHisto[bytes[1]] += 1;
|
||||||
if (bytes[2]) E += PE[bytes[2]]; else ez++;
|
regHisto[bytes[2]] += 1;
|
||||||
if (bytes[3]) E += PE[bytes[3]]; else ez++;
|
regHisto[bytes[3]] += 1;
|
||||||
if (bytes[4]) E += PE[bytes[4]]; else ez++;
|
regHisto[bytes[4]] += 1;
|
||||||
if (bytes[5]) E += PE[bytes[5]]; else ez++;
|
regHisto[bytes[5]] += 1;
|
||||||
if (bytes[6]) E += PE[bytes[6]]; else ez++;
|
regHisto[bytes[6]] += 1;
|
||||||
if (bytes[7]) E += PE[bytes[7]]; else ez++;
|
regHisto[bytes[7]] += 1;
|
||||||
}
|
}
|
||||||
word++;
|
word++;
|
||||||
}
|
}
|
||||||
E += ez; /* 2^(-reg[j]) is 1 when m is 0, add it 'ez' times for every
|
}
|
||||||
zero register in the HLL. */
|
|
||||||
*ezp = ez;
|
/* Helper function sigma as defined in
|
||||||
return E;
|
* "New cardinality estimation algorithms for HyperLogLog sketches"
|
||||||
|
* Otmar Ertl, arXiv:1702.01284 */
|
||||||
|
double hllSigma(double x) {
|
||||||
|
if (x == 1.) return INFINITY;
|
||||||
|
double zPrime;
|
||||||
|
double y = 1;
|
||||||
|
double z = x;
|
||||||
|
do {
|
||||||
|
x *= x;
|
||||||
|
zPrime = z;
|
||||||
|
z += x * y;
|
||||||
|
y += y;
|
||||||
|
} while(zPrime != z);
|
||||||
|
return z;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Helper function tau as defined in
|
||||||
|
* "New cardinality estimation algorithms for HyperLogLog sketches"
|
||||||
|
* Otmar Ertl, arXiv:1702.01284 */
|
||||||
|
double hllTau(double x) {
|
||||||
|
if (x == 0. || x == 1.) return 0.;
|
||||||
|
double zPrime;
|
||||||
|
double y = 1.0;
|
||||||
|
double z = 1 - x;
|
||||||
|
do {
|
||||||
|
x = sqrt(x);
|
||||||
|
zPrime = z;
|
||||||
|
y *= 0.5;
|
||||||
|
z -= pow(1 - x, 2)*y;
|
||||||
|
} while(zPrime != z);
|
||||||
|
return z / 3;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Return the approximated cardinality of the set based on the harmonic
|
/* Return the approximated cardinality of the set based on the harmonic
|
||||||
@ -988,49 +1011,33 @@ double hllRawSum(uint8_t *registers, double *PE, int *ezp) {
|
|||||||
* keys (no need to work with 6-bit integers encoding). */
|
* keys (no need to work with 6-bit integers encoding). */
|
||||||
uint64_t hllCount(struct hllhdr *hdr, int *invalid) {
|
uint64_t hllCount(struct hllhdr *hdr, int *invalid) {
|
||||||
double m = HLL_REGISTERS;
|
double m = HLL_REGISTERS;
|
||||||
double E, alpha = 0.7213/(1+1.079/m);
|
double E;
|
||||||
int j, ez; /* Number of registers equal to 0. */
|
int j;
|
||||||
|
int regHisto[HLL_Q+2] = {0};
|
||||||
|
|
||||||
/* We precompute 2^(-reg[j]) in a small table in order to
|
/* Compute register histogram */
|
||||||
* speedup the computation of SUM(2^-register[0..i]). */
|
|
||||||
static int initialized = 0;
|
|
||||||
static double PE[64];
|
|
||||||
if (!initialized) {
|
|
||||||
PE[0] = 1; /* 2^(-reg[j]) is 1 when m is 0. */
|
|
||||||
for (j = 1; j < 64; j++) {
|
|
||||||
/* 2^(-reg[j]) is the same as 1/2^reg[j]. */
|
|
||||||
PE[j] = 1.0/(1ULL << j);
|
|
||||||
}
|
|
||||||
initialized = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Compute SUM(2^-register[0..i]). */
|
|
||||||
if (hdr->encoding == HLL_DENSE) {
|
if (hdr->encoding == HLL_DENSE) {
|
||||||
E = hllDenseSum(hdr->registers,PE,&ez);
|
hllDenseRegHisto(hdr->registers,regHisto);
|
||||||
} else if (hdr->encoding == HLL_SPARSE) {
|
} else if (hdr->encoding == HLL_SPARSE) {
|
||||||
E = hllSparseSum(hdr->registers,
|
hllSparseRegHisto(hdr->registers,
|
||||||
sdslen((sds)hdr)-HLL_HDR_SIZE,PE,&ez,invalid);
|
sdslen((sds)hdr)-HLL_HDR_SIZE,invalid,regHisto);
|
||||||
} else if (hdr->encoding == HLL_RAW) {
|
} else if (hdr->encoding == HLL_RAW) {
|
||||||
E = hllRawSum(hdr->registers,PE,&ez);
|
hllRawRegHisto(hdr->registers,regHisto);
|
||||||
} else {
|
} else {
|
||||||
serverPanic("Unknown HyperLogLog encoding in hllCount()");
|
serverPanic("Unknown HyperLogLog encoding in hllCount()");
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Apply loglog-beta to the raw estimate. See:
|
/* Estimate cardinality form register histogram. See:
|
||||||
* "LogLog-Beta and More: A New Algorithm for Cardinality Estimation
|
* "New cardinality estimation algorithms for HyperLogLog sketches"
|
||||||
* Based on LogLog Counting" Jason Qin, Denys Kim, Yumei Tung
|
* Otmar Ertl, arXiv:1702.01284 */
|
||||||
* arXiv:1612.02284 */
|
double z = m * hllTau((m-regHisto[HLL_Q+1])/(double)m);
|
||||||
double zl = log(ez + 1);
|
for (j = HLL_Q; j >= 1; --j) {
|
||||||
double beta = -0.370393911*ez +
|
z += regHisto[j];
|
||||||
0.070471823*zl +
|
z *= 0.5;
|
||||||
0.17393686*pow(zl,2) +
|
}
|
||||||
0.16339839*pow(zl,3) +
|
z += m * hllSigma(regHisto[0]/(double)m);
|
||||||
-0.09237745*pow(zl,4) +
|
E = llroundl(HLL_ALPHA_INF*m*m/z);
|
||||||
0.03738027*pow(zl,5) +
|
|
||||||
-0.005384159*pow(zl,6) +
|
|
||||||
0.00042419*pow(zl,7);
|
|
||||||
|
|
||||||
E = llroundl(alpha*m*(m-ez)*(1/(E+beta)));
|
|
||||||
return (uint64_t) E;
|
return (uint64_t) E;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user