From aaf6db459b34dc2182fe1d84596733a9b9e97ec4 Mon Sep 17 00:00:00 2001
From: antirez <antirez@gmail.com>
Date: Sun, 30 Mar 2014 00:54:08 +0100
Subject: [PATCH] Use endian neutral hash function for HyperLogLog.

We need to be sure that you can save a dataset in a Redis instance,
reload it in a different architecture, and continue to count in the same
HyperLogLog structure.

So 32 and 64 bit, little or bit endian, must all guarantee to output the
same hash for the same element.
---
 src/hyperloglog.c | 42 ++++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/src/hyperloglog.c b/src/hyperloglog.c
index faea9bd2..b94f325f 100644
--- a/src/hyperloglog.c
+++ b/src/hyperloglog.c
@@ -213,33 +213,48 @@
 
 /* ========================= HyperLogLog algorithm  ========================= */
 
-/* Our hahs function is MurmurHash2, 64 bit version. */
+/* Our hash function is MurmurHash2, 64 bit version.
+ * It was modified for Redis in order to provide the same result in
+ * big and little endian archs (endian neutral). */
 uint64_t MurmurHash64A (const void * key, int len, unsigned int seed) {
     const uint64_t m = 0xc6a4a7935bd1e995;
     const int r = 47;
     uint64_t h = seed ^ (len * m);
-    const uint64_t *data = (const uint64_t *)key;
-    const uint64_t *end = data + (len/8);
+    const uint8_t *data = (const uint8_t *)key;
+    const uint8_t *end = data + (len-(len&7));
 
     while(data != end) {
-        uint64_t k = *data++;
+        uint64_t k;
+
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+        k = *((uint64_t*)data);
+#else
+        k = (uint64_t) data[0];
+        k |= (uint64_t) data[1] << 8;
+        k |= (uint64_t) data[2] << 16;
+        k |= (uint64_t) data[3] << 24;
+        k |= (uint64_t) data[4] << 32;
+        k |= (uint64_t) data[5] << 40;
+        k |= (uint64_t) data[6] << 48;
+        k |= (uint64_t) data[7] << 56;
+#endif
+
         k *= m;
         k ^= k >> r;
         k *= m;
         h ^= k;
         h *= m;
+        data += 8;
     }
 
-    const unsigned char *data2 = (const unsigned char*)data;
-
     switch(len & 7) {
-    case 7: h ^= (uint64_t)data2[6] << 48;
-    case 6: h ^= (uint64_t)data2[5] << 40;
-    case 5: h ^= (uint64_t)data2[4] << 32;
-    case 4: h ^= (uint64_t)data2[3] << 24;
-    case 3: h ^= (uint64_t)data2[2] << 16;
-    case 2: h ^= (uint64_t)data2[1] << 8;
-    case 1: h ^= (uint64_t)data2[0];
+    case 7: h ^= (uint64_t)data[6] << 48;
+    case 6: h ^= (uint64_t)data[5] << 40;
+    case 5: h ^= (uint64_t)data[4] << 32;
+    case 4: h ^= (uint64_t)data[3] << 24;
+    case 3: h ^= (uint64_t)data[2] << 16;
+    case 2: h ^= (uint64_t)data[1] << 8;
+    case 1: h ^= (uint64_t)data[0];
             h *= m;
     };
 
@@ -462,7 +477,6 @@ void hllCountCommand(redisClient *c) {
     }
 }
 
-
 /* This command performs a self-test of the HLL registers implementation.
  * Something that is not easy to test from within the outside.
  *