diff --git a/src/lzfP.h b/src/lzfP.h
index c9eae3f6..c6d2e096 100644
--- a/src/lzfP.h
+++ b/src/lzfP.h
@@ -49,7 +49,7 @@
  * the difference between 15 and 14 is very small
  * for small blocks (and 14 is usually a bit faster).
  * For a low-memory/faster configuration, use HLOG == 13;
- * For best compression, use 15 or 16 (or more, up to 23).
+ * For best compression, use 15 or 16 (or more, up to 22).
  */
 #ifndef HLOG
 # define HLOG 16
@@ -94,7 +94,7 @@
 /*
  * Avoid assigning values to errno variable? for some embedding purposes
  * (linux kernel for example), this is necessary. NOTE: this breaks
- * the documentation in lzf.h.
+ * the documentation in lzf.h. Avoiding errno has no speed impact.
  */
 #ifndef AVOID_ERRNO
 # define AVOID_ERRNO 0
@@ -121,16 +121,52 @@
 # define CHECK_INPUT 1
 #endif
 
+/*
+ * Whether to store pointers or offsets inside the hash table. On
+ * 64 bit architetcures, pointers take up twice as much space,
+ * and might also be slower. Default is to autodetect.
+ */
+/*#define LZF_USER_OFFSETS autodetect */
+
 /*****************************************************************************/
 /* nothing should be changed below */
 
+#ifdef __cplusplus
+# include <cstring>
+# include <climits>
+using namespace std;
+#else
+# include <string.h>
+# include <limits.h>
+#endif
+
+#ifndef LZF_USE_OFFSETS
+# if defined (WIN32)
+#  define LZF_USE_OFFSETS defined(_M_X64)
+# else
+#  if __cplusplus > 199711L
+#   include <cstdint>
+#  else
+#   include <stdint.h>
+#  endif
+#  define LZF_USE_OFFSETS (UINTPTR_MAX > 0xffffffffU)
+# endif
+#endif
+
 typedef unsigned char u8;
 
-typedef const u8 *LZF_STATE[1 << (HLOG)];
+#if LZF_USE_OFFSETS
+# define LZF_HSLOT_BIAS ((const u8 *)in_data)
+  typedef unsigned int LZF_HSLOT;
+#else
+# define LZF_HSLOT_BIAS 0
+  typedef const u8 *LZF_HSLOT;
+#endif
+
+typedef LZF_HSLOT LZF_STATE[1 << (HLOG)];
 
 #if !STRICT_ALIGN
 /* for unaligned accesses we need a 16 bit datatype. */
-# include <limits.h>
 # if USHRT_MAX == 65535
     typedef unsigned short u16;
 # elif UINT_MAX == 65535
@@ -142,17 +178,7 @@ typedef const u8 *LZF_STATE[1 << (HLOG)];
 #endif
 
 #if ULTRA_FAST
-# if defined(VERY_FAST)
-#  undef VERY_FAST
-# endif
-#endif
-
-#if INIT_HTAB
-# ifdef __cplusplus
-#  include <cstring>
-# else
-#  include <string.h>
-# endif
+# undef VERY_FAST
 #endif
 
 #endif
diff --git a/src/lzf_c.c b/src/lzf_c.c
index 9e031ad0..e9c69a0b 100644
--- a/src/lzf_c.c
+++ b/src/lzf_c.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2008 Marc Alexander Lehmann <schmorp@schmorp.de>
+ * Copyright (c) 2000-2010 Marc Alexander Lehmann <schmorp@schmorp.de>
  *
  * Redistribution and use in source and binary forms, with or without modifica-
  * tion, are permitted provided that the following conditions are met:
@@ -40,8 +40,8 @@
 
 /*
  * don't play with this unless you benchmark!
- * decompression is not dependent on the hash function
- * the hashing function might seem strange, just believe me
+ * the data format is not dependent on the hash function.
+ * the hash function might seem strange, just believe me,
  * it works ;)
  */
 #ifndef FRST
@@ -89,9 +89,9 @@
 /*
  * compressed format
  *
- * 000LLLLL <L+1>    ; literal
- * LLLooooo oooooooo ; backref L
- * 111ooooo LLLLLLLL oooooooo ; backref L+7
+ * 000LLLLL <L+1>    ; literal, L+1=1..33 octets
+ * LLLooooo oooooooo ; backref L+1=1..7 octets, o+1=1..4096 offset
+ * 111ooooo LLLLLLLL oooooooo ; backref L+8 octets, o+1=1..4096 offset
  *
  */
 
@@ -106,7 +106,6 @@ lzf_compress (const void *const in_data, unsigned int in_len,
 #if !LZF_STATE_ARG
   LZF_STATE htab;
 #endif
-  const u8 **hslot;
   const u8 *ip = (const u8 *)in_data;
         u8 *op = (u8 *)out_data;
   const u8 *in_end  = ip + in_len;
@@ -133,10 +132,6 @@ lzf_compress (const void *const in_data, unsigned int in_len,
 
 #if INIT_HTAB
   memset (htab, 0, sizeof (htab));
-# if 0
-  for (hslot = htab; hslot < htab + HSIZE; hslot++)
-    *hslot++ = ip;
-# endif
 #endif
 
   lit = 0; op++; /* start run */
@@ -144,24 +139,23 @@ lzf_compress (const void *const in_data, unsigned int in_len,
   hval = FRST (ip);
   while (ip < in_end - 2)
     {
+      LZF_HSLOT *hslot;
+
       hval = NEXT (hval, ip);
       hslot = htab + IDX (hval);
-      ref = *hslot; *hslot = ip;
+      ref = *hslot + LZF_HSLOT_BIAS; *hslot = ip - LZF_HSLOT_BIAS;
 
       if (1
 #if INIT_HTAB
           && ref < ip /* the next test will actually take care of this, but this is faster */
 #endif
           && (off = ip - ref - 1) < MAX_OFF
-          && ip + 4 < in_end
           && ref > (u8 *)in_data
-#if STRICT_ALIGN
-          && ref[0] == ip[0]
-          && ref[1] == ip[1]
           && ref[2] == ip[2]
+#if STRICT_ALIGN
+          && ((ref[1] << 8) | ref[0]) == ((ip[1] << 8) | ip[0])
 #else
           && *(u16 *)ref == *(u16 *)ip
-          && ref[2] == ip[2]
 #endif
         )
         {
@@ -170,12 +164,13 @@ lzf_compress (const void *const in_data, unsigned int in_len,
           unsigned int maxlen = in_end - ip - len;
           maxlen = maxlen > MAX_REF ? MAX_REF : maxlen;
 
+          if (expect_false (op + 3 + 1 >= out_end)) /* first a faster conservative test */
+            if (op - !lit + 3 + 1 >= out_end) /* second the exact but rare test */
+              return 0;
+
           op [- lit - 1] = lit - 1; /* stop run */
           op -= !lit; /* undo run if length is zero */
 
-          if (expect_false (op + 3 + 1 >= out_end))
-            return 0;
-
           for (;;)
             {
               if (expect_true (maxlen > 16))
@@ -222,6 +217,7 @@ lzf_compress (const void *const in_data, unsigned int in_len,
             }
 
           *op++ = off;
+
           lit = 0; op++; /* start run */
 
           ip += len + 1;
@@ -237,12 +233,12 @@ lzf_compress (const void *const in_data, unsigned int in_len,
           hval = FRST (ip);
 
           hval = NEXT (hval, ip);
-          htab[IDX (hval)] = ip;
+          htab[IDX (hval)] = ip - LZF_HSLOT_BIAS;
           ip++;
 
 # if VERY_FAST && !ULTRA_FAST
           hval = NEXT (hval, ip);
-          htab[IDX (hval)] = ip;
+          htab[IDX (hval)] = ip - LZF_HSLOT_BIAS;
           ip++;
 # endif
 #else
@@ -251,7 +247,7 @@ lzf_compress (const void *const in_data, unsigned int in_len,
           do
             {
               hval = NEXT (hval, ip);
-              htab[IDX (hval)] = ip;
+              htab[IDX (hval)] = ip - LZF_HSLOT_BIAS;
               ip++;
             }
           while (len--);
diff --git a/src/lzf_d.c b/src/lzf_d.c
index 6c723f5e..c32be8e8 100644
--- a/src/lzf_d.c
+++ b/src/lzf_d.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2000-2007 Marc Alexander Lehmann <schmorp@schmorp.de>
+ * Copyright (c) 2000-2010 Marc Alexander Lehmann <schmorp@schmorp.de>
  *
  * Redistribution and use in source and binary forms, with or without modifica-
  * tion, are permitted provided that the following conditions are met:
@@ -43,14 +43,14 @@
 # define SET_ERRNO(n) errno = (n)
 #endif
 
-/*
+#if USE_REP_MOVSB /* small win on amd, big loss on intel */
 #if (__i386 || __amd64) && __GNUC__ >= 3
 # define lzf_movsb(dst, src, len)                \
    asm ("rep movsb"                              \
         : "=D" (dst), "=S" (src), "=c" (len)     \
         :  "0" (dst),  "1" (src),  "2" (len));
 #endif
-*/
+#endif
 
 unsigned int
 lzf_decompress (const void *const in_data,  unsigned int in_len,
@@ -86,9 +86,17 @@ lzf_decompress (const void *const in_data,  unsigned int in_len,
 #ifdef lzf_movsb
           lzf_movsb (op, ip, ctrl);
 #else
-          do
-            *op++ = *ip++;
-          while (--ctrl);
+          switch (ctrl)
+            {
+              case 32: *op++ = *ip++; case 31: *op++ = *ip++; case 30: *op++ = *ip++; case 29: *op++ = *ip++;
+              case 28: *op++ = *ip++; case 27: *op++ = *ip++; case 26: *op++ = *ip++; case 25: *op++ = *ip++;
+              case 24: *op++ = *ip++; case 23: *op++ = *ip++; case 22: *op++ = *ip++; case 21: *op++ = *ip++;
+              case 20: *op++ = *ip++; case 19: *op++ = *ip++; case 18: *op++ = *ip++; case 17: *op++ = *ip++;
+              case 16: *op++ = *ip++; case 15: *op++ = *ip++; case 14: *op++ = *ip++; case 13: *op++ = *ip++;
+              case 12: *op++ = *ip++; case 11: *op++ = *ip++; case 10: *op++ = *ip++; case  9: *op++ = *ip++;
+              case  8: *op++ = *ip++; case  7: *op++ = *ip++; case  6: *op++ = *ip++; case  5: *op++ = *ip++;
+              case  4: *op++ = *ip++; case  3: *op++ = *ip++; case  2: *op++ = *ip++; case  1: *op++ = *ip++;
+            }
 #endif
         }
       else /* back reference */
@@ -134,12 +142,39 @@ lzf_decompress (const void *const in_data,  unsigned int in_len,
           len += 2;
           lzf_movsb (op, ref, len);
 #else
-          *op++ = *ref++;
-          *op++ = *ref++;
+          switch (len)
+            {
+              default:
+                len += 2;
 
-          do
-            *op++ = *ref++;
-          while (--len);
+                if (op >= ref + len)
+                  {
+                    /* disjunct areas */
+                    memcpy (op, ref, len);
+                    op += len;
+                  }
+                else
+                  {
+                    /* overlapping, use octte by octte copying */
+                    do
+                      *op++ = *ref++;
+                    while (--len);
+                  }
+
+                break;
+
+              case 9: *op++ = *ref++;
+              case 8: *op++ = *ref++;
+              case 7: *op++ = *ref++;
+              case 6: *op++ = *ref++;
+              case 5: *op++ = *ref++;
+              case 4: *op++ = *ref++;
+              case 3: *op++ = *ref++;
+              case 2: *op++ = *ref++;
+              case 1: *op++ = *ref++;
+              case 0: *op++ = *ref++; /* two octets more */
+                      *op++ = *ref++;
+            }
 #endif
         }
     }