[PATCH] optimize hweight64 for x86_64
Based on patch from David Rientjes <rientjes@google.com>, but
changed by AK.
Optimizes the 64-bit hamming weight for x86_64 processors assuming they
have fast multiplication. Uses five fewer bitops than the generic
hweight64. Benchmark on one EMT64 showed ~25% speedup with 2^24
consecutive calls.
Define a new ARCH_HAS_FAST_MULTIPLIER that can be set by other
architectures that can also multiply fast.
Signed-off-by: Andi Kleen <ak@suse.de>
diff --git a/lib/hweight.c b/lib/hweight.c
index 4382576..360556a 100644
--- a/lib/hweight.c
+++ b/lib/hweight.c
@@ -1,5 +1,6 @@
#include <linux/module.h>
#include <asm/types.h>
+#include <asm/bitops.h>
/**
* hweightN - returns the hamming weight of a N-bit word
@@ -40,14 +41,19 @@
#if BITS_PER_LONG == 32
return hweight32((unsigned int)(w >> 32)) + hweight32((unsigned int)w);
#elif BITS_PER_LONG == 64
+#ifdef ARCH_HAS_FAST_MULTIPLIER
+ w -= (w >> 1) & 0x5555555555555555ul;
+ w = (w & 0x3333333333333333ul) + ((w >> 2) & 0x3333333333333333ul);
+ w = (w + (w >> 4)) & 0x0f0f0f0f0f0f0f0ful;
+ return (w * 0x0101010101010101ul) >> 56;
+#else
__u64 res = w - ((w >> 1) & 0x5555555555555555ul);
res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul);
res = (res + (res >> 4)) & 0x0F0F0F0F0F0F0F0Ful;
res = res + (res >> 8);
res = res + (res >> 16);
return (res + (res >> 32)) & 0x00000000000000FFul;
-#else
-#error BITS_PER_LONG not defined
+#endif
#endif
}
EXPORT_SYMBOL(hweight64);