Count the number of bits in an integer. Otherwise known as "population count"
or
.
A number of algorithms are presented.
I take no credit for collecting these algorithms; they are copied rather
verbatim from
http://gurmeet.net/puzzles/fast-bit-counting-routines/ .
I did, however change the code somewhat, for e.g. portability reasons.
Naive bit counting
Iterates through all bits.
// Naive bit counting: Iterates through all bits.
static unsigned bitcount (TestType n)
{
unsigned count = 0;
while (n) {
count += (n & 1);
n >>= 1;
}
return count;
}
Sparse Ones
// Sparse-ones: Only iterates as many times as there are 1-bits in the integer.
static unsigned bitcount (TestType n)
{
unsigned count = 0;
while (n)
{
++count;
n &= (n - 1); // Zero the lowest-order one-bit
}
return count;
}
Dense Ones
// Dense-ones: Only iterates as many times as there are zero-bits in the integer.
const unsigned TEST_BITS = sizeof(TestType) * CHAR_BITS;
static unsigned bitcount (TestType n)
{
unsigned count = TEST_BITS;
n = ~n;
while (n)
{
--count;
n &= (n - 1); // Zero the lowest-order one-bit
}
return count;
}
Parallel Counting
// Parallel counting. A general algorithm suitable for any bitness.
const unsigned TEST_BITS = sizeof(TestType) * CHAR_BITS;
static unsigned bitcount (TestType n)
{
for(unsigned m = 0; (1u << m) < TEST_BITS; ++m)
{
TestType divisor = (TestType(1) << (1u << m)) + 1; // 3,5,17,257,...
TestType mask = (~(TestType)0) / divisor;
n = (n & mask) + ((n >> (1u << m)) & mask);
}
return n;
}
Nifty Parallel Counting
// Nifty parallel counting.
const unsigned TEST_BITS = sizeof(TestType) * CHAR_BITS;
static unsigned bitcount(TestType n)
{
TestType m1 = (~(TestType)0) / 3; // Binary 01010101...
TestType m2 = (~(TestType)0) / 5; // Binary 00110011...
TestType m4 = (~(TestType)0) / 17; // Binary 00001111...
TestType h01 = (~(TestType)0) / 255; // Hex 0101...
n = (n & m1) + ((n >> 1) & m1);
n = (n & m2) + ((n >> 2) & m2);
n = (n & m4) + ((n >> 4) & m4);
return (n * h01) >> (TEST_BITS-8);
// ^incidentally same as n % 255
}
WP3 - Nifty Revised
A variant of Nifty Parallel Counting that requires fewer instructions.
This algorithm was called popcount_3 on the Wikipedia article.
const unsigned TEST_BITS = sizeof(TestType) * CHAR_BITS;
//This uses fewer arithmetic operations than any other known
//implementation on machines with fast multiplication.
//It uses 12 arithmetic operations, one of which is a multiply.
static unsigned bitcount (TestType n)
{
TestType m1 = (~(TestType)0) / 3;
TestType m2 = (~(TestType)0) / 5;
TestType m4 = (~(TestType)0) / 17;
TestType h01 = (~(TestType)0) / 255;
n -= (n >> 1) & m1; //put count of each 2 bits into those 2 bits
n = (n & m2) + ((n >> 2) & m2); //put count of each 4 bits into those 4 bits
n = (n + (n >> 4)) & m4; //put count of each 8 bits into those 8 bits
return (n * h01) >> (TEST_BITS-8);
// returns left 8 bits of x + (x<<8) + (x<<16) + (x<<24) + ...
}
WP2 - Nifty Revised, without multipliations
This is a variant of WP3 for machines with slow multiplication.
This algorithm was called popcount_2 on the Wikipedia article.
//This uses fewer arithmetic operations than any other known
//implementation on machines with slow multiplication.
//It uses 17 arithmetic operations.
const unsigned TEST_BITS = sizeof(TestType) * CHAR_BITS;
static unsigned bitcount (TestType n)
{
TestType m1 = (~(TestType)0) / 3;
TestType m2 = (~(TestType)0) / 5;
TestType m4 = (~(TestType)0) / 17;
n -= (n >> 1) & m1; //put count of each 2 bits into those 2 bits
n = (n & m2) + ((n >> 2) & m2); //put count of each 4 bits into those 4 bits
n = (n + (n >> 4)) & m4; //put count of each 8 bits into those 8 bits
if(TEST_BITS > 8) n += n >> 8; //put count of each 16 bits into their lowest 8 bits
if(TEST_BITS > 16) n += n >> 16; //put count of each 32 bits into their lowest 8 bits
if(TEST_BITS > 32) n += n >> 32; //put count of each 64 bits into their lowest 8 bits
return n & 0x7F;
}
Precomputed Table Lookups
#define a(n) n+0, n+1, n+1, n+2 // 2 bits
#define b(n) a(n+0),a(n+1),a(n+1),a(n+2) // 4 bits
#define c(n) b(n+0),b(n+1),b(n+1),b(n+2) // 6 bits
#define d(n) c(n+0),c(n+1),c(n+1),c(n+2) // 8 bits
#define e(n) d(n+0),d(n+1),d(n+1),d(n+2) //10 bits
#define f(n) e(n+0),e(n+1),e(n+1),e(n+2) //12 bits
#define g(n) f(n+0),f(n+1),f(n+1),f(n+2) //14 bits
#define h(n) g(n+0),g(n+1),g(n+1),g(n+2) //16 bits
#define i(n) h(n+0),h(n+1),h(n+1),h(n+2) //18 bits
#define j(n) i(n+0),i(n+1),i(n+1),i(n+2) //20 bits
#define k(n) j(n+0),j(n+1),j(n+1),j(n+2) //22 bits
static const unsigned char bits_in[0x10000] alignas(64) = { h(0) };
#undef k
#undef j
#undef i
#undef h
#undef g
#undef f
#undef e
#undef d
#undef c
#undef b
#undef a
const unsigned TEST_BITS = sizeof(TestType) * CHAR_BITS;
// Table lookups, based on a precomputed table.
template<unsigned PreCompBits, unsigned a=0, bool over = a >= TEST_BITS>
struct BitCountHelper
{
// Using template recursion instead of a for-loop
// will ensure that the compiler will completely
// and totally unroll the loop.
static unsigned bitcount(TestType n)
{
unsigned index = (n >> a) & ((1 << PreCompBits) - 1);
return bits_in[ index ]
+ BitCountHelper<PreCompBits, a + PreCompBits>::bitcount(n);
}
};
template<unsigned PreCompBits, unsigned a>
struct BitCountHelper<PreCompBits,a,true>
{
static unsigned bitcount(TestType) { return 0; }
};
using Test_Precomp2 = BitCountHelper<2>;
using Test_Precomp4 = BitCountHelper<4>;
using Test_Precomp8 = BitCountHelper<8>;
using Test_Precomp12 = BitCountHelper<12>;
using Test_Precomp16 = BitCountHelper<16>;
GCC Built-ins
// This version just uses whatever the compiler authors felt was the best solution.
static unsigned bitcount (TestType n)
{
if(sizeof(n)==sizeof(int)) return __builtin_popcount(n);
if(sizeof(n)==sizeof(long)) return __builtin_popcountl(n);
return __builtin_popcountll(n);
}
I wrote this article because I was unsatisfied with the tables presented
on Gurmeet Manku's web page, especially the ones concerning the use of
precomputed tables.
The performance of precomputed tables depends a lot on whether the data
is found in the cache or not. In a typical timing-profiling-loop testbed,
all data is readily found in the cache.
I set out to test what happens if none of the data is found in the cache.
Here are the results!
Real-time scheduling and CPU affinity was used to get as reliable
timings as possible.
Tests were run on an
Intel(R) Core(TM)2 Quad CPU Q6600 @ 2.40GHz
(x86_64). The SSE4.1
POPCNT instruction was not available.
Mcps = million counts per second.
Random data has equal chance for any number of bits in it.
Dense data has 75% chance that more than half of its bits are set.
Sparse data has 75% chance that fewer than half of its bits are set.
In real random 32-bit data, values that have 16 one-bits on are most common,
followed by those that have 15 or 17 one-bits; and values that
have 0, 1, 31 or 32 one-bits are the rarest. In this testcase though,
values that have 1 bit on are exactly as common as values that have
16 bits on. This is in order to judge the counting algorithms fairly.
I used the
clcache opcode (SSE2) to manually evict the lookup tables
from the CPU's cache. I also wrote an alternative function that evicts
dummy data from the cache, for enabling comparisons between the cost of
dummy page evictions and the cost of actual data missing from the cache.
I tried using a loop that accesses memory data to evict the cache data,
but I could not get stable timings from that approach.
As can be seen, when the precomputed data can not be found in the cache,
the table lookup versions can end up having a rather bad performance,
even worse than the version that simply counts each bit.
At the time of writing, GCC-Builtin appears to use a 8-bit table lookup
method. Why is it slower than Precomp8? I'm not sure, but it may be because
the function call is not inlined. It contains the "call" + "ret" overhead.
At the time of writing, Clang-builtin appears to use the WP3 algorithm.
32-bit integers on amd64 (64-bit platform)
32-bit integers, running on 64-bit platform
-=- -=- -=- -=-
Method name Random data Dense data Sparse data
-=- -=- -=- -=-
Naive 25.11 Mcps 25.36 Mcps 25.48 Mcps
DenseOnes 42.77 Mcps 55.89 Mcps 37.25 Mcps
SparseOnes 42.69 Mcps 37.19 Mcps 53.02 Mcps
GCC-Builtin 72.39 Mcps 72.40 Mcps 72.39 Mcps
Clang-Builtin 242.89 Mcps 243.32 Mcps 242.81 Mcps
Parallel 174.27 Mcps 174.49 Mcps 174.26 Mcps
NiftyParallel 224.63 Mcps 224.14 Mcps 224.82 Mcps
WP3 244.07 Mcps 244.01 Mcps 244.27 Mcps
WP2 188.80 Mcps 188.58 Mcps 188.93 Mcps
Precomp2 @ Good cache 88.39 Mcps 88.35 Mcps 88.39 Mcps
Precomp4 @ Good cache 155.72 Mcps 155.67 Mcps 155.76 Mcps
Precomp8 @ Good cache 264.93 Mcps 265.21 Mcps 264.98 Mcps
Precomp12 @ Good cache 340.89 Mcps 340.24 Mcps 340.40 Mcps
Precomp16 @ Good cache 389.38 Mcps 386.49 Mcps 389.91 Mcps
Precomp22 @ Good cache 115.22 Mcps 142.76 Mcps 139.38 Mcps
Precomp2 @ Bad cache 10.40 Mcps 10.40 Mcps 10.40 Mcps
Precomp4 @ Bad cache 11.08 Mcps 11.08 Mcps 11.08 Mcps
Precomp8 @ Bad cache 16.70 Mcps 16.70 Mcps 16.70 Mcps
Precomp12 @ Bad cache 44.45 Mcps 44.45 Mcps 44.45 Mcps
Precomp16 @ Bad cache 108.17 Mcps 107.96 Mcps 108.24 Mcps
Precomp22 @ Bad cache 47.85 Mcps 52.18 Mcps 51.60 Mcps
64-bit integers on amd64 (64-bit platform)
64-bit integers, running on 64-bit platform
-=- -=- -=- -=-
Method name Random data Dense data Sparse data
-=- -=- -=- -=-
Naive 15.55 Mcps 15.65 Mcps 15.77 Mcps
DenseOnes 28.05 Mcps 34.76 Mcps 23.37 Mcps
SparseOnes 27.30 Mcps 23.02 Mcps 33.78 Mcps
GCC-Builtin 90.83 Mcps 90.69 Mcps 90.68 Mcps
Clang-Builtin 227.88 Mcps 228.42 Mcps 228.01 Mcps
Parallel 133.50 Mcps 133.26 Mcps 133.34 Mcps
NiftyParallel 208.75 Mcps 207.43 Mcps 207.86 Mcps
WP3 224.03 Mcps 220.78 Mcps 221.28 Mcps
WP2 155.79 Mcps 156.09 Mcps 156.09 Mcps
Precomp2 @ Good cache 48.18 Mcps 48.19 Mcps 48.12 Mcps
Precomp4 @ Good cache 88.34 Mcps 88.34 Mcps 88.23 Mcps
Precomp8 @ Good cache 159.59 Mcps 159.61 Mcps 159.68 Mcps
Precomp12 @ Good cache 193.37 Mcps 193.10 Mcps 192.65 Mcps
Precomp16 @ Good cache 225.06 Mcps 239.31 Mcps 239.25 Mcps
Precomp22 @ Good cache 59.99 Mcps 70.83 Mcps 71.74 Mcps
Precomp2 @ Bad cache 4.99 Mcps 4.99 Mcps 4.99 Mcps
Precomp4 @ Bad cache 9.42 Mcps 9.42 Mcps 9.42 Mcps
Precomp8 @ Bad cache 12.51 Mcps 12.51 Mcps 12.51 Mcps
Precomp12 @ Bad cache 20.42 Mcps 20.42 Mcps 20.41 Mcps
Precomp16 @ Bad cache 36.33 Mcps 36.68 Mcps 36.67 Mcps
Precomp22 @ Bad cache 27.94 Mcps 30.04 Mcps 30.24 Mcps
32-bit integers on i386 (32-bit platform)
32-bit integers, running on 32-bit platform
-=- -=- -=- -=-
Method name Random data Dense data Sparse data
-=- -=- -=- -=-
Naive 26.85 Mcps 27.92 Mcps 26.84 Mcps
DenseOnes 42.46 Mcps 55.77 Mcps 35.88 Mcps
SparseOnes 41.94 Mcps 36.37 Mcps 51.77 Mcps
GCC-Builtin 95.50 Mcps 95.50 Mcps 95.54 Mcps
Clang-Builtin 208.72 Mcps 208.40 Mcps 208.14 Mcps
Parallel 147.79 Mcps 149.21 Mcps 148.45 Mcps
NiftyParallel 188.55 Mcps 188.57 Mcps 188.54 Mcps
WP3 203.56 Mcps 201.62 Mcps 203.66 Mcps
WP2 152.92 Mcps 152.80 Mcps 151.06 Mcps
Precomp2 @ Good cache 78.55 Mcps 78.37 Mcps 78.30 Mcps
Precomp4 @ Good cache 121.57 Mcps 121.48 Mcps 121.08 Mcps
Precomp8 @ Good cache 237.33 Mcps 237.25 Mcps 237.54 Mcps
Precomp12 @ Good cache 263.12 Mcps 262.29 Mcps 258.14 Mcps
Precomp16 @ Good cache 310.84 Mcps 317.28 Mcps 322.47 Mcps
Precomp22 @ Good cache 86.23 Mcps 107.45 Mcps 105.26 Mcps
Precomp2 @ Bad cache 9.02 Mcps 9.02 Mcps 9.02 Mcps
Precomp4 @ Bad cache 9.72 Mcps 9.71 Mcps 9.72 Mcps
Precomp8 @ Bad cache 18.45 Mcps 18.45 Mcps 18.45 Mcps
Precomp12 @ Bad cache 41.11 Mcps 40.90 Mcps 41.05 Mcps
Precomp16 @ Bad cache 78.21 Mcps 78.65 Mcps 78.26 Mcps
Precomp22 @ Bad cache 40.13 Mcps 45.77 Mcps 45.44 Mcps
64-bit integers on i386 (32-bit platform)
64-bit integers, running on 32-bit platform
-=- -=- -=- -=-
Method name Random data Dense data Sparse data
-=- -=- -=- -=-
Naive 9.49 Mcps 9.42 Mcps 9.73 Mcps
DenseOnes 9.96 Mcps 12.72 Mcps 8.19 Mcps
SparseOnes 9.44 Mcps 7.78 Mcps 12.08 Mcps
GCC-Builtin 49.24 Mcps 48.86 Mcps 48.94 Mcps
Clang-Builtin 115.83 Mcps 115.78 Mcps 115.73 Mcps
Parallel 44.93 Mcps 44.86 Mcps 44.90 Mcps
NiftyParallel 61.31 Mcps 61.39 Mcps 61.42 Mcps
WP3 85.33 Mcps 85.09 Mcps 85.12 Mcps
WP2 53.01 Mcps 53.09 Mcps 53.29 Mcps
Precomp2 @ Good cache 36.28 Mcps 36.37 Mcps 36.30 Mcps
Precomp4 @ Good cache 63.31 Mcps 63.26 Mcps 63.28 Mcps
Precomp8 @ Good cache 103.20 Mcps 102.73 Mcps 103.34 Mcps
Precomp12 @ Good cache 112.76 Mcps 112.80 Mcps 112.61 Mcps
Precomp16 @ Good cache 136.62 Mcps 142.02 Mcps 141.66 Mcps
Precomp22 @ Good cache 49.79 Mcps 58.58 Mcps 59.49 Mcps
Precomp2 @ Bad cache 5.12 Mcps 5.12 Mcps 5.12 Mcps
Precomp4 @ Bad cache 9.77 Mcps 9.77 Mcps 9.77 Mcps
Precomp8 @ Bad cache 11.29 Mcps 11.28 Mcps 11.28 Mcps
Precomp12 @ Bad cache 12.47 Mcps 12.46 Mcps 12.47 Mcps
Precomp16 @ Bad cache 30.16 Mcps 30.42 Mcps 30.40 Mcps
Precomp22 @ Bad cache 22.00 Mcps 23.66 Mcps 23.80 Mcps
16-bit integers
Only listing those algorithms whose 16-bit specialization does work.
32-bit and 64-bit hosts have nearly identical timings here, so they are not listed separately.
Interestingly there are significant differences in the
performance of WP2 depending on the compiler that was used.
Method name Random data Dense data Sparse data
-"- -"- -"- -"-
Naive 22.59 Mcps 22.87 Mcps 23.28 Mcps
DenseOnes 46.63 Mcps 63.26 Mcps 38.58 Mcps
SparseOnes 44.83 Mcps 38.37 Mcps 58.13 Mcps
GCC-Builtin 72.41 Mcps 72.42 Mcps 72.43 Mcps
WP2 (compiled by GCC) 95.59 Mcps 95.54 Mcps 95.62 Mcps
Clang-Builtin 230.06 Mcps 230.10 Mcps 230.11 Mcps
WP2 (compiled by Clang) 223.99 Mcps 223.78 Mcps 223.97 Mcps
Precomp2 @ Good cache 153.75 Mcps 153.66 Mcps 153.83 Mcps
Precomp4 @ Good cache 265.27 Mcps 265.14 Mcps 265.11 Mcps
Precomp8 @ Good cache 341.36 Mcps 341.12 Mcps 341.42 Mcps
Precomp12 @ Good cache 340.83 Mcps 341.01 Mcps 341.15 Mcps
Precomp16 @ Good cache 341.27 Mcps 341.43 Mcps 341.30 Mcps
Precomp2 @ Bad cache 10.92 Mcps 10.91 Mcps 10.91 Mcps
Precomp4 @ Bad cache 30.36 Mcps 30.36 Mcps 30.36 Mcps
Precomp8 @ Bad cache 51.23 Mcps 51.24 Mcps 51.24 Mcps
Precomp12 @ Bad cache 13.05 Mcps 13.05 Mcps 13.05 Mcps
Precomp16 @ Bad cache 143.50 Mcps 143.54 Mcps 143.51 Mcps
8-bit integers
Only listing those algorithms whose 8-bit specialization does work.
32-bit and 64-bit hosts have nearly identical timings here, so they are not listed separately.
Method name Random data Dense data Sparse data
-"- -"- -"- -"-
Naive 46.91 Mcps 52.60 Mcps 45.77 Mcps
DenseOnes 80.53 Mcps 116.69 Mcps 78.28 Mcps
SparseOnes 76.99 Mcps 76.63 Mcps 101.05 Mcps
GCC-Builtin 70.32 Mcps 70.33 Mcps 70.28 Mcps
WP2 (compiled by GCC) 290.80 Mcps 290.73 Mcps 290.72 Mcps
Clang-Builtin 229.96 Mcps 230.08 Mcps 229.88 Mcps
WP2 (compiled by Clang) 269.05 Mcps 269.32 Mcps 269.29 Mcps
Precomp2 @ Good cache 277.11 Mcps 277.19 Mcps 277.16 Mcps
Precomp4 @ Good cache 397.92 Mcps 398.04 Mcps 397.88 Mcps
Precomp8 @ Good cache 398.70 Mcps 398.33 Mcps 398.54 Mcps
Precomp2 @ Bad cache 31.46 Mcps 31.46 Mcps 31.46 Mcps
Precomp4 @ Bad cache 56.45 Mcps 56.45 Mcps 56.45 Mcps
Precomp8 @ Bad cache 65.06 Mcps 65.07 Mcps 65.06 Mcps