Bit-counting algorithms

Count the number of bits in an integer. Otherwise known as "population count" or hamming weight.

Table of contents [expand all] [collapse all]

Algorithms

A number of algorithms are presented.

I take no credit for collecting these algorithms; they are copied rather verbatim from http://gurmeet.net/puzzles/fast-bit-counting-routines/ .

I did, however change the code somewhat, for e.g. portability reasons.

Naive bit counting

Iterates through all bits.

// Naive bit counting: Iterates through all bits.
static unsigned bitcount (TestType n)
{
    unsigned count = 0;
    while (n) {
        count += (n & 1);
        n >>= 1;
    }
    return count;
}

Sparse Ones

// Sparse-ones: Only iterates as many times as there are 1-bits in the integer.
static unsigned bitcount (TestType n)
{
    unsigned count = 0;
    while (n)
    {
        ++count;
        n &= (n - 1); // Zero the lowest-order one-bit
    }
    return count;
}

Dense Ones

// Dense-ones: Only iterates as many times as there are zero-bits in the integer.
const unsigned TEST_BITS = sizeof(TestType) * CHAR_BITS;

static unsigned bitcount (TestType n)
{
    unsigned count = TEST_BITS;
    n = ~n;
    while (n)
    {
        --count;
        n &= (n - 1); // Zero the lowest-order one-bit
    }
    return count;
}

Parallel Counting

// Parallel counting. A general algorithm suitable for any bitness.
const unsigned TEST_BITS = sizeof(TestType) * CHAR_BITS;

static unsigned bitcount (TestType n)
{
    for(unsigned m = 0; (1u << m) < TEST_BITS; ++m)
    {
        TestType divisor = (TestType(1) << (1u << m)) + 1; // 3,5,17,257,...
        TestType mask = (~(TestType)0) / divisor;

        n = (n & mask) + ((n >> (1u << m)) & mask);
    }
    return n;
}

Nifty Parallel Counting

// Nifty parallel counting. 
const unsigned TEST_BITS = sizeof(TestType) * CHAR_BITS;

static unsigned bitcount(TestType n)
{
    TestType m1 = (~(TestType)0) / 3;   // Binary 01010101...
    TestType m2 = (~(TestType)0) / 5;   // Binary 00110011...
    TestType m4 = (~(TestType)0) / 17;  // Binary 00001111...
    TestType h01 = (~(TestType)0) / 255; // Hex 0101...

    n = (n & m1) + ((n >> 1) & m1);
    n = (n & m2) + ((n >> 2) & m2);
    n = (n & m4) + ((n >> 4) & m4);

    return (n * h01) >> (TEST_BITS-8);
    // ^incidentally same as n % 255
}

WP3 - Nifty Revised

A variant of Nifty Parallel Counting that requires fewer instructions.

This algorithm was called popcount_3 on the Wikipedia article.

const unsigned TEST_BITS = sizeof(TestType) * CHAR_BITS;

//This uses fewer arithmetic operations than any other known  
//implementation on machines with fast multiplication.
//It uses 12 arithmetic operations, one of which is a multiply.
static unsigned bitcount (TestType n)
{
    TestType m1 = (~(TestType)0) / 3;
    TestType m2 = (~(TestType)0) / 5;
    TestType m4 = (~(TestType)0) / 17;
    TestType h01 = (~(TestType)0) / 255;

    n -= (n >> 1) & m1;             //put count of each 2 bits into those 2 bits
    n = (n & m2) + ((n >> 2) & m2); //put count of each 4 bits into those 4 bits 
    n = (n + (n >> 4)) & m4;        //put count of each 8 bits into those 8 bits 

    return (n * h01) >> (TEST_BITS-8);
    // returns left 8 bits of x + (x<<8) + (x<<16) + (x<<24) + ... 
}

WP2 - Nifty Revised, without multipliations

This is a variant of WP3 for machines with slow multiplication.

This algorithm was called popcount_2 on the Wikipedia article.

//This uses fewer arithmetic operations than any other known  
//implementation on machines with slow multiplication.
//It uses 17 arithmetic operations.
const unsigned TEST_BITS = sizeof(TestType) * CHAR_BITS;

static unsigned bitcount (TestType n)
{
    TestType m1 = (~(TestType)0) / 3;
    TestType m2 = (~(TestType)0) / 5;
    TestType m4 = (~(TestType)0) / 17;

    n -= (n >> 1) & m1;              //put count of each 2 bits into those 2 bits
    n = (n & m2) + ((n >> 2) & m2);  //put count of each 4 bits into those 4 bits 
    n = (n + (n >> 4)) & m4;         //put count of each 8 bits into those 8 bits 
    if(TEST_BITS > 8)  n += n >>  8; //put count of each 16 bits into their lowest 8 bits
    if(TEST_BITS > 16) n += n >> 16; //put count of each 32 bits into their lowest 8 bits
    if(TEST_BITS > 32) n += n >> 32; //put count of each 64 bits into their lowest 8 bits
    return n & 0x7F;
}

Precomputed Table Lookups

#define a(n) n+0, n+1, n+1, n+2          // 2 bits
#define b(n) a(n+0),a(n+1),a(n+1),a(n+2) // 4 bits
#define c(n) b(n+0),b(n+1),b(n+1),b(n+2) // 6 bits
#define d(n) c(n+0),c(n+1),c(n+1),c(n+2) // 8 bits
#define e(n) d(n+0),d(n+1),d(n+1),d(n+2) //10 bits
#define f(n) e(n+0),e(n+1),e(n+1),e(n+2) //12 bits
#define g(n) f(n+0),f(n+1),f(n+1),f(n+2) //14 bits
#define h(n) g(n+0),g(n+1),g(n+1),g(n+2) //16 bits
#define i(n) h(n+0),h(n+1),h(n+1),h(n+2) //18 bits
#define j(n) i(n+0),i(n+1),i(n+1),i(n+2) //20 bits
#define k(n) j(n+0),j(n+1),j(n+1),j(n+2) //22 bits

static const unsigned char bits_in[0x10000] alignas(64) = { h(0) };

#undef k
#undef j
#undef i
#undef h
#undef g
#undef f
#undef e
#undef d
#undef c
#undef b
#undef a

const unsigned TEST_BITS = sizeof(TestType) * CHAR_BITS;

// Table lookups, based on a precomputed table.
template<unsigned PreCompBits, unsigned a=0, bool over = a >= TEST_BITS>
struct BitCountHelper
{
    // Using template recursion instead of a for-loop
    // will ensure that the compiler will completely
    // and totally unroll the loop.
    static unsigned bitcount(TestType n)
    {   
        unsigned index = (n >> a) & ((1 << PreCompBits) - 1);
        return bits_in[ index ]
             + BitCountHelper<PreCompBits, a + PreCompBits>::bitcount(n);
    }
};
template<unsigned PreCompBits, unsigned a>
struct BitCountHelper<PreCompBits,a,true>
{
    static unsigned bitcount(TestType) { return 0; }
};

using Test_Precomp2  = BitCountHelper<2>;
using Test_Precomp4  = BitCountHelper<4>;
using Test_Precomp8  = BitCountHelper<8>;
using Test_Precomp12 = BitCountHelper<12>;
using Test_Precomp16 = BitCountHelper<16>;

GCC Built-ins

// This version just uses whatever the compiler authors felt was the best solution.
static unsigned bitcount (TestType n)
{
    if(sizeof(n)==sizeof(int)) return __builtin_popcount(n);
    if(sizeof(n)==sizeof(long)) return __builtin_popcountl(n);
    return __builtin_popcountll(n);
}

The tests

I wrote this article because I was unsatisfied with the tables presented on Gurmeet Manku's web page, especially the ones concerning the use of precomputed tables.

The performance of precomputed tables depends a lot on whether the data is found in the cache or not. In a typical timing-profiling-loop testbed, all data is readily found in the cache.

I set out to test what happens if none of the data is found in the cache.

Here are the results!

Real-time scheduling and CPU affinity was used to get as reliable timings as possible.

Tests were run on an Intel(R) Core(TM)2 Quad CPU Q6600 @ 2.40GHz (x86_64). The SSE4.1 POPCNT instruction was not available.

   Mcps = million counts per second.
   Random data has equal chance for any number of bits in it.
   Dense data has 75% chance that more than half of its bits are set.
   Sparse data has 75% chance that fewer than half of its bits are set.
In real random 32-bit data, values that have 16 one-bits on are most common, followed by those that have 15 or 17 one-bits; and values that have 0, 1, 31 or 32 one-bits are the rarest. In this testcase though, values that have 1 bit on are exactly as common as values that have 16 bits on. This is in order to judge the counting algorithms fairly.

I used the clcache opcode (SSE2) to manually evict the lookup tables from the CPU's cache. I also wrote an alternative function that evicts dummy data from the cache, for enabling comparisons between the cost of dummy page evictions and the cost of actual data missing from the cache.

I tried using a loop that accesses memory data to evict the cache data, but I could not get stable timings from that approach.

As can be seen, when the precomputed data can not be found in the cache, the table lookup versions can end up having a rather bad performance, even worse than the version that simply counts each bit.

At the time of writing, GCC-Builtin appears to use a 8-bit table lookup method. Why is it slower than Precomp8? I'm not sure, but it may be because the function call is not inlined. It contains the "call" + "ret" overhead.

At the time of writing, Clang-builtin appears to use the WP3 algorithm.

32-bit integers on amd64 (64-bit platform)

   32-bit integers, running on 64-bit platform
   -=-                               -=-            -=-            -=-
   Method name               Random data     Dense data    Sparse data
   -=-                               -=-            -=-            -=-
   Naive                      25.11 Mcps     25.36 Mcps     25.48 Mcps
   DenseOnes                  42.77 Mcps     55.89 Mcps     37.25 Mcps
   SparseOnes                 42.69 Mcps     37.19 Mcps     53.02 Mcps
   GCC-Builtin                72.39 Mcps     72.40 Mcps     72.39 Mcps
   Clang-Builtin             242.89 Mcps    243.32 Mcps    242.81 Mcps
   Parallel                  174.27 Mcps    174.49 Mcps    174.26 Mcps
   NiftyParallel             224.63 Mcps    224.14 Mcps    224.82 Mcps
   WP3                       244.07 Mcps    244.01 Mcps    244.27 Mcps
   WP2                       188.80 Mcps    188.58 Mcps    188.93 Mcps
   Precomp2 @ Good cache      88.39 Mcps     88.35 Mcps     88.39 Mcps
   Precomp4 @ Good cache     155.72 Mcps    155.67 Mcps    155.76 Mcps
   Precomp8 @ Good cache     264.93 Mcps    265.21 Mcps    264.98 Mcps
   Precomp12 @ Good cache    340.89 Mcps    340.24 Mcps    340.40 Mcps
   Precomp16 @ Good cache    389.38 Mcps    386.49 Mcps    389.91 Mcps
   Precomp22 @ Good cache    115.22 Mcps    142.76 Mcps    139.38 Mcps
   Precomp2 @ Bad cache       10.40 Mcps     10.40 Mcps     10.40 Mcps
   Precomp4 @ Bad cache       11.08 Mcps     11.08 Mcps     11.08 Mcps
   Precomp8 @ Bad cache       16.70 Mcps     16.70 Mcps     16.70 Mcps
   Precomp12 @ Bad cache      44.45 Mcps     44.45 Mcps     44.45 Mcps
   Precomp16 @ Bad cache     108.17 Mcps    107.96 Mcps    108.24 Mcps
   Precomp22 @ Bad cache      47.85 Mcps     52.18 Mcps     51.60 Mcps

64-bit integers on amd64 (64-bit platform)

   64-bit integers, running on 64-bit platform
   -=-                               -=-            -=-            -=-
   Method name               Random data     Dense data    Sparse data
   -=-                               -=-            -=-            -=-
   Naive                      15.55 Mcps     15.65 Mcps     15.77 Mcps
   DenseOnes                  28.05 Mcps     34.76 Mcps     23.37 Mcps
   SparseOnes                 27.30 Mcps     23.02 Mcps     33.78 Mcps
   GCC-Builtin                90.83 Mcps     90.69 Mcps     90.68 Mcps
   Clang-Builtin             227.88 Mcps    228.42 Mcps    228.01 Mcps
   Parallel                  133.50 Mcps    133.26 Mcps    133.34 Mcps
   NiftyParallel             208.75 Mcps    207.43 Mcps    207.86 Mcps
   WP3                       224.03 Mcps    220.78 Mcps    221.28 Mcps
   WP2                       155.79 Mcps    156.09 Mcps    156.09 Mcps
   Precomp2 @ Good cache      48.18 Mcps     48.19 Mcps     48.12 Mcps
   Precomp4 @ Good cache      88.34 Mcps     88.34 Mcps     88.23 Mcps
   Precomp8 @ Good cache     159.59 Mcps    159.61 Mcps    159.68 Mcps
   Precomp12 @ Good cache    193.37 Mcps    193.10 Mcps    192.65 Mcps
   Precomp16 @ Good cache    225.06 Mcps    239.31 Mcps    239.25 Mcps
   Precomp22 @ Good cache     59.99 Mcps     70.83 Mcps     71.74 Mcps
   Precomp2 @ Bad cache        4.99 Mcps      4.99 Mcps      4.99 Mcps
   Precomp4 @ Bad cache        9.42 Mcps      9.42 Mcps      9.42 Mcps
   Precomp8 @ Bad cache       12.51 Mcps     12.51 Mcps     12.51 Mcps
   Precomp12 @ Bad cache      20.42 Mcps     20.42 Mcps     20.41 Mcps
   Precomp16 @ Bad cache      36.33 Mcps     36.68 Mcps     36.67 Mcps
   Precomp22 @ Bad cache      27.94 Mcps     30.04 Mcps     30.24 Mcps

32-bit integers on i386 (32-bit platform)

   32-bit integers, running on 32-bit platform
   -=-                               -=-            -=-            -=-
   Method name               Random data     Dense data    Sparse data
   -=-                               -=-            -=-            -=-
   Naive                      26.85 Mcps     27.92 Mcps     26.84 Mcps
   DenseOnes                  42.46 Mcps     55.77 Mcps     35.88 Mcps
   SparseOnes                 41.94 Mcps     36.37 Mcps     51.77 Mcps
   GCC-Builtin                95.50 Mcps     95.50 Mcps     95.54 Mcps
   Clang-Builtin             208.72 Mcps    208.40 Mcps    208.14 Mcps
   Parallel                  147.79 Mcps    149.21 Mcps    148.45 Mcps
   NiftyParallel             188.55 Mcps    188.57 Mcps    188.54 Mcps
   WP3                       203.56 Mcps    201.62 Mcps    203.66 Mcps
   WP2                       152.92 Mcps    152.80 Mcps    151.06 Mcps
   Precomp2 @ Good cache      78.55 Mcps     78.37 Mcps     78.30 Mcps
   Precomp4 @ Good cache     121.57 Mcps    121.48 Mcps    121.08 Mcps
   Precomp8 @ Good cache     237.33 Mcps    237.25 Mcps    237.54 Mcps
   Precomp12 @ Good cache    263.12 Mcps    262.29 Mcps    258.14 Mcps
   Precomp16 @ Good cache    310.84 Mcps    317.28 Mcps    322.47 Mcps
   Precomp22 @ Good cache     86.23 Mcps    107.45 Mcps    105.26 Mcps
   Precomp2 @ Bad cache        9.02 Mcps      9.02 Mcps      9.02 Mcps
   Precomp4 @ Bad cache        9.72 Mcps      9.71 Mcps      9.72 Mcps
   Precomp8 @ Bad cache       18.45 Mcps     18.45 Mcps     18.45 Mcps
   Precomp12 @ Bad cache      41.11 Mcps     40.90 Mcps     41.05 Mcps
   Precomp16 @ Bad cache      78.21 Mcps     78.65 Mcps     78.26 Mcps
   Precomp22 @ Bad cache      40.13 Mcps     45.77 Mcps     45.44 Mcps

64-bit integers on i386 (32-bit platform)

   64-bit integers, running on 32-bit platform
   -=-                               -=-            -=-            -=-
   Method name               Random data     Dense data    Sparse data
   -=-                               -=-            -=-            -=-
   Naive                       9.49 Mcps      9.42 Mcps      9.73 Mcps
   DenseOnes                   9.96 Mcps     12.72 Mcps      8.19 Mcps
   SparseOnes                  9.44 Mcps      7.78 Mcps     12.08 Mcps
   GCC-Builtin                49.24 Mcps     48.86 Mcps     48.94 Mcps
   Clang-Builtin             115.83 Mcps    115.78 Mcps    115.73 Mcps
   Parallel                   44.93 Mcps     44.86 Mcps     44.90 Mcps
   NiftyParallel              61.31 Mcps     61.39 Mcps     61.42 Mcps
   WP3                        85.33 Mcps     85.09 Mcps     85.12 Mcps
   WP2                        53.01 Mcps     53.09 Mcps     53.29 Mcps
   Precomp2 @ Good cache      36.28 Mcps     36.37 Mcps     36.30 Mcps
   Precomp4 @ Good cache      63.31 Mcps     63.26 Mcps     63.28 Mcps
   Precomp8 @ Good cache     103.20 Mcps    102.73 Mcps    103.34 Mcps
   Precomp12 @ Good cache    112.76 Mcps    112.80 Mcps    112.61 Mcps
   Precomp16 @ Good cache    136.62 Mcps    142.02 Mcps    141.66 Mcps
   Precomp22 @ Good cache     49.79 Mcps     58.58 Mcps     59.49 Mcps
   Precomp2 @ Bad cache        5.12 Mcps      5.12 Mcps      5.12 Mcps
   Precomp4 @ Bad cache        9.77 Mcps      9.77 Mcps      9.77 Mcps
   Precomp8 @ Bad cache       11.29 Mcps     11.28 Mcps     11.28 Mcps
   Precomp12 @ Bad cache      12.47 Mcps     12.46 Mcps     12.47 Mcps
   Precomp16 @ Bad cache      30.16 Mcps     30.42 Mcps     30.40 Mcps
   Precomp22 @ Bad cache      22.00 Mcps     23.66 Mcps     23.80 Mcps

16-bit integers

Only listing those algorithms whose 16-bit specialization does work. 32-bit and 64-bit hosts have nearly identical timings here, so they are not listed separately.

Interestingly there are significant differences in the performance of WP2 depending on the compiler that was used.

   Method name               Random data     Dense data    Sparse data
   -"-                               -"-            -"-            -"-
   Naive                      22.59 Mcps     22.87 Mcps     23.28 Mcps
   DenseOnes                  46.63 Mcps     63.26 Mcps     38.58 Mcps
   SparseOnes                 44.83 Mcps     38.37 Mcps     58.13 Mcps
   GCC-Builtin                72.41 Mcps     72.42 Mcps     72.43 Mcps
   WP2 (compiled by GCC)      95.59 Mcps     95.54 Mcps     95.62 Mcps
   Clang-Builtin             230.06 Mcps    230.10 Mcps    230.11 Mcps
   WP2 (compiled by Clang)   223.99 Mcps    223.78 Mcps    223.97 Mcps
   Precomp2 @ Good cache     153.75 Mcps    153.66 Mcps    153.83 Mcps
   Precomp4 @ Good cache     265.27 Mcps    265.14 Mcps    265.11 Mcps
   Precomp8 @ Good cache     341.36 Mcps    341.12 Mcps    341.42 Mcps
   Precomp12 @ Good cache    340.83 Mcps    341.01 Mcps    341.15 Mcps
   Precomp16 @ Good cache    341.27 Mcps    341.43 Mcps    341.30 Mcps
   Precomp2 @ Bad cache       10.92 Mcps     10.91 Mcps     10.91 Mcps
   Precomp4 @ Bad cache       30.36 Mcps     30.36 Mcps     30.36 Mcps
   Precomp8 @ Bad cache       51.23 Mcps     51.24 Mcps     51.24 Mcps
   Precomp12 @ Bad cache      13.05 Mcps     13.05 Mcps     13.05 Mcps
   Precomp16 @ Bad cache     143.50 Mcps    143.54 Mcps    143.51 Mcps

8-bit integers

Only listing those algorithms whose 8-bit specialization does work. 32-bit and 64-bit hosts have nearly identical timings here, so they are not listed separately.

   Method name               Random data     Dense data    Sparse data
   -"-                               -"-            -"-            -"-
   Naive                      46.91 Mcps     52.60 Mcps     45.77 Mcps
   DenseOnes                  80.53 Mcps    116.69 Mcps     78.28 Mcps
   SparseOnes                 76.99 Mcps     76.63 Mcps    101.05 Mcps
   GCC-Builtin                70.32 Mcps     70.33 Mcps     70.28 Mcps
   WP2 (compiled by GCC)     290.80 Mcps    290.73 Mcps    290.72 Mcps
   Clang-Builtin             229.96 Mcps    230.08 Mcps    229.88 Mcps
   WP2 (compiled by Clang)   269.05 Mcps    269.32 Mcps    269.29 Mcps
   Precomp2 @ Good cache     277.11 Mcps    277.19 Mcps    277.16 Mcps
   Precomp4 @ Good cache     397.92 Mcps    398.04 Mcps    397.88 Mcps
   Precomp8 @ Good cache     398.70 Mcps    398.33 Mcps    398.54 Mcps
   Precomp2 @ Bad cache       31.46 Mcps     31.46 Mcps     31.46 Mcps
   Precomp4 @ Bad cache       56.45 Mcps     56.45 Mcps     56.45 Mcps
   Precomp8 @ Bad cache       65.06 Mcps     65.07 Mcps     65.06 Mcps

Source code

The source code can be downloaded at http://bisqwit.iki.fi/src/arch/popcount_profiling_tests.zip . It is distributed under the MIT license.


Copyright © 2013 Joel Yliluoma (http://iki.fi/bisqwit/)

Last edited at: 2013-11-07T23:12:38+02:00