Cache friendly string switch in C++11

Switching on strings

C++ has traditionally never supported switching on strings so you typically end up with a chain of if else calling string compare.

Something like this:


std::string str = "meow";
if(str == "foo") {

} else if(str == "bar") {

} else if(str == "car") {

} // ...

But with the addition of user literals and constexpr in c++11 it’s now possible to switch on strings by computing compile time hashes of the strings.

String hashing with literals

With user literals we can create a literal that will return the hash of a string. I will be using the popular string hashing algorithm fnv1a.

Note: the speed of the hashing algorithm is quite important in order to beat the if else, which we will get into later.

Which then lets us do this.


uint32_t stringHash = "foo"_fnv1a; // 0xA9F37ED7

The definition of this operator is (source at end)

constexpr uint32_t operator"" _fnv1a(const char* const pStr, const size_t len);

Switching on strings

User literals are not required to evaluate at compile time but by placing them in a switch which requires a integral expression it forces compile time evaluation, allowing us to use the string hashes in a switch.


switch(hash::fnva1(str.data(), str.length()))
{
    case "foo"_fnva1:
        break;
    case "bar"_fnva1:
        break;
    case "car"_fnva1:
        break;
    // ...
}

Note: I recommend not calling the same hashing implementation for the input string as the literal since the constexpr version is typically less optimal. (Source at end provides both)

Since a switch with duplicate case values is not valid we get compile time hash collision detection for free!


switch(...)
{
   case "foo"_fnva1:
   case "foo"_fnva1: //  error C2196: case value '0xA9F37ED7' already used
        break;
}

So you can tune the speed of your hashing algo until it no longer compiles, unless you have external pre computed hashes (saved to disk etc), that you can leverage making a more standardized hash preferable.

Case insensitive

It’s also easy to support case insensitive by changing the hash we use on the input string with a case independent one.

Case insensitive version of fnva1 is included in the source below for ascii inputs.

Performance

The thing I really love about this style of string matching is that at runtime all we are doing is comparing 32bit ints and these integers are intermediate values in the instruction stream, so it should be very cache friendly.

With strings compares you may have a cache miss for every single one even if you only end up comparing the first character. The strings may also be scattered around in the data section making it hard for the CPU to predict. Although potentially the CPU is smart enough to prefetch the string data as the memory address are encoded in the instruction stream.

Lets take a look at what we end up with for a hash switch, below is the disassembly of a hash switch with 9 cases + default that I have tidied up and added jmp labels / comments.


; Calculate Fnv1a of string we are switching on

 mov         eax,811C9DC5h          ; Fnv1aVal offset bias
 test        r9,r9  
 je          default_case  
 mov         r8,r9
 nop         dword ptr [rax]        ; instruction stream padding (alignment)  

 hash_loop:
     movzx       ecx,byte ptr [rdx] ; load byte of string
     lea         rdx,[rdx+1]  
     xor         ecx,eax  
     imul        eax,ecx,1000193h   ; Fnv1a prime
     sub         r8,1               ; loop counter
     jne         hash_loop          ; finished?

; compare the hashe 'eax' contains the result of `switch(fnva1(str))`
 cmp         eax,6D294357h  
 ja          00007FF7EC4E1658  
 je          case4  
 cmp         eax,13254BC4h  
 je          case3  
 cmp         eax,2A387561h  
 je          case2  
 cmp         eax,486059E7h  
 je          case1  
 cmp         eax,506DFF3Eh  

 ; if none of these matched move to next block of hashes to check.
 jne         check_next_set  

 case0:
     mov         dword ptr [rdi],2  
     mov         al,1  
     mov         rdi,qword ptr [rsp+0000000000000198h]  
     add         rsp,170h  
     pop         rbp  
     ret  
 case1:
     mov         dword ptr [rdi],5  
     mov         al,1  
     mov         rdi,qword ptr [rsp+0000000000000198h]  
     add         rsp,170h  
     pop         rbp  
     ret  
 case2:
     mov         dword ptr [rdi],7  
     mov         al,1  
     mov         rdi,qword ptr [rsp+0000000000000198h]  
     add         rsp,170h  
     pop         rbp  
     ret  
 case3:
     mov         dword ptr [rdi],1  
     mov         al,1  
     mov         rdi,qword ptr [rsp+0000000000000198h]  
     add         rsp,170h  
     pop         rbp  
     ret  
 case4:
     mov         dword ptr [rdi],4  
     mov         al,1  
     mov         rdi,qword ptr [rsp+0000000000000198h]  
     add         rsp,170h  
     pop         rbp  
     ret  

check_next_set:
; compare the hashes
 cmp         eax,7F2346E9h  
 je          case8  
 cmp         eax,0C46EE0B2h  
 je          case7  
 cmp         eax,0DA54F687h  
 je          case6  
 cmp         eax,0EDD0246Eh  
 jne         default  

; fall through if matched case5
 case5:
     mov         dword ptr [rdi],3  
     mov         al,1  
     mov         rdi,qword ptr [rsp+0000000000000198h]  
     add         rsp,170h  
     pop         rbp  
     ret 
 case6: 
     mov         dword ptr [rdi],8  
     mov         al,1  
     mov         rdi,qword ptr [rsp+0000000000000198h]  
     add         rsp,170h  
     pop         rbp  
     ret  
case7:
     mov         dword ptr [rdi],6  
     mov         al,1  
     mov         rdi,qword ptr [rsp+0000000000000198h]  
     add         rsp,170h  
     pop         rbp  
     ret  
 case8:
     mov         dword ptr [rdi],0  
     mov         al,1  
     mov         rdi,qword ptr [rsp+0000000000000198h]  
     add         rsp,170h  
     pop         rbp  
     ret  
 default_case:
     mov         rdi,qword ptr [rsp+198h]  
     xor         al,al  
     add         rsp,170h  
     pop         rbp  
     ret

This confirms we never touch any memory outside the instruction stream after calculating hash of incoming string, lovely.

Benchmarks

Unfortunately the benchmarks reveal it’s not quite a golden bullet and in some cases the if else chain is faster.

NoMatchLong: is 160 characters of Lorem Ipsum.
NoMatchPartial: means string has a prefix matching at least one case.
MatchStart: matches the first if and the first case.

The long string is aligned to 64bytes so it only straddles 3 cache lines.

CPU Caches:
  L1 Data 32K (x6)
  L1 Instruction 32K (x6)
  L2 Unified 262K (x6)
  L3 Unified 12582K (x1)

Benchmark                                   CPU(Time)
-----------------------------------------------------------
switch_hash/"NoMatch"                       8.20 ns
switch_hash/"NoMatchPartial"                13.8 ns
switch_hash/"NoMatchLong"                    215 ns
switch_hash/"MatchStart"                    6.14 ns
switch_hash/"MatchMiddle"                   8.20 ns
switch_hash/"MatchLast"                     5.47 ns

if_chain/"NoMatch"                          8.37 ns
if_chain/"NoMatchPartial"                   8.54 ns
if_chain/"NoMatchLong"                      8.54 ns
if_chain/"MatchStart"                       2.25 ns
if_chain/"MatchMiddle"                      5.16 ns
if_chain/"MatchLast"                        8.16 ns

Note: CPU time is what we care about here.

I was surprised to see how if else was either on par or faster than our hash switch. But after looking at the generated code I could see the compiler had a few tricks up its sleeve and had pre-calculated the string literal lengths for if else.

So it turned:


std::string str = "foo"

if(str == "meow") {
} else if(str == "woof") {
} else if(str == "foo") {
}

Into:


std::string str = "foo"
auto len = str.length();

if(len == 4 && str.equal("meow")) {
} else if(len == 4 && str.equal("woof") {
} else if(len == 3 && str.equal("foo") {
}

Which means that the simple if else would only touch the memory of the siring literals if the lengths where the same.

Allowing if else to beat hash switch in MatchStart as hashing the string takes more time than a single strcmp, lets confirm that by removing the time it takes to compute the input hash time from benchmarks.

switch(hash(str)) becomes switch(hashVal)

switch_pre_hash/"NoMatch"                         3.60 ns
switch_pre_hash/"NoMatchPartial"                  3.37 ns
switch_pre_hash/"NoMatchLong"                     3.07 ns
switch_pre_hash/"MatchStart"                      2.85 ns
switch_pre_hash/"MatchMiddle"                     2.57 ns
switch_pre_hash/"MatchLast"                       3.52 ns

Nice! the hash switch is faster in all cases expect MatchStart. This is explainable by looking at the assembly from before. We can see the compiler re ordered the cases into blocks checked in reverse order. So instead of case0, case1, case2, ... it’s case4, case3, case2, case1. So we actually had to compare 4 hashes before we reached the first case.

Lets see what happens when all strings are the same length meaning if else implementation will always end up calling strcmp as the length checks will pass.

Except NoMatchPartial and NoMatchLong which we will keep long.

switch_hash_same_len/"NoMatch"                    8.72 ns
switch_hash_same_len/"NoMatchPartial"             11.0 ns
switch_hash_same_len/"NoMatchLong"                 218 ns
switch_hash_same_len/"MatchStart"                 8.37 ns
switch_hash_same_len/"MatchMiddle"                7.67 ns
switch_hash_same_len/"MatchLast"                  7.85 ns

if_chain_same_len/"NoMatch"                       17.6 ns
if_chain_same_len/"NoMatchPartial"                8.89 ns
if_chain_same_len/"NoMatchLong"                   8.54 ns
if_chain_same_len/"MatchStart"                    2.46 ns
if_chain_same_len/"MatchMiddle"                   9.63 ns
if_chain_same_len/"MatchLast"                     17.6 ns

So here we can see the hash switch winning in a few cases but also it’s execution time is more consistent hovering around 8ns for inputs under 64 bytes.

While the if else time varies based on how many strings it ends up comparing as expected.

We can also see NoMatch takes twice as long as NoMatchPartial for if else this is because NoMatch is having to check each string unlike NoMatchPartial.

CPU cache

All the benchmarks so far data has been in L1 cache so is representative of performance in a hot loop.

But I would expect this type of code to appear more in program startup logic for argument or text file parsing. Where it’s much more likely the string literals won’t be in the cpu caches.

So lets have a look at the numbers for when the data is cold.

switch_hash_cold/"NoMatch"                         795 ns
switch_hash_cold/"NoMatchPartial"                 1520 ns
switch_hash_cold/"NoMatchLong"                    1343 ns
switch_hash_cold/"MatchStart"                      991 ns
switch_hash_cold/"MatchMiddle"                     991 ns
switch_hash_cold/"MatchLast"                       991 ns

if_chain_cold/"NoMatch"                           1601 ns
if_chain_cold/"NoMatchPartial"                    1733 ns
if_chain_cold/"NoMatchLong"                       1733 ns
if_chain_cold/"MatchStart"                        1733 ns
if_chain_cold/"MatchMiddle"                       1733 ns
if_chain_cold/"MatchLast"                         1733 ns

Looks like the hash switch is faster in all cases, which is what I would would of guessed, since we have less cache misses.

Instruction size

Another metric to consider is instruction size. If we have a program with 1000 if else chains and replace them with hash switches should we expect the binary to grow or shrink?

This is a little hard to measure but below is the byte code size for each implementation with 10 strings.

Version           Function Size     Branches
-------------------------------------------------------------------
hash switch       277 byte          22
if else           545 byte          45

The hash switch has a lot less branches and is almost half the size, which is great since the function takes up less space in the instruction cache.

Since we calculate the hash of the strings the resulting binary won’t contain the strings (unless used elsewhere) so we also save some bytes in the rdata section.

Debug performance

Ideally debug builds are fast enough to be used daily, will using a switch hash slow them down?

switch_hash/"NoMatch"                             36.0 ns
switch_hash/"NoMatchPartial"                      69.8 ns
switch_hash/"NoMatchLong"                          656 ns
switch_hash/"MatchStart"                          24.1 ns
switch_hash/"MatchMiddle"                         37.5 ns
switch_hash/"MatchLast"                           18.0 ns
switch_hash_same_len/"NoMatch"                    28.3 ns
switch_hash_same_len/"NoMatchPartial"             42.4 ns
switch_hash_same_len/"NoMatchLong"                 656 ns
switch_hash_same_len/"MatchStart"                 28.3 ns
switch_hash_same_len/"MatchMiddle"                28.3 ns
switch_hash_same_len/"MatchLast"                  27.6 ns
if_chain/"MatchStart"                             92.1 ns
if_chain/"NoMatch"                                 767 ns
if_chain/"NoMatchPartial"                          767 ns
if_chain/"NoMatchLong"                             750 ns
if_chain/"MatchMiddle"                             420 ns
if_chain/"MatchLast"                               767 ns
if_chain_same_len/"NoMatch"                        816 ns
if_chain_same_len/"NoMatchPartial"                 767 ns
if_chain_same_len/"NoMatchLong"                    767 ns
if_chain_same_len/"MatchStart"                    92.1 ns
if_chain_same_len/"MatchMiddle"                    455 ns
if_chain_same_len/"MatchLast"                      820 ns

Looks like switch hashes are actually about 10x faster in debug builds nice!

Which makes sense since for hash switch the only thing that benefits from optimization is the hash loop.

Compile times

Is there a price for this runtime speed up?

The compiler has to calculate the hash of every string when building the binary if we have 16 strings it’s probably marginal but what about 10,000?

I made two test programs both with 100 functions each function containing either 100 if else or a switch with 100 case statements. Each string was globally unique and about 10 characters long.

Version           Debug Build    Release Build    Binary Size
-------------------------------------------------------------------
hash switch       6,243 ms       6,529 ms         886 KB
if else           4,838 ms       18,736 ms        1,475 KB

As expected the hash switch is slower to compile in debug builds since we still need to calculate all the hashes, but release builds are almost 3 times quicker.

So for a typically program we can expect debug build times to not slow down much but release build times are likely to speed up and binary sizes decrease!

This is ilkley due to the compiler having less code to optimize and not having to store the strings in the compiled binary.

Conclusion

Looks like hash strings are a pretty solid alternative to if else and provide consistently good performance. While the speed of if else is mostly on the back of the length checks the compiler inserted, which a compiler may fail to add or prove useless if lots of the strings are the same length.

hash switch

Pros:

Faster than if else in most cases with small strings
Significantly faster runtime speed in debug builds
Extremely fast if you already have the hash
Cache friendly
Consistent performance
Smaller binary
Faster release build times
Subjectivity easier to read.

Cons:

Slightly slower debug build times
Not ideal on large inputs (> 256 bytes) due to hashing time

Source



namespace hash
{
    typedef uint32_t Fnv1aVal;

    namespace Fnv1aConst
    {
        constexpr static uint32_t default_offset_basis = 2166136261u;
        constexpr static uint32_t prime = 16777619u;

        namespace Internal
        {
            inline char charToLower(const char c) 
            {
                return (c >= 'A' && c <= 'Z') ? c + ('a' - 'A') : c;
            }

            constexpr static inline Fnv1aVal Hash(char const* const pStr, const uint32_t val)
            {
                return !*pStr ? val : Hash(pStr + 1, static_cast<uint32_t>(((val ^ *pStr) * static_cast<uint64_t>(prime)) & 0xFFFFFFFF));
            }

            constexpr static inline Fnv1aVal Hash(char const* const pStr, const size_t strLen, const uint32_t val)
            {
                return (strLen == 0) ? val : Hash(pStr + 1, strLen - 1, static_cast<uint32_t>(((val ^ *pStr) * static_cast<uint64_t>(prime)) & 0xFFFFFFFF));
            }

        } // namespace Internal

        constexpr static inline Fnv1aVal Hash(char const* const pStr)
        {
            return !*pStr ? default_offset_basis : Internal::Hash(pStr, default_offset_basis);
        }

        constexpr static inline Fnv1aVal Hash(char const* const pStr, const size_t strLen)
        {
            return (strLen == 0) ? default_offset_basis : Internal::Hash(pStr, strLen, default_offset_basis);
        }

    } // namespace Fnv1aConst

    namespace literals
    {
        inline constexpr uint32_t operator"" _fnv1a(const char* const pStr, const size_t strLen)
        {
            return Fnv1aConst::Hash(pStr, strLen);
        }

    } // namespace literals

    // Faster version for runtime.
    inline Fnv1aVal Fnv1aHash(const void* key, size_t length)
    {
        Fnv1aVal hash = Fnv1aConst::default_offset_basis;
        auto* s = reinterpret_cast<const uint8_t*>(key);

        for (size_t i = 0; i < length; ++i) {
            hash ^= (uint32_t)*s++;
            hash *= Fnv1aConst::prime;
        }

        return hash;
    }

    // Note: there is no need for a ToLower constexpr version.
    // since you only need to lowercase the input string, and just make the cases lowercase
    inline Fnv1aVal Fnv1aHashLower(const void* key, size_t length)
    {
        Fnv1aVal hash = Fnv1aConst::default_offset_basis;
        auto* s = reinterpret_cast<const uint8_t*>(key);

        for (size_t i = 0; i < length; ++i) {
            hash ^= (uint32_t)Fnv1aConst::Internal::charToLower(*s++);
            hash *= Fnv1aConst::prime;
        }

        return hash;
    }

} // namespace hash


/*
int example(std::string_view str)
{
    using namespace hash::literals;

    int res = 0;
    switch(hash::Fnv1aHash(str.data(), str.length()))
    {
        case "foo"_fnv1a;
            res = 1;
            break;
        case "bar"_fnv1a;
            res = 2;
            break;
    }
    return res;
}
*/