11 minutes
Cache friendly string switch in C++11
Switching on strings
C++ has traditionally never supported switching on strings so you typically end up with a chain of if else calling string compare.
Something like this:
std::string str = "meow";
if(str == "foo") {
} else if(str == "bar") {
} else if(str == "car") {
} // ...
But with the addition of user literals and constexpr in c++11 it’s now possible to switch on strings by computing compile time hashes of the strings.
String hashing with literals
With user literals we can create a literal that will return the hash of a string. I will be using the popular string hashing algorithm fnv1a.
Note: the speed of the hashing algorithm is quite important in order to beat the if else, which we will get into later.
Which then lets us do this.
uint32_t stringHash = "foo"_fnv1a; // 0xA9F37ED7
The definition of this operator is (source at end)
constexpr uint32_t operator"" _fnv1a(const char* const pStr, const size_t len);
Switching on strings
User literals are not required to evaluate at compile time but by placing them in a switch which requires a integral expression it forces compile time evaluation, allowing us to use the string hashes in a switch.
switch(hash::fnva1(str.data(), str.length()))
{
case "foo"_fnva1:
break;
case "bar"_fnva1:
break;
case "car"_fnva1:
break;
// ...
}
Note: I recommend not calling the same hashing implementation for the input string as the literal since the constexpr version is typically less optimal. (Source at end provides both)
Since a switch with duplicate case values is not valid we get compile time hash collision detection for free!
switch(...)
{
case "foo"_fnva1:
case "foo"_fnva1: // error C2196: case value '0xA9F37ED7' already used
break;
}
So you can tune the speed of your hashing algo until it no longer compiles, unless you have external pre computed hashes (saved to disk etc), that you can leverage making a more standardized hash preferable.
Case insensitive
It’s also easy to support case insensitive by changing the hash we use on the input string with a case independent one.
Case insensitive version of fnva1 is included in the source below for ascii inputs.
Performance
The thing I really love about this style of string matching is that at runtime all we are doing is comparing 32bit ints and these integers are intermediate values in the instruction stream, so it should be very cache friendly.
With strings compares you may have a cache miss for every single one even if you only end up comparing the first character. The strings may also be scattered around in the data section making it hard for the CPU to predict. Although potentially the CPU is smart enough to prefetch the string data as the memory address are encoded in the instruction stream.
Lets take a look at what we end up with for a hash switch, below is the disassembly of a hash switch with 9 cases + default that I have tidied up and added jmp labels / comments.
; Calculate Fnv1a of string we are switching on
mov eax,811C9DC5h ; Fnv1aVal offset bias
test r9,r9
je default_case
mov r8,r9
nop dword ptr [rax] ; instruction stream padding (alignment)
hash_loop:
movzx ecx,byte ptr [rdx] ; load byte of string
lea rdx,[rdx+1]
xor ecx,eax
imul eax,ecx,1000193h ; Fnv1a prime
sub r8,1 ; loop counter
jne hash_loop ; finished?
; compare the hashe 'eax' contains the result of `switch(fnva1(str))`
cmp eax,6D294357h
ja 00007FF7EC4E1658
je case4
cmp eax,13254BC4h
je case3
cmp eax,2A387561h
je case2
cmp eax,486059E7h
je case1
cmp eax,506DFF3Eh
; if none of these matched move to next block of hashes to check.
jne check_next_set
case0:
mov dword ptr [rdi],2
mov al,1
mov rdi,qword ptr [rsp+0000000000000198h]
add rsp,170h
pop rbp
ret
case1:
mov dword ptr [rdi],5
mov al,1
mov rdi,qword ptr [rsp+0000000000000198h]
add rsp,170h
pop rbp
ret
case2:
mov dword ptr [rdi],7
mov al,1
mov rdi,qword ptr [rsp+0000000000000198h]
add rsp,170h
pop rbp
ret
case3:
mov dword ptr [rdi],1
mov al,1
mov rdi,qword ptr [rsp+0000000000000198h]
add rsp,170h
pop rbp
ret
case4:
mov dword ptr [rdi],4
mov al,1
mov rdi,qword ptr [rsp+0000000000000198h]
add rsp,170h
pop rbp
ret
check_next_set:
; compare the hashes
cmp eax,7F2346E9h
je case8
cmp eax,0C46EE0B2h
je case7
cmp eax,0DA54F687h
je case6
cmp eax,0EDD0246Eh
jne default
; fall through if matched case5
case5:
mov dword ptr [rdi],3
mov al,1
mov rdi,qword ptr [rsp+0000000000000198h]
add rsp,170h
pop rbp
ret
case6:
mov dword ptr [rdi],8
mov al,1
mov rdi,qword ptr [rsp+0000000000000198h]
add rsp,170h
pop rbp
ret
case7:
mov dword ptr [rdi],6
mov al,1
mov rdi,qword ptr [rsp+0000000000000198h]
add rsp,170h
pop rbp
ret
case8:
mov dword ptr [rdi],0
mov al,1
mov rdi,qword ptr [rsp+0000000000000198h]
add rsp,170h
pop rbp
ret
default_case:
mov rdi,qword ptr [rsp+198h]
xor al,al
add rsp,170h
pop rbp
ret
This confirms we never touch any memory outside the instruction stream after calculating hash of incoming string, lovely.
Benchmarks
Unfortunately the benchmarks reveal it’s not quite a golden bullet and in some cases the if else chain is faster.
NoMatchLong: is 160 characters of Lorem Ipsum.
NoMatchPartial: means string has a prefix matching at least one case.
MatchStart: matches the first if and the first case.
The long string is aligned to 64bytes so it only straddles 3 cache lines.
CPU Caches:
L1 Data 32K (x6)
L1 Instruction 32K (x6)
L2 Unified 262K (x6)
L3 Unified 12582K (x1)
Benchmark CPU(Time)
-----------------------------------------------------------
switch_hash/"NoMatch" 8.20 ns
switch_hash/"NoMatchPartial" 13.8 ns
switch_hash/"NoMatchLong" 215 ns
switch_hash/"MatchStart" 6.14 ns
switch_hash/"MatchMiddle" 8.20 ns
switch_hash/"MatchLast" 5.47 ns
if_chain/"NoMatch" 8.37 ns
if_chain/"NoMatchPartial" 8.54 ns
if_chain/"NoMatchLong" 8.54 ns
if_chain/"MatchStart" 2.25 ns
if_chain/"MatchMiddle" 5.16 ns
if_chain/"MatchLast" 8.16 ns
Note: CPU time is what we care about here.
I was surprised to see how if else was either on par or faster than our hash switch. But after looking at the generated code I could see the compiler had a few tricks up its sleeve and had pre-calculated the string literal lengths for if else.
So it turned:
std::string str = "foo"
if(str == "meow") {
} else if(str == "woof") {
} else if(str == "foo") {
}
Into:
std::string str = "foo"
auto len = str.length();
if(len == 4 && str.equal("meow")) {
} else if(len == 4 && str.equal("woof") {
} else if(len == 3 && str.equal("foo") {
}
Which means that the simple if else would only touch the memory of the siring literals if the lengths where the same.
Allowing if else to beat hash switch in MatchStart as hashing the string takes more time than a single strcmp, lets confirm that by removing the time it takes to compute the input hash time from benchmarks.
switch(hash(str)) becomes switch(hashVal)
switch_pre_hash/"NoMatch" 3.60 ns
switch_pre_hash/"NoMatchPartial" 3.37 ns
switch_pre_hash/"NoMatchLong" 3.07 ns
switch_pre_hash/"MatchStart" 2.85 ns
switch_pre_hash/"MatchMiddle" 2.57 ns
switch_pre_hash/"MatchLast" 3.52 ns
Nice! the hash switch is faster in all cases expect MatchStart. This is explainable by looking at the assembly from before. We can see the compiler re ordered the cases into blocks checked in reverse order. So instead of case0, case1, case2, ... it’s case4, case3, case2, case1. So we actually had to compare 4 hashes before we reached the first case.
Lets see what happens when all strings are the same length meaning if else implementation will always end up calling strcmp as the length checks will pass.
Except NoMatchPartial and NoMatchLong which we will keep long.
switch_hash_same_len/"NoMatch" 8.72 ns
switch_hash_same_len/"NoMatchPartial" 11.0 ns
switch_hash_same_len/"NoMatchLong" 218 ns
switch_hash_same_len/"MatchStart" 8.37 ns
switch_hash_same_len/"MatchMiddle" 7.67 ns
switch_hash_same_len/"MatchLast" 7.85 ns
if_chain_same_len/"NoMatch" 17.6 ns
if_chain_same_len/"NoMatchPartial" 8.89 ns
if_chain_same_len/"NoMatchLong" 8.54 ns
if_chain_same_len/"MatchStart" 2.46 ns
if_chain_same_len/"MatchMiddle" 9.63 ns
if_chain_same_len/"MatchLast" 17.6 ns
So here we can see the hash switch winning in a few cases but also it’s execution time is more consistent hovering around 8ns for inputs under 64 bytes.
While the if else time varies based on how many strings it ends up comparing as expected.
We can also see NoMatch takes twice as long as NoMatchPartial for if else this is because NoMatch is having to check each string unlike NoMatchPartial.
CPU cache
All the benchmarks so far data has been in L1 cache so is representative of performance in a hot loop.
But I would expect this type of code to appear more in program startup logic for argument or text file parsing. Where it’s much more likely the string literals won’t be in the cpu caches.
So lets have a look at the numbers for when the data is cold.
switch_hash_cold/"NoMatch" 795 ns
switch_hash_cold/"NoMatchPartial" 1520 ns
switch_hash_cold/"NoMatchLong" 1343 ns
switch_hash_cold/"MatchStart" 991 ns
switch_hash_cold/"MatchMiddle" 991 ns
switch_hash_cold/"MatchLast" 991 ns
if_chain_cold/"NoMatch" 1601 ns
if_chain_cold/"NoMatchPartial" 1733 ns
if_chain_cold/"NoMatchLong" 1733 ns
if_chain_cold/"MatchStart" 1733 ns
if_chain_cold/"MatchMiddle" 1733 ns
if_chain_cold/"MatchLast" 1733 ns
Looks like the hash switch is faster in all cases, which is what I would would of guessed, since we have less cache misses.
Instruction size
Another metric to consider is instruction size. If we have a program with 1000 if else chains and replace them with hash switches should we expect the binary to grow or shrink?
This is a little hard to measure but below is the byte code size for each implementation with 10 strings.
Version Function Size Branches
-------------------------------------------------------------------
hash switch 277 byte 22
if else 545 byte 45
The hash switch has a lot less branches and is almost half the size, which is great since the function takes up less space in the instruction cache.
Since we calculate the hash of the strings the resulting binary won’t contain the strings (unless used elsewhere) so we also save some bytes in the rdata section.
Debug performance
Ideally debug builds are fast enough to be used daily, will using a switch hash slow them down?
switch_hash/"NoMatch" 36.0 ns
switch_hash/"NoMatchPartial" 69.8 ns
switch_hash/"NoMatchLong" 656 ns
switch_hash/"MatchStart" 24.1 ns
switch_hash/"MatchMiddle" 37.5 ns
switch_hash/"MatchLast" 18.0 ns
switch_hash_same_len/"NoMatch" 28.3 ns
switch_hash_same_len/"NoMatchPartial" 42.4 ns
switch_hash_same_len/"NoMatchLong" 656 ns
switch_hash_same_len/"MatchStart" 28.3 ns
switch_hash_same_len/"MatchMiddle" 28.3 ns
switch_hash_same_len/"MatchLast" 27.6 ns
if_chain/"MatchStart" 92.1 ns
if_chain/"NoMatch" 767 ns
if_chain/"NoMatchPartial" 767 ns
if_chain/"NoMatchLong" 750 ns
if_chain/"MatchMiddle" 420 ns
if_chain/"MatchLast" 767 ns
if_chain_same_len/"NoMatch" 816 ns
if_chain_same_len/"NoMatchPartial" 767 ns
if_chain_same_len/"NoMatchLong" 767 ns
if_chain_same_len/"MatchStart" 92.1 ns
if_chain_same_len/"MatchMiddle" 455 ns
if_chain_same_len/"MatchLast" 820 ns
Looks like switch hashes are actually about 10x faster in debug builds nice!
Which makes sense since for hash switch the only thing that benefits from optimization is the hash loop.
Compile times
Is there a price for this runtime speed up?
The compiler has to calculate the hash of every string when building the binary if we have 16 strings it’s probably marginal but what about 10,000?
I made two test programs both with 100 functions each function containing either 100 if else or a switch with 100 case statements. Each string was globally unique and about 10 characters long.
Version Debug Build Release Build Binary Size
-------------------------------------------------------------------
hash switch 6,243 ms 6,529 ms 886 KB
if else 4,838 ms 18,736 ms 1,475 KB
As expected the hash switch is slower to compile in debug builds since we still need to calculate all the hashes, but release builds are almost 3 times quicker.
So for a typically program we can expect debug build times to not slow down much but release build times are likely to speed up and binary sizes decrease!
This is ilkley due to the compiler having less code to optimize and not having to store the strings in the compiled binary.
Conclusion
Looks like hash strings are a pretty solid alternative to if else and provide consistently good performance. While the speed of if else is mostly on the back of the length checks the compiler inserted, which a compiler may fail to add or prove useless if lots of the strings are the same length.
hash switch
Pros:
- Faster than
if elsein most cases with small strings - Significantly faster runtime speed in debug builds
- Extremely fast if you already have the hash
- Cache friendly
- Consistent performance
- Smaller binary
- Faster release build times
- Subjectivity easier to read.
Cons:
- Slightly slower debug build times
- Not ideal on large inputs (> 256 bytes) due to hashing time
Source
namespace hash
{
typedef uint32_t Fnv1aVal;
namespace Fnv1aConst
{
constexpr static uint32_t default_offset_basis = 2166136261u;
constexpr static uint32_t prime = 16777619u;
namespace Internal
{
inline char charToLower(const char c)
{
return (c >= 'A' && c <= 'Z') ? c + ('a' - 'A') : c;
}
constexpr static inline Fnv1aVal Hash(char const* const pStr, const uint32_t val)
{
return !*pStr ? val : Hash(pStr + 1, static_cast<uint32_t>(((val ^ *pStr) * static_cast<uint64_t>(prime)) & 0xFFFFFFFF));
}
constexpr static inline Fnv1aVal Hash(char const* const pStr, const size_t strLen, const uint32_t val)
{
return (strLen == 0) ? val : Hash(pStr + 1, strLen - 1, static_cast<uint32_t>(((val ^ *pStr) * static_cast<uint64_t>(prime)) & 0xFFFFFFFF));
}
} // namespace Internal
constexpr static inline Fnv1aVal Hash(char const* const pStr)
{
return !*pStr ? default_offset_basis : Internal::Hash(pStr, default_offset_basis);
}
constexpr static inline Fnv1aVal Hash(char const* const pStr, const size_t strLen)
{
return (strLen == 0) ? default_offset_basis : Internal::Hash(pStr, strLen, default_offset_basis);
}
} // namespace Fnv1aConst
namespace literals
{
inline constexpr uint32_t operator"" _fnv1a(const char* const pStr, const size_t strLen)
{
return Fnv1aConst::Hash(pStr, strLen);
}
} // namespace literals
// Faster version for runtime.
inline Fnv1aVal Fnv1aHash(const void* key, size_t length)
{
Fnv1aVal hash = Fnv1aConst::default_offset_basis;
auto* s = reinterpret_cast<const uint8_t*>(key);
for (size_t i = 0; i < length; ++i) {
hash ^= (uint32_t)*s++;
hash *= Fnv1aConst::prime;
}
return hash;
}
// Note: there is no need for a ToLower constexpr version.
// since you only need to lowercase the input string, and just make the cases lowercase
inline Fnv1aVal Fnv1aHashLower(const void* key, size_t length)
{
Fnv1aVal hash = Fnv1aConst::default_offset_basis;
auto* s = reinterpret_cast<const uint8_t*>(key);
for (size_t i = 0; i < length; ++i) {
hash ^= (uint32_t)Fnv1aConst::Internal::charToLower(*s++);
hash *= Fnv1aConst::prime;
}
return hash;
}
} // namespace hash
/*
int example(std::string_view str)
{
using namespace hash::literals;
int res = 0;
switch(hash::Fnv1aHash(str.data(), str.length()))
{
case "foo"_fnv1a;
res = 1;
break;
case "bar"_fnv1a;
res = 2;
break;
}
return res;
}
*/