From 824c3aa83d9acc45567421c0d08ca6eb4d718a43 Mon Sep 17 00:00:00 2001 From: Johannes Stoelp Date: Wed, 15 Nov 2023 02:36:11 +0100 Subject: post: cas, aba and ll/sc first draft --- content/2023-09-01-cas-llsc-aba/.clang-format | 15 + content/2023-09-01-cas-llsc-aba/.gitignore | 3 + content/2023-09-01-cas-llsc-aba/Makefile | 37 ++ content/2023-09-01-cas-llsc-aba/a64-basic-llsc.cc | 89 +++ content/2023-09-01-cas-llsc-aba/a64-llsc.cc | 176 ++++++ content/2023-09-01-cas-llsc-aba/cas-aba-issue.cc | 87 +++ .../2023-09-01-cas-llsc-aba/cas-ptr-generation.cc | 134 +++++ content/2023-09-01-cas-llsc-aba/index.md | 670 +++++++++++++++++++++ content/2023-09-01-cas-llsc-aba/list-abc.drawio | 61 ++ content/2023-09-01-cas-llsc-aba/list-abc.svg | 4 + content/2023-09-01-cas-llsc-aba/list-ac.drawio | 58 ++ content/2023-09-01-cas-llsc-aba/list-ac.svg | 4 + content/2023-09-01-cas-llsc-aba/list-b.drawio | 55 ++ content/2023-09-01-cas-llsc-aba/list-b.svg | 4 + 14 files changed, 1397 insertions(+) create mode 100644 content/2023-09-01-cas-llsc-aba/.clang-format create mode 100644 content/2023-09-01-cas-llsc-aba/.gitignore create mode 100644 content/2023-09-01-cas-llsc-aba/Makefile create mode 100644 content/2023-09-01-cas-llsc-aba/a64-basic-llsc.cc create mode 100644 content/2023-09-01-cas-llsc-aba/a64-llsc.cc create mode 100644 content/2023-09-01-cas-llsc-aba/cas-aba-issue.cc create mode 100644 content/2023-09-01-cas-llsc-aba/cas-ptr-generation.cc create mode 100644 content/2023-09-01-cas-llsc-aba/index.md create mode 100644 content/2023-09-01-cas-llsc-aba/list-abc.drawio create mode 100644 content/2023-09-01-cas-llsc-aba/list-abc.svg create mode 100644 content/2023-09-01-cas-llsc-aba/list-ac.drawio create mode 100644 content/2023-09-01-cas-llsc-aba/list-ac.svg create mode 100644 content/2023-09-01-cas-llsc-aba/list-b.drawio create mode 100644 content/2023-09-01-cas-llsc-aba/list-b.svg (limited to 'content') diff --git a/content/2023-09-01-cas-llsc-aba/.clang-format b/content/2023-09-01-cas-llsc-aba/.clang-format new file mode 100644 index 0000000..1c7765b --- /dev/null +++ b/content/2023-09-01-cas-llsc-aba/.clang-format @@ -0,0 +1,15 @@ +# dotfiles -- clang-format +# author: johannst +# doc : https://clang.llvm.org/docs/ClangFormatStyleOptions.html +--- + +Language: Cpp +BasedOnStyle: Chromium + +AllowShortFunctionsOnASingleLine: None + +AccessModifierOffset: -2 +IndentWidth: 2 + +AlignConsecutiveMacros: true +AlignConsecutiveAssignments: true diff --git a/content/2023-09-01-cas-llsc-aba/.gitignore b/content/2023-09-01-cas-llsc-aba/.gitignore new file mode 100644 index 0000000..94b26cc --- /dev/null +++ b/content/2023-09-01-cas-llsc-aba/.gitignore @@ -0,0 +1,3 @@ +*.elf +.$*.drawio.dtmp +.$*.drawio.bkp diff --git a/content/2023-09-01-cas-llsc-aba/Makefile b/content/2023-09-01-cas-llsc-aba/Makefile new file mode 100644 index 0000000..602418f --- /dev/null +++ b/content/2023-09-01-cas-llsc-aba/Makefile @@ -0,0 +1,37 @@ +BINS = cas-aba-issue cas-ptr-generation a64-basic-llsc a64-llsc + +CFLAGS_cas-ptr-generation = -mcx16 +CFLAGS = -g -O3 -Wall -Wextra $(CFLAGS_$(*)) + +A64_SYSROOT = /usr/aarch64-linux-gnu + +run: $(BINS:%=run-%) +build: $(BINS:%=%.elf) + +# -- HOST NATIVE TARGETS ------------------------------------------------------- + +run-%: %.elf + ./$^ + +%.elf : %.cc + $(CXX) $(CFLAGS) -o $@ $^ -latomic + +disasm: cas-ptr-generation.elf + objdump -C -d -j .text -M intel --no-show-raw-insn --visualize-jumps=color $^ | less -R + +# -- ARM64 CROSS TARGETS ------------------------------------------------------- + +a64-%.elf: a64-%.cc + clang++ $(CFLAGS) -target aarch64-linux-gnu --sysroot=$(A64_SYSROOT) -fuse-ld=lld -o $@ $^ + #llvm-objdump -j .text -C -d $@ + +run-a64-%: a64-%.elf + qemu-aarch64 -L $(A64_SYSROOT) -E LD_LIBRARY_PATH=$(A64_SYSROOT)/lib64 $^ + +# -- MISC TARGETS -------------------------------------------------------------- + +fmt: + clang-format -i *.cc + +clean: + $(RM) *.elf diff --git a/content/2023-09-01-cas-llsc-aba/a64-basic-llsc.cc b/content/2023-09-01-cas-llsc-aba/a64-basic-llsc.cc new file mode 100644 index 0000000..82c68e1 --- /dev/null +++ b/content/2023-09-01-cas-llsc-aba/a64-basic-llsc.cc @@ -0,0 +1,89 @@ +#include +#include +#include +#include + +#ifndef __aarch64__ +#error "This must be compiled for arm64!" +#endif + +// NOTES on the inline assembly: +// +// * AArch64 constraint. +// https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html +// +// Q: A memory address which uses a single base register with no offset. +// +// * Output constraint. +// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Output-Operands +// +// Use the '&' constraint modifier on all output operands that must not +// overlap an input. Otherwise, GCC may allocate the output operand in the +// same register as an unrelated input operand, on the assumption that the +// assembler code consumes its inputs before producing outputs. This +// assumption may be false if the assembler code actually consists of more +// than one instruction. + +// LDXR: Load exclusive register wrapper. +// +// Read from ADDR and marked address for exclusive access (exclusive monitor). +// +// Return value read from memory. +// +// NOTE: No memory ordering semantics. +// +// https://developer.arm.com/documentation/ddi0596/latest/Base-Instructions/LDXR--Load-Exclusive-Register-?lang=en +inline uint64_t ldxr(uint64_t* addr) { + uint64_t ret; + asm volatile("ldxr %0, %1" : "=r"(ret) : "Q"(*addr) : "memory"); + return ret; +} + +// STXR: Store exclusive register wrapper. +// +// Conditionally write VAL to ADDR if ADDR is marked for exclusive access by a +// previous exclusive load (eg LDXR). +// +// Return 0 if the write was successful, 1 otherwise. +// +// NOTE: No memory ordering semantics. +// +// https://developer.arm.com/documentation/ddi0596/latest/Base-Instructions/STXR--Store-Exclusive-Register-?lang=en +inline bool stxr(uint64_t* addr, uint64_t val) { + uint32_t ret; + asm volatile("stxr %w0, %2, %1" + : "=&r"(ret), "=Q"(*addr) + : "r"(val) + : "memory"); + return ret == 0; +} + +int main() { + uint64_t mem = 42; + + auto T1 = std::thread([&mem]() { + // Write to exclusive location (does clear exclusive monitor). + mem = 2222; + // Full memory barrier. + __sync_synchronize(); + }); + + uint64_t old = ldxr(&mem); + + // Some artificial delay w/o an explicit context switch (eg syscall) as that + // would clear the exclusive monitor, though it can still be interupted by + // the scheduler. + // Delay is "tuned" for my ARM silicon. + for (int i = 0; i < (1 << 13); ++i) { + asm volatile("nop"); + } + + // Full memory barrier. + __sync_synchronize(); + + bool ok = stxr(&mem, 1111); + printf("old: %lu -> mem: %lu | ok: %d\n", old, mem, ok); + + T1.join(); + return ok ? 0 : 1; +} diff --git a/content/2023-09-01-cas-llsc-aba/a64-llsc.cc b/content/2023-09-01-cas-llsc-aba/a64-llsc.cc new file mode 100644 index 0000000..37563b5 --- /dev/null +++ b/content/2023-09-01-cas-llsc-aba/a64-llsc.cc @@ -0,0 +1,176 @@ +#include +#include +#include + +#include +#include + +#include + +#ifndef __aarch64__ +#error "This must be compiled for arm64!" +#endif + +// LDXR: Load exclusive register wrapper. +// +// Read from ADDR and marked address for exclusive access (exclusive monitor). +// +// Return value read from memory. +// +// NOTE: No memory ordering semantics. +// +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDXR--Load-Exclusive-Register-?lang=en +uint64_t ldxr(uint64_t* addr) { + uint64_t ret; + asm volatile("ldxr %0, %1" : "=r"(ret) : "Q"(*addr) : "memory"); + return ret; +} + +// STXR: Store exclusive register wrapper. +// +// Conditionally write VAL to ADDR if ADDR is marked for exclusive access by a +// previous exclusive load (eg LDXR). +// +// Return 0 if the write was successful, 1 otherwise. +// +// NOTE: No memory ordering semantics. +// +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STXR--Store-Exclusive-Register-?lang=en +bool stxr(uint64_t* addr, uint64_t val) { + uint32_t ret; + asm volatile("stxr %w0, %2, %1" + : "=&r"(ret), "=Q"(*addr) + : "r"(val) + : "memory"); + return ret == 0; +} + +inline void mem_barrier() { + asm volatile("dmb ish" : /* out */ : /* in */ : "memory"); +} + +// -- BLOCK STACK -------------------------------------------------------------- + +struct BlockStack { + struct Block { + Block* next{nullptr}; + unsigned mem{0}; + }; + + void push(Block* blk) { + printf("[%d] push blk->mem=%d\n", gettid(), blk->mem); + + do { + blk->next = load_head_ex(); + } while (!store_head_ex(blk)); + } + + Block* pop(bool delay = false) { + Block *blk, *next; + do { + blk = load_head_ex(); + next = blk->next; + + // Spin some time to give other thread a chance to replace head. + // Delay somewhat tuned for my ARM device. + for (int i = 0; delay && (i < (1 << 16)); ++i) { + asm volatile("nop"); + } + delay = false; + } while (!store_head_ex(next)); + + printf("[%d] pop blk->mem=%d\n", gettid(), blk->mem); + return blk; + } + +private: + Block* load_head_ex() const { + static_assert(sizeof(void*) == sizeof(uint64_t), + "Pointer size miss-match!"); + + Block* blk = (Block*)ldxr((uint64_t*)&m_head); + mem_barrier(); + return blk; + } + + bool store_head_ex(Block* blk) { + static_assert(sizeof(void*) == sizeof(uint64_t), + "Pointer size miss-match!"); + + const bool success = stxr((uint64_t*)&m_head, (uint64_t)blk); + mem_barrier(); + return success; + } + + Block* m_head{nullptr}; +}; + +int main() { + BlockStack::Block B1, B2, B3; + B1.mem = 1; + B2.mem = 2; + B3.mem = 3; + + // Create free memory block list. + // S: head -> 1 -> 2 -> 3. + BlockStack S; + S.push(&B3); + S.push(&B2); + S.push(&B1); + + // Atomics to coordinate when threads are being released. + std::atomic worker(0); + std::atomic release(false); + + auto T1 = std::thread([&S, &worker, &release]() { + // Notify thread T1 ready, and wait for release. + worker.fetch_add(1); + while (!release.load()) {} + + // Read head=1 & next=2, and add some artificial delay by executing NOPs. + // This gives T2 some time to pop 1 & 2 and push back 1, which triggered the + // ABA problem with the CAS instruction. + // + // However with the LL/SC atomics, our exclusive monitor for T1 after the + // exclusive load of the head is cleared after T2 updated the head. + // Therefore the exclusive store after the NOP delay will fail and we will + // load the new head pointer, circumventing the ABA problem. + // + // NOTE: This does not work under qemu user emulation (aarch64) as qemu does + // not implement the strong LL/SC semantics. + // https://elixir.bootlin.com/qemu/v8.1.2/source/target/arm/tcg/translate-a64.c#L2411 + auto* B1 = S.pop(true /* nops delay */); + assert(B1->mem == 1); + // Pops 3, correct, no ABA problem. + auto* B2 = S.pop(); + assert(B2->mem == 3); + }); + + auto T2 = std::thread([&S, &worker, &release]() { + // Notify thread T2 ready, and wait for release. + worker.fetch_add(1); + while (!release.load()) {} + + // Artificial sleep, such that T1 entered pop and read head & next. + // Delay somewhat tuned for my ARM device. + for (int i = 0; i < (1<<8); ++i) { + asm volatile("nop"); + } + + // Pop 1 & 2. + auto* B1 = S.pop(); + assert(B1->mem == 1); + auto* B2 = S.pop(); + assert(B2->mem == 2); + + // Re-insert 1. + S.push(B1); + }); + + // Wait T1 & T2 ready, then release both threads. + while (worker.load() != 2) {} + release.store(true); + + T2.join(); + T1.join(); +} diff --git a/content/2023-09-01-cas-llsc-aba/cas-aba-issue.cc b/content/2023-09-01-cas-llsc-aba/cas-aba-issue.cc new file mode 100644 index 0000000..2482781 --- /dev/null +++ b/content/2023-09-01-cas-llsc-aba/cas-aba-issue.cc @@ -0,0 +1,87 @@ +#include +#include + +#include +#include + +#include + +// -- BLOCK STACK -------------------------------------------------------------- + +struct BlockStack { + struct Block { + Block* next{nullptr}; + unsigned mem{0}; + }; + + void push(Block* blk) { + printf("[%d] push blk->mem=%d\n", gettid(), blk->mem); + + do { + blk->next = m_head; + } while (!m_head.compare_exchange_weak(blk->next, blk)); + } + + Block* pop(int s = 0) { + Block *blk, *next; + do { + blk = m_head; + next = blk->next; + // Simulate thread being interrupted. + if (s) { + sleep(s); + } + } while (!m_head.compare_exchange_weak(blk, next)); + + printf("[%d] pop blk->mem=%d\n", gettid(), blk->mem); + return blk; + } + +private: + std::atomic m_head{nullptr}; +}; + +// -- MAIN --------------------------------------------------------------------- + +int main() { + BlockStack::Block B1, B2, B3; + B1.mem = 1; + B2.mem = 2; + B3.mem = 3; + + // Create free memory block list. + // S: head -> 1 -> 2 -> 3. + BlockStack S; + S.push(&B3); + S.push(&B2); + S.push(&B1); + + auto T1 = std::thread([&S]() { + // Read head=1 & next=2, then get interrupted (simulated by sleep). + // After resuming, T2 already popped 1 & 2 and pushed again 1 (ABA). + // The CAS will set head pointing to 2 and pop block 1. Block 2 is inserted + // again into the free list, even it is not free. + auto* B1 = S.pop(2); + assert(B1->mem == 1); + // Pops 2, which is currently owned by T2, double usage (broken). + auto* B2 = S.pop(); + assert(B2->mem == 2); + }); + + auto T2 = std::thread([&S]() { + // Artificial sleep, such that T1 entered pop and read head & next. + sleep(1); + + // Pop 1 & 2. + auto* B1 = S.pop(); + assert(B1->mem == 1); + auto* B2 = S.pop(); + assert(B2->mem == 2); + + // Re-insert 1. + S.push(B1); + }); + + T2.join(); + T1.join(); +} diff --git a/content/2023-09-01-cas-llsc-aba/cas-ptr-generation.cc b/content/2023-09-01-cas-llsc-aba/cas-ptr-generation.cc new file mode 100644 index 0000000..319481b --- /dev/null +++ b/content/2023-09-01-cas-llsc-aba/cas-ptr-generation.cc @@ -0,0 +1,134 @@ +#include +#include +#include + +#include +#include + +#include + +#if __x86_64 != 1 +#error "Example only for x86_64" +#endif + +// Only used because we look at the disassembly. +#define NOINLINE __attribute__((noinline)) + +// -- UTILITIES ---------------------------------------------------------------- + +template +NOINLINE bool cmpxchg16b(T* dest, T oldval, T newval) { + static_assert(sizeof(T) == 16, "Required for CMPXCHG16B"); + static_assert(alignof(T) == 16, "Required for CMPXCHG16B"); + + const auto as_u128 = [](T* ptr) { + return reinterpret_cast(ptr); + }; + // Use legacy __sync atomic builtin instead __atomic builtin because gcc will + // directly emit the 16byte compare exchange instruction instead of emitting a + // library call. This is nice when looking at the disassembly, weeh. + // + // -mcx16 + // This option enables GCC to generate "CMPXCHG16B" instructions in 64-bit + // code to implement compare-and- exchange operations on 16-byte aligned + // 128-bit objects. This is useful for atomic updates of data + // structures exceeding one machine word in size. The compiler uses this + // instruction to implement __sync Builtins. However, for __atomic Builtins + // operating on 128-bit integers, a library call is always used. + return __sync_bool_compare_and_swap(as_u128(dest), *as_u128(&oldval), + *as_u128(&newval)); +} + +// -- BLOCK STACK -------------------------------------------------------------- + +struct BlockStack { + struct Block { + Block* next{nullptr}; + unsigned mem{0}; + }; + + void push(Block* blk) { + printf("[%d] push blk->mem=%d\n", gettid(), blk->mem); + + GenerationPtr old_head, new_head; + do { + old_head = m_head; + blk->next = old_head.ptr; + new_head.ptr = blk; + new_head.gen = old_head.gen + 1; + } while (!cmpxchg16b(&m_head, old_head, new_head)); + } + + Block* pop(int s = 0) { + GenerationPtr old_head, new_head; + do { + old_head = m_head; + new_head.ptr = old_head.ptr->next; + new_head.gen = old_head.gen + 1; + + // Simulate thread being interrupted. + if (s) { + sleep(s); + } + } while (!cmpxchg16b(&m_head, old_head, new_head)); + + printf("[%d] pop blk->mem=%d\n", gettid(), old_head.ptr->mem); + return old_head.ptr; + } + +private: + struct GenerationPtr { + Block* ptr{nullptr}; + uint64_t gen{0}; + } __attribute__((aligned(16))); + + GenerationPtr m_head; +}; + +// -- MAIN --------------------------------------------------------------------- + +int main() { + BlockStack::Block B1, B2, B3; + B1.mem = 1; + B2.mem = 2; + B3.mem = 3; + + // Create free memory block list. + // S: head -> 1 -> 2 -> 3. + BlockStack S; + S.push(&B3); + S.push(&B2); + S.push(&B1); + + auto T1 = std::thread([&S]() { + // Read head=1 & next=2, then get interrupted (simulated by sleep). + // After resuming, T2 already popped 1 & 2 and pushed again 1 (ABA). + // + // The first CAS will fail even block 1 is currently at the beginning of the + // list as the generation of the OLD_ROOT does not match the generation of + // M_ROOT. The CAS loop will retry, read the new M_ROOT, and hence the new + // NEXT pointer pointing to 3 and then successfully update M_ROOT. + auto* B1 = S.pop(2); + assert(B1->mem == 1); + // Pops 3, as it should be. + auto* B3 = S.pop(); + assert(B3->mem == 3); + }); + + auto T2 = std::thread([&S]() { + // Artificial sleep, such that T1 entered pop and read head & next. + sleep(1); + + // Pop 1 & 2. + auto* B1 = S.pop(); + assert(B1->mem == 1); + auto* B2 = S.pop(); + assert(B2->mem == 2); + + // Re-insert 1. + S.push(B1); + }); + + T2.join(); + T1.join(); +} diff --git a/content/2023-09-01-cas-llsc-aba/index.md b/content/2023-09-01-cas-llsc-aba/index.md new file mode 100644 index 0000000..177404f --- /dev/null +++ b/content/2023-09-01-cas-llsc-aba/index.md @@ -0,0 +1,670 @@ ++++ +title = "CAS, ABA and LL/SC" + +[taxonomies] +tags = ["threading", "linux", "arm", "x86", "qemu"] ++++ + +If this zoo of abbreviations does not mean anything to you then "[fly you +fools][fly-you-fools]" as long as you still can :^) + +Jokes aside. I recently had the opportunity to ramp up a new co-worker on the +different semantics of `atomic` instructions (compare and swap `CAS` vs +load-linked store-conditional `LL/SC`), `lock-free` programming and the `ABA` +problem. + +In the realm of `lock-free` programming and the `ABA` problem, I shared a war +story from some years ago when I was debugging a random memory corruption in +our multi-threaded simulator. Sharing that story actually gave the idea to +write this post as reference for potential new co-workers or all the internet +people <3. + +After many hours of debugging the random memory corruption, which *obviously* +did not occur very often, it turned out to be a bug in a custom memory +allocator. To be more precise, in a primitive used by the allocator to manage a +list of free memory blocks. The allocator was used from multiple threads +concurrently and hence `pushing` and `popping` free memory blocks had to be +thread-safe. + +The operations to manage the free block list were implemented with an +`lock-free` algorithm, which we will visit later when analyzing the bug. +Before, let us revisit the fundamental semantics of atomic `compare-and-swap +(CAS)` operations, such that everyone has the same understanding. If you are +already familiar with CAS, feel free to skip ahead to the exploration of the +[lock-free memory block primitive](#lock-free-free-memory-block-primitive). + +## compare-and-swap (CAS) + +The following pseudo code gives a good mental model for an atomic CAS +operation. The described compare-and-swap is executed atomically without giving +any other execution stream the chance to interfere in the middle of this +operation. +```cpp +// bool cas(T* val, T* expected, T desired); +// +// Does the following operation atomically. +// +bool is_expected = (*val == *expected); +if (is_expected) { + *val = desired; +} else { + *expected = *val; +} +return is_expected; +``` + +Below are some concrete examples using the cpp [`std::atomic`][cpp-std-atomic] +type. +```cpp +// bool std::atomic::compare_exchange_weak(T& expected, T desired); + +std::atomic val(42); +int expected = 43; +// val != expected -> expected = val +assert(val.compare_exchange_weak(expected, 1337) == false); +// val == expected -> val = desired +assert(val.compare_exchange_weak(expected, 1337) == true); +assert(val.load() == 1337); +``` + +When talking about atomics we also need to talk about [memory +ordering][cpp-mem-order]. + +Memory ordering concerns the ordering of memory accesses, both atomic and +non-atomic, in relation to atomic operations. It also determines how +side-effects, such as unrelated writes for example, are observed across +threads. Essentially, memory ordering sets the rules for optimizers to reorder +instructions and dictates compilers to insert memory barrier instructions, +ensuring the memory ordering guarantees, in cases where the underlying +hardware's memory model is relatively weak and subject to instruction +reordering. + +For the purpose of our discussion, we'll focus on the concept of `sequential +consistent` ordering, as it is the most intuitive and also the default in C++ +when utilizing std::atomic. When dealing with atomic loads and stores, it +provides the following guarantees: +1. `atomic store`: On the current thread no read / write preceding the *store* + can be reordered after the *store*. Other threads will observe all writes + before they observe the atomic store. +1. `atomic load`: On the current thread no read / write following the *load* + can be reordered before the *load*. The current thread observes all writes + done on other threads, which happen before the store to the atomic. + +In the example below that means, on *Thread 1* the `M = X` can not be reordered +after the `A.store(1)`. When *Thread 2* sees the atomic variable change, it is +guaranteed that is observes the write to M on *Thread 1* and hence `print(M)` +would give X. +``` +Thread 1 Thread 2 + +M = X if (A.load() == 1) +A.store(1) print(M) +``` + +An atomic CAS operations performs and atomic load and store, which means that +no read or write can be reordered across the CAS operation in any direction. + +With that, we just scratched the surface of the rabbit hole, but sufficient for +the remaining discussion. The 2017 cppcon talk from [Fedor Pikus "C++ atomics, +from basic to advanced. What do they really do?"][yt-atomics] goes into more +depth and is highly recommended. I also added some more +[references](#references) at the end of the post. + +## `lock-free` free memory block primitive + +The code below gives the essence of the primitive to manage the free memory +blocks with the goal of being thread-safe. Essentially it is a linked list of +`BlockData::Block` objects where the head pointer to the beginning of the list +is swapped atomically. + +Details such as how the list initially is filled up or how it grows +dynamically at runtime are left out here. The same goes for any sort of error +handling as none of this is relevant for the discussion. + +```cpp +struct BlockStack { + struct Block { + Block* next{nullptr}; + // .. + }; + + void push(Block* blk) { + do { + blk->next = m_head; + } while (!m_head.compare_exchange_weak(blk->next, blk)); + } + + Block* pop() { + Block* blk; + do { + blk = m_head; + // eg, if (blk == nullptr) expand with new free Blocks. + } while (!m_head.compare_exchange_weak(blk, blk->next)); + return blk; + } + +private: + std::atomic m_head{nullptr}; +}; +``` +> The `std::atomic::compare_exchange_weak` overload used has an additional +> argument for the memory ordering with the default value of sequential +> consistent. This ensures in the push() method, that when another thread +> observes the new head pointer the write to *blk->next* before the CAS is also +> visible. + +Taking a first look at the `push()` and `pop()` methods, the implementation +looks quite sound. + +`push()` inserts the `blk` object at the beginning of the list. That means the +newly inserted block needs to point to the current list head and the list head +pointer must be updated to point to the newly inserted block. The head pointer +is swapped atomically, which only succeeds if no one else has updated the head +pointer in the meantime and it still points to `blk->next`. In case the CAS was +not successful, the insert operation is retried in a loop until the block is +inserted. + +`pop()` on the other hand removes the block at the beginning of the list and +sets the second block as the new beginning of the list. This is done by +updating the head pointer to point to the second block entry in the list. +Similar to push, the current head is read once and the head pointer swapped +atomically. This operation is again repeated in a loop until the CAS is +successful. + +However the `pop()` method has a subtle difference compared to the `push()` +method as it dereferences the current head block, which makes it vulnerable to +the `ABA problem`. We will see in a moment what the ABA problem is and why it +is called like that, but first let us slightly re-write the pop method to make +it easier to spot the bug. + +```cpp, linenos +Block* pop() { + Block* blk; + Block* next; + do { + blk = m_head; + next = blk->next; // Deref "our" head block. + } while (!m_head.compare_exchange_weak(blk, next)); + return blk; +} +``` + +Now let us assume we have two threads *Thread 1* and *Thread 2*. Both threads +access the shared `BlockData` instance which currently contains three free +blocks (A), (B) and (C). + + + +*Thread 1* calls pop() and gets interrupted between line 6 and 7. +- `blk` points to (A) +- `next` points to (B) + +In the meantime *Thread 2* performs the following actions +- pop() -> (A) +- pop() -> (B) +- push((A)) + +After this, *Thread 2* owns block (B) and the `BlockData` instance contains the +free blocks (A) and (C). + + + +Next, *Thread 1* is resumed and continues with the atomic swap of `m_head`. The +swap will succeed as `m_head (A) == blk (A)` is true, however this operation +will re-insert block (B) as free block since *Thread 1* read `next = (B)` +before being interrupted. + +This leaves us with the following state after *Thread 1* returns from the pop +operation. + + + +We can easily see that this state is catastrophic, as the next pop operation +will return block (B) which is currently used and owned by *Thread 2*. +Additionally, block (C) and the rest of the list is leaked. + +The example above describes the `ABA problem`. Which is, the "inner" state of +the block list is altered, by removing (B), but the observable "outer" state is +unchanged, by removing and re-inserting (A) again, hence ABA. + +The following program [cas-aba-issue.cc](cas-aba-issue.cc) demonstrates the +above described issue and can be used for further experiments. It can be +compiled as follows. +``` +clang++ -o cas-aba-issue cas-aba-issue.cc -latomic +``` + +So, how can the current implementation be fixed? + +The obvious approach would be to rewrite the push() and pop() methods to use a +`lock`. + +However, the point of this post is to look at the `lock-free` approaches and +learn about the different atomic instructions. +Therefore, we will discuss two approaches which require specific instructions, +and therefore are only applicable if the underlying hardware supports that +instructions. +1. Generational pointers using `double-word CAS`. +2. `Exclusive` accesses using load-linked store-conditional. + +## Generational pointers + +In this approach the raw head pointer is replaced with an head object that +holds the raw pointer as well as a counter (the `generation`). The idea being, +when ever the head object is swapped the generation is increased. In practice, +this makes it resilient against the ambiguous raw pointer in the ABA problem, +as in the example discussed above, the head objects would not compare equal due +to a different generation. +> Theoretically, a case can be constructed where the ABA problem occurs, if the +> counter overflows, but that is not really relevant in practice. + +For the following discussion we assume that we are in a x86_64 environment. + +The `GenerationPtr` will be our new head object, holding the raw pointer as +well as the generation counter. +```cpp +struct GenerationPtr { + Block* ptr{nullptr}; + uint64_t gen{0}; +}; +``` + +To be able to atomically compare and swap such an object, the host must support +a double-word CAS, and in this specific case a 16 byte CAS. Most x86_64 hosts +support the [`cmpxchg16b`][cmpxchg16b] instruction which allows to atomically +compare and swap 16 byte when combined with the `lock` prefix. + +We create ourselves a helper function for the 16 byte CAS based on the legacy +`__sync_bool_compare_and_swap` builtin, as both gcc and clang directly emitted +cmpxchg16b instructions instead of library calls with the `-mcx16` target +option enabled. It is irrelevant for this discussion and makes looking at the +disassembly easier. +```cpp +// bool __sync_bool_compare_and_swap (type *ptr, type oldval, type newval); + +template +bool cmpxchg16b(T* dest, T oldval, T newval) { + static_assert(sizeof(T) == 16, "Required for cmpxchg16b"); + static_assert(alignof(T) == 16, "Required for cmpxchg16b"); + + const auto as_u128 = [](T* ptr) { + return reinterpret_cast(ptr); + }; + return __sync_bool_compare_and_swap(as_u128(dest), *as_u128(&oldval), + *as_u128(&newval)); +} +``` + +The codegen with clang 15.0.7 for this function gives the following result on +my machine. + +```asm +; generated by clang version 15.0.7 with -mcx16 + +00000000000016e0 (BlockStack::GenerationPtr* dest, + BlockStack::GenerationPtr oldval, + BlockStack::GenerationPtr newval)>: + ; Per SystemV ABI + ; rdi = dest ptr + ; rsi = oldval.ptr + ; rdx = oldval.gen + ; rcx = newval.ptr + ; r8 = newval.gen + 16e0: push rbx + 16e1: mov rbx,rcx ; shuffle args for cmpxchg16b + 16e4: mov rax,rsi ; shuffle args for cmpxchg16b + 16e7: mov rcx,r8 ; shuffle args for cmpxchg16b + ; CMPXCHG16B + ; compare rdx:rax (oldval) with [rdi] (dest*) + ; if equal then load rcx:rbx (newval) into [rdi] (dest*) + 16ea: lock cmpxchg16b OWORD PTR [rdi] + 16ef: sete al + 16f2: pop rbx + 16f3: ret +``` + +With that we can update the type of `BlockStack::m_head` to GenerationPtr and +adapt the push() and pop() methods. + +```cpp, linenos +// BlockStack .. + +void push(Block* blk) { + GenerationPtr old_head, new_head; + do { + old_head = m_head; + blk->next = old_head.ptr; + new_head.ptr = blk; + new_head.gen = old_head.gen + 1; + } while (!cmpxchg16b(&m_head, old_head, new_head)); +} + +Block* pop() { + GenerationPtr old_head, new_head; + do { + old_head = m_head; + new_head.ptr = old_head.ptr->next; + new_head.gen = old_head.gen + 1; + } while (!cmpxchg16b(&m_head, old_head, new_head)); + return old_head.ptr; +} +``` +> For the purpose of this discussion we assume that the +> __sync_bool_compare_and_swap builtin acts as full barrier. In practice the +> __sync builtins should not be used anymore, and instead the newer __atomic +> builtins which allow to specify the memory ordering. + +A note about the loads of m_head in line 6 and line 16. Those loads are not +atomic and the value of m_head could be updated from another thread in the +midst when loading the values. However if we read an inconsistent m_head the +CAS would fail and we would retry our operation. + +The following program [cas-ptr-generation.cc](cas-ptr-generation.cc) provides +the full example. When being run it demonstrates how the _generation pointer_ +prevents the ABA problem. It can be compiled as follows. +``` +clang++ -o cas-ptr-generation cas-ptr-generation.cc -latomic -mcx16 +``` + +## Load linked store conditional + +So far we looked at atomic CAS instructions which have read-modify-write +semantics and are typically found with CISC ISAs. RISC ISAs on the other hand +usually provide load-linked (LL) store-conditional (SC) instruction pairs to +implement atomic accesses. Compared to the compare-and-swap instruction, LL/SC +are two distinct instructions. The LL instruction loads the current value and +marks the memory region for exclusive access, and the SC instruction only +succeeds and updates the memory location if the memory location is still marked +as exclusive at the time of the store-conditional. If there was an update to +the memory location between the LL/SC pair, the exclusive marker is removed. + +The pseudo code below gives an example of how atomic operations are typically +implemented with LL/SC instructions. The example shows an atomic increment of a +location in memory. +```c +/// Atomically increment *MEM and return OLD value. +uint64_t atomic_fetch_inc(uint64_t* mem) { + uint64_t old; + bool ok; + + do { + old = ll(mem); // Load current value and mark for ex access. + ok = sc(mem, old + 1); // Try to store incremented value. + } while (!ok); + + return old; +} +``` + +For the remaining discussion we will focus on the ARM architecture, and use the +[A64 instruction set][a64-insn] as I have some ARM silicon at home to run the +examples on. + +We can take a look into the [ARM ARM (Architecture Reference +Manual)][arm-arm-a] for the A-profile to find the description of the LL/SC +instructions in the chapter *B2.9 Synchronization and semaphores*. Where +`Load-Exclusive` refers to load-linked and `Store-Exclusive` refers to +store-conditional. +> The model for the use of a Load-Exclusive/Store-Exclusive instruction pair +> accessing a non-aborting memory address x is: +> - The Load-Exclusive instruction reads a value from memory address x. +> - The corresponding Store-Exclusive instruction succeeds in writing back to +> memory address x only if no other observer, process, or thread has +> performed a more recent store to address x. The Store-Exclusive instruction +> returns a status bit that indicates whether the Write Memory succeeded. + +The A64 instruction set supports multiple LL/SC instruction pairs for different +operand sizes and with different memory ordering semantics. We will make use of +the [LDXR][a64-ldxr] and [STXR][a64-stxr] instructions, which are exclusive +load from register and store to register instructions without any memory +ordering semantics. We will use explicit `memory barriers` rather than +instruction pairs with `release-acquire` memory ordering semantics, as this +post is not aimed to discuss the different memory orderings, but give an +overview of different atomic instructions. + +Table *B2-3 Synchronization primitives and associated instruction, A64 +instruction set* in the ARM ARM has a full overview of all the available LL/SC +instruction pairs in the A64 instruction set. + +Following code gives a starting point to play with the LL/SC instructions on +ARM, by providing inline assembly wrapper functions to emit the LDXR and STXR +instructions. + +```cpp +/// Load exclusive register. +inline uint64_t ldxr(uint64_t* addr) { + uint64_t ret; + asm volatile("ldxr %0, %1" : "=r"(ret) : "Q"(*addr) : "memory"); + return ret; +} + +/// Store exclusive register. +inline bool stxr(uint64_t* addr, uint64_t val) { + uint32_t ret; + asm volatile("stxr %w0, %2, %1" : "=&r"(ret), "=Q"(*addr) : "r"(val) : "memory"); + return ret == 0; +} + +int main() { + uint64_t mem = 42; + + auto T1 = std::thread([&mem]() { + // Write to exclusive location (does clear exclusive monitor). + mem = 2222; + // Full memory barrier. + __sync_synchronize(); + }); + + uint64_t old = ldxr(&mem); + + // Some artificial delay w/o an explicit context switch (eg syscall) as that + // would clear the exclusive monitor, though it can still be interupted by + // the scheduler. + // Delay is "tuned" for my ARM device. + for (int i = 0; i < (1 << 14); ++i) { + asm volatile("nop"); + } + + // Full memory barrier. + __sync_synchronize(); + + bool ok = stxr(&mem, 1111); + printf("old: %lu -> mem: %lu | ok: %d\n", old, mem, ok); + + T1.join(); + return ok ? 0 : 1; +} +``` + +The full source code with additional comments is available here +[a64-basic-llsc.cc](a64-basic-llsc.cc). It can be compiled and run natively on +an ARM system as shown below. +We run the program in a loop until we hit the case where T1 wins the race and +does the memory store in between the LDXR/STXR pair on the main thread. +``` +> g++ -o a64-basic-llsc a64-basic-llsc.cc -lpthread +> while ./a64-basic-llsc; do : ; done +... +old: 42 -> mem: 1111 | ok: 1 +old: 42 -> mem: 1111 | ok: 1 +old: 42 -> mem: 1111 | ok: 1 +old: 42 -> mem: 2222 | ok: 0 +``` + +With those basics covered we can also implement the LL/SC version of the +`BlockStack`. The push() and pop() methods now leverage the exclusive load and +store instructions to atomically update the `m_head` pointer. + +```cpp +struct BlockStack { + struct Block { + Block* next{nullptr}; + // .. + }; + + void push(Block* blk) { + do { + blk->next = load_head_ex(); + } while (!store_head_ex(blk)); + } + + Block* pop() { + Block *blk, *next; + do { + blk = load_head_ex(); + next = blk->next; + } while (!store_head_ex(next)); + return blk; + } + +private: + // Assume 64bit ptr size for ldxr/stxr instructions. + static_assert(sizeof(void*) == sizeof(uint64_t), "Ptr size miss-match!"); + + Block* load_head_ex() const { + Block* blk = (Block*)ldxr((uint64_t*)&m_head); + mem_barrier(); + return blk; + } + + bool store_head_ex(Block* blk) { + const bool success = stxr((uint64_t*)&m_head, (uint64_t)blk); + mem_barrier(); + return success; + } + + Block* m_head{nullptr}; +}; +``` + +The full version of this implementation with additional comments as explanation +is available here [a64-llsc.cc](a64-llsc.cc). It can be compiled and run as +follows on some ARM system. Depending on the timing, the example must be run +multiple times to pass the assertions, as all the artificial delays hacked into +the implementation are tuned according to my ARM device. +``` +g++ -o a64-llsc a64-llsc.cc -lpthread +./a64-llsc +``` + +In fact, passing the assertions in the example above actually does not +guarantee that we drive the two threads and the BlockData object exactly into +the state as in the CAS example, which triggered the ABA problem. In the CAS +example we could use a _sleep()_ to control the exact order of events, but here +we can neither use any _sleep()s_ nor any _atomics_ to synchronize the threads +execution, as any of those would most certainly lead to clearing the exclusive +monitor of the _ldxr_ instruction to read the head pointer. +All we can do is to add some _nop_ delays, hoping the exclusive monitor is not +cleared due to any of those artificial hacks we introduced (or some +interruption by the scheduler), praise the order of events is exactly as we +want them to be and feel happy when the assertions pass. + +The sources of the example are good to explain how the stronger guarantees of +the LL/SC instructions prevent the ABA problem. But in practice, it is really +hard to prove that the program ran exactly as we wanted it to :^) + +## The END + +Personally, I would say that in 99% of the cases, nobody will have to write +code as we did in this post. Of course, there are always exceptions. + +Always prefer to write portable and generic algorithms, and start with a lock +rather than a lock-free algorithm. And most important, measure and profile your +code and do not do premature optimizations and use "cool" stuff just for the +sake of using it. Use the right tool for the job and seek for the simplest +solution. + +However, we specifically wanted to write code like this to introduce CAS vs +LL/SC atomics and the ABA problem in multi-threaded lock-free algorithms. + +Any corrections, improvements or comments are happily welcome, for that find my +mail on [memzero.de][memzero] + +## Appendix: QEMU ARM user emulation + +Unfortunately, we can not use the `qemu-aarch64` userspace emulator for the +experiments here, as for the ARM guest target, QEMU implements the load / store +exclusive instructions not with the full architectural semantics, but rather a +simplification which seems sufficient for the typical guest images. + +When QEMU emulates a _load exclusive_ instruction, it saves the guest address +as well as the data loaded from the guest memory in the cpu state +[CPUARMState::{exclusive_addr, exclusive_val}][qemu-a64-cpu-state]. On a _store +exclusive_, the guest address of the store is compared with the saved +_exclusive_addr_ and the data to be written with the saved _exclusive_val_. +Iff both compare true, the _store exclusive_ is carried out successfully. + +The following pseudo code taken from a comment in +[QEMU:ranslate-a64.c][qemu-a64-ex] shows how QEMU emits _store exclusive_ +instructions where `env` refers to the cpu state. +```c +/// gen_store_exclusive(..) +if (env->exclusive_addr == addr && env->exclusive_val == [addr]) { + [addr] = {Rt}; + {Rd} = 0; +} else { + {Rd} = 1; +} +env->exclusive_addr = -1; +``` + +The *exclusive_val* and *exclusive_addr* fields act as _exclusive monitor_. +However, they are only maintained on load / store exclusive instructions and +not in normal store instructions and this is where QEMU weakens the semantics +of the exclusive instructions. A sequence of normal write instructions can +invalidate and restore the state of the "_exclusive monitor_" in the qemu cpu +state. + +With everything learned about the ABA problem in this post, one can now +understand why QEMU is a bad idea for running our experiments. QEMU will just +trigger the ABA problem even with our [a64-llsc.cc](a64-llsc.cc) example using +load / store exclusive instructions. + +## Abbreviations +- CAS: compare-and-swap +- LL/SC: load-linked store-conditional +- CISC: complex instruction set computer +- RISC: reduced instruction set computer + +## Source files +- [cas-aba-issue.cc](cas-aba-issue.cc) +- [cas-ptr-generation.cc](cas-ptr-generation.cc) +- [a64-basic-llsc.cc](a64-basic-llsc.cc) +- [a64-llsc.cc](a64-llsc.cc) + +## References +- [std::atomic][cpp-std-atomic] +- [cpp memory ordering][cpp-mem-order] +- [Fedor Pikus "C++ atomics, from basic to advanced. What do they really do?"][yt-atomics] +- [Memory Consistency Models: A Tutorial][mem-order-tutorial] +- [gcc _sync_ builtins][gcc-sync] +- [gcc inline asm: output constraints][gcc-asm-output] +- [gcc inline asm: machine constraints][gcc-asm-machine] +- [x86_64: cmpxchg16b instruction][cmpxchg16b] +- [ARM architecture reference manual A-profile][arm-arm-a] +- [ARM exclusive monitor][a64-ex-monitor] +- [ARM A64: instruction manual][a64-insn] +- [ARM A64: stxr instruction][a64-stxr] +- [ARM A64: ldxr instruction][a64-ldxr] +- [arm64: atomics: fix use of acquire + release for full barrier semantics][linux-a64-patch] +- [QEMU A64 load / store exclusive][qemu-a64-ex] +- [QEMU A64 CPUARMState::{exclusive_addr, exclusive_val}][qemu-a64-cpu-state] + +[fly-you-fools]: https://media.giphy.com/media/v1.Y2lkPTc5MGI3NjExMWJpdjFmMGlmdjcyaWZqM21yaXlzb3JlamN6NGJjZmwzdHkxOWlieCZlcD12MV9pbnRlcm5hbF9naWZfYnlfaWQmY3Q9Zw/74ExXfqsbJ8l2/giphy.gif +[yt-atomics]: https://youtu.be/ZQFzMfHIxng?t=2596 +[cpp-mem-order]: https://en.cppreference.com/w/cpp/atomic/memory_order +[cpp-std-atomic]: https://en.cppreference.com/w/cpp/atomic/atomic +[gcc-sync]: https://gcc.gnu.org/onlinedocs/gcc/_005f_005fsync-Builtins.html +[gcc-asm-machine]: https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html +[gcc-asm-output]: https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Output-Operands +[cmpxchg16b]: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b +[a64-insn]: https://developer.arm.com/documentation/ddi0602/latest/?lang=en +[a64-stxr]: https://developer.arm.com/documentation/ddi0596/latest/Base-Instructions/STXR--Store-Exclusive-Register-?lang=en +[a64-ldxr]: https://developer.arm.com/documentation/ddi0596/latest/Base-Instructions/LDXR--Load-Exclusive-Register-?lang=en +[a64-ex-monitor]: https://developer.arm.com/documentation/100934/0100/Exclusive-monitors?lang=en +[arm-arm-a]: https://developer.arm.com/documentation/ddi0487/latest/ +[memzero]: https://memzero.de +[qemu-a64-ex]: https://elixir.bootlin.com/qemu/v8.1.2/source/target/arm/tcg/translate-a64.c#L2411 +[qemu-a64-cpu-state]: https://elixir.bootlin.com/qemu/v8.1.2/source/target/arm/cpu.h#L687 +[mem-order-tutorial]: https://www.cs.utexas.edu/~bornholt/post/memory-models.html +[linux-a64-patch]: https://patchwork.kernel.org/project/linux-arm-kernel/patch/1391516953-14541-1-git-send-email-will.deacon@arm.com/ diff --git a/content/2023-09-01-cas-llsc-aba/list-abc.drawio b/content/2023-09-01-cas-llsc-aba/list-abc.drawio new file mode 100644 index 0000000..a099e1a --- /dev/null +++ b/content/2023-09-01-cas-llsc-aba/list-abc.drawio @@ -0,0 +1,61 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/2023-09-01-cas-llsc-aba/list-abc.svg b/content/2023-09-01-cas-llsc-aba/list-abc.svg new file mode 100644 index 0000000..a80e38b --- /dev/null +++ b/content/2023-09-01-cas-llsc-aba/list-abc.svg @@ -0,0 +1,4 @@ + + + +
A
A
next
next
B
B
next
next
C
C
next
next
m_head
m_head
..
..
next
next
Text is not SVG - cannot display
\ No newline at end of file diff --git a/content/2023-09-01-cas-llsc-aba/list-ac.drawio b/content/2023-09-01-cas-llsc-aba/list-ac.drawio new file mode 100644 index 0000000..859b9e1 --- /dev/null +++ b/content/2023-09-01-cas-llsc-aba/list-ac.drawio @@ -0,0 +1,58 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/2023-09-01-cas-llsc-aba/list-ac.svg b/content/2023-09-01-cas-llsc-aba/list-ac.svg new file mode 100644 index 0000000..9371124 --- /dev/null +++ b/content/2023-09-01-cas-llsc-aba/list-ac.svg @@ -0,0 +1,4 @@ + + + +
A
A
next
next
C
C
next
next
m_head
m_head
..
..
next
next
B
B
next
next
Thread 2
Thread 2
Text is not SVG - cannot display
\ No newline at end of file diff --git a/content/2023-09-01-cas-llsc-aba/list-b.drawio b/content/2023-09-01-cas-llsc-aba/list-b.drawio new file mode 100644 index 0000000..13a14e2 --- /dev/null +++ b/content/2023-09-01-cas-llsc-aba/list-b.drawio @@ -0,0 +1,55 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/content/2023-09-01-cas-llsc-aba/list-b.svg b/content/2023-09-01-cas-llsc-aba/list-b.svg new file mode 100644 index 0000000..2bd0020 --- /dev/null +++ b/content/2023-09-01-cas-llsc-aba/list-b.svg @@ -0,0 +1,4 @@ + + + +
C
C
next
next
m_head
m_head
..
..
next
next
B
B
next
next
Thread 2
Thread 2
A
A
next
next
Thread 1
Thread 1
Text is not SVG - cannot display
\ No newline at end of file -- cgit v1.2.3