diff options
Diffstat (limited to 'content/2023-09-01-cas-llsc-aba/a64-llsc.cc')
-rw-r--r-- | content/2023-09-01-cas-llsc-aba/a64-llsc.cc | 176 |
1 files changed, 176 insertions, 0 deletions
diff --git a/content/2023-09-01-cas-llsc-aba/a64-llsc.cc b/content/2023-09-01-cas-llsc-aba/a64-llsc.cc new file mode 100644 index 0000000..37563b5 --- /dev/null +++ b/content/2023-09-01-cas-llsc-aba/a64-llsc.cc @@ -0,0 +1,176 @@ +#include <cassert> +#include <cstdint> +#include <cstdio> + +#include <atomic> +#include <thread> + +#include <unistd.h> + +#ifndef __aarch64__ +#error "This must be compiled for arm64!" +#endif + +// LDXR: Load exclusive register wrapper. +// +// Read from ADDR and marked address for exclusive access (exclusive monitor). +// +// Return value read from memory. +// +// NOTE: No memory ordering semantics. +// +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDXR--Load-Exclusive-Register-?lang=en +uint64_t ldxr(uint64_t* addr) { + uint64_t ret; + asm volatile("ldxr %0, %1" : "=r"(ret) : "Q"(*addr) : "memory"); + return ret; +} + +// STXR: Store exclusive register wrapper. +// +// Conditionally write VAL to ADDR if ADDR is marked for exclusive access by a +// previous exclusive load (eg LDXR). +// +// Return 0 if the write was successful, 1 otherwise. +// +// NOTE: No memory ordering semantics. +// +// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STXR--Store-Exclusive-Register-?lang=en +bool stxr(uint64_t* addr, uint64_t val) { + uint32_t ret; + asm volatile("stxr %w0, %2, %1" + : "=&r"(ret), "=Q"(*addr) + : "r"(val) + : "memory"); + return ret == 0; +} + +inline void mem_barrier() { + asm volatile("dmb ish" : /* out */ : /* in */ : "memory"); +} + +// -- BLOCK STACK -------------------------------------------------------------- + +struct BlockStack { + struct Block { + Block* next{nullptr}; + unsigned mem{0}; + }; + + void push(Block* blk) { + printf("[%d] push blk->mem=%d\n", gettid(), blk->mem); + + do { + blk->next = load_head_ex(); + } while (!store_head_ex(blk)); + } + + Block* pop(bool delay = false) { + Block *blk, *next; + do { + blk = load_head_ex(); + next = blk->next; + + // Spin some time to give other thread a chance to replace head. + // Delay somewhat tuned for my ARM device. + for (int i = 0; delay && (i < (1 << 16)); ++i) { + asm volatile("nop"); + } + delay = false; + } while (!store_head_ex(next)); + + printf("[%d] pop blk->mem=%d\n", gettid(), blk->mem); + return blk; + } + +private: + Block* load_head_ex() const { + static_assert(sizeof(void*) == sizeof(uint64_t), + "Pointer size miss-match!"); + + Block* blk = (Block*)ldxr((uint64_t*)&m_head); + mem_barrier(); + return blk; + } + + bool store_head_ex(Block* blk) { + static_assert(sizeof(void*) == sizeof(uint64_t), + "Pointer size miss-match!"); + + const bool success = stxr((uint64_t*)&m_head, (uint64_t)blk); + mem_barrier(); + return success; + } + + Block* m_head{nullptr}; +}; + +int main() { + BlockStack::Block B1, B2, B3; + B1.mem = 1; + B2.mem = 2; + B3.mem = 3; + + // Create free memory block list. + // S: head -> 1 -> 2 -> 3. + BlockStack S; + S.push(&B3); + S.push(&B2); + S.push(&B1); + + // Atomics to coordinate when threads are being released. + std::atomic<size_t> worker(0); + std::atomic<bool> release(false); + + auto T1 = std::thread([&S, &worker, &release]() { + // Notify thread T1 ready, and wait for release. + worker.fetch_add(1); + while (!release.load()) {} + + // Read head=1 & next=2, and add some artificial delay by executing NOPs. + // This gives T2 some time to pop 1 & 2 and push back 1, which triggered the + // ABA problem with the CAS instruction. + // + // However with the LL/SC atomics, our exclusive monitor for T1 after the + // exclusive load of the head is cleared after T2 updated the head. + // Therefore the exclusive store after the NOP delay will fail and we will + // load the new head pointer, circumventing the ABA problem. + // + // NOTE: This does not work under qemu user emulation (aarch64) as qemu does + // not implement the strong LL/SC semantics. + // https://elixir.bootlin.com/qemu/v8.1.2/source/target/arm/tcg/translate-a64.c#L2411 + auto* B1 = S.pop(true /* nops delay */); + assert(B1->mem == 1); + // Pops 3, correct, no ABA problem. + auto* B2 = S.pop(); + assert(B2->mem == 3); + }); + + auto T2 = std::thread([&S, &worker, &release]() { + // Notify thread T2 ready, and wait for release. + worker.fetch_add(1); + while (!release.load()) {} + + // Artificial sleep, such that T1 entered pop and read head & next. + // Delay somewhat tuned for my ARM device. + for (int i = 0; i < (1<<8); ++i) { + asm volatile("nop"); + } + + // Pop 1 & 2. + auto* B1 = S.pop(); + assert(B1->mem == 1); + auto* B2 = S.pop(); + assert(B2->mem == 2); + + // Re-insert 1. + S.push(B1); + }); + + // Wait T1 & T2 ready, then release both threads. + while (worker.load() != 2) {} + release.store(true); + + T2.join(); + T1.join(); +} |