aboutsummaryrefslogtreecommitdiffhomepage
path: root/content/2023-09-01-cas-llsc-aba/a64-llsc.cc
diff options
context:
space:
mode:
Diffstat (limited to 'content/2023-09-01-cas-llsc-aba/a64-llsc.cc')
-rw-r--r--content/2023-09-01-cas-llsc-aba/a64-llsc.cc176
1 files changed, 176 insertions, 0 deletions
diff --git a/content/2023-09-01-cas-llsc-aba/a64-llsc.cc b/content/2023-09-01-cas-llsc-aba/a64-llsc.cc
new file mode 100644
index 0000000..37563b5
--- /dev/null
+++ b/content/2023-09-01-cas-llsc-aba/a64-llsc.cc
@@ -0,0 +1,176 @@
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+
+#include <atomic>
+#include <thread>
+
+#include <unistd.h>
+
+#ifndef __aarch64__
+#error "This must be compiled for arm64!"
+#endif
+
+// LDXR: Load exclusive register wrapper.
+//
+// Read from ADDR and marked address for exclusive access (exclusive monitor).
+//
+// Return value read from memory.
+//
+// NOTE: No memory ordering semantics.
+//
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDXR--Load-Exclusive-Register-?lang=en
+uint64_t ldxr(uint64_t* addr) {
+ uint64_t ret;
+ asm volatile("ldxr %0, %1" : "=r"(ret) : "Q"(*addr) : "memory");
+ return ret;
+}
+
+// STXR: Store exclusive register wrapper.
+//
+// Conditionally write VAL to ADDR if ADDR is marked for exclusive access by a
+// previous exclusive load (eg LDXR).
+//
+// Return 0 if the write was successful, 1 otherwise.
+//
+// NOTE: No memory ordering semantics.
+//
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STXR--Store-Exclusive-Register-?lang=en
+bool stxr(uint64_t* addr, uint64_t val) {
+ uint32_t ret;
+ asm volatile("stxr %w0, %2, %1"
+ : "=&r"(ret), "=Q"(*addr)
+ : "r"(val)
+ : "memory");
+ return ret == 0;
+}
+
+inline void mem_barrier() {
+ asm volatile("dmb ish" : /* out */ : /* in */ : "memory");
+}
+
+// -- BLOCK STACK --------------------------------------------------------------
+
+struct BlockStack {
+ struct Block {
+ Block* next{nullptr};
+ unsigned mem{0};
+ };
+
+ void push(Block* blk) {
+ printf("[%d] push blk->mem=%d\n", gettid(), blk->mem);
+
+ do {
+ blk->next = load_head_ex();
+ } while (!store_head_ex(blk));
+ }
+
+ Block* pop(bool delay = false) {
+ Block *blk, *next;
+ do {
+ blk = load_head_ex();
+ next = blk->next;
+
+ // Spin some time to give other thread a chance to replace head.
+ // Delay somewhat tuned for my ARM device.
+ for (int i = 0; delay && (i < (1 << 16)); ++i) {
+ asm volatile("nop");
+ }
+ delay = false;
+ } while (!store_head_ex(next));
+
+ printf("[%d] pop blk->mem=%d\n", gettid(), blk->mem);
+ return blk;
+ }
+
+private:
+ Block* load_head_ex() const {
+ static_assert(sizeof(void*) == sizeof(uint64_t),
+ "Pointer size miss-match!");
+
+ Block* blk = (Block*)ldxr((uint64_t*)&m_head);
+ mem_barrier();
+ return blk;
+ }
+
+ bool store_head_ex(Block* blk) {
+ static_assert(sizeof(void*) == sizeof(uint64_t),
+ "Pointer size miss-match!");
+
+ const bool success = stxr((uint64_t*)&m_head, (uint64_t)blk);
+ mem_barrier();
+ return success;
+ }
+
+ Block* m_head{nullptr};
+};
+
+int main() {
+ BlockStack::Block B1, B2, B3;
+ B1.mem = 1;
+ B2.mem = 2;
+ B3.mem = 3;
+
+ // Create free memory block list.
+ // S: head -> 1 -> 2 -> 3.
+ BlockStack S;
+ S.push(&B3);
+ S.push(&B2);
+ S.push(&B1);
+
+ // Atomics to coordinate when threads are being released.
+ std::atomic<size_t> worker(0);
+ std::atomic<bool> release(false);
+
+ auto T1 = std::thread([&S, &worker, &release]() {
+ // Notify thread T1 ready, and wait for release.
+ worker.fetch_add(1);
+ while (!release.load()) {}
+
+ // Read head=1 & next=2, and add some artificial delay by executing NOPs.
+ // This gives T2 some time to pop 1 & 2 and push back 1, which triggered the
+ // ABA problem with the CAS instruction.
+ //
+ // However with the LL/SC atomics, our exclusive monitor for T1 after the
+ // exclusive load of the head is cleared after T2 updated the head.
+ // Therefore the exclusive store after the NOP delay will fail and we will
+ // load the new head pointer, circumventing the ABA problem.
+ //
+ // NOTE: This does not work under qemu user emulation (aarch64) as qemu does
+ // not implement the strong LL/SC semantics.
+ // https://elixir.bootlin.com/qemu/v8.1.2/source/target/arm/tcg/translate-a64.c#L2411
+ auto* B1 = S.pop(true /* nops delay */);
+ assert(B1->mem == 1);
+ // Pops 3, correct, no ABA problem.
+ auto* B2 = S.pop();
+ assert(B2->mem == 3);
+ });
+
+ auto T2 = std::thread([&S, &worker, &release]() {
+ // Notify thread T2 ready, and wait for release.
+ worker.fetch_add(1);
+ while (!release.load()) {}
+
+ // Artificial sleep, such that T1 entered pop and read head & next.
+ // Delay somewhat tuned for my ARM device.
+ for (int i = 0; i < (1<<8); ++i) {
+ asm volatile("nop");
+ }
+
+ // Pop 1 & 2.
+ auto* B1 = S.pop();
+ assert(B1->mem == 1);
+ auto* B2 = S.pop();
+ assert(B2->mem == 2);
+
+ // Re-insert 1.
+ S.push(B1);
+ });
+
+ // Wait T1 & T2 ready, then release both threads.
+ while (worker.load() != 2) {}
+ release.store(true);
+
+ T2.join();
+ T1.join();
+}