1 files changed, 176 insertions, 0 deletions
diff --git a/content/2023-09-01-cas-llsc-aba/a64-llsc.cc b/content/2023-09-01-cas-llsc-aba/a64-llsc.cc
new file mode 100644
index 0000000..37563b5
--- /dev/null
+++ b/content/2023-09-01-cas-llsc-aba/a64-llsc.cc
@@ -0,0 +1,176 @@
+#include <cassert>
+#include <cstdint>
+#include <cstdio>
+
+#include <atomic>
+#include <thread>
+
+#include <unistd.h>
+
+#ifndef __aarch64__
+#error "This must be compiled for arm64!"
+#endif
+
+// LDXR: Load exclusive register wrapper.
+//
+// Read from ADDR and marked address for exclusive access (exclusive monitor).
+//
+// Return value read from memory.
+//
+// NOTE: No memory ordering semantics.
+//
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDXR--Load-Exclusive-Register-?lang=en
+uint64_t ldxr(uint64_t* addr) {
+  uint64_t ret;
+  asm volatile("ldxr %0, %1" : "=r"(ret) : "Q"(*addr) : "memory");
+  return ret;
+}
+
+// STXR: Store exclusive register wrapper.
+//
+// Conditionally write VAL to ADDR if ADDR is marked for exclusive access by a
+// previous exclusive load (eg LDXR).
+//
+// Return 0 if the write was successful, 1 otherwise.
+//
+// NOTE: No memory ordering semantics.
+//
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STXR--Store-Exclusive-Register-?lang=en
+bool stxr(uint64_t* addr, uint64_t val) {
+  uint32_t ret;
+  asm volatile("stxr %w0, %2, %1"
+               : "=&r"(ret), "=Q"(*addr)
+               : "r"(val)
+               : "memory");
+  return ret == 0;
+}
+
+inline void mem_barrier() {
+  asm volatile("dmb ish" : /* out */ : /* in */ : "memory");
+}
+
+// -- BLOCK STACK --------------------------------------------------------------
+
+struct BlockStack {
+  struct Block {
+    Block* next{nullptr};
+    unsigned mem{0};
+  };
+
+  void push(Block* blk) {
+    printf("[%d] push blk->mem=%d\n", gettid(), blk->mem);
+
+    do {
+      blk->next = load_head_ex();
+    } while (!store_head_ex(blk));
+  }
+
+  Block* pop(bool delay = false) {
+    Block *blk, *next;
+    do {
+      blk  = load_head_ex();
+      next = blk->next;
+
+      // Spin some time to give other thread a chance to replace head.
+      // Delay somewhat tuned for my ARM device.
+      for (int i = 0; delay && (i < (1 << 16)); ++i) {
+        asm volatile("nop");
+      }
+      delay = false;
+    } while (!store_head_ex(next));
+
+    printf("[%d] pop blk->mem=%d\n", gettid(), blk->mem);
+    return blk;
+  }
+
+private:
+  Block* load_head_ex() const {
+    static_assert(sizeof(void*) == sizeof(uint64_t),
+                  "Pointer size miss-match!");
+
+    Block* blk = (Block*)ldxr((uint64_t*)&m_head);
+    mem_barrier();
+    return blk;
+  }
+
+  bool store_head_ex(Block* blk) {
+    static_assert(sizeof(void*) == sizeof(uint64_t),
+                  "Pointer size miss-match!");
+
+    const bool success = stxr((uint64_t*)&m_head, (uint64_t)blk);
+    mem_barrier();
+    return success;
+  }
+
+  Block* m_head{nullptr};
+};
+
+int main() {
+  BlockStack::Block B1, B2, B3;
+  B1.mem = 1;
+  B2.mem = 2;
+  B3.mem = 3;
+
+  // Create free memory block list.
+  // S: head -> 1 -> 2 -> 3.
+  BlockStack S;
+  S.push(&B3);
+  S.push(&B2);
+  S.push(&B1);
+
+  // Atomics to coordinate when threads are being released.
+  std::atomic<size_t> worker(0);
+  std::atomic<bool> release(false);
+
+  auto T1 = std::thread([&S, &worker, &release]() {
+    // Notify thread T1 ready, and wait for release.
+    worker.fetch_add(1);
+    while (!release.load()) {}
+
+    // Read head=1 & next=2, and add some artificial delay by executing NOPs.
+    // This gives T2 some time to pop 1 & 2 and push back 1, which triggered the
+    // ABA problem with the CAS instruction.
+    //
+    // However with the LL/SC atomics, our exclusive monitor for T1 after the
+    // exclusive load of the head is cleared after T2 updated the head.
+    // Therefore the exclusive store after the NOP delay will fail and we will
+    // load the new head pointer, circumventing the ABA problem.
+    //
+    // NOTE: This does not work under qemu user emulation (aarch64) as qemu does
+    // not implement the strong LL/SC semantics.
+    // https://elixir.bootlin.com/qemu/v8.1.2/source/target/arm/tcg/translate-a64.c#L2411
+    auto* B1 = S.pop(true /* nops delay */);
+    assert(B1->mem == 1);
+    // Pops 3, correct, no ABA problem.
+    auto* B2 = S.pop();
+    assert(B2->mem == 3);
+  });
+
+  auto T2 = std::thread([&S, &worker, &release]() {
+    // Notify thread T2 ready, and wait for release.
+    worker.fetch_add(1);
+    while (!release.load()) {}
+
+    // Artificial sleep, such that T1 entered pop and read head & next.
+    // Delay somewhat tuned for my ARM device.
+    for (int i = 0; i < (1<<8); ++i) {
+      asm volatile("nop");
+    }
+
+    // Pop 1 & 2.
+    auto* B1 = S.pop();
+    assert(B1->mem == 1);
+    auto* B2 = S.pop();
+    assert(B2->mem == 2);
+
+    // Re-insert 1.
+    S.push(B1);
+  });
+
+  // Wait T1 & T2 ready, then release both threads.
+  while (worker.load() != 2) {}
+  release.store(true);
+
+  T2.join();
+  T1.join();
+}