#include <cassert>
#include <cstdint>
#include <cstdio>

#include <atomic>
#include <thread>

#include <unistd.h>

#ifndef __aarch64__
#error "This must be compiled for arm64!"
#endif

// LDXR: Load exclusive register wrapper.
//
// Read from ADDR and marked address for exclusive access (exclusive monitor).
//
// Return value read from memory.
//
// NOTE: No memory ordering semantics.
//
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDXR--Load-Exclusive-Register-?lang=en
uint64_t ldxr(uint64_t* addr) {
  uint64_t ret;
  asm volatile("ldxr %0, %1" : "=r"(ret) : "Q"(*addr) : "memory");
  return ret;
}

// STXR: Store exclusive register wrapper.
//
// Conditionally write VAL to ADDR if ADDR is marked for exclusive access by a
// previous exclusive load (eg LDXR).
//
// Return 0 if the write was successful, 1 otherwise.
//
// NOTE: No memory ordering semantics.
//
// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STXR--Store-Exclusive-Register-?lang=en
bool stxr(uint64_t* addr, uint64_t val) {
  uint32_t ret;
  asm volatile("stxr %w0, %2, %1"
               : "=&r"(ret), "=Q"(*addr)
               : "r"(val)
               : "memory");
  return ret == 0;
}

inline void mem_barrier() {
  asm volatile("dmb ish" : /* out */ : /* in */ : "memory");
}

// -- BLOCK STACK --------------------------------------------------------------

struct BlockStack {
  struct Block {
    Block* next{nullptr};
    unsigned mem{0};
  };

  void push(Block* blk) {
    printf("[%d] push blk->mem=%d\n", gettid(), blk->mem);

    do {
      blk->next = load_head_ex();
    } while (!store_head_ex(blk));
  }

  Block* pop(bool delay = false) {
    Block *blk, *next;
    do {
      blk  = load_head_ex();
      next = blk->next;

      // Spin some time to give other thread a chance to replace head.
      // Delay somewhat tuned for my ARM device.
      for (int i = 0; delay && (i < (1 << 16)); ++i) {
        asm volatile("nop");
      }
      delay = false;
    } while (!store_head_ex(next));

    printf("[%d] pop blk->mem=%d\n", gettid(), blk->mem);
    return blk;
  }

private:
  Block* load_head_ex() const {
    static_assert(sizeof(void*) == sizeof(uint64_t),
                  "Pointer size miss-match!");

    Block* blk = (Block*)ldxr((uint64_t*)&m_head);
    mem_barrier();
    return blk;
  }

  bool store_head_ex(Block* blk) {
    static_assert(sizeof(void*) == sizeof(uint64_t),
                  "Pointer size miss-match!");

    const bool success = stxr((uint64_t*)&m_head, (uint64_t)blk);
    mem_barrier();
    return success;
  }

  Block* m_head{nullptr};
};

int main() {
  BlockStack::Block B1, B2, B3;
  B1.mem = 1;
  B2.mem = 2;
  B3.mem = 3;

  // Create free memory block list.
  // S: head -> 1 -> 2 -> 3.
  BlockStack S;
  S.push(&B3);
  S.push(&B2);
  S.push(&B1);

  // Atomics to coordinate when threads are being released.
  std::atomic<size_t> worker(0);
  std::atomic<bool> release(false);

  auto T1 = std::thread([&S, &worker, &release]() {
    // Notify thread T1 ready, and wait for release.
    worker.fetch_add(1);
    while (!release.load()) {}

    // Read head=1 & next=2, and add some artificial delay by executing NOPs.
    // This gives T2 some time to pop 1 & 2 and push back 1, which triggered the
    // ABA problem with the CAS instruction.
    //
    // However with the LL/SC atomics, our exclusive monitor for T1 after the
    // exclusive load of the head is cleared after T2 updated the head.
    // Therefore the exclusive store after the NOP delay will fail and we will
    // load the new head pointer, circumventing the ABA problem.
    //
    // NOTE: This does not work under qemu user emulation (aarch64) as qemu does
    // not implement the strong LL/SC semantics.
    // https://elixir.bootlin.com/qemu/v8.1.2/source/target/arm/tcg/translate-a64.c#L2411
    auto* B1 = S.pop(true /* nops delay */);
    assert(B1->mem == 1);
    // Pops 3, correct, no ABA problem.
    auto* B2 = S.pop();
    assert(B2->mem == 3);
  });

  auto T2 = std::thread([&S, &worker, &release]() {
    // Notify thread T2 ready, and wait for release.
    worker.fetch_add(1);
    while (!release.load()) {}

    // Artificial sleep, such that T1 entered pop and read head & next.
    // Delay somewhat tuned for my ARM device.
    for (int i = 0; i < (1<<8); ++i) {
      asm volatile("nop");
    }

    // Pop 1 & 2.
    auto* B1 = S.pop();
    assert(B1->mem == 1);
    auto* B2 = S.pop();
    assert(B2->mem == 2);

    // Re-insert 1.
    S.push(B1);
  });

  // Wait T1 & T2 ready, then release both threads.
  while (worker.load() != 2) {}
  release.store(true);

  T2.join();
  T1.join();
}