#include <cassert>
#include <cstdint>
#include <cstdio>

#include <atomic>
#include <thread>

#include <unistd.h>

#if __x86_64 != 1
#error "Example only for x86_64"
#endif

// Only used because we look at the disassembly.
#define NOINLINE __attribute__((noinline))

// -- UTILITIES ----------------------------------------------------------------

template <typename T>
NOINLINE bool cmpxchg16b(T* dest, T oldval, T newval) {
  static_assert(sizeof(T) == 16, "Required for CMPXCHG16B");
  static_assert(alignof(T) == 16, "Required for CMPXCHG16B");

  const auto as_u128 = [](T* ptr) {
    return reinterpret_cast<unsigned __int128*>(ptr);
  };
  // Use legacy __sync atomic builtin instead __atomic builtin because gcc will
  // directly emit the 16byte compare exchange instruction instead of emitting a
  // library call. This is nice when looking at the disassembly, weeh.
  //
  // -mcx16
  //   This option enables GCC to generate "CMPXCHG16B" instructions in 64-bit
  //   code to implement compare-and- exchange operations on 16-byte aligned
  //   128-bit objects. This is useful for atomic updates of data
  //   structures exceeding one machine word in size. The compiler uses this
  //   instruction to implement __sync Builtins. However, for __atomic Builtins
  //   operating on 128-bit integers, a library call is always used.
  return __sync_bool_compare_and_swap(as_u128(dest), *as_u128(&oldval),
                                      *as_u128(&newval));
}

// -- BLOCK STACK --------------------------------------------------------------

struct BlockStack {
  struct Block {
    Block* next{nullptr};
    unsigned mem{0};
  };

  void push(Block* blk) {
    printf("[%d] push blk->mem=%d\n", gettid(), blk->mem);

    GenerationPtr old_head, new_head;
    do {
      old_head     = m_head;
      blk->next    = old_head.ptr;
      new_head.ptr = blk;
      new_head.gen = old_head.gen + 1;
    } while (!cmpxchg16b(&m_head, old_head, new_head));
  }

  Block* pop(int s = 0) {
    GenerationPtr old_head, new_head;
    do {
      old_head     = m_head;
      new_head.ptr = old_head.ptr->next;
      new_head.gen = old_head.gen + 1;

      // Simulate thread being interrupted.
      if (s) {
        sleep(s);
      }
    } while (!cmpxchg16b(&m_head, old_head, new_head));

    printf("[%d] pop blk->mem=%d\n", gettid(), old_head.ptr->mem);
    return old_head.ptr;
  }

private:
  struct GenerationPtr {
    Block* ptr{nullptr};
    uint64_t gen{0};
  } __attribute__((aligned(16)));

  GenerationPtr m_head;
};

// -- MAIN ---------------------------------------------------------------------

int main() {
  BlockStack::Block B1, B2, B3;
  B1.mem = 1;
  B2.mem = 2;
  B3.mem = 3;

  // Create free memory block list.
  // S: head -> 1 -> 2 -> 3.
  BlockStack S;
  S.push(&B3);
  S.push(&B2);
  S.push(&B1);

  auto T1 = std::thread([&S]() {
    // Read head=1 & next=2, then get interrupted (simulated by sleep).
    // After resuming, T2 already popped 1 & 2 and pushed again 1 (ABA).
    //
    // The first CAS will fail even block 1 is currently at the beginning of the
    // list as the generation of the OLD_ROOT does not match the generation of
    // M_ROOT. The CAS loop will retry, read the new M_ROOT, and hence the new
    // NEXT pointer pointing to 3 and then successfully update M_ROOT.
    auto* B1 = S.pop(2);
    assert(B1->mem == 1);
    // Pops 3, as it should be.
    auto* B3 = S.pop();
    assert(B3->mem == 3);
  });

  auto T2 = std::thread([&S]() {
    // Artificial sleep, such that T1 entered pop and read head & next.
    sleep(1);

    // Pop 1 & 2.
    auto* B1 = S.pop();
    assert(B1->mem == 1);
    auto* B2 = S.pop();
    assert(B2->mem == 2);

    // Re-insert 1.
    S.push(B1);
  });

  T2.join();
  T1.join();
}