#include <cassert>
#include <cstdint>
#include <cstdio>
#include <atomic>
#include <thread>
#include <unistd.h>
#if __x86_64 != 1
#error "Example only for x86_64"
#endif
// Only used because we look at the disassembly.
#define NOINLINE __attribute__((noinline))
// -- UTILITIES ----------------------------------------------------------------
template <typename T>
NOINLINE bool cmpxchg16b(T* dest, T oldval, T newval) {
static_assert(sizeof(T) == 16, "Required for CMPXCHG16B");
static_assert(alignof(T) == 16, "Required for CMPXCHG16B");
const auto as_u128 = [](T* ptr) {
return reinterpret_cast<unsigned __int128*>(ptr);
};
// Use legacy __sync atomic builtin instead __atomic builtin because gcc will
// directly emit the 16byte compare exchange instruction instead of emitting a
// library call. This is nice when looking at the disassembly, weeh.
//
// -mcx16
// This option enables GCC to generate "CMPXCHG16B" instructions in 64-bit
// code to implement compare-and- exchange operations on 16-byte aligned
// 128-bit objects. This is useful for atomic updates of data
// structures exceeding one machine word in size. The compiler uses this
// instruction to implement __sync Builtins. However, for __atomic Builtins
// operating on 128-bit integers, a library call is always used.
return __sync_bool_compare_and_swap(as_u128(dest), *as_u128(&oldval),
*as_u128(&newval));
}
// -- BLOCK STACK --------------------------------------------------------------
struct BlockStack {
struct Block {
Block* next{nullptr};
unsigned mem{0};
};
void push(Block* blk) {
printf("[%d] push blk->mem=%d\n", gettid(), blk->mem);
GenerationPtr old_head, new_head;
do {
old_head = m_head;
blk->next = old_head.ptr;
new_head.ptr = blk;
new_head.gen = old_head.gen + 1;
} while (!cmpxchg16b(&m_head, old_head, new_head));
}
Block* pop(int s = 0) {
GenerationPtr old_head, new_head;
do {
old_head = m_head;
new_head.ptr = old_head.ptr->next;
new_head.gen = old_head.gen + 1;
// Simulate thread being interrupted.
if (s) {
sleep(s);
}
} while (!cmpxchg16b(&m_head, old_head, new_head));
printf("[%d] pop blk->mem=%d\n", gettid(), old_head.ptr->mem);
return old_head.ptr;
}
private:
struct GenerationPtr {
Block* ptr{nullptr};
uint64_t gen{0};
} __attribute__((aligned(16)));
GenerationPtr m_head;
};
// -- MAIN ---------------------------------------------------------------------
int main() {
BlockStack::Block B1, B2, B3;
B1.mem = 1;
B2.mem = 2;
B3.mem = 3;
// Create free memory block list.
// S: head -> 1 -> 2 -> 3.
BlockStack S;
S.push(&B3);
S.push(&B2);
S.push(&B1);
auto T1 = std::thread([&S]() {
// Read head=1 & next=2, then get interrupted (simulated by sleep).
// After resuming, T2 already popped 1 & 2 and pushed again 1 (ABA).
//
// The first CAS will fail even block 1 is currently at the beginning of the
// list as the generation of the OLD_ROOT does not match the generation of
// M_ROOT. The CAS loop will retry, read the new M_ROOT, and hence the new
// NEXT pointer pointing to 3 and then successfully update M_ROOT.
auto* B1 = S.pop(2);
assert(B1->mem == 1);
// Pops 3, as it should be.
auto* B3 = S.pop();
assert(B3->mem == 3);
});
auto T2 = std::thread([&S]() {
// Artificial sleep, such that T1 entered pop and read head & next.
sleep(1);
// Pop 1 & 2.
auto* B1 = S.pop();
assert(B1->mem == 1);
auto* B2 = S.pop();
assert(B2->mem == 2);
// Re-insert 1.
S.push(B1);
});
T2.join();
T1.join();
}