#include #include #include #include #include #include #if __x86_64 != 1 #error "Example only for x86_64" #endif // Only used because we look at the disassembly. #define NOINLINE __attribute__((noinline)) // -- UTILITIES ---------------------------------------------------------------- template NOINLINE bool cmpxchg16b(T* dest, T oldval, T newval) { static_assert(sizeof(T) == 16, "Required for CMPXCHG16B"); static_assert(alignof(T) == 16, "Required for CMPXCHG16B"); const auto as_u128 = [](T* ptr) { return reinterpret_cast(ptr); }; // Use legacy __sync atomic builtin instead __atomic builtin because gcc will // directly emit the 16byte compare exchange instruction instead of emitting a // library call. This is nice when looking at the disassembly, weeh. // // -mcx16 // This option enables GCC to generate "CMPXCHG16B" instructions in 64-bit // code to implement compare-and- exchange operations on 16-byte aligned // 128-bit objects. This is useful for atomic updates of data // structures exceeding one machine word in size. The compiler uses this // instruction to implement __sync Builtins. However, for __atomic Builtins // operating on 128-bit integers, a library call is always used. return __sync_bool_compare_and_swap(as_u128(dest), *as_u128(&oldval), *as_u128(&newval)); } // -- BLOCK STACK -------------------------------------------------------------- struct BlockStack { struct Block { Block* next{nullptr}; unsigned mem{0}; }; void push(Block* blk) { printf("[%d] push blk->mem=%d\n", gettid(), blk->mem); GenerationPtr old_head, new_head; do { old_head = m_head; blk->next = old_head.ptr; new_head.ptr = blk; new_head.gen = old_head.gen + 1; } while (!cmpxchg16b(&m_head, old_head, new_head)); } Block* pop(int s = 0) { GenerationPtr old_head, new_head; do { old_head = m_head; new_head.ptr = old_head.ptr->next; new_head.gen = old_head.gen + 1; // Simulate thread being interrupted. if (s) { sleep(s); } } while (!cmpxchg16b(&m_head, old_head, new_head)); printf("[%d] pop blk->mem=%d\n", gettid(), old_head.ptr->mem); return old_head.ptr; } private: struct GenerationPtr { Block* ptr{nullptr}; uint64_t gen{0}; } __attribute__((aligned(16))); GenerationPtr m_head; }; // -- MAIN --------------------------------------------------------------------- int main() { BlockStack::Block B1, B2, B3; B1.mem = 1; B2.mem = 2; B3.mem = 3; // Create free memory block list. // S: head -> 1 -> 2 -> 3. BlockStack S; S.push(&B3); S.push(&B2); S.push(&B1); auto T1 = std::thread([&S]() { // Read head=1 & next=2, then get interrupted (simulated by sleep). // After resuming, T2 already popped 1 & 2 and pushed again 1 (ABA). // // The first CAS will fail even block 1 is currently at the beginning of the // list as the generation of the OLD_ROOT does not match the generation of // M_ROOT. The CAS loop will retry, read the new M_ROOT, and hence the new // NEXT pointer pointing to 3 and then successfully update M_ROOT. auto* B1 = S.pop(2); assert(B1->mem == 1); // Pops 3, as it should be. auto* B3 = S.pop(); assert(B3->mem == 3); }); auto T2 = std::thread([&S]() { // Artificial sleep, such that T1 entered pop and read head & next. sleep(1); // Pop 1 & 2. auto* B1 = S.pop(); assert(B1->mem == 1); auto* B2 = S.pop(); assert(B2->mem == 2); // Re-insert 1. S.push(B1); }); T2.join(); T1.join(); }