content/2023-09-01-cas-llsc-aba/cas-ptr-generation.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134

#include <cassert>
#include <cstdint>
#include <cstdio>

#include <atomic>
#include <thread>

#include <unistd.h>

#if __x86_64 != 1
#error "Example only for x86_64"
#endif

// Only used because we look at the disassembly.
#define NOINLINE __attribute__((noinline))

// -- UTILITIES ----------------------------------------------------------------

template <typename T>
NOINLINE bool cmpxchg16b(T* dest, T oldval, T newval) {
  static_assert(sizeof(T) == 16, "Required for CMPXCHG16B");
  static_assert(alignof(T) == 16, "Required for CMPXCHG16B");

  const auto as_u128 = [](T* ptr) {
    return reinterpret_cast<unsigned __int128*>(ptr);
  };
  // Use legacy __sync atomic builtin instead __atomic builtin because gcc will
  // directly emit the 16byte compare exchange instruction instead of emitting a
  // library call. This is nice when looking at the disassembly, weeh.
  //
  // -mcx16
  //   This option enables GCC to generate "CMPXCHG16B" instructions in 64-bit
  //   code to implement compare-and- exchange operations on 16-byte aligned
  //   128-bit objects. This is useful for atomic updates of data
  //   structures exceeding one machine word in size. The compiler uses this
  //   instruction to implement __sync Builtins. However, for __atomic Builtins
  //   operating on 128-bit integers, a library call is always used.
  return __sync_bool_compare_and_swap(as_u128(dest), *as_u128(&oldval),
                                      *as_u128(&newval));
}

// -- BLOCK STACK --------------------------------------------------------------

struct BlockStack {
  struct Block {
    Block* next{nullptr};
    unsigned mem{0};
  };

  void push(Block* blk) {
    printf("[%d] push blk->mem=%d\n", gettid(), blk->mem);

    GenerationPtr old_head, new_head;
    do {
      old_head     = m_head;
      blk->next    = old_head.ptr;
      new_head.ptr = blk;
      new_head.gen = old_head.gen + 1;
    } while (!cmpxchg16b(&m_head, old_head, new_head));
  }

  Block* pop(int s = 0) {
    GenerationPtr old_head, new_head;
    do {
      old_head     = m_head;
      new_head.ptr = old_head.ptr->next;
      new_head.gen = old_head.gen + 1;

      // Simulate thread being interrupted.
      if (s) {
        sleep(s);
      }
    } while (!cmpxchg16b(&m_head, old_head, new_head));

    printf("[%d] pop blk->mem=%d\n", gettid(), old_head.ptr->mem);
    return old_head.ptr;
  }

private:
  struct GenerationPtr {
    Block* ptr{nullptr};
    uint64_t gen{0};
  } __attribute__((aligned(16)));

  GenerationPtr m_head;
};

// -- MAIN ---------------------------------------------------------------------

int main() {
  BlockStack::Block B1, B2, B3;
  B1.mem = 1;
  B2.mem = 2;
  B3.mem = 3;

  // Create free memory block list.
  // S: head -> 1 -> 2 -> 3.
  BlockStack S;
  S.push(&B3);
  S.push(&B2);
  S.push(&B1);

  auto T1 = std::thread([&S]() {
    // Read head=1 & next=2, then get interrupted (simulated by sleep).
    // After resuming, T2 already popped 1 & 2 and pushed again 1 (ABA).
    //
    // The first CAS will fail even block 1 is currently at the beginning of the
    // list as the generation of the OLD_ROOT does not match the generation of
    // M_ROOT. The CAS loop will retry, read the new M_ROOT, and hence the new
    // NEXT pointer pointing to 3 and then successfully update M_ROOT.
    auto* B1 = S.pop(2);
    assert(B1->mem == 1);
    // Pops 3, as it should be.
    auto* B3 = S.pop();
    assert(B3->mem == 3);
  });

  auto T2 = std::thread([&S]() {
    // Artificial sleep, such that T1 entered pop and read head & next.
    sleep(1);

    // Pop 1 & 2.
    auto* B1 = S.pop();
    assert(B1->mem == 1);
    auto* B2 = S.pop();
    assert(B2->mem == 2);

    // Re-insert 1.
    S.push(B1);
  });

  T2.join();
  T1.join();
}