1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
|
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <thread>
#ifndef __aarch64__
#error "This must be compiled for arm64!"
#endif
// NOTES on the inline assembly:
//
// * AArch64 constraint.
// https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html
//
// Q: A memory address which uses a single base register with no offset.
//
// * Output constraint.
// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Output-Operands
//
// Use the '&' constraint modifier on all output operands that must not
// overlap an input. Otherwise, GCC may allocate the output operand in the
// same register as an unrelated input operand, on the assumption that the
// assembler code consumes its inputs before producing outputs. This
// assumption may be false if the assembler code actually consists of more
// than one instruction.
// LDXR: Load exclusive register wrapper.
//
// Read from ADDR and marked address for exclusive access (exclusive monitor).
//
// Return value read from memory.
//
// NOTE: No memory ordering semantics.
//
// https://developer.arm.com/documentation/ddi0596/latest/Base-Instructions/LDXR--Load-Exclusive-Register-?lang=en
inline uint64_t ldxr(uint64_t* addr) {
uint64_t ret;
asm volatile("ldxr %0, %1" : "=r"(ret) : "Q"(*addr) : "memory");
return ret;
}
// STXR: Store exclusive register wrapper.
//
// Conditionally write VAL to ADDR if ADDR is marked for exclusive access by a
// previous exclusive load (eg LDXR).
//
// Return 0 if the write was successful, 1 otherwise.
//
// NOTE: No memory ordering semantics.
//
// https://developer.arm.com/documentation/ddi0596/latest/Base-Instructions/STXR--Store-Exclusive-Register-?lang=en
inline bool stxr(uint64_t* addr, uint64_t val) {
uint32_t ret;
asm volatile("stxr %w0, %2, %1"
: "=&r"(ret), "=Q"(*addr)
: "r"(val)
: "memory");
return ret == 0;
}
int main() {
uint64_t mem = 42;
auto T1 = std::thread([&mem]() {
// Write to exclusive location (does clear exclusive monitor).
mem = 2222;
// Full memory barrier.
__sync_synchronize();
});
uint64_t old = ldxr(&mem);
// Some artificial delay w/o an explicit context switch (eg syscall) as that
// would clear the exclusive monitor, though it can still be interupted by
// the scheduler.
// Delay is "tuned" for my ARM silicon.
for (int i = 0; i < (1 << 13); ++i) {
asm volatile("nop");
}
// Full memory barrier.
__sync_synchronize();
bool ok = stxr(&mem, 1111);
printf("old: %lu -> mem: %lu | ok: %d\n", old, mem, ok);
T1.join();
return ok ? 0 : 1;
}
|