blob: 82c68e193a0aefffd876352801e118600284b979 (
plain) (
tree)
|
|
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <thread>
#ifndef __aarch64__
#error "This must be compiled for arm64!"
#endif
// NOTES on the inline assembly:
//
// * AArch64 constraint.
// https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html
//
// Q: A memory address which uses a single base register with no offset.
//
// * Output constraint.
// https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Output-Operands
//
// Use the '&' constraint modifier on all output operands that must not
// overlap an input. Otherwise, GCC may allocate the output operand in the
// same register as an unrelated input operand, on the assumption that the
// assembler code consumes its inputs before producing outputs. This
// assumption may be false if the assembler code actually consists of more
// than one instruction.
// LDXR: Load exclusive register wrapper.
//
// Read from ADDR and marked address for exclusive access (exclusive monitor).
//
// Return value read from memory.
//
// NOTE: No memory ordering semantics.
//
// https://developer.arm.com/documentation/ddi0596/latest/Base-Instructions/LDXR--Load-Exclusive-Register-?lang=en
inline uint64_t ldxr(uint64_t* addr) {
uint64_t ret;
asm volatile("ldxr %0, %1" : "=r"(ret) : "Q"(*addr) : "memory");
return ret;
}
// STXR: Store exclusive register wrapper.
//
// Conditionally write VAL to ADDR if ADDR is marked for exclusive access by a
// previous exclusive load (eg LDXR).
//
// Return 0 if the write was successful, 1 otherwise.
//
// NOTE: No memory ordering semantics.
//
// https://developer.arm.com/documentation/ddi0596/latest/Base-Instructions/STXR--Store-Exclusive-Register-?lang=en
inline bool stxr(uint64_t* addr, uint64_t val) {
uint32_t ret;
asm volatile("stxr %w0, %2, %1"
: "=&r"(ret), "=Q"(*addr)
: "r"(val)
: "memory");
return ret == 0;
}
int main() {
uint64_t mem = 42;
auto T1 = std::thread([&mem]() {
// Write to exclusive location (does clear exclusive monitor).
mem = 2222;
// Full memory barrier.
__sync_synchronize();
});
uint64_t old = ldxr(&mem);
// Some artificial delay w/o an explicit context switch (eg syscall) as that
// would clear the exclusive monitor, though it can still be interupted by
// the scheduler.
// Delay is "tuned" for my ARM silicon.
for (int i = 0; i < (1 << 13); ++i) {
asm volatile("nop");
}
// Full memory barrier.
__sync_synchronize();
bool ok = stxr(&mem, 1111);
printf("old: %lu -> mem: %lu | ok: %d\n", old, mem, ok);
T1.join();
return ok ? 0 : 1;
}
|