#include #include #include #include #ifndef __aarch64__ #error "This must be compiled for arm64!" #endif // NOTES on the inline assembly: // // * AArch64 constraint. // https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html // // Q: A memory address which uses a single base register with no offset. // // * Output constraint. // https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#Output-Operands // // Use the '&' constraint modifier on all output operands that must not // overlap an input. Otherwise, GCC may allocate the output operand in the // same register as an unrelated input operand, on the assumption that the // assembler code consumes its inputs before producing outputs. This // assumption may be false if the assembler code actually consists of more // than one instruction. // LDXR: Load exclusive register wrapper. // // Read from ADDR and marked address for exclusive access (exclusive monitor). // // Return value read from memory. // // NOTE: No memory ordering semantics. // // https://developer.arm.com/documentation/ddi0596/latest/Base-Instructions/LDXR--Load-Exclusive-Register-?lang=en inline uint64_t ldxr(uint64_t* addr) { uint64_t ret; asm volatile("ldxr %0, %1" : "=r"(ret) : "Q"(*addr) : "memory"); return ret; } // STXR: Store exclusive register wrapper. // // Conditionally write VAL to ADDR if ADDR is marked for exclusive access by a // previous exclusive load (eg LDXR). // // Return 0 if the write was successful, 1 otherwise. // // NOTE: No memory ordering semantics. // // https://developer.arm.com/documentation/ddi0596/latest/Base-Instructions/STXR--Store-Exclusive-Register-?lang=en inline bool stxr(uint64_t* addr, uint64_t val) { uint32_t ret; asm volatile("stxr %w0, %2, %1" : "=&r"(ret), "=Q"(*addr) : "r"(val) : "memory"); return ret == 0; } int main() { uint64_t mem = 42; auto T1 = std::thread([&mem]() { // Write to exclusive location (does clear exclusive monitor). mem = 2222; // Full memory barrier. __sync_synchronize(); }); uint64_t old = ldxr(&mem); // Some artificial delay w/o an explicit context switch (eg syscall) as that // would clear the exclusive monitor, though it can still be interupted by // the scheduler. // Delay is "tuned" for my ARM silicon. for (int i = 0; i < (1 << 13); ++i) { asm volatile("nop"); } // Full memory barrier. __sync_synchronize(); bool ok = stxr(&mem, 1111); printf("old: %lu -> mem: %lu | ok: %d\n", old, mem, ok); T1.join(); return ok ? 0 : 1; }