commit 89ec3480b48b5f7c95293b8df207700f11b3e7f0
parent 3c940a4af437a98fd7b190e07c0d1c2136e3ddda
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 29 May 2026 20:20:50 -0700
aa64: decode load/store exclusive + acquire/release atomics
Adversarial round-trip of _Atomic RMW on a global surfaced a decode gap:
the disassembler rendered ldaxr/stlxr/ldar (the exclusive-monitor sequence
codegen emits for atomics) as .inst, so cc -S dropped them and the
round-trip lost the atomic ops entirely. The assembler already encoded the
whole family; only the decode side was missing.
Add AA64_FMT_LDST_EXCL + print_ldst_excl and 14 table rows covering
ldxr/ldaxr/ldar/stxr/stlxr/stlr (Wt+Xt via a size-bit-free mask) and the
byte/half ldaxrb/ldaxrh/ldarb/ldarh/stlxrb/stlxrh/stlrb/stlrh. The printer
keys register width on size and the operand shape (Ws,Rt,[Xn] vs Rt,[Xn])
on L/o2.
Atomics were the one core-op family the corpus fan-out missed; add
roundtrip/atomic_{rmw,cas,ops} and decode case aa64_ldst_excl. Round-trip
aa64 852/0/1; asm/ISA-unit/inline green.
Diffstat:
11 files changed, 105 insertions(+), 0 deletions(-)
diff --git a/src/arch/aa64/isa.c b/src/arch/aa64/isa.c
@@ -310,6 +310,27 @@ const AA64InsnDesc aa64_insn_table[] = {
AA64_ASMFL_SF1,
{0, 0}},
+ /* ----- Load/store exclusive + acquire/release ordered -----
+ * Family bits[29:24]=001000, o1[21]=0 (single register). The word/dword
+ * mnemonics leave size bit30 free (mask 0xBFE08000) so one row decodes
+ * both Wt and Xt; the byte/half mnemonics pin the full size (0xFFE08000).
+ * print_ldst_excl keys the register width on size and the operand shape on
+ * L[22]/o2[23]. */
+ {MN("ldxr"), 0x88400000u, 0xBFE08000u, AA64_FMT_LDST_EXCL, 0, {0, 0}},
+ {MN("ldaxr"), 0x88408000u, 0xBFE08000u, AA64_FMT_LDST_EXCL, 0, {0, 0}},
+ {MN("ldar"), 0x88C08000u, 0xBFE08000u, AA64_FMT_LDST_EXCL, 0, {0, 0}},
+ {MN("stxr"), 0x88000000u, 0xBFE08000u, AA64_FMT_LDST_EXCL, 0, {0, 0}},
+ {MN("stlxr"), 0x88008000u, 0xBFE08000u, AA64_FMT_LDST_EXCL, 0, {0, 0}},
+ {MN("stlr"), 0x88808000u, 0xBFE08000u, AA64_FMT_LDST_EXCL, 0, {0, 0}},
+ {MN("ldaxrb"), 0x08408000u, 0xFFE08000u, AA64_FMT_LDST_EXCL, 0, {0, 0}},
+ {MN("ldaxrh"), 0x48408000u, 0xFFE08000u, AA64_FMT_LDST_EXCL, 0, {0, 0}},
+ {MN("ldarb"), 0x08C08000u, 0xFFE08000u, AA64_FMT_LDST_EXCL, 0, {0, 0}},
+ {MN("ldarh"), 0x48C08000u, 0xFFE08000u, AA64_FMT_LDST_EXCL, 0, {0, 0}},
+ {MN("stlxrb"), 0x08008000u, 0xFFE08000u, AA64_FMT_LDST_EXCL, 0, {0, 0}},
+ {MN("stlxrh"), 0x48008000u, 0xFFE08000u, AA64_FMT_LDST_EXCL, 0, {0, 0}},
+ {MN("stlrb"), 0x08808000u, 0xFFE08000u, AA64_FMT_LDST_EXCL, 0, {0, 0}},
+ {MN("stlrh"), 0x48808000u, 0xFFE08000u, AA64_FMT_LDST_EXCL, 0, {0, 0}},
+
/* ----- Load/store pair, pre-indexed (opc=10 X / opc=01 D) ----- */
{MN("stp"),
0xA9800000u,
@@ -739,6 +760,31 @@ static void print_ldst_simm9(StrBuf* sb, u32 w, const AA64InsnDesc* d) {
strbuf_putc(sb, ']');
}
+/* Load/store exclusive (LDXR/LDAXR/STXR/STLXR) and acquire/release ordered
+ * (LDAR/STLR), incl. byte/half variants. Fields: size[31:30] picks the
+ * transfer register width (Wt for byte/half/word, Xt for dword); o2[23] and
+ * L[22] select the form; Rs[20:16] is the store-exclusive status register
+ * (Ws). A store-exclusive (L=0, o2=0) prints `Ws, Rt, [Xn]`; everything else
+ * (loads + store-release) prints `Rt, [Xn]`. */
+static void print_ldst_excl(StrBuf* sb, u32 w) {
+ u32 size = (w >> 30) & 3u;
+ u32 o2 = (w >> 23) & 1u;
+ u32 L = (w >> 22) & 1u;
+ u32 Rs = (w >> 16) & 0x1fu;
+ u32 Rn = (w >> 5) & 0x1fu;
+ u32 Rt = w & 0x1fu;
+ int sf = (size == 3u);
+ int store_excl = (L == 0u && o2 == 0u);
+ if (store_excl) {
+ emit_reg(sb, Rs, /*sf=*/0, 0); /* Ws: status result, always 32-bit */
+ strbuf_puts(sb, ", ");
+ }
+ emit_reg(sb, Rt, sf, 0);
+ strbuf_puts(sb, ", [");
+ emit_reg(sb, Rn, /*sf=*/1, 1);
+ strbuf_putc(sb, ']');
+}
+
static void print_ldstp_common(StrBuf* sb, AA64LdStPPre f, int pre) {
/* opc=10 → 64-bit X; opc=00 → 32-bit W; opc=01 (V=1) → D (FP);
* opc=00 (V=1) → S; opc=10 (V=1) → Q (not yet emitted). */
@@ -1081,6 +1127,9 @@ void aa64_print_operands(StrBuf* sb, const AA64InsnDesc* desc, u32 word,
case AA64_FMT_LDST_SIMM9:
print_ldst_simm9(sb, word, desc);
break;
+ case AA64_FMT_LDST_EXCL:
+ print_ldst_excl(sb, word);
+ break;
case AA64_FMT_BR_IMM:
print_br_imm(sb, word, vaddr);
break;
diff --git a/src/arch/aa64/isa.h b/src/arch/aa64/isa.h
@@ -66,6 +66,8 @@ typedef enum AA64Format {
AA64_FMT_FP_CVT, /* FP precision convert (FCVT single<->double) */
AA64_FMT_FP_INT_CVT, /* FP<->int convert + FMOV gpr<->fp
* (SCVTF/UCVTF/FCVTZS/FCVTZU/FMOV) */
+ AA64_FMT_LDST_EXCL, /* load/store exclusive + acquire/release ordered
+ * (LDXR/LDAXR/STXR/STLXR/LDAR/STLR + b/h) */
} AA64Format;
/* ---- AsmFlags column on AA64InsnDesc ----
diff --git a/test/asm/decode/aa64_ldst_excl.expected.txt b/test/asm/decode/aa64_ldst_excl.expected.txt
@@ -0,0 +1,20 @@
+0: ldxr w0, [x1]
+4: ldaxr w2, [x3]
+8: ldar w4, [x5]
+c: stxr w6, w7, [x8]
+10: stlxr w9, w10, [x11]
+14: stlr w12, [x13]
+18: ldxr x14, [x15]
+1c: ldaxr x16, [x17]
+20: ldar x18, [x19]
+24: stxr w20, x21, [x22]
+28: stlxr w23, x24, [x25]
+2c: stlr x26, [x27]
+30: ldaxrb w0, [x1]
+34: ldaxrh w2, [x3]
+38: ldarb w4, [x5]
+3c: ldarh w6, [x7]
+40: stlxrb w8, w9, [x10]
+44: stlxrh w11, w12, [x13]
+48: stlrb w14, [x15]
+4c: stlrh w16, [x17]
diff --git a/test/asm/decode/aa64_ldst_excl.hex b/test/asm/decode/aa64_ldst_excl.hex
@@ -0,0 +1 @@
+207c5f8862fc5f88a4fcdf88077d06886afd0988acfd9f88ee7d5fc830fe5fc872fedfc8d57e14c838ff17c87aff9fc820fc5f0862fc5f48a4fcdf08e6fcdf4849fd0808acfd0b48eefd9f0830fe9f48
diff --git a/test/asm/decode/aa64_ldst_excl.targets b/test/asm/decode/aa64_ldst_excl.targets
@@ -0,0 +1 @@
+aa64
diff --git a/test/asm/roundtrip/atomic_cas.c b/test/asm/roundtrip/atomic_cas.c
@@ -0,0 +1,9 @@
+/* Atomic compare-exchange on a global: the ldaxr/stlxr CAS loop plus the
+ * branch on success. Exit: g becomes 42. */
+_Atomic int g = 40;
+int test_main(void) {
+ int expected = 40;
+ __atomic_compare_exchange_n(&g, &expected, 42, 0, __ATOMIC_SEQ_CST,
+ __ATOMIC_SEQ_CST);
+ return __atomic_load_n(&g, __ATOMIC_SEQ_CST);
+}
diff --git a/test/asm/roundtrip/atomic_cas.expected b/test/asm/roundtrip/atomic_cas.expected
@@ -0,0 +1 @@
+42
diff --git a/test/asm/roundtrip/atomic_ops.c b/test/asm/roundtrip/atomic_ops.c
@@ -0,0 +1,10 @@
+/* A spread of atomic RMW operators (and/or/xor/sub) on a global, exercising
+ * the exclusive-monitor loop for each. Exit computed to 42. */
+_Atomic unsigned g = 0xFF;
+int test_main(void) {
+ __atomic_fetch_and(&g, 0x3F, __ATOMIC_SEQ_CST); /* 0x3F = 63 */
+ __atomic_fetch_or(&g, 0x40, __ATOMIC_SEQ_CST); /* 0x7F = 127 */
+ __atomic_fetch_xor(&g, 0x01, __ATOMIC_SEQ_CST); /* 0x7E = 126 */
+ __atomic_fetch_sub(&g, 84, __ATOMIC_SEQ_CST); /* 42 */
+ return (int)__atomic_load_n(&g, __ATOMIC_SEQ_CST);
+}
diff --git a/test/asm/roundtrip/atomic_ops.expected b/test/asm/roundtrip/atomic_ops.expected
@@ -0,0 +1 @@
+42
diff --git a/test/asm/roundtrip/atomic_rmw.c b/test/asm/roundtrip/atomic_rmw.c
@@ -0,0 +1,10 @@
+/* Atomic read-modify-write on a file-scope _Atomic global. The address
+ * escapes (global), so codegen emits the real exclusive-monitor sequence
+ * (ldaxr/stlxr loop) plus a barrier, and the final load is ldar — the
+ * acquire/release/exclusive atomics the disassembler must decode and the
+ * round-trip re-encode. Exit: 30 + 8 + 4 = 42. */
+_Atomic int g = 30;
+int test_main(void) {
+ __atomic_fetch_add(&g, 8, __ATOMIC_SEQ_CST);
+ return __atomic_load_n(&g, __ATOMIC_SEQ_CST) + 4;
+}
diff --git a/test/asm/roundtrip/atomic_rmw.expected b/test/asm/roundtrip/atomic_rmw.expected
@@ -0,0 +1 @@
+42