kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

asm.c (110411B)


      1 /* AArch64 standalone .s instruction parser.
      2  *
      3  * Per-mnemonic dispatch: each entry in the mnemonic table names a
      4  * parse function that reads operand tokens through the asm-driver
      5  * surface and emits the encoded word via the inline encoders in
      6  * aa64_isa.h.  Encoders are the single source of truth for bit
      7  * layout — the disassembler shares them through aa64_*_unpack.
      8  *
      9  * Aliases (`mov`, `neg`, `cmp`, `mul`, ...) live in this table as
     10  * dedicated rows that pick the canonical form's encoder with the
     11  * alias-specific operand shape.  When a mnemonic admits multiple
     12  * forms (e.g. `mov` register-vs-immediate, `add` register-vs-
     13  * immediate), the parser branches on operand shape after reading
     14  * the first non-Rd operand. */
     15 
     16 #include "arch/aa64/asm.h"
     17 
     18 #include <string.h>
     19 
     20 #include "arch/aa64/isa.h"
     21 #include "arch/aa64/regs.h"
     22 #include "arch/arch.h"
     23 #include "asm/asm_helpers.h"
     24 #include "asm/asm_lex.h"
     25 #include "cg/type.h"
     26 #include "core/arena.h"
     27 #include "core/pool.h"
     28 #include "core/slice.h"
     29 #include "core/strbuf.h"
     30 #include "obj/obj.h"
     31 
     32 /* ---- public handle ---- */
     33 
     34 struct AA64Asm {
     35   ArchAsm base;
     36   Compiler* c;
     37 
     38   /* Inline-asm bound state (set by aa64_inline_bind, cleared otherwise).
     39    * Operand indexing per GCC convention: 0..nout-1 are outputs, then
     40    * nout..nout+nin-1 are inputs.  Templates address into this combined
     41    * list via %N / %wN / %xN / %aN.  out_ops is mutable (the binder fills
     42    * in result locations); in_ops + constraints + clobbers are read-only
     43    * borrows. */
     44   const AsmConstraint* outs;
     45   Operand* out_ops;
     46   const AsmConstraint* ins;
     47   const Operand* in_ops;
     48   const Sym* clobbers;
     49   u32 nout;
     50   u32 nin;
     51   u32 nclob;
     52 };
     53 
     54 static void aa64_arch_asm_insn(ArchAsm* base, AsmDriver* d, Sym mnemonic);
     55 static void aa64_arch_asm_destroy(ArchAsm* base);
     56 
     57 AA64Asm* aa64_asm_open(Compiler* c) {
     58   AA64Asm* a = arena_new(c->tu, AA64Asm);
     59   memset(a, 0, sizeof *a);
     60   a->base.insn = aa64_arch_asm_insn;
     61   a->base.destroy = aa64_arch_asm_destroy;
     62   a->c = c;
     63   return a;
     64 }
     65 
     66 void aa64_asm_close(AA64Asm* a) { (void)a; }
     67 
     68 ArchAsm* aa64_arch_asm_new(Compiler* c) { return &aa64_asm_open(c)->base; }
     69 
     70 static void aa64_arch_asm_insn(ArchAsm* base, AsmDriver* d, Sym mnemonic) {
     71   aa64_asm_insn((AA64Asm*)base, d, mnemonic);
     72 }
     73 
     74 static void aa64_arch_asm_destroy(ArchAsm* base) {
     75   aa64_asm_close((AA64Asm*)base);
     76 }
     77 
     78 void aa64_inline_bind(AA64Asm* a, const AsmConstraint* outs, u32 nout,
     79                       Operand* out_ops, const AsmConstraint* ins, u32 nin,
     80                       const Operand* in_ops, const Sym* clobbers, u32 nclob) {
     81   a->outs = outs;
     82   a->out_ops = out_ops;
     83   a->ins = ins;
     84   a->in_ops = in_ops;
     85   a->clobbers = clobbers;
     86   a->nout = nout;
     87   a->nin = nin;
     88   a->nclob = nclob;
     89 }
     90 
     91 /* ---- helpers ---- */
     92 
     93 static int tok_punct(AsmTok t, u32 p) { return asm_driver_tok_is_punct(t, p); }
     94 
     95 static int icase_eq(const char* a, size_t an, const char* b) {
     96   size_t i;
     97   for (i = 0; i < an; ++i) {
     98     char x = a[i], y = b[i];
     99     if (x >= 'A' && x <= 'Z') x = (char)(x + ('a' - 'A'));
    100     if (y >= 'A' && y <= 'Z') y = (char)(y + ('a' - 'A'));
    101     if (x != y || !y) return 0;
    102   }
    103   return b[an] == '\0';
    104 }
    105 
    106 /* Parse a register operand.  Returns the 5-bit encoded register number
    107  * via *reg_out and the form via *is64_out.  Recognized forms (case-
    108  * insensitive):
    109  *   w0..w30, wzr            → is64=0, reg=0..30 / 31
    110  *   x0..x30, xzr, lr (=x30) → is64=1, reg=0..30 / 31
    111  *   sp                      → is64=1, reg=31  (sp_means_sp set)
    112  *   wsp                     → is64=0, reg=31  (sp_means_sp set)
    113  * Aliases:
    114  *   fp = x29
    115  *   ip0 = x16, ip1 = x17  (PLT scratch — useful for hand-written PLTs) */
    116 typedef struct AA64Reg {
    117   u32 num;
    118   u8 is64;
    119   u8 is_sp;    /* 1 if the spelling was "sp" / "wsp" */
    120   u8 is_fp;    /* 1 for SIMD/FP register spellings accepted in FP forms */
    121   u8 fp_bytes; /* 8 for Dn, 16 for Qn */
    122 } AA64Reg;
    123 
    124 static int parse_reg_from_ident(AsmDriver* d, Sym ident, AA64Reg* out) {
    125   Slice sl = pool_slice(asm_driver_pool(d), ident);
    126   const char* p = sl.s;
    127   size_t n = sl.len;
    128   if (!p || !n) return 0;
    129   /* "sp" */
    130   if (icase_eq(p, n, "sp")) {
    131     out->num = 31;
    132     out->is64 = 1;
    133     out->is_sp = 1;
    134     out->is_fp = 0;
    135     return 1;
    136   }
    137   if (icase_eq(p, n, "wsp")) {
    138     out->num = 31;
    139     out->is64 = 0;
    140     out->is_sp = 1;
    141     out->is_fp = 0;
    142     return 1;
    143   }
    144   if (icase_eq(p, n, "lr")) {
    145     out->num = 30;
    146     out->is64 = 1;
    147     out->is_sp = 0;
    148     out->is_fp = 0;
    149     return 1;
    150   }
    151   if (icase_eq(p, n, "fp")) {
    152     out->num = 29;
    153     out->is64 = 1;
    154     out->is_sp = 0;
    155     out->is_fp = 0;
    156     return 1;
    157   }
    158   if (icase_eq(p, n, "ip0")) {
    159     out->num = 16;
    160     out->is64 = 1;
    161     out->is_sp = 0;
    162     out->is_fp = 0;
    163     return 1;
    164   }
    165   if (icase_eq(p, n, "ip1")) {
    166     out->num = 17;
    167     out->is64 = 1;
    168     out->is_sp = 0;
    169     out->is_fp = 0;
    170     return 1;
    171   }
    172   if (icase_eq(p, n, "xzr")) {
    173     out->num = 31;
    174     out->is64 = 1;
    175     out->is_sp = 0;
    176     out->is_fp = 0;
    177     return 1;
    178   }
    179   if (icase_eq(p, n, "wzr")) {
    180     out->num = 31;
    181     out->is64 = 0;
    182     out->is_sp = 0;
    183     out->is_fp = 0;
    184     return 1;
    185   }
    186   /* W/X<num> */
    187   if ((p[0] == 'w' || p[0] == 'W' || p[0] == 'x' || p[0] == 'X') && n >= 2) {
    188     u32 r = 0;
    189     size_t i;
    190     for (i = 1; i < n; ++i) {
    191       char c = p[i];
    192       if (c < '0' || c > '9') return 0;
    193       r = r * 10 + (u32)(c - '0');
    194       if (r > 31) return 0;
    195     }
    196     out->num = r;
    197     out->is64 = (p[0] == 'x' || p[0] == 'X') ? 1 : 0;
    198     out->is_sp = 0;
    199     out->is_fp = 0;
    200     return 1;
    201   }
    202   return 0;
    203 }
    204 
    205 static int parse_fp_pair_reg_from_ident(AsmDriver* d, Sym ident, AA64Reg* out) {
    206   Slice sl = pool_slice(asm_driver_pool(d), ident);
    207   const char* p = sl.s;
    208   size_t n = sl.len;
    209   if (!p || n < 2 || (p[0] != 'd' && p[0] != 'D' && p[0] != 'q' && p[0] != 'Q'))
    210     return 0;
    211   u32 r = 0;
    212   for (size_t i = 1; i < n; ++i) {
    213     char c = p[i];
    214     if (c < '0' || c > '9') return 0;
    215     r = r * 10 + (u32)(c - '0');
    216     if (r > 31) return 0;
    217   }
    218   out->num = r;
    219   out->is64 = 1;
    220   out->is_sp = 0;
    221   out->is_fp = 1;
    222   out->fp_bytes = (p[0] == 'q' || p[0] == 'Q') ? 16u : 8u;
    223   return 1;
    224 }
    225 
    226 /* Scalar SIMD/FP transfer register for ldr/str/ldur/stur: b/h/s/d/q with the
    227  * access width in fp_bytes (1/2/4/8/16). Unlike parse_fp_pair_reg_from_ident
    228  * (ldp/stp, d/q only) this accepts the sub-64-bit scalar widths a single-reg
    229  * FP load/store can carry. */
    230 static int parse_fp_scalar_reg_from_ident(AsmDriver* d, Sym ident,
    231                                           AA64Reg* out) {
    232   Slice sl = pool_slice(asm_driver_pool(d), ident);
    233   const char* p = sl.s;
    234   size_t n = sl.len;
    235   u8 bytes;
    236   u32 r = 0;
    237   size_t i;
    238   if (!p || n < 2) return 0;
    239   switch (p[0]) {
    240     case 'b':
    241     case 'B':
    242       bytes = 1;
    243       break;
    244     case 'h':
    245     case 'H':
    246       bytes = 2;
    247       break;
    248     case 's':
    249     case 'S':
    250       bytes = 4;
    251       break;
    252     case 'd':
    253     case 'D':
    254       bytes = 8;
    255       break;
    256     case 'q':
    257     case 'Q':
    258       bytes = 16;
    259       break;
    260     default:
    261       return 0;
    262   }
    263   for (i = 1; i < n; ++i) {
    264     char c = p[i];
    265     if (c < '0' || c > '9') return 0;
    266     r = r * 10 + (u32)(c - '0');
    267     if (r > 31) return 0;
    268   }
    269   out->num = r;
    270   out->is64 = 1;
    271   out->is_sp = 0;
    272   out->is_fp = 1;
    273   out->fp_bytes = bytes;
    274   return 1;
    275 }
    276 
    277 static AA64Reg parse_reg(AsmDriver* d) {
    278   AsmTok t = asm_driver_next(d);
    279   AA64Reg r;
    280   memset(&r, 0, sizeof r);
    281   if (t.kind != ASM_TOK_IDENT || !parse_reg_from_ident(d, t.v.ident, &r))
    282     asm_driver_panic(d, "asm: expected register");
    283   return r;
    284 }
    285 
    286 /* Non-consuming lookahead: is the next operand a register? Used to pick
    287  * between the register and immediate forms of dual-form mnemonics (e.g.
    288  * `and Rd,Rn,Rm` vs `and Rd,Rn,#imm`). */
    289 static int peek_is_reg(AsmDriver* d) {
    290   AsmTok t = asm_driver_peek(d);
    291   AA64Reg r;
    292   return t.kind == ASM_TOK_IDENT && parse_reg_from_ident(d, t.v.ident, &r);
    293 }
    294 
    295 /* ldr/str transfer register: GPR (Wt/Xt) or scalar SIMD/FP (Bt..Qt). */
    296 static AA64Reg parse_ldst_reg(AsmDriver* d) {
    297   AsmTok t = asm_driver_next(d);
    298   AA64Reg r;
    299   memset(&r, 0, sizeof r);
    300   if (t.kind != ASM_TOK_IDENT ||
    301       (!parse_reg_from_ident(d, t.v.ident, &r) &&
    302        !parse_fp_scalar_reg_from_ident(d, t.v.ident, &r)))
    303     asm_driver_panic(d, "asm: ldr/str: expected register");
    304   return r;
    305 }
    306 
    307 /* Resolve the (size, V, opc, scale) load/store encoding fields from the
    308  * transfer register and mnemonic flavor. GPR width comes from fixed_size (the
    309  * sized mnemonics ldrb/ldrsw/…) or the register; FP uses V=1 with size/opc
    310  * keyed on the scalar width (b/h/s/d = size 0/1/2/3, opc store=0/load=1; the
    311  * 128-bit q is size=0 opc=2/3). `scale` is the byte access width for the
    312  * scaled unsigned-imm12 form. */
    313 typedef struct {
    314   u32 size, V, opc, scale;
    315 } AA64LdStEnc;
    316 
    317 static AA64LdStEnc ldst_encoding(AsmDriver* d, AA64Reg rt, int is_load,
    318                                  int fixed_size, int sign_ext) {
    319   AA64LdStEnc e;
    320   if (rt.is_fp) {
    321     if (fixed_size >= 0 || sign_ext)
    322       asm_driver_panic(d, "asm: sized/signed ld/st takes a GPR, not an FP reg");
    323     e.V = 1;
    324     e.scale = rt.fp_bytes;
    325     e.size = (rt.fp_bytes == 1)   ? 0u
    326              : (rt.fp_bytes == 2) ? 1u
    327              : (rt.fp_bytes == 4) ? 2u
    328              : (rt.fp_bytes == 8) ? 3u
    329                                   : 0u; /* 16 (Q): size=0, opc carries width */
    330     e.opc = (rt.fp_bytes == 16) ? (is_load ? 3u : 2u) : (is_load ? 1u : 0u);
    331   } else {
    332     e.V = 0;
    333     e.size = (fixed_size >= 0) ? (u32)fixed_size : (rt.is64 ? 3u : 2u);
    334     e.scale = 1u << e.size;
    335     e.opc = !is_load    ? AA64_LDST_OPC_STR
    336             : !sign_ext ? AA64_LDST_OPC_LDR
    337             : rt.is64   ? 2u  /* LDRS*, 64-bit dst */
    338                         : 3u; /* LDRS*, 32-bit dst */
    339   }
    340   return e;
    341 }
    342 
    343 static AA64Reg parse_ldstp_reg(AsmDriver* d) {
    344   AsmTok t = asm_driver_next(d);
    345   AA64Reg r;
    346   memset(&r, 0, sizeof r);
    347   if (t.kind != ASM_TOK_IDENT ||
    348       (!parse_reg_from_ident(d, t.v.ident, &r) &&
    349        !parse_fp_pair_reg_from_ident(d, t.v.ident, &r))) {
    350     asm_driver_panic(d, "asm: expected register");
    351   }
    352   return r;
    353 }
    354 
    355 static void reject_sp_reg(AsmDriver* d, AA64Reg r, const char* what) {
    356   if (r.is_sp)
    357     asm_driver_panic(d, "asm: %.*s: SP register not allowed",
    358                      SLICE_ARG(slice_from_cstr(what)));
    359 }
    360 
    361 static void require_sp_spelling(AsmDriver* d, AA64Reg r, const char* what) {
    362   if (r.num == 31u && !r.is_sp)
    363     asm_driver_panic(d, "asm: %.*s: zero register not allowed in SP operand",
    364                      SLICE_ARG(slice_from_cstr(what)));
    365 }
    366 
    367 /* Parse "#imm" (with optional + / -) or a bare expression — GNU as is
    368  * lenient about the leading hash.  Returns an i64. */
    369 static i64 parse_imm_const(AsmDriver* d) {
    370   (void)asm_driver_eat_punct(d, '#');
    371   return asm_driver_parse_const(d);
    372 }
    373 
    374 /* Parse a possibly-symbolic operand prefixed by '#'. */
    375 static void parse_imm_sym(AsmDriver* d, ObjSymId* sym_out, i64* val_out) {
    376   (void)asm_driver_eat_punct(d, '#');
    377   asm_driver_parse_sym_expr(d, sym_out, val_out);
    378 }
    379 
    380 /* GNU-as relocation modifier on an aarch64 operand (`:lo12:`, `:got:`,
    381  * `:got_lo12:`).  AA64_RELMOD_NONE means no modifier was present. */
    382 typedef enum AA64RelMod {
    383   AA64_RELMOD_NONE = 0,
    384   AA64_RELMOD_PAGE, /* explicit adrp page reloc (Mach-O `@PAGE`); == bare adrp
    385                      */
    386   AA64_RELMOD_LO12,
    387   AA64_RELMOD_GOT,
    388   AA64_RELMOD_GOT_LO12,
    389 } AA64RelMod;
    390 
    391 /* True when the assembler's target object format is Mach-O, which spells
    392  * operand relocations as `@PAGE`/`@PAGEOFF` suffixes; ELF/COFF spell them as
    393  * `:lo12:`/`:got:` prefixes.  kit as parses the dialect of its target only
    394  * (no hybrid), mirroring what `cc -S` emits for that format. */
    395 static int target_is_macho(AsmDriver* d) {
    396   return asm_driver_compiler(d)->target.obj == KIT_OBJ_MACHO;
    397 }
    398 
    399 /* If the next token is ':', consume a `:name:` relocation modifier prefix and
    400  * return its kind.  A leading ':' is unambiguous at an operand position (a
    401  * label's ':' only appears at end-of-mnemonic).  Returns AA64_RELMOD_NONE and
    402  * leaves the stream untouched when there is no modifier. */
    403 static AA64RelMod parse_reloc_mod(AsmDriver* d) {
    404   if (!tok_punct(asm_driver_peek(d), ':')) return AA64_RELMOD_NONE;
    405   (void)asm_driver_next(d); /* eat ':' */
    406   AsmTok name = asm_driver_next(d);
    407   if (name.kind != ASM_TOK_IDENT)
    408     asm_driver_panic(d, "asm: expected relocation modifier name after ':'");
    409   Slice s = pool_slice(asm_driver_pool(d), name.v.ident);
    410   AA64RelMod mod;
    411   if (icase_eq(s.s, s.len, "lo12"))
    412     mod = AA64_RELMOD_LO12;
    413   else if (icase_eq(s.s, s.len, "got"))
    414     mod = AA64_RELMOD_GOT;
    415   else if (icase_eq(s.s, s.len, "got_lo12"))
    416     mod = AA64_RELMOD_GOT_LO12;
    417   else
    418     asm_driver_panic(d, "asm: unsupported relocation modifier");
    419   asm_driver_expect_punct(d, ':', "':' closing relocation modifier");
    420   return mod;
    421 }
    422 
    423 /* Mach-O operand relocation suffix: after a symbol(+addend), an optional
    424  * `@PAGE` / `@PAGEOFF` / `@GOTPAGE` / `@GOTPAGEOFF`.  Maps to the same
    425  * AA64RelMod the ELF `:mod:` prefix produces, so downstream encoding/reloc
    426  * emission is shared.  `@PAGE` is the explicit spelling of an adrp page reloc
    427  * (a bare adrp on ELF).  Returns AA64_RELMOD_NONE, stream untouched, when the
    428  * next token is not '@'. */
    429 static AA64RelMod parse_reloc_suffix(AsmDriver* d) {
    430   if (!tok_punct(asm_driver_peek(d), '@')) return AA64_RELMOD_NONE;
    431   (void)asm_driver_next(d); /* eat '@' */
    432   AsmTok name = asm_driver_next(d);
    433   if (name.kind != ASM_TOK_IDENT)
    434     asm_driver_panic(d, "asm: expected relocation suffix name after '@'");
    435   Slice s = pool_slice(asm_driver_pool(d), name.v.ident);
    436   if (icase_eq(s.s, s.len, "PAGE")) return AA64_RELMOD_PAGE;
    437   if (icase_eq(s.s, s.len, "PAGEOFF")) return AA64_RELMOD_LO12;
    438   if (icase_eq(s.s, s.len, "GOTPAGE")) return AA64_RELMOD_GOT;
    439   if (icase_eq(s.s, s.len, "GOTPAGEOFF")) return AA64_RELMOD_GOT_LO12;
    440   asm_driver_panic(d, "asm: unsupported relocation suffix");
    441 }
    442 
    443 /* The R_AARCH64_LDST{8,16,32,64}_ABS_LO12_NC reloc for an access log2-size. */
    444 static RelocKind aa64_ldst_lo12_reloc(AsmDriver* d, u32 size) {
    445   switch (size) {
    446     case 0:
    447       return R_AARCH64_LDST8_ABS_LO12_NC;
    448     case 1:
    449       return R_AARCH64_LDST16_ABS_LO12_NC;
    450     case 2:
    451       return R_AARCH64_LDST32_ABS_LO12_NC;
    452     case 3:
    453       return R_AARCH64_LDST64_ABS_LO12_NC;
    454     default:
    455       asm_driver_panic(d,
    456                        "asm: ldr/str: :lo12: not valid for this access size");
    457   }
    458 }
    459 
    460 /* Printer-side inverse of the operand reloc-modifier parsers above: how a
    461  * relocated aarch64 operand is spelled in `cc -S` text for the target object
    462  * format.  ELF uses a `:mod:` prefix; Mach-O uses an `@MOD` suffix — and even
    463  * a bare adrp page reloc needs an explicit `@PAGE` there.  Kept adjacent to
    464  * the `.s` parser (parse_reloc_mod / parse_reloc_suffix and their call sites)
    465  * so the emit and parse spellings stay in lockstep.  See ArchAsmOps. */
    466 static int aa64_reloc_operand(u16 kind, KitObjFmt fmt, ArchRelocOperand* out) {
    467   ArchRelocSurg surg;
    468   const char* elf;   /* `:mod:` prefix */
    469   const char* macho; /* `@MOD` suffix */
    470   switch (kind) {
    471     case R_AARCH64_CALL26:
    472     case R_AARCH64_JUMP26:
    473     case R_AARCH64_CONDBR19:
    474     case R_AARCH64_ADR_PREL_LO21:
    475       surg = ARCH_RELOC_SURG_TAIL, elf = "", macho = "";
    476       break;
    477     case R_AARCH64_ADR_PREL_PG_HI21:
    478       surg = ARCH_RELOC_SURG_TAIL, elf = "", macho = "@PAGE";
    479       break;
    480     case R_AARCH64_ADR_GOT_PAGE:
    481       surg = ARCH_RELOC_SURG_TAIL, elf = ":got:", macho = "@GOTPAGE";
    482       break;
    483     case R_AARCH64_ADD_ABS_LO12_NC:
    484       surg = ARCH_RELOC_SURG_TAIL, elf = ":lo12:", macho = "@PAGEOFF";
    485       break;
    486     case R_AARCH64_LDST8_ABS_LO12_NC:
    487     case R_AARCH64_LDST16_ABS_LO12_NC:
    488     case R_AARCH64_LDST32_ABS_LO12_NC:
    489     case R_AARCH64_LDST64_ABS_LO12_NC:
    490       surg = ARCH_RELOC_SURG_MEM, elf = ":lo12:", macho = "@PAGEOFF";
    491       break;
    492     case R_AARCH64_LD64_GOT_LO12_NC:
    493       surg = ARCH_RELOC_SURG_MEM, elf = ":got_lo12:", macho = "@GOTPAGEOFF";
    494       break;
    495     default:
    496       return 0; /* TLV and anything else: keep the numeric operand */
    497   }
    498   out->surg = surg;
    499   out->addend_bias = 0; /* aarch64 relocs store the symbol offset directly */
    500   if (fmt == KIT_OBJ_MACHO) {
    501     out->prefix = "";
    502     out->suffix = macho;
    503   } else {
    504     out->prefix = elf;
    505     out->suffix = "";
    506   }
    507   return 1;
    508 }
    509 
    510 /* Intra-section local branches whose target codegen resolved in place (no
    511  * relocation): b, b.<cc>, cbz/cbnz, tbz/tbnz, and adr (address-of-label, e.g.
    512  * `&&label`). Excludes bl (a call — always relocated), adrp (page-relative; its
    513  * lo12 partner carries the reloc), and register-form branches. Moved here from
    514  * the printer so branch-mnemonic knowledge is arch-local. */
    515 static int aa64_is_local_branch(KitSlice m) {
    516   if (m.len == 1 && m.s[0] == 'b') return 1;
    517   if (m.len >= 2 && m.s[0] == 'b' && m.s[1] == '.') return 1;
    518   if (m.len == 3 && memcmp(m.s, "cbz", 3) == 0) return 1;
    519   if (m.len == 4 && memcmp(m.s, "cbnz", 4) == 0) return 1;
    520   if (m.len == 3 && memcmp(m.s, "tbz", 3) == 0) return 1;
    521   if (m.len == 4 && memcmp(m.s, "tbnz", 4) == 0) return 1;
    522   return 0;
    523 }
    524 
    525 const ArchAsmOps aa64_asm_ops = {
    526     .reloc_operand = aa64_reloc_operand,
    527     .is_local_branch = aa64_is_local_branch,
    528 };
    529 
    530 static void emit32(AsmDriver* d, u32 word) {
    531   MCEmitter* mc = asm_driver_mc(d);
    532   (void)asm_driver_cur_section(d);
    533   u8 buf[4];
    534   buf[0] = (u8)(word & 0xff);
    535   buf[1] = (u8)((word >> 8) & 0xff);
    536   buf[2] = (u8)((word >> 16) & 0xff);
    537   buf[3] = (u8)((word >> 24) & 0xff);
    538   mc->emit_bytes(mc, buf, 4);
    539 }
    540 
    541 static int parse_cond_from_ident(AsmDriver* d, Sym ident, u32* out) {
    542   Slice sl = pool_slice(asm_driver_pool(d), ident);
    543   const char* s = sl.s;
    544   size_t n = sl.len;
    545   if (!s) return 0;
    546   if (icase_eq(s, n, "eq"))
    547     *out = 0;
    548   else if (icase_eq(s, n, "ne"))
    549     *out = 1;
    550   else if (icase_eq(s, n, "cs") || icase_eq(s, n, "hs"))
    551     *out = 2;
    552   else if (icase_eq(s, n, "cc") || icase_eq(s, n, "lo"))
    553     *out = 3;
    554   else if (icase_eq(s, n, "mi"))
    555     *out = 4;
    556   else if (icase_eq(s, n, "pl"))
    557     *out = 5;
    558   else if (icase_eq(s, n, "vs"))
    559     *out = 6;
    560   else if (icase_eq(s, n, "vc"))
    561     *out = 7;
    562   else if (icase_eq(s, n, "hi"))
    563     *out = 8;
    564   else if (icase_eq(s, n, "ls"))
    565     *out = 9;
    566   else if (icase_eq(s, n, "ge"))
    567     *out = 10;
    568   else if (icase_eq(s, n, "lt"))
    569     *out = 11;
    570   else if (icase_eq(s, n, "gt"))
    571     *out = 12;
    572   else if (icase_eq(s, n, "le"))
    573     *out = 13;
    574   else if (icase_eq(s, n, "al"))
    575     *out = 14;
    576   else
    577     return 0;
    578   return 1;
    579 }
    580 
    581 static u32 parse_cond(AsmDriver* d, const char* what) {
    582   AsmTok t = asm_driver_next(d);
    583   u32 cond = 0;
    584   if (t.kind != ASM_TOK_IDENT || !parse_cond_from_ident(d, t.v.ident, &cond))
    585     asm_driver_panic(d, "asm: %.*s: expected condition code",
    586                      SLICE_ARG(slice_from_cstr(what)));
    587   return cond;
    588 }
    589 
    590 static void expect_comma(AsmDriver* d, const char* what) {
    591   if (!asm_driver_eat_comma(d))
    592     asm_driver_panic(d, "asm: expected ',' (%.*s)",
    593                      SLICE_ARG(slice_from_cstr(what)));
    594 }
    595 
    596 /* ---- per-mnemonic parsers ---- */
    597 
    598 /* ret [Xn] — Xn defaults to x30. */
    599 static void p_ret(AsmDriver* d) {
    600   if (asm_driver_at_eol(d)) {
    601     emit32(d, aa64_ret(30));
    602     return;
    603   }
    604   AA64Reg r = parse_reg(d);
    605   if (!r.is64) asm_driver_panic(d, "asm: ret: 64-bit register expected");
    606   emit32(d, aa64_ret(r.num));
    607 }
    608 
    609 static void p_br(AsmDriver* d) {
    610   AA64Reg r = parse_reg(d);
    611   if (!r.is64) asm_driver_panic(d, "asm: br: 64-bit register expected");
    612   emit32(d, aa64_br(r.num));
    613 }
    614 
    615 static void p_blr(AsmDriver* d) {
    616   AA64Reg r = parse_reg(d);
    617   if (!r.is64) asm_driver_panic(d, "asm: blr: 64-bit register expected");
    618   emit32(d, aa64_blr(r.num));
    619 }
    620 
    621 static void p_nop(AsmDriver* d) {
    622   (void)d;
    623   emit32(d, aa64_nop());
    624 }
    625 
    626 /* Memory barriers (DMB / DSB / ISB / CLREX).
    627  *
    628  *   dmb <option>        ; option in {sy, ish, nsh, osh, ld, st, ishld,
    629  *                                    ishst, nshld, nshst, oshld, oshst}
    630  *   dmb #imm4           ; numeric form
    631  *   dsb <option> | #imm4
    632  *   isb [<option>]      ; option defaults to sy when omitted
    633  *   clrex [#imm4]       ; option defaults to sy (15) when omitted */
    634 static u32 parse_barrier_option(AsmDriver* d, int allow_dmb_ld_st) {
    635   if (asm_driver_at_eol(d)) return AA64_BARRIER_OPT_SY;
    636   AsmTok t = asm_driver_peek(d);
    637   if (t.kind == ASM_TOK_IDENT) {
    638     (void)asm_driver_next(d);
    639     Slice sl = pool_slice(asm_driver_pool(d), t.v.ident);
    640     const char* s = sl.s;
    641     size_t n = sl.len;
    642     if (icase_eq(s, n, "sy")) return AA64_BARRIER_OPT_SY;
    643     if (icase_eq(s, n, "ish")) return AA64_BARRIER_OPT_ISH;
    644     if (icase_eq(s, n, "ishld")) return AA64_BARRIER_OPT_ISHLD;
    645     if (icase_eq(s, n, "ishst")) return AA64_BARRIER_OPT_ISHST;
    646     if (icase_eq(s, n, "nsh")) return AA64_BARRIER_OPT_NSH;
    647     if (icase_eq(s, n, "nshld")) return AA64_BARRIER_OPT_NSHLD;
    648     if (icase_eq(s, n, "nshst")) return AA64_BARRIER_OPT_NSHST;
    649     if (icase_eq(s, n, "osh")) return AA64_BARRIER_OPT_OSH;
    650     if (icase_eq(s, n, "oshld")) return AA64_BARRIER_OPT_OSHLD;
    651     if (icase_eq(s, n, "oshst")) return AA64_BARRIER_OPT_OSHST;
    652     if (allow_dmb_ld_st) {
    653       if (icase_eq(s, n, "ld")) return AA64_BARRIER_OPT_LD;
    654       if (icase_eq(s, n, "st")) return AA64_BARRIER_OPT_ST;
    655     }
    656     asm_driver_panic(d, "asm: unknown barrier option");
    657   }
    658   /* Numeric form: '#imm4'. */
    659   i64 imm = parse_imm_const(d);
    660   if (imm < 0 || imm > 15) asm_driver_panic(d, "asm: barrier imm out of range");
    661   return (u32)imm;
    662 }
    663 
    664 static void p_dmb(AsmDriver* d) {
    665   u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/1);
    666   emit32(d, aa64_dmb(opt));
    667 }
    668 static void p_dsb(AsmDriver* d) {
    669   u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/0);
    670   emit32(d, aa64_dsb(opt));
    671 }
    672 static void p_isb(AsmDriver* d) {
    673   u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/0);
    674   emit32(d, aa64_isb(opt));
    675 }
    676 static void p_clrex(AsmDriver* d) {
    677   u32 opt = parse_barrier_option(d, /*allow_dmb_ld_st=*/0);
    678   emit32(d, aa64_clrex(opt));
    679 }
    680 
    681 /* System-register access (MRS/MSR register form).
    682  *
    683  *   mrs Xt, <sysreg>     ; read  system register into Xt
    684  *   msr <sysreg>, Xt     ; write Xt into the system register
    685  *
    686  * A system register is named (resolved against the shared isa.c name table
    687  * via aa64_sysreg_by_name) or given by the architectural generic spelling
    688  * S<op0>_<op1>_C<crn>_C<crm>_<op2> (e.g. s3_3_c13_c0_2 == tpidr_el0), which
    689  * covers any encodable register. The immediate PSTATE forms (msr daifset,
    690  * #imm / spsel, #imm) are not handled. */
    691 
    692 /* Consume the next token as a system-register name into the 5 fields. */
    693 static void parse_sysreg(AsmDriver* d, const char* what, u32* op0, u32* op1,
    694                          u32* crn, u32* crm, u32* op2) {
    695   AsmTok t = asm_driver_peek(d);
    696   if (t.kind != ASM_TOK_IDENT)
    697     asm_driver_panic(d, "asm: expected system register");
    698   (void)asm_driver_next(d);
    699   Slice sl = pool_slice(asm_driver_pool(d), t.v.ident);
    700   if (!aa64_sysreg_by_name(sl.s, sl.len, op0, op1, crn, crm, op2))
    701     asm_driver_panic(d, what);
    702 }
    703 
    704 static void p_mrs_(AsmDriver* d) {
    705   AA64Reg rt = parse_reg(d);
    706   if (!rt.is64 || rt.is_sp)
    707     asm_driver_panic(d, "asm: mrs: destination must be a 64-bit GPR");
    708   expect_comma(d, "mrs");
    709   u32 op0, op1, crn, crm, op2;
    710   parse_sysreg(d, "asm: mrs: unknown system register", &op0, &op1, &crn, &crm,
    711                &op2);
    712   emit32(d, aa64_sysreg_move(/*is_read=*/1, op0, op1, crn, crm, op2, rt.num));
    713 }
    714 
    715 static void p_msr_(AsmDriver* d) {
    716   u32 op0, op1, crn, crm, op2;
    717   parse_sysreg(d,
    718                "asm: msr: unknown system register (immediate PSTATE forms "
    719                "like daifset are unsupported)",
    720                &op0, &op1, &crn, &crm, &op2);
    721   expect_comma(d, "msr");
    722   AA64Reg rt = parse_reg(d);
    723   if (!rt.is64 || rt.is_sp)
    724     asm_driver_panic(d, "asm: msr: source must be a 64-bit GPR");
    725   emit32(d, aa64_sysreg_move(/*is_read=*/0, op0, op1, crn, crm, op2, rt.num));
    726 }
    727 
    728 /* mov:
    729  *   mov Rd, Rm        → ORR Rd, ZR, Rm
    730  *   mov Rd, #imm      → MOVZ (if imm fits in a single halfword unshifted)
    731  *                       MOVN (if ~imm fits)
    732  *                       otherwise: panic (multi-step expansion deferred). */
    733 static void p_mov(AsmDriver* d) {
    734   AA64Reg rd = parse_reg(d);
    735   expect_comma(d, "mov");
    736   AsmTok t = asm_driver_peek(d);
    737   if (t.kind == ASM_TOK_IDENT) {
    738     AA64Reg src;
    739     memset(&src, 0, sizeof src);
    740     if (parse_reg_from_ident(d, t.v.ident, &src)) {
    741       (void)asm_driver_next(d);
    742       if (src.is64 != rd.is64)
    743         asm_driver_panic(d, "asm: mov: register width mismatch");
    744       /* mov involving SP encodes as `ADD Rd, Rsp, #0` per AArch64;
    745        * approximate with that exact form. */
    746       if (rd.is_sp || src.is_sp) {
    747         require_sp_spelling(d, rd, "mov sp");
    748         require_sp_spelling(d, src, "mov sp");
    749         emit32(d, aa64_add_imm(rd.is64, rd.num, src.num, 0, 0));
    750         return;
    751       }
    752       emit32(d, aa64_mov_reg(rd.is64, rd.num, src.num));
    753       return;
    754     }
    755     /* fall through: identifier that is not a register → treat as
    756      * symbol/equate via expression below. */
    757   }
    758   /* Immediate. */
    759   i64 imm = parse_imm_const(d);
    760   if (rd.is_sp) asm_driver_panic(d, "asm: mov: cannot move imm into SP");
    761   u64 uv = (u64)imm;
    762   u64 mask = rd.is64 ? ~0ull : 0xffffffffull;
    763   uv &= mask;
    764   /* Try MOVZ with one of four halfwords. */
    765   for (u32 hw = 0; hw < (rd.is64 ? 4u : 2u); ++hw) {
    766     u64 shift = (u64)hw * 16;
    767     u64 hwmask = 0xffffull << shift;
    768     if ((uv & ~hwmask) == 0) {
    769       u32 v = (u32)((uv >> shift) & 0xffff);
    770       emit32(d, aa64_movz(rd.is64, rd.num, v, hw));
    771       return;
    772     }
    773   }
    774   /* Try MOVN with one halfword (encodes ~imm in that halfword). */
    775   u64 nv = (~uv) & mask;
    776   for (u32 hw = 0; hw < (rd.is64 ? 4u : 2u); ++hw) {
    777     u64 shift = (u64)hw * 16;
    778     u64 hwmask = 0xffffull << shift;
    779     if ((nv & ~hwmask) == 0) {
    780       u32 v = (u32)((nv >> shift) & 0xffff);
    781       emit32(d, aa64_movn(rd.is64, rd.num, v, hw));
    782       return;
    783     }
    784   }
    785   /* Try the ORR-bitmask alias (mov Rd,#imm → ORR Rd,ZR,#bitmask). */
    786   {
    787     u32 N = 0, immr = 0, imms = 0;
    788     if (aa64_logimm_encode(uv, rd.is64, &N, &immr, &imms)) {
    789       emit32(d, aa64_orr_imm(rd.is64, rd.num, AA64_ZR, N, immr, imms));
    790       return;
    791     }
    792   }
    793   asm_driver_panic(d, "asm: mov: immediate cannot be encoded in one insn");
    794 }
    795 
    796 /* mvn Rd, Rm */
    797 static void p_mvn(AsmDriver* d) {
    798   AA64Reg rd = parse_reg(d);
    799   expect_comma(d, "mvn");
    800   AA64Reg rm = parse_reg(d);
    801   if (rd.is64 != rm.is64) asm_driver_panic(d, "asm: mvn: width mismatch");
    802   emit32(d, aa64_mvn(rd.is64, rd.num, rm.num));
    803 }
    804 
    805 /* movz / movn / movk Rd, #imm[, lsl #shift] */
    806 static void p_movwide(AsmDriver* d, u32 opc) {
    807   AA64Reg rd = parse_reg(d);
    808   expect_comma(d, "movz/n/k");
    809   i64 imm = parse_imm_const(d);
    810   u32 hw = 0;
    811   if (asm_driver_eat_comma(d)) {
    812     /* lsl #N (N is 0/16/32/48). */
    813     AsmTok lid = asm_driver_next(d);
    814     if (lid.kind != ASM_TOK_IDENT) asm_driver_panic(d, "asm: expected 'lsl'");
    815     Slice lsl = pool_slice(asm_driver_pool(d), lid.v.ident);
    816     const char* lp = lsl.s;
    817     size_t ln = lsl.len;
    818     if (!lp || !icase_eq(lp, ln, "lsl"))
    819       asm_driver_panic(d, "asm: expected 'lsl'");
    820     i64 sh = parse_imm_const(d);
    821     if (sh % 16 != 0 || sh < 0 || sh > 48)
    822       asm_driver_panic(d, "asm: movz/n/k: bad lsl shift");
    823     hw = (u32)(sh / 16);
    824   }
    825   u32 word = ((rd.is64 & 1u) << 31) | ((opc & 3u) << 29) |
    826              AA64_MOVEWIDE_FAMILY_MATCH | ((hw & 3u) << 21) |
    827              (((u32)imm & 0xffffu) << 5) | (rd.num & 0x1fu);
    828   emit32(d, word);
    829 }
    830 
    831 /* svc / brk / hlt #imm */
    832 static void p_except(AsmDriver* d, u32 form) {
    833   i64 imm = parse_imm_const(d);
    834   switch (form) {
    835     case 0:
    836       emit32(d, aa64_svc((u32)imm));
    837       break;
    838     case 1:
    839       emit32(d, aa64_brk((u32)imm));
    840       break;
    841     case 2: {
    842       /* HLT */
    843       u32 word = AA64_EXCEPT_FAMILY_MATCH | ((u32)2 << 21) |
    844                  (((u32)imm & 0xffffu) << 5);
    845       emit32(d, word);
    846       break;
    847     }
    848     default:
    849       asm_driver_panic(d, "asm: bad exception form");
    850   }
    851 }
    852 
    853 /* Read optional `, lsl|lsr|asr|ror #imm` shift modifier. Returns 1 if
    854  * present. */
    855 static int parse_shift_mod(AsmDriver* d, u32* shift_out, u32* imm6_out) {
    856   AsmTok t = asm_driver_peek(d);
    857   if (t.kind != ASM_TOK_IDENT) return 0;
    858   Slice sl = pool_slice(asm_driver_pool(d), t.v.ident);
    859   const char* p = sl.s;
    860   size_t n = sl.len;
    861   u32 sh;
    862   if (icase_eq(p, n, "lsl"))
    863     sh = 0;
    864   else if (icase_eq(p, n, "lsr"))
    865     sh = 1;
    866   else if (icase_eq(p, n, "asr"))
    867     sh = 2;
    868   else if (icase_eq(p, n, "ror"))
    869     sh = 3;
    870   else
    871     return 0;
    872   (void)asm_driver_next(d);
    873   i64 imm = parse_imm_const(d);
    874   if (imm < 0 || imm > 63)
    875     asm_driver_panic(d, "asm: shift amount out of range");
    876   *shift_out = sh;
    877   *imm6_out = (u32)imm;
    878   return 1;
    879 }
    880 
    881 /* add / sub family.
    882  * Forms:
    883  *   add Rd, Rn, Rm[, lsl #s]   shifted-register
    884  *   add Rd, Rn, #imm           immediate
    885  *   add Rd, Rn, #imm, lsl #12  immediate w/ shift
    886  * S-suffixed (adds/subs) sets flags. */
    887 static void p_addsub(AsmDriver* d, int is_sub, int set_flags) {
    888   AA64Reg rd = parse_reg(d);
    889   expect_comma(d, "add/sub");
    890   AA64Reg rn = parse_reg(d);
    891   expect_comma(d, "add/sub");
    892   AsmTok t = asm_driver_peek(d);
    893   /* `add Rd, Rn, <sym lo12>` — ADD (immediate), zero imm12, plus an
    894    * R_AARCH64_ADD_ABS_LO12_NC relocation (the low-12 PIC/abs sequence). ELF
    895    * spells the modifier as a `:lo12:` prefix (leading ':'); Mach-O spells it
    896    * as a `sym@PAGEOFF` suffix, so the trigger there is a non-register IDENT
    897    * third operand (probe with parse_reg_from_ident so `add x0,x1,x2` stays the
    898    * register path). */
    899   int symbolic = 0;
    900   if (!is_sub && !set_flags) {
    901     if (target_is_macho(d)) {
    902       AA64Reg probe;
    903       memset(&probe, 0, sizeof probe);
    904       symbolic = (t.kind == ASM_TOK_IDENT &&
    905                   !parse_reg_from_ident(d, t.v.ident, &probe));
    906     } else {
    907       symbolic = tok_punct(t, ':');
    908     }
    909   }
    910   if (symbolic) {
    911     AA64RelMod mod;
    912     ObjSymId sym = OBJ_SYM_NONE;
    913     i64 off = 0;
    914     if (target_is_macho(d)) {
    915       parse_imm_sym(d, &sym, &off);
    916       mod = parse_reloc_suffix(d);
    917     } else {
    918       mod = parse_reloc_mod(d);
    919       parse_imm_sym(d, &sym, &off);
    920     }
    921     if (mod != AA64_RELMOD_LO12)
    922       asm_driver_panic(d,
    923                        "asm: add: only :lo12: (ELF) / @PAGEOFF (Mach-O) is "
    924                        "valid here");
    925     if (rd.is64 != rn.is64)
    926       asm_driver_panic(d, "asm: add lo12: width mismatch");
    927     u32 word = aa64_addsubimm_pack((AA64AddSubImm){.sf = rd.is64,
    928                                                    .op = 0,
    929                                                    .S = 0,
    930                                                    .sh = 0,
    931                                                    .imm12 = 0,
    932                                                    .Rn = rn.num,
    933                                                    .Rd = rd.num});
    934     emit32(d, word);
    935     MCEmitter* mc = asm_driver_mc(d);
    936     mc->emit_reloc_at(mc, asm_driver_cur_section(d), mc->pos(mc) - 4,
    937                       R_AARCH64_ADD_ABS_LO12_NC, sym, off, 1, 0);
    938     return;
    939   }
    940   if (tok_punct(t, '#') || t.kind == ASM_TOK_NUM || tok_punct(t, '-') ||
    941       tok_punct(t, '+')) {
    942     /* immediate form */
    943     if (rd.is64 != rn.is64)
    944       asm_driver_panic(d, "asm: add/sub imm: width mismatch");
    945     require_sp_spelling(d, rn, "add/sub imm");
    946     if (set_flags) {
    947       reject_sp_reg(d, rd, "add/sub imm");
    948     } else {
    949       require_sp_spelling(d, rd, "add/sub imm");
    950     }
    951     i64 imm = parse_imm_const(d);
    952     u32 sh = 0;
    953     if (asm_driver_eat_comma(d)) {
    954       AsmTok lid = asm_driver_next(d);
    955       if (lid.kind != ASM_TOK_IDENT)
    956         asm_driver_panic(d, "asm: expected 'lsl #12'");
    957       Slice lsl = pool_slice(asm_driver_pool(d), lid.v.ident);
    958       const char* lp = lsl.s;
    959       size_t ln = lsl.len;
    960       if (!lp || !icase_eq(lp, ln, "lsl"))
    961         asm_driver_panic(d, "asm: expected 'lsl'");
    962       i64 s = parse_imm_const(d);
    963       if (s == 12)
    964         sh = 1;
    965       else if (s == 0)
    966         sh = 0;
    967       else
    968         asm_driver_panic(d, "asm: add/sub imm: lsl must be 0 or 12");
    969     }
    970     if (imm < 0 || imm > 0xfff)
    971       asm_driver_panic(d, "asm: add/sub imm out of range");
    972     u32 word = aa64_addsubimm_pack((AA64AddSubImm){.sf = rd.is64,
    973                                                    .op = (u32)is_sub,
    974                                                    .S = (u32)set_flags,
    975                                                    .sh = sh,
    976                                                    .imm12 = (u32)imm,
    977                                                    .Rn = rn.num,
    978                                                    .Rd = rd.num});
    979     emit32(d, word);
    980     return;
    981   }
    982   /* register form */
    983   AA64Reg rm = parse_reg(d);
    984   reject_sp_reg(d, rd, "add/sub reg");
    985   reject_sp_reg(d, rn, "add/sub reg");
    986   reject_sp_reg(d, rm, "add/sub reg");
    987   if (rd.is64 != rm.is64 || rd.is64 != rn.is64)
    988     asm_driver_panic(d, "asm: add/sub reg: width mismatch");
    989   u32 shift = 0, imm6 = 0;
    990   if (asm_driver_eat_comma(d)) {
    991     if (!parse_shift_mod(d, &shift, &imm6))
    992       asm_driver_panic(d, "asm: add/sub reg: expected shift modifier");
    993   }
    994   u32 word = aa64_addsubsr_pack((AA64AddSubSR){.sf = rd.is64,
    995                                                .op = (u32)is_sub,
    996                                                .S = (u32)set_flags,
    997                                                .shift = shift,
    998                                                .Rm = rm.num,
    999                                                .imm6 = imm6,
   1000                                                .Rn = rn.num,
   1001                                                .Rd = rd.num});
   1002   emit32(d, word);
   1003 }
   1004 
   1005 /* cmp Rn, Rm | cmp Rn, #imm  → SUBS ZR, Rn, ... */
   1006 static void p_cmp(AsmDriver* d, int is_neg /* cmn flips op */) {
   1007   AA64Reg rn = parse_reg(d);
   1008   expect_comma(d, "cmp");
   1009   AsmTok t = asm_driver_peek(d);
   1010   if (tok_punct(t, '#') || t.kind == ASM_TOK_NUM || tok_punct(t, '-') ||
   1011       tok_punct(t, '+')) {
   1012     require_sp_spelling(d, rn, "cmp imm");
   1013     i64 imm = parse_imm_const(d);
   1014     u32 sh = 0;
   1015     if (asm_driver_eat_comma(d)) {
   1016       AsmTok lid = asm_driver_next(d);
   1017       Slice lsl = (lid.kind == ASM_TOK_IDENT)
   1018                       ? pool_slice(asm_driver_pool(d), lid.v.ident)
   1019                       : SLICE_NULL;
   1020       const char* lp = lsl.s;
   1021       size_t ln = lsl.len;
   1022       if (!lp || !icase_eq(lp, ln, "lsl"))
   1023         asm_driver_panic(d, "asm: cmp imm: expected 'lsl'");
   1024       i64 s = parse_imm_const(d);
   1025       if (s == 12)
   1026         sh = 1;
   1027       else if (s != 0)
   1028         asm_driver_panic(d, "asm: cmp imm: lsl must be 0 or 12");
   1029     }
   1030     if (imm < 0 || imm > 0xfff)
   1031       asm_driver_panic(d, "asm: cmp imm out of range");
   1032     u32 word = aa64_addsubimm_pack((AA64AddSubImm){.sf = rn.is64,
   1033                                                    .op = (u32)(!is_neg),
   1034                                                    .S = 1,
   1035                                                    .sh = sh,
   1036                                                    .imm12 = (u32)imm,
   1037                                                    .Rn = rn.num,
   1038                                                    .Rd = AA64_ZR});
   1039     emit32(d, word);
   1040     return;
   1041   }
   1042   AA64Reg rm = parse_reg(d);
   1043   reject_sp_reg(d, rn, "cmp reg");
   1044   reject_sp_reg(d, rm, "cmp reg");
   1045   if (rm.is64 != rn.is64) asm_driver_panic(d, "asm: cmp: width mismatch");
   1046   u32 shift = 0, imm6 = 0;
   1047   if (asm_driver_eat_comma(d)) parse_shift_mod(d, &shift, &imm6);
   1048   u32 word = aa64_addsubsr_pack((AA64AddSubSR){.sf = rn.is64,
   1049                                                .op = (u32)(!is_neg),
   1050                                                .S = 1,
   1051                                                .shift = shift,
   1052                                                .Rm = rm.num,
   1053                                                .imm6 = imm6,
   1054                                                .Rn = rn.num,
   1055                                                .Rd = AA64_ZR});
   1056   emit32(d, word);
   1057 }
   1058 
   1059 static void p_condsel(AsmDriver* d, u32 op, u32 op2, const char* what) {
   1060   AA64Reg rd = parse_reg(d);
   1061   expect_comma(d, what);
   1062   AA64Reg rn = parse_reg(d);
   1063   expect_comma(d, what);
   1064   AA64Reg rm = parse_reg(d);
   1065   expect_comma(d, what);
   1066   u32 cond = parse_cond(d, what);
   1067   if (rd.is_sp || rn.is_sp || rm.is_sp)
   1068     asm_driver_panic(d, "asm: %.*s: SP register not allowed",
   1069                      SLICE_ARG(slice_from_cstr(what)));
   1070   if (rd.is64 != rn.is64 || rd.is64 != rm.is64)
   1071     asm_driver_panic(d, "asm: %.*s: width mismatch",
   1072                      SLICE_ARG(slice_from_cstr(what)));
   1073   u32 word = aa64_condsel_pack((AA64CondSel){.sf = (u32)rd.is64,
   1074                                              .op = op,
   1075                                              .S = 0,
   1076                                              .Rm = rm.num,
   1077                                              .cond = cond,
   1078                                              .op2 = op2,
   1079                                              .Rn = rn.num,
   1080                                              .Rd = rd.num});
   1081   emit32(d, word);
   1082 }
   1083 
   1084 static void p_cset_like(AsmDriver* d, u32 op, u32 op2, const char* what) {
   1085   AA64Reg rd = parse_reg(d);
   1086   expect_comma(d, what);
   1087   u32 cond = parse_cond(d, what);
   1088   if (rd.is_sp)
   1089     asm_driver_panic(d, "asm: %.*s: SP register not allowed",
   1090                      SLICE_ARG(slice_from_cstr(what)));
   1091   u32 word = aa64_condsel_pack((AA64CondSel){.sf = (u32)rd.is64,
   1092                                              .op = op,
   1093                                              .S = 0,
   1094                                              .Rm = AA64_ZR,
   1095                                              .cond = cond ^ 1u,
   1096                                              .op2 = op2,
   1097                                              .Rn = AA64_ZR,
   1098                                              .Rd = rd.num});
   1099   emit32(d, word);
   1100 }
   1101 
   1102 /* neg / negs Rd, Rm  → SUB / SUBS Rd, ZR, Rm */
   1103 static void p_neg(AsmDriver* d, int set_flags) {
   1104   AA64Reg rd = parse_reg(d);
   1105   expect_comma(d, "neg");
   1106   AA64Reg rm = parse_reg(d);
   1107   reject_sp_reg(d, rd, "neg");
   1108   reject_sp_reg(d, rm, "neg");
   1109   if (rd.is64 != rm.is64) asm_driver_panic(d, "asm: neg: width mismatch");
   1110   u32 shift = 0, imm6 = 0;
   1111   if (asm_driver_eat_comma(d)) parse_shift_mod(d, &shift, &imm6);
   1112   u32 word = aa64_addsubsr_pack((AA64AddSubSR){.sf = rd.is64,
   1113                                                .op = 1,
   1114                                                .S = (u32)set_flags,
   1115                                                .shift = shift,
   1116                                                .Rm = rm.num,
   1117                                                .imm6 = imm6,
   1118                                                .Rn = AA64_ZR,
   1119                                                .Rd = rd.num});
   1120   emit32(d, word);
   1121 }
   1122 
   1123 /* Logical family: shifted-register `<op> Rd,Rn,Rm{,shift}` or, for the
   1124  * non-negated AND/ORR/EOR/ANDS, the bitmask-immediate `<op> Rd,Rn,#imm`.
   1125  * N is the SR-form negate bit (BIC/ORN/EON/BICS); those have no immediate
   1126  * form, so an immediate third operand is only valid when N==0. */
   1127 static void p_log_sr(AsmDriver* d, u32 opc, u32 N) {
   1128   AA64Reg rd = parse_reg(d);
   1129   expect_comma(d, "logical");
   1130   AA64Reg rn = parse_reg(d);
   1131   expect_comma(d, "logical");
   1132   if (!peek_is_reg(d)) {
   1133     /* Bitmask-immediate form. AND/ORR/EOR use the SP-capable destination;
   1134      * ANDS uses ZR. Rn is always a GPR (caller's parse_reg already enforced
   1135      * GP for the two register operands). */
   1136     if (N) asm_driver_panic(d, "asm: logical: immediate form has no negation");
   1137     if (rd.is64 != rn.is64) asm_driver_panic(d, "asm: logical: width mismatch");
   1138     u64 imm = (u64)parse_imm_const(d);
   1139     u32 bN = 0, immr = 0, imms = 0;
   1140     if (!aa64_logimm_encode(imm, rd.is64, &bN, &immr, &imms))
   1141       asm_driver_panic(d, "asm: logical: immediate is not a valid bitmask");
   1142     emit32(d, aa64_logimm_pack((AA64LogImm){.sf = rd.is64,
   1143                                             .opc = opc,
   1144                                             .N = bN,
   1145                                             .immr = immr,
   1146                                             .imms = imms,
   1147                                             .Rn = rn.num,
   1148                                             .Rd = rd.num}));
   1149     return;
   1150   }
   1151   AA64Reg rm = parse_reg(d);
   1152   if (rd.is64 != rn.is64 || rd.is64 != rm.is64)
   1153     asm_driver_panic(d, "asm: logical: width mismatch");
   1154   u32 shift = 0, imm6 = 0;
   1155   if (asm_driver_eat_comma(d)) parse_shift_mod(d, &shift, &imm6);
   1156   u32 word = aa64_logsr_pack((AA64LogSR){.sf = rd.is64,
   1157                                          .opc = opc,
   1158                                          .shift = shift,
   1159                                          .N = N,
   1160                                          .Rm = rm.num,
   1161                                          .imm6 = imm6,
   1162                                          .Rn = rn.num,
   1163                                          .Rd = rd.num});
   1164   emit32(d, word);
   1165 }
   1166 
   1167 /* Data-processing 3-source: madd/msub Rd, Rn, Rm, Ra. */
   1168 static void p_dp3(AsmDriver* d, u32 o0) {
   1169   AA64Reg rd = parse_reg(d);
   1170   expect_comma(d, "dp3");
   1171   AA64Reg rn = parse_reg(d);
   1172   expect_comma(d, "dp3");
   1173   AA64Reg rm = parse_reg(d);
   1174   expect_comma(d, "dp3");
   1175   AA64Reg ra = parse_reg(d);
   1176   if (rd.is64 != rn.is64 || rd.is64 != rm.is64 || rd.is64 != ra.is64)
   1177     asm_driver_panic(d, "asm: dp3: width mismatch");
   1178   u32 word = aa64_dp3_pack((AA64DP3){.sf = rd.is64,
   1179                                      .op31 = 0,
   1180                                      .o0 = o0,
   1181                                      .Rm = rm.num,
   1182                                      .Ra = ra.num,
   1183                                      .Rn = rn.num,
   1184                                      .Rd = rd.num});
   1185   emit32(d, word);
   1186 }
   1187 
   1188 /* mul Rd, Rn, Rm  → MADD Rd, Rn, Rm, ZR */
   1189 static void p_mul(AsmDriver* d, u32 o0) {
   1190   AA64Reg rd = parse_reg(d);
   1191   expect_comma(d, "mul");
   1192   AA64Reg rn = parse_reg(d);
   1193   expect_comma(d, "mul");
   1194   AA64Reg rm = parse_reg(d);
   1195   if (rd.is64 != rn.is64 || rd.is64 != rm.is64)
   1196     asm_driver_panic(d, "asm: mul: width mismatch");
   1197   u32 word = aa64_dp3_pack((AA64DP3){.sf = rd.is64,
   1198                                      .op31 = 0,
   1199                                      .o0 = o0,
   1200                                      .Rm = rm.num,
   1201                                      .Ra = AA64_ZR,
   1202                                      .Rn = rn.num,
   1203                                      .Rd = rd.num});
   1204   emit32(d, word);
   1205 }
   1206 
   1207 /* DP2: udiv/sdiv/lslv/lsrv/asrv/rorv Rd, Rn, Rm. */
   1208 static void p_dp2(AsmDriver* d, u32 opcode) {
   1209   AA64Reg rd = parse_reg(d);
   1210   expect_comma(d, "dp2");
   1211   AA64Reg rn = parse_reg(d);
   1212   expect_comma(d, "dp2");
   1213   AA64Reg rm = parse_reg(d);
   1214   if (rd.is64 != rn.is64 || rd.is64 != rm.is64)
   1215     asm_driver_panic(d, "asm: dp2: width mismatch");
   1216   u32 word = aa64_dp2_pack((AA64DP2){.sf = rd.is64,
   1217                                      .opcode = opcode,
   1218                                      .Rm = rm.num,
   1219                                      .Rn = rn.num,
   1220                                      .Rd = rd.num});
   1221   emit32(d, word);
   1222 }
   1223 
   1224 /* Shift aliases: `<op> Rd, Rn, (Rm | #imm)`.
   1225  *   register form  → LSLV/LSRV/ASRV (DP2 variable shift)
   1226  *   immediate form → UBFM (lsl/lsr) / SBFM (asr) bitfield alias
   1227  * `kind` indexes the three shifts: 0=lsl 1=lsr 2=asr. The immediate aliases
   1228  * are exactly what the disassembler prints for these UBFM/SBFM encodings, so
   1229  * `cc -S | as` round-trips. (ROR's immediate form is EXTR, which the
   1230  * disassembler doesn't decode, so it is left out — `rorv` covers the register
   1231  * rotate.) */
   1232 static void p_shift(AsmDriver* d, u32 kind) {
   1233   static const u32 dp2op[3] = {AA64_DP2_LSLV_OP, AA64_DP2_LSRV_OP,
   1234                                AA64_DP2_ASRV_OP};
   1235   AA64Reg rd = parse_reg(d);
   1236   expect_comma(d, "shift");
   1237   AA64Reg rn = parse_reg(d);
   1238   if (rd.is64 != rn.is64) asm_driver_panic(d, "asm: shift: width mismatch");
   1239   expect_comma(d, "shift");
   1240   if (peek_is_reg(d)) {
   1241     AA64Reg rm = parse_reg(d);
   1242     if (rd.is64 != rm.is64) asm_driver_panic(d, "asm: shift: width mismatch");
   1243     emit32(d, aa64_dp2_pack((AA64DP2){.sf = rd.is64,
   1244                                       .opcode = dp2op[kind],
   1245                                       .Rm = rm.num,
   1246                                       .Rn = rn.num,
   1247                                       .Rd = rd.num}));
   1248     return;
   1249   }
   1250   i64 sv = parse_imm_const(d);
   1251   u32 width = rd.is64 ? 64u : 32u;
   1252   if (sv < 0 || (u64)sv >= width)
   1253     asm_driver_panic(d, "asm: shift: amount out of range");
   1254   u32 shift = (u32)sv, immr = 0, imms = 0;
   1255   if (kind == 2) { /* asr → SBFM */
   1256     aa64_asr_imm_fields(shift, rd.is64, &immr, &imms);
   1257     emit32(d, aa64_bitfield(rd.is64, 0u, immr, imms, rd.num, rn.num));
   1258   } else { /* lsl/lsr → UBFM */
   1259     if (kind == 0)
   1260       aa64_lsl_imm_fields(shift, rd.is64, &immr, &imms);
   1261     else
   1262       aa64_lsr_imm_fields(shift, rd.is64, &immr, &imms);
   1263     emit32(d, aa64_bitfield(rd.is64, 2u, immr, imms, rd.num, rn.num));
   1264   }
   1265 }
   1266 
   1267 /* Branch immediate / conditional / compare-and-branch. */
   1268 
   1269 static void emit_branch_imm(AsmDriver* d, u32 op_bl, ObjSymId target,
   1270                             i64 addend, i64 const_disp) {
   1271   MCEmitter* mc = asm_driver_mc(d);
   1272   /* Emit a B/BL with imm26 = 0; record a CALL26/JUMP26 reloc against
   1273    * either the symbol or the constant displacement. */
   1274   u32 word = aa64_brimm_pack((AA64BrImm){.op = op_bl, .imm26 = 0});
   1275   emit32(d, word);
   1276   u32 ofs = mc->pos(mc) - 4;
   1277   RelocKind k = op_bl ? R_AARCH64_CALL26 : R_AARCH64_JUMP26;
   1278   if (target != OBJ_SYM_NONE) {
   1279     mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs, k, target, addend, 1,
   1280                       0);
   1281   } else {
   1282     /* Pure constant displacement is rare in real .s; reject it now.
   1283      * The recommended form is to use a label and let the assembler
   1284      * compute the displacement. */
   1285     (void)const_disp;
   1286     asm_driver_panic(d, "asm: branch with pure constant disp not supported");
   1287   }
   1288 }
   1289 
   1290 static void p_b(AsmDriver* d, u32 op_bl) {
   1291   ObjSymId sym = OBJ_SYM_NONE;
   1292   i64 off = 0;
   1293   /* GNU as accepts `b sym`, `bl sym+8`, etc. */
   1294   parse_imm_sym(d, &sym, &off);
   1295   if (sym == OBJ_SYM_NONE)
   1296     asm_driver_panic(d, "asm: b/bl: symbolic target required");
   1297   emit_branch_imm(d, op_bl, sym, off, 0);
   1298 }
   1299 
   1300 static void p_b_cond(AsmDriver* d, u32 cond) {
   1301   ObjSymId sym = OBJ_SYM_NONE;
   1302   i64 off = 0;
   1303   parse_imm_sym(d, &sym, &off);
   1304   if (sym == OBJ_SYM_NONE)
   1305     asm_driver_panic(d, "asm: b.cond: symbolic target required");
   1306   /* Emit the instruction with imm19=0 + R_AARCH64_CONDBR19 reloc. */
   1307   u32 word = aa64_brcond_pack((AA64BrCond){.imm19 = 0, .cond = cond});
   1308   emit32(d, word);
   1309   MCEmitter* mc = asm_driver_mc(d);
   1310   u32 ofs = mc->pos(mc) - 4;
   1311   mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs, R_AARCH64_CONDBR19, sym,
   1312                     off, 1, 0);
   1313 }
   1314 
   1315 static void p_cbz(AsmDriver* d, u32 op) {
   1316   AA64Reg rt = parse_reg(d);
   1317   expect_comma(d, "cbz");
   1318   ObjSymId sym = OBJ_SYM_NONE;
   1319   i64 off = 0;
   1320   parse_imm_sym(d, &sym, &off);
   1321   if (sym == OBJ_SYM_NONE)
   1322     asm_driver_panic(d, "asm: cbz: symbolic target required");
   1323   u32 word =
   1324       aa64_cb_pack((AA64CB){.sf = rt.is64, .op = op, .imm19 = 0, .Rt = rt.num});
   1325   emit32(d, word);
   1326   MCEmitter* mc = asm_driver_mc(d);
   1327   u32 ofs = mc->pos(mc) - 4;
   1328   mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs, R_AARCH64_CONDBR19, sym,
   1329                     off, 1, 0);
   1330 }
   1331 
   1332 /* Memory-operand parser.  Recognized shapes:
   1333  *   [Xn]                                base only
   1334  *   [Xn, #imm]                          base + immediate offset
   1335  *   [Xn, #imm]!                         pre-index (writeback)
   1336  *   [Xn], #imm                          post-index (writeback)
   1337  *   [Xn, Xm]                            register offset (LSL #0)
   1338  *   [Xn, Xm, LSL #s]                    register offset, scaled
   1339  *   [Xn, Wm, {U,S}XTW {#s}]             32-bit index, extended
   1340  *   [Xn, Xm, {U,S}XTX {#s}] / SXTX      64-bit index, extended
   1341  *
   1342  * imm is the literal byte offset (no scaling).  When has_index is set,
   1343  * `index` is the index register, `option` its 3-bit extend code, and
   1344  * shift_present records whether an explicit `#s` was written (with the
   1345  * amount in `shift`).  pre_index / post_index flag the writeback forms. */
   1346 typedef struct AA64Mem {
   1347   AA64Reg base;
   1348   AA64Reg index;
   1349   i64 imm; /* byte offset (literal as written) */
   1350   u32 option;
   1351   u32 shift;
   1352   AA64RelMod reloc_mod; /* :lo12: / :got_lo12: on the offset, or NONE */
   1353   ObjSymId reloc_sym;   /* symbol when reloc_mod != NONE */
   1354   i64 reloc_off;        /* addend when reloc_mod != NONE */
   1355   u8 pre_index;
   1356   u8 post_index;
   1357   u8 has_offset;
   1358   u8 has_index;
   1359   u8 shift_present;
   1360   u8 pad[3];
   1361 } AA64Mem;
   1362 
   1363 /* Parse the optional extend/shift modifier of a register-offset memory
   1364  * operand: `LSL #s`, `UXTW {#s}`, `SXTW {#s}`, `UXTX {#s}`, `SXTX {#s}`.
   1365  * The index register width (32 vs 64) must agree with the extend kind.
   1366  * Fills m->option / m->shift / m->shift_present. */
   1367 static void parse_mem_extend(AsmDriver* d, AA64Mem* m) {
   1368   AsmTok t = asm_driver_next(d);
   1369   if (t.kind != ASM_TOK_IDENT)
   1370     asm_driver_panic(d, "asm: ldr/str: expected extend (lsl/sxtw/uxtw/...)");
   1371   Slice sl = pool_slice(asm_driver_pool(d), t.v.ident);
   1372   const char* p = sl.s;
   1373   size_t n = sl.len;
   1374   int need64 = 0; /* index must be 64-bit */
   1375   if (icase_eq(p, n, "lsl") || icase_eq(p, n, "uxtx")) {
   1376     m->option = AA64_LDST_OPTION_LSL;
   1377     need64 = 1;
   1378   } else if (icase_eq(p, n, "sxtx")) {
   1379     m->option = AA64_LDST_OPTION_SXTX;
   1380     need64 = 1;
   1381   } else if (icase_eq(p, n, "uxtw")) {
   1382     m->option = AA64_LDST_OPTION_UXTW;
   1383     need64 = 0;
   1384   } else if (icase_eq(p, n, "sxtw")) {
   1385     m->option = AA64_LDST_OPTION_SXTW;
   1386     need64 = 0;
   1387   } else {
   1388     asm_driver_panic(d, "asm: ldr/str: unknown index extend");
   1389   }
   1390   if (need64 && !m->index.is64)
   1391     asm_driver_panic(d, "asm: ldr/str: index must be 64-bit for this extend");
   1392   if (!need64 && m->index.is64)
   1393     asm_driver_panic(d, "asm: ldr/str: index must be 32-bit for sxtw/uxtw");
   1394   /* LSL requires an explicit shift; the extends accept an optional one. */
   1395   AsmTok nt = asm_driver_peek(d);
   1396   if (tok_punct(nt, '#') || nt.kind == ASM_TOK_NUM) {
   1397     i64 s = parse_imm_const(d);
   1398     if (s < 0) asm_driver_panic(d, "asm: ldr/str: negative index shift");
   1399     m->shift = (u32)s;
   1400     m->shift_present = 1;
   1401   } else if (m->option == AA64_LDST_OPTION_LSL) {
   1402     asm_driver_panic(d, "asm: ldr/str: lsl requires a shift amount");
   1403   }
   1404 }
   1405 
   1406 static AA64Mem parse_mem(AsmDriver* d) {
   1407   AA64Mem m;
   1408   memset(&m, 0, sizeof m);
   1409   if (!asm_driver_eat_punct(d, '[')) asm_driver_panic(d, "asm: expected '['");
   1410   m.base = parse_reg(d);
   1411   if (!m.base.is64)
   1412     asm_driver_panic(d, "asm: ldr/str: base register must be 64-bit");
   1413   require_sp_spelling(d, m.base, "ldr/str base");
   1414   if (asm_driver_eat_comma(d)) {
   1415     /* A relocation offset (ELF `:lo12:sym`/`:got_lo12:sym` prefix, or Mach-O
   1416      * `sym@PAGEOFF`/`sym@GOTPAGEOFF` suffix), a register index, or a plain
   1417      * `#imm`/expression. */
   1418     AsmTok t = asm_driver_peek(d);
   1419     AA64Reg idx;
   1420     memset(&idx, 0, sizeof idx);
   1421     if (!target_is_macho(d) && tok_punct(t, ':')) {
   1422       m.reloc_mod = parse_reloc_mod(d);
   1423       parse_imm_sym(d, &m.reloc_sym, &m.reloc_off);
   1424       m.has_offset = 1; /* imm field stays 0; reloc supplies the low bits */
   1425     } else if (t.kind == ASM_TOK_IDENT &&
   1426                parse_reg_from_ident(d, t.v.ident, &idx)) {
   1427       (void)asm_driver_next(d);
   1428       reject_sp_reg(d, idx, "ldr/str index");
   1429       m.index = idx;
   1430       m.has_index = 1;
   1431       m.option = idx.is64 ? AA64_LDST_OPTION_LSL : AA64_LDST_OPTION_UXTW;
   1432       if (asm_driver_eat_comma(d)) parse_mem_extend(d, &m);
   1433     } else if (target_is_macho(d) && t.kind == ASM_TOK_IDENT) {
   1434       /* Mach-O: `[Xn, sym@PAGEOFF]` / `[Xn, sym@GOTPAGEOFF]`. */
   1435       parse_imm_sym(d, &m.reloc_sym, &m.reloc_off);
   1436       m.reloc_mod = parse_reloc_suffix(d);
   1437       if (m.reloc_mod != AA64_RELMOD_LO12 &&
   1438           m.reloc_mod != AA64_RELMOD_GOT_LO12)
   1439         asm_driver_panic(
   1440             d, "asm: ldr/str: symbol offset needs @PAGEOFF/@GOTPAGEOFF");
   1441       m.has_offset = 1;
   1442     } else {
   1443       m.imm = parse_imm_const(d);
   1444       m.has_offset = 1;
   1445     }
   1446   }
   1447   if (!asm_driver_eat_punct(d, ']')) asm_driver_panic(d, "asm: expected ']'");
   1448   if (asm_driver_eat_punct(d, '!')) {
   1449     if (m.has_index)
   1450       asm_driver_panic(d, "asm: ldr/str: writeback not allowed with index");
   1451     m.pre_index = 1;
   1452   } else if (asm_driver_eat_comma(d)) {
   1453     /* Post-index: `[Xn], #imm`. */
   1454     if (m.has_index || m.has_offset)
   1455       asm_driver_panic(d, "asm: ldr/str: malformed post-index operand");
   1456     m.imm = parse_imm_const(d);
   1457     m.has_offset = 1;
   1458     m.post_index = 1;
   1459   }
   1460   return m;
   1461 }
   1462 
   1463 /* ldr/str Rt, [Xn, #imm] — chooses scaled or unscaled form based on
   1464  * alignment of imm. */
   1465 /* Core load/store. `fixed_size` is the access log2-size (0=byte..3=dword) for
   1466  * ldrb/ldrh/ldrsw etc., or -1 to derive it from the register width (ldr/str).
   1467  * `sign_ext` selects the signed-load opc (10 = sign-extend to 64-bit, 11 = to
   1468  * 32-bit), keyed on the destination register width. */
   1469 static void p_ldst_core(AsmDriver* d, int is_load, int fixed_size,
   1470                         int sign_ext) {
   1471   AA64Reg rt = parse_ldst_reg(d);
   1472   reject_sp_reg(d, rt, "ldr/str");
   1473   expect_comma(d, "ldr/str");
   1474   AA64Mem m = parse_mem(d);
   1475   AA64LdStEnc e = ldst_encoding(d, rt, is_load, fixed_size, sign_ext);
   1476   u32 size = e.size, opc = e.opc, V = e.V;
   1477   if (m.reloc_mod != AA64_RELMOD_NONE) {
   1478     /* [Xn, :lo12:sym] / [Xn, :got_lo12:sym] — unsigned-imm12 form with a zero
   1479      * immediate; the relocation supplies the low 12 bits.  :got_lo12: only
   1480      * applies to a 64-bit `ldr` (the GOT entry is an 8-byte pointer); llvm-mc
   1481      * rejects it on stores, signed loads, and sub-word loads. */
   1482     if (m.reloc_mod == AA64_RELMOD_GOT_LO12 &&
   1483         !(V == 0 && size == 3 && opc == AA64_LDST_OPC_LDR))
   1484       asm_driver_panic(d, "asm: :got_lo12: only valid on a 64-bit ldr");
   1485     u32 word = aa64_ldst_uimm_pack((AA64LdStUimm){.size = size,
   1486                                                   .V = V,
   1487                                                   .opc = opc,
   1488                                                   .imm12 = 0,
   1489                                                   .Rn = m.base.num,
   1490                                                   .Rt = rt.num});
   1491     emit32(d, word);
   1492     RelocKind k = (m.reloc_mod == AA64_RELMOD_GOT_LO12)
   1493                       ? R_AARCH64_LD64_GOT_LO12_NC
   1494                       : aa64_ldst_lo12_reloc(d, size);
   1495     MCEmitter* mc = asm_driver_mc(d);
   1496     mc->emit_reloc_at(mc, asm_driver_cur_section(d), mc->pos(mc) - 4, k,
   1497                       m.reloc_sym, m.reloc_off, 1, 0);
   1498     return;
   1499   }
   1500   if (m.has_index) {
   1501     /* Register-offset form.  The S bit (scale by access size) is set when an
   1502      * explicit shift equal to the access log2-size is written.  An explicit
   1503      * `#0` is legal and stays unscaled (S=0); for byte access #0 == size so it
   1504      * sets S — matching llvm-mc, where the only legal amounts are 0 or size. */
   1505     u32 S = 0;
   1506     if (m.shift_present) {
   1507       if (m.shift == size)
   1508         S = 1;
   1509       else if (m.shift != 0)
   1510         asm_driver_panic(d,
   1511                          "asm: ldr/str: index shift must be 0 or access size");
   1512     }
   1513     u32 word = aa64_ldst_regoff_pack((AA64LdStRegOff){.size = size,
   1514                                                       .V = V,
   1515                                                       .opc = opc,
   1516                                                       .Rm = m.index.num,
   1517                                                       .option = m.option,
   1518                                                       .S = S,
   1519                                                       .Rn = m.base.num,
   1520                                                       .Rt = rt.num});
   1521     emit32(d, word);
   1522     return;
   1523   }
   1524   if (m.pre_index || m.post_index) {
   1525     /* Immediate writeback (unscaled signed imm9). */
   1526     if (m.imm < -256 || m.imm > 255)
   1527       asm_driver_panic(d, "asm: ldr/str: writeback imm9 out of range");
   1528     u32 imm9 = (u32)((u64)m.imm & 0x1ffu);
   1529     u32 idx = m.pre_index ? AA64_LDST_IDX_PRE : AA64_LDST_IDX_POST;
   1530     u32 word = aa64_ldst_wback_pack((AA64LdStWBack){.size = size,
   1531                                                     .V = V,
   1532                                                     .opc = opc,
   1533                                                     .imm9 = imm9,
   1534                                                     .idx = idx,
   1535                                                     .Rn = m.base.num,
   1536                                                     .Rt = rt.num});
   1537     emit32(d, word);
   1538     return;
   1539   }
   1540   {
   1541     /* Try scaled unsigned-imm12 first. */
   1542     u32 scale = e.scale;
   1543     if (m.imm >= 0 && (i64)((u64)m.imm % scale) == 0 &&
   1544         (u64)m.imm / scale <= 0xfff) {
   1545       u32 imm12 = (u32)((u64)m.imm / scale);
   1546       u32 word = aa64_ldst_uimm_pack((AA64LdStUimm){.size = size,
   1547                                                     .V = V,
   1548                                                     .opc = opc,
   1549                                                     .imm12 = imm12,
   1550                                                     .Rn = m.base.num,
   1551                                                     .Rt = rt.num});
   1552       emit32(d, word);
   1553       return;
   1554     }
   1555     /* Fall back to unscaled signed-imm9 (LDUR/STUR). */
   1556     if (m.imm >= -256 && m.imm <= 255) {
   1557       u32 imm9 = (u32)((u64)m.imm & 0x1ffu);
   1558       u32 word = aa64_ldst_simm9_pack((AA64LdStSimm9){.size = size,
   1559                                                       .V = V,
   1560                                                       .opc = opc,
   1561                                                       .imm9 = imm9,
   1562                                                       .Rn = m.base.num,
   1563                                                       .Rt = rt.num});
   1564       emit32(d, word);
   1565       return;
   1566     }
   1567     asm_driver_panic(d, "asm: ldr/str: immediate out of range");
   1568   }
   1569 }
   1570 
   1571 /* ldr/str: access width follows the register (Wt=word, Xt=dword). */
   1572 static void p_ldr_str(AsmDriver* d, int is_load) {
   1573   p_ldst_core(d, is_load, /*fixed_size=*/-1, /*sign_ext=*/0);
   1574 }
   1575 /* Byte/half + signed sub-word loads/stores (fixed access width). */
   1576 static void p_ldrb(AsmDriver* d) { p_ldst_core(d, 1, 0, 0); }
   1577 static void p_strb(AsmDriver* d) { p_ldst_core(d, 0, 0, 0); }
   1578 static void p_ldrh(AsmDriver* d) { p_ldst_core(d, 1, 1, 0); }
   1579 static void p_strh(AsmDriver* d) { p_ldst_core(d, 0, 1, 0); }
   1580 static void p_ldrsb(AsmDriver* d) { p_ldst_core(d, 1, 0, 1); }
   1581 static void p_ldrsh(AsmDriver* d) { p_ldst_core(d, 1, 1, 1); }
   1582 static void p_ldrsw(AsmDriver* d) { p_ldst_core(d, 1, 2, 1); }
   1583 
   1584 /* ldur/stur — unscaled signed-imm9. `fixed_size` is the access log2-size
   1585  * (0=byte..3=dword) for sturb/ldurb/sturh/ldurh/ldursw etc., or -1 to derive
   1586  * it from the register width (stur/ldur). `sign_ext` selects the signed-load
   1587  * opc (ldursb/ldursh/ldursw), keyed on the destination register width — the
   1588  * unscaled mirror of p_ldst_core. */
   1589 static void p_ldur_stur(AsmDriver* d, int is_load, int fixed_size,
   1590                         int sign_ext) {
   1591   AA64Reg rt = parse_ldst_reg(d);
   1592   reject_sp_reg(d, rt, "ldur/stur");
   1593   expect_comma(d, "ldur/stur");
   1594   AA64Mem m = parse_mem(d);
   1595   AA64LdStEnc e = ldst_encoding(d, rt, is_load, fixed_size, sign_ext);
   1596   if (m.imm < -256 || m.imm > 255)
   1597     asm_driver_panic(d, "asm: ldur/stur: imm9 out of range");
   1598   u32 imm9 = (u32)((u64)m.imm & 0x1ffu);
   1599   u32 word = aa64_ldst_simm9_pack((AA64LdStSimm9){.size = e.size,
   1600                                                   .V = e.V,
   1601                                                   .opc = e.opc,
   1602                                                   .imm9 = imm9,
   1603                                                   .Rn = m.base.num,
   1604                                                   .Rt = rt.num});
   1605   emit32(d, word);
   1606 }
   1607 
   1608 /* ldp / stp Rt, Rt2, [Xn, #imm] or [Xn, #imm]! */
   1609 static void p_ldp_stp(AsmDriver* d, int is_load) {
   1610   AA64Reg rt = parse_ldstp_reg(d);
   1611   expect_comma(d, "ldp/stp");
   1612   AA64Reg rt2 = parse_ldstp_reg(d);
   1613   expect_comma(d, "ldp/stp");
   1614   reject_sp_reg(d, rt, "ldp/stp");
   1615   reject_sp_reg(d, rt2, "ldp/stp");
   1616   if (rt.is64 != rt2.is64 || rt.is_fp != rt2.is_fp ||
   1617       rt.fp_bytes != rt2.fp_bytes)
   1618     asm_driver_panic(d, "asm: ldp/stp: width mismatch");
   1619   AA64Mem m = parse_mem(d);
   1620   u32 scale = rt.is_fp ? (u32)rt.fp_bytes : (rt.is64 ? 8u : 4u);
   1621   if ((i64)((u64)m.imm % scale) != 0)
   1622     asm_driver_panic(d, "asm: ldp/stp: imm not scale-aligned");
   1623   i64 imm7 = m.imm / (i64)scale;
   1624   if (imm7 < -64 || imm7 > 63)
   1625     asm_driver_panic(d, "asm: ldp/stp: imm7 out of range");
   1626   AA64LdStPPre f = {
   1627       .opc = rt.is_fp ? (rt.fp_bytes == 16u ? 2u : 1u) : (rt.is64 ? 2u : 0u),
   1628       .V = rt.is_fp ? 1u : 0u,
   1629       .L = is_load ? 1u : 0u,
   1630       .imm7 = (u32)imm7 & 0x7fu,
   1631       .Rt2 = rt2.num,
   1632       .Rn = m.base.num,
   1633       .Rt = rt.num};
   1634   if (m.pre_index)
   1635     emit32(d, aa64_ldstp_pre_pack(f));
   1636   else if (m.post_index)
   1637     emit32(d, aa64_ldstp_post_pack(f));
   1638   else
   1639     emit32(d, aa64_ldstp_soff_pack(f));
   1640 }
   1641 
   1642 /* adr / adrp Rd, sym */
   1643 static void p_adr(AsmDriver* d, int is_adrp) {
   1644   AA64Reg rd = parse_reg(d);
   1645   expect_comma(d, "adr");
   1646   /* adrp page reloc on a symbol: ELF spells a bare symbol (`:got:` selects the
   1647    * GOT page); Mach-O spells `sym@PAGE` / `sym@GOTPAGE`.  adr takes a bare
   1648    * symbol on both.  cc -S emits the form matching the target format. */
   1649   AA64RelMod mod = AA64_RELMOD_NONE;
   1650   ObjSymId sym = OBJ_SYM_NONE;
   1651   i64 off = 0;
   1652   if (target_is_macho(d)) {
   1653     parse_imm_sym(d, &sym, &off);
   1654     mod = parse_reloc_suffix(d);
   1655   } else {
   1656     mod = parse_reloc_mod(d);
   1657     parse_imm_sym(d, &sym, &off);
   1658   }
   1659   if (!is_adrp) {
   1660     if (mod != AA64_RELMOD_NONE)
   1661       asm_driver_panic(d, "asm: adr: no relocation modifier valid here");
   1662   } else if (mod != AA64_RELMOD_NONE && mod != AA64_RELMOD_PAGE &&
   1663              mod != AA64_RELMOD_GOT) {
   1664     asm_driver_panic(d,
   1665                      "asm: adrp: only @PAGE/@GOTPAGE (Mach-O) or :got: "
   1666                      "(ELF) valid here");
   1667   }
   1668   if (sym == OBJ_SYM_NONE)
   1669     asm_driver_panic(d, "asm: adr/adrp: symbol required");
   1670   AA64PCRelAdr f = {.op = is_adrp ? AA64_ADR_OP_ADRP : AA64_ADR_OP_ADR,
   1671                     .immlo = 0,
   1672                     .immhi = 0,
   1673                     .Rd = rd.num};
   1674   emit32(d, aa64_pcrel_adr_pack(f));
   1675   MCEmitter* mc = asm_driver_mc(d);
   1676   u32 ofs = mc->pos(mc) - 4;
   1677   RelocKind k = !is_adrp                 ? R_AARCH64_ADR_PREL_LO21
   1678                 : mod == AA64_RELMOD_GOT ? R_AARCH64_ADR_GOT_PAGE
   1679                                          : R_AARCH64_ADR_PREL_PG_HI21;
   1680   mc->emit_reloc_at(mc, asm_driver_cur_section(d), ofs, k, sym, off, 1, 0);
   1681 }
   1682 
   1683 /* ---- atomics / exclusive ----
   1684  *
   1685  * Every form here addresses a bare base register `[Xn]` (no offset, no
   1686  * index, no writeback).  parse_mem already rejects malformed shapes; we
   1687  * additionally reject any offset/index so `ldxr w0,[x1,#4]` is an error,
   1688  * matching llvm/gas. */
   1689 static AA64Mem parse_mem_bare(AsmDriver* d, const char* what) {
   1690   AA64Mem m = parse_mem(d);
   1691   if (m.has_offset || m.has_index || m.pre_index || m.post_index)
   1692     asm_driver_panic(d, "asm: %.*s: expected bare [Xn] address",
   1693                      SLICE_ARG(slice_from_cstr(what)));
   1694   return m;
   1695 }
   1696 
   1697 /* Map an access log2-size (0..3) onto the GPR width the operand register
   1698  * must have: byte/half/word use Wt (32-bit), dword uses Xt (64-bit). */
   1699 static void require_gpr_width(AsmDriver* d, AA64Reg r, u32 size,
   1700                               const char* what) {
   1701   reject_sp_reg(d, r, what);
   1702   u32 want64 = (size == 3u) ? 1u : 0u;
   1703   if ((u32)r.is64 != want64)
   1704     asm_driver_panic(d, "asm: %.*s: register width mismatch",
   1705                      SLICE_ARG(slice_from_cstr(what)));
   1706 }
   1707 
   1708 /* Load-exclusive / load-acquire: `<op> Wt|Xt, [Xn]`.
   1709  *   o2/o0 select the family member (see aa64_ldstex_pack).  size is the
   1710  *   access log2-size; Rs/Rt2 are fixed to 11111. */
   1711 static void p_ldex(AsmDriver* d, u32 size, u32 o2, u32 o0, const char* what) {
   1712   AA64Reg rt = parse_reg(d);
   1713   require_gpr_width(d, rt, size, what);
   1714   expect_comma(d, what);
   1715   AA64Mem m = parse_mem_bare(d, what);
   1716   emit32(d, aa64_ldstex_pack((AA64LdStEx){.size = size,
   1717                                           .o2 = o2,
   1718                                           .L = 1u,
   1719                                           .o1 = 0u,
   1720                                           .Rs = AA64_ZR,
   1721                                           .o0 = o0,
   1722                                           .Rt2 = AA64_ZR,
   1723                                           .Rn = m.base.num,
   1724                                           .Rt = rt.num}));
   1725 }
   1726 
   1727 /* Store-release without status: `stlr Wt|Xt, [Xn]` (o2=1, L=0, o0=1). */
   1728 static void p_stlr(AsmDriver* d, u32 size, const char* what) {
   1729   AA64Reg rt = parse_reg(d);
   1730   require_gpr_width(d, rt, size, what);
   1731   expect_comma(d, what);
   1732   AA64Mem m = parse_mem_bare(d, what);
   1733   emit32(d, aa64_ldstex_pack((AA64LdStEx){.size = size,
   1734                                           .o2 = 1u,
   1735                                           .L = 0u,
   1736                                           .o1 = 0u,
   1737                                           .Rs = AA64_ZR,
   1738                                           .o0 = 1u,
   1739                                           .Rt2 = AA64_ZR,
   1740                                           .Rn = m.base.num,
   1741                                           .Rt = rt.num}));
   1742 }
   1743 
   1744 /* Store-exclusive with status: `<op> Ws, Wt|Xt, [Xn]` (L=0).  Ws (the
   1745  * 32-bit status result) must be a W register and distinct from Rt/Rn. */
   1746 /* Store-exclusive constraint (ARM ARM): the status register Ws must differ
   1747  * from the stored value Rt and from the base Rn, else the result is
   1748  * UNPREDICTABLE.  The base is exempt when it is SP (reg #31 names SP, not the
   1749  * WZR the status reg would be).  CAS/LSE atomics do NOT share this rule. */
   1750 static void reject_stex_alias(AsmDriver* d, AA64Reg rs, AA64Reg rt, AA64Mem m,
   1751                               const char* what) {
   1752   if (rs.num == rt.num)
   1753     asm_driver_panic(d, "asm: %.*s: status reg cannot be the value reg",
   1754                      SLICE_ARG(slice_from_cstr(what)));
   1755   if (!m.base.is_sp && rs.num == m.base.num)
   1756     asm_driver_panic(d, "asm: %.*s: status reg cannot be the base reg",
   1757                      SLICE_ARG(slice_from_cstr(what)));
   1758 }
   1759 
   1760 static void p_stex(AsmDriver* d, u32 size, u32 o0, const char* what) {
   1761   AA64Reg rs = parse_reg(d);
   1762   reject_sp_reg(d, rs, what);
   1763   if (rs.is64)
   1764     asm_driver_panic(d, "asm: %.*s: status reg must be 32-bit",
   1765                      SLICE_ARG(slice_from_cstr(what)));
   1766   expect_comma(d, what);
   1767   AA64Reg rt = parse_reg(d);
   1768   require_gpr_width(d, rt, size, what);
   1769   expect_comma(d, what);
   1770   AA64Mem m = parse_mem_bare(d, what);
   1771   reject_stex_alias(d, rs, rt, m, what);
   1772   emit32(d, aa64_ldstex_pack((AA64LdStEx){.size = size,
   1773                                           .o2 = 0u,
   1774                                           .L = 0u,
   1775                                           .o1 = 0u,
   1776                                           .Rs = rs.num,
   1777                                           .o0 = o0,
   1778                                           .Rt2 = AA64_ZR,
   1779                                           .Rn = m.base.num,
   1780                                           .Rt = rt.num}));
   1781 }
   1782 
   1783 /* Compare-and-swap: `<op> Ws, Wt, [Xn]` / `<op> Xs, Xt, [Xn]`.  Rs and Rt
   1784  * share the operand width selected by `size` (word or dword). */
   1785 static void p_cas(AsmDriver* d, u32 size, u32 L, u32 o0, const char* what) {
   1786   AA64Reg rs = parse_reg(d);
   1787   require_gpr_width(d, rs, size, what);
   1788   expect_comma(d, what);
   1789   AA64Reg rt = parse_reg(d);
   1790   require_gpr_width(d, rt, size, what);
   1791   expect_comma(d, what);
   1792   AA64Mem m = parse_mem_bare(d, what);
   1793   emit32(d, aa64_cas_pack((AA64Cas){.size = size,
   1794                                     .L = L,
   1795                                     .Rs = rs.num,
   1796                                     .o0 = o0,
   1797                                     .Rn = m.base.num,
   1798                                     .Rt = rt.num}));
   1799 }
   1800 
   1801 /* LSE atomic memory op: `<op> Ws, Wt, [Xn]` / `<op> Xs, Xt, [Xn]`.
   1802  *   o3=1 selects SWP; otherwise opc names LDADD/LDCLR/LDEOR/LDSET. */
   1803 static void p_lse(AsmDriver* d, u32 size, u32 A, u32 R, u32 o3, u32 opc,
   1804                   const char* what) {
   1805   AA64Reg rs = parse_reg(d);
   1806   require_gpr_width(d, rs, size, what);
   1807   expect_comma(d, what);
   1808   AA64Reg rt = parse_reg(d);
   1809   require_gpr_width(d, rt, size, what);
   1810   expect_comma(d, what);
   1811   AA64Mem m = parse_mem_bare(d, what);
   1812   emit32(d, aa64_lse_atomic_pack((AA64LseAtomic){.size = size,
   1813                                                  .A = A,
   1814                                                  .R = R,
   1815                                                  .Rs = rs.num,
   1816                                                  .o3 = o3,
   1817                                                  .opc = opc,
   1818                                                  .Rn = m.base.num,
   1819                                                  .Rt = rt.num}));
   1820 }
   1821 
   1822 /* ---- mnemonic dispatch table ---- */
   1823 
   1824 typedef void (*P_Fn)(AsmDriver*);
   1825 
   1826 typedef struct AA64Mn {
   1827   const char* name;
   1828   P_Fn fn;
   1829   u32 arg; /* per-fn discriminator (alias parameter) */
   1830 } AA64Mn;
   1831 
   1832 /* Wrapper functions for the discriminator-taking parsers, since the
   1833  * table holds a uniform P_Fn pointer.  Each wraps a single (fn, arg)
   1834  * tuple. */
   1835 static void p_addsub_add(AsmDriver* d) { p_addsub(d, /*is_sub=*/0, 0); }
   1836 static void p_addsub_adds(AsmDriver* d) { p_addsub(d, 0, 1); }
   1837 static void p_addsub_sub(AsmDriver* d) { p_addsub(d, 1, 0); }
   1838 static void p_addsub_subs(AsmDriver* d) { p_addsub(d, 1, 1); }
   1839 static void p_cmp_w(AsmDriver* d) { p_cmp(d, 0); }
   1840 static void p_cmn_w(AsmDriver* d) { p_cmp(d, 1); }
   1841 static void p_csel_(AsmDriver* d) { p_condsel(d, 0, 0, "csel"); }
   1842 static void p_csinc_(AsmDriver* d) { p_condsel(d, 0, 1, "csinc"); }
   1843 static void p_csinv_(AsmDriver* d) { p_condsel(d, 1, 0, "csinv"); }
   1844 static void p_csneg_(AsmDriver* d) { p_condsel(d, 1, 1, "csneg"); }
   1845 static void p_cset_(AsmDriver* d) { p_cset_like(d, 0, 1, "cset"); }
   1846 static void p_csetm_(AsmDriver* d) { p_cset_like(d, 1, 0, "csetm"); }
   1847 static void p_neg_w(AsmDriver* d) { p_neg(d, 0); }
   1848 static void p_negs_w(AsmDriver* d) { p_neg(d, 1); }
   1849 static void p_and_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_AND_OPC, 0); }
   1850 static void p_bic_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_AND_OPC, 1); }
   1851 static void p_orr_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ORR_OPC, 0); }
   1852 static void p_orn_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ORR_OPC, 1); }
   1853 static void p_eor_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_EOR_OPC, 0); }
   1854 static void p_eon_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_EOR_OPC, 1); }
   1855 static void p_ands_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ANDS_OPC, 0); }
   1856 static void p_bics_w(AsmDriver* d) { p_log_sr(d, AA64_LOG_ANDS_OPC, 1); }
   1857 static void p_madd(AsmDriver* d) { p_dp3(d, 0); }
   1858 static void p_msub(AsmDriver* d) { p_dp3(d, 1); }
   1859 static void p_mul_w(AsmDriver* d) { p_mul(d, 0); }
   1860 static void p_mneg_w(AsmDriver* d) { p_mul(d, 1); }
   1861 static void p_udiv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_UDIV_OP); }
   1862 static void p_sdiv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_SDIV_OP); }
   1863 static void p_lslv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_LSLV_OP); }
   1864 static void p_lsrv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_LSRV_OP); }
   1865 static void p_asrv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_ASRV_OP); }
   1866 static void p_rorv_w(AsmDriver* d) { p_dp2(d, AA64_DP2_RORV_OP); }
   1867 static void p_lsl_(AsmDriver* d) { p_shift(d, 0); }
   1868 static void p_lsr_(AsmDriver* d) { p_shift(d, 1); }
   1869 static void p_asr_(AsmDriver* d) { p_shift(d, 2); }
   1870 static void p_b_(AsmDriver* d) { p_b(d, 0); }
   1871 static void p_bl_(AsmDriver* d) { p_b(d, 1); }
   1872 static void p_cbz_(AsmDriver* d) { p_cbz(d, 0); }
   1873 static void p_cbnz_(AsmDriver* d) { p_cbz(d, 1); }
   1874 static void p_movz_(AsmDriver* d) { p_movwide(d, AA64_MOVZ_OPC); }
   1875 static void p_movn_(AsmDriver* d) { p_movwide(d, AA64_MOVN_OPC); }
   1876 static void p_movk_(AsmDriver* d) { p_movwide(d, AA64_MOVK_OPC); }
   1877 static void p_svc_(AsmDriver* d) { p_except(d, 0); }
   1878 static void p_brk_(AsmDriver* d) { p_except(d, 1); }
   1879 static void p_hlt_(AsmDriver* d) { p_except(d, 2); }
   1880 static void p_ldr_(AsmDriver* d) { p_ldr_str(d, 1); }
   1881 static void p_str_(AsmDriver* d) { p_ldr_str(d, 0); }
   1882 static void p_ldur_(AsmDriver* d) { p_ldur_stur(d, 1, -1, 0); }
   1883 static void p_stur_(AsmDriver* d) { p_ldur_stur(d, 0, -1, 0); }
   1884 static void p_ldurb(AsmDriver* d) { p_ldur_stur(d, 1, 0, 0); }
   1885 static void p_sturb(AsmDriver* d) { p_ldur_stur(d, 0, 0, 0); }
   1886 static void p_ldurh(AsmDriver* d) { p_ldur_stur(d, 1, 1, 0); }
   1887 static void p_sturh(AsmDriver* d) { p_ldur_stur(d, 0, 1, 0); }
   1888 static void p_ldursb(AsmDriver* d) { p_ldur_stur(d, 1, 0, 1); }
   1889 static void p_ldursh(AsmDriver* d) { p_ldur_stur(d, 1, 1, 1); }
   1890 static void p_ldursw(AsmDriver* d) { p_ldur_stur(d, 1, 2, 1); }
   1891 static void p_ldp_(AsmDriver* d) { p_ldp_stp(d, 1); }
   1892 static void p_stp_(AsmDriver* d) { p_ldp_stp(d, 0); }
   1893 static void p_adr_(AsmDriver* d) { p_adr(d, 0); }
   1894 static void p_adrp_(AsmDriver* d) { p_adr(d, 1); }
   1895 
   1896 /* b.cond family.  cond codes follow the standard ARMv8 numbering. */
   1897 static void p_b_eq(AsmDriver* d) { p_b_cond(d, 0); }
   1898 static void p_b_ne(AsmDriver* d) { p_b_cond(d, 1); }
   1899 static void p_b_cs(AsmDriver* d) { p_b_cond(d, 2); }
   1900 static void p_b_hs(AsmDriver* d) { p_b_cond(d, 2); }
   1901 static void p_b_cc(AsmDriver* d) { p_b_cond(d, 3); }
   1902 static void p_b_lo(AsmDriver* d) { p_b_cond(d, 3); }
   1903 static void p_b_mi(AsmDriver* d) { p_b_cond(d, 4); }
   1904 static void p_b_pl(AsmDriver* d) { p_b_cond(d, 5); }
   1905 static void p_b_vs(AsmDriver* d) { p_b_cond(d, 6); }
   1906 static void p_b_vc(AsmDriver* d) { p_b_cond(d, 7); }
   1907 static void p_b_hi(AsmDriver* d) { p_b_cond(d, 8); }
   1908 static void p_b_ls(AsmDriver* d) { p_b_cond(d, 9); }
   1909 static void p_b_ge(AsmDriver* d) { p_b_cond(d, 10); }
   1910 static void p_b_lt(AsmDriver* d) { p_b_cond(d, 11); }
   1911 static void p_b_gt(AsmDriver* d) { p_b_cond(d, 12); }
   1912 static void p_b_le(AsmDriver* d) { p_b_cond(d, 13); }
   1913 static void p_b_al(AsmDriver* d) { p_b_cond(d, 14); }
   1914 
   1915 /* ---- Scalar floating-point ----
   1916  * Sn/Dn/Hn are the single/double/half views of the FP register file; the
   1917  * 2-bit ftype (0=s,1=d,3=h) drives both the encoding and the operand text. */
   1918 static int parse_fp_scalar_from_ident(AsmDriver* d, Sym ident, u32* num,
   1919                                       u32* ftype) {
   1920   Slice sl = pool_slice(asm_driver_pool(d), ident);
   1921   const char* p = sl.s;
   1922   size_t n = sl.len;
   1923   u32 ft, r = 0;
   1924   size_t i;
   1925   if (!p || n < 2) return 0;
   1926   if (p[0] == 's' || p[0] == 'S')
   1927     ft = 0u;
   1928   else if (p[0] == 'd' || p[0] == 'D')
   1929     ft = 1u;
   1930   else if (p[0] == 'h' || p[0] == 'H')
   1931     ft = 3u;
   1932   else
   1933     return 0;
   1934   for (i = 1; i < n; ++i) {
   1935     char c = p[i];
   1936     if (c < '0' || c > '9') return 0;
   1937     r = r * 10u + (u32)(c - '0');
   1938     if (r > 31u) return 0;
   1939   }
   1940   *num = r;
   1941   *ftype = ft;
   1942   return 1;
   1943 }
   1944 
   1945 static void parse_fp_scalar(AsmDriver* d, u32* num, u32* ftype) {
   1946   AsmTok t = asm_driver_next(d);
   1947   if (t.kind != ASM_TOK_IDENT ||
   1948       !parse_fp_scalar_from_ident(d, t.v.ident, num, ftype))
   1949     asm_driver_panic(d, "asm: expected FP register (Sn/Dn/Hn)");
   1950 }
   1951 
   1952 /* A register operand that may be either a GPR or a scalar FP register — used
   1953  * by fmov, whose three forms differ only by operand class. */
   1954 typedef struct FpOrGpr {
   1955   int is_fp;
   1956   u32 num;
   1957   u32 ftype; /* when is_fp */
   1958   int is64;  /* when !is_fp */
   1959 } FpOrGpr;
   1960 
   1961 static FpOrGpr parse_fp_or_gpr(AsmDriver* d) {
   1962   AsmTok t = asm_driver_next(d);
   1963   FpOrGpr r;
   1964   AA64Reg g;
   1965   memset(&r, 0, sizeof r);
   1966   if (t.kind == ASM_TOK_IDENT &&
   1967       parse_fp_scalar_from_ident(d, t.v.ident, &r.num, &r.ftype)) {
   1968     r.is_fp = 1;
   1969     return r;
   1970   }
   1971   memset(&g, 0, sizeof g);
   1972   if (t.kind == ASM_TOK_IDENT && parse_reg_from_ident(d, t.v.ident, &g)) {
   1973     r.is_fp = 0;
   1974     r.num = g.num;
   1975     r.is64 = (int)g.is64;
   1976     return r;
   1977   }
   1978   asm_driver_panic(d, "asm: fmov: expected register");
   1979   return r; /* unreachable */
   1980 }
   1981 
   1982 static void p_fp_dp2(AsmDriver* d, u32 op) {
   1983   u32 rd, rn, rm, ftd, ftn, ftm;
   1984   parse_fp_scalar(d, &rd, &ftd);
   1985   expect_comma(d, "fp");
   1986   parse_fp_scalar(d, &rn, &ftn);
   1987   expect_comma(d, "fp");
   1988   parse_fp_scalar(d, &rm, &ftm);
   1989   if (ftd != ftn || ftd != ftm)
   1990     asm_driver_panic(d, "asm: fp: operand type mismatch");
   1991   emit32(d, aa64_fp_dp2(ftd, op, rd, rn, rm));
   1992 }
   1993 static void p_fp_dp1(AsmDriver* d, u32 op) {
   1994   u32 rd, rn, ftd, ftn;
   1995   parse_fp_scalar(d, &rd, &ftd);
   1996   expect_comma(d, "fp");
   1997   parse_fp_scalar(d, &rn, &ftn);
   1998   if (ftd != ftn) asm_driver_panic(d, "asm: fp: operand type mismatch");
   1999   emit32(d, aa64_fp_dp1(ftd, op, rd, rn));
   2000 }
   2001 static void p_fadd(AsmDriver* d) { p_fp_dp2(d, AA64_FP_DP2_FADD); }
   2002 static void p_fsub(AsmDriver* d) { p_fp_dp2(d, AA64_FP_DP2_FSUB); }
   2003 static void p_fmul(AsmDriver* d) { p_fp_dp2(d, AA64_FP_DP2_FMUL); }
   2004 static void p_fdiv(AsmDriver* d) { p_fp_dp2(d, AA64_FP_DP2_FDIV); }
   2005 static void p_fmax(AsmDriver* d) { p_fp_dp2(d, AA64_FP_DP2_FMAX); }
   2006 static void p_fmin(AsmDriver* d) { p_fp_dp2(d, AA64_FP_DP2_FMIN); }
   2007 static void p_fnmul(AsmDriver* d) { p_fp_dp2(d, AA64_FP_DP2_FNMUL); }
   2008 static void p_fneg(AsmDriver* d) { p_fp_dp1(d, AA64_FP_DP1_FNEG); }
   2009 static void p_fabs(AsmDriver* d) { p_fp_dp1(d, AA64_FP_DP1_FABS); }
   2010 static void p_fsqrt(AsmDriver* d) { p_fp_dp1(d, AA64_FP_DP1_FSQRT); }
   2011 
   2012 static void p_fcmp(AsmDriver* d) {
   2013   u32 rn, rm, ftn, ftm;
   2014   parse_fp_scalar(d, &rn, &ftn);
   2015   expect_comma(d, "fcmp");
   2016   parse_fp_scalar(d, &rm, &ftm);
   2017   if (ftn != ftm) asm_driver_panic(d, "asm: fcmp: operand type mismatch");
   2018   emit32(d, aa64_fcmp_reg(ftn, rn, rm));
   2019 }
   2020 static void p_fcvt(AsmDriver* d) {
   2021   u32 rd, rn, ftd, ftn;
   2022   parse_fp_scalar(d, &rd, &ftd);
   2023   expect_comma(d, "fcvt");
   2024   parse_fp_scalar(d, &rn, &ftn);
   2025   emit32(d, aa64_fcvt_prec(ftn /*src*/, ftd /*dst*/, rd, rn));
   2026 }
   2027 /* scvtf/ucvtf: FP dst, GPR src. */
   2028 static void p_cvtf(AsmDriver* d, u32 opcode) {
   2029   u32 fd, ft;
   2030   AA64Reg rn;
   2031   parse_fp_scalar(d, &fd, &ft);
   2032   expect_comma(d, "cvtf");
   2033   rn = parse_reg(d);
   2034   emit32(d, aa64_fp_int_cvt((u32)rn.is64, ft, opcode, fd, rn.num));
   2035 }
   2036 /* fcvtzs/fcvtzu: GPR dst, FP src. */
   2037 static void p_fcvtz(AsmDriver* d, u32 opcode) {
   2038   AA64Reg rd;
   2039   u32 fn, ft;
   2040   rd = parse_reg(d);
   2041   expect_comma(d, "fcvtz");
   2042   parse_fp_scalar(d, &fn, &ft);
   2043   emit32(d, aa64_fp_int_cvt((u32)rd.is64, ft, opcode, rd.num, fn));
   2044 }
   2045 static void p_scvtf(AsmDriver* d) { p_cvtf(d, AA64_FP_ICVT_SCVTF); }
   2046 static void p_ucvtf(AsmDriver* d) { p_cvtf(d, AA64_FP_ICVT_UCVTF); }
   2047 static void p_fcvtzs(AsmDriver* d) { p_fcvtz(d, AA64_FP_ICVT_FCVTZS); }
   2048 static void p_fcvtzu(AsmDriver* d) { p_fcvtz(d, AA64_FP_ICVT_FCVTZU); }
   2049 
   2050 /* Data-processing (1 source): clz/rbit/rev16, and rev (whose opcode2 is the
   2051  * width: 2 for 32-bit, 3 for 64-bit). */
   2052 static void p_dp1_op(AsmDriver* d, u32 opcode2) {
   2053   AA64Reg rd = parse_reg(d);
   2054   AA64Reg rn;
   2055   expect_comma(d, "dp1");
   2056   rn = parse_reg(d);
   2057   if (rd.is64 != rn.is64) asm_driver_panic(d, "asm: dp1: width mismatch");
   2058   emit32(d, aa64_dp1(rd.is64, opcode2, rd.num, rn.num));
   2059 }
   2060 static void p_clz(AsmDriver* d) { p_dp1_op(d, AA64_DP1_CLZ); }
   2061 static void p_rbit(AsmDriver* d) { p_dp1_op(d, AA64_DP1_RBIT); }
   2062 static void p_rev16(AsmDriver* d) { p_dp1_op(d, AA64_DP1_REV16); }
   2063 static void p_rev(AsmDriver* d) {
   2064   AA64Reg rd = parse_reg(d);
   2065   AA64Reg rn;
   2066   expect_comma(d, "rev");
   2067   rn = parse_reg(d);
   2068   if (rd.is64 != rn.is64) asm_driver_panic(d, "asm: rev: width mismatch");
   2069   emit32(d, aa64_dp1(rd.is64, rd.is64 ? AA64_DP1_REV64 : AA64_DP1_REV32, rd.num,
   2070                      rn.num));
   2071 }
   2072 
   2073 /* Bitfield move (opc: 0=sbfm, 1=bfm, 2=ubfm): Rd, Rn, #immr, #imms. */
   2074 static void p_bitfield(AsmDriver* d, u32 opc) {
   2075   AA64Reg rd = parse_reg(d);
   2076   AA64Reg rn;
   2077   i64 immr, imms;
   2078   expect_comma(d, "bitfield");
   2079   rn = parse_reg(d);
   2080   expect_comma(d, "bitfield");
   2081   immr = parse_imm_const(d);
   2082   expect_comma(d, "bitfield");
   2083   imms = parse_imm_const(d);
   2084   if (rd.is64 != rn.is64) asm_driver_panic(d, "asm: bitfield: width mismatch");
   2085   emit32(d, aa64_bitfield(rd.is64, opc, (u32)immr, (u32)imms, rd.num, rn.num));
   2086 }
   2087 static void p_sbfm(AsmDriver* d) { p_bitfield(d, 0u); }
   2088 static void p_bfm(AsmDriver* d) { p_bitfield(d, 1u); }
   2089 static void p_ubfm(AsmDriver* d) { p_bitfield(d, 2u); }
   2090 
   2091 static void p_bfx(AsmDriver* d, u32 opc, const char* what) {
   2092   AA64Reg rd = parse_reg(d);
   2093   AA64Reg rn;
   2094   i64 lsb, width;
   2095   u32 reg_width;
   2096   expect_comma(d, what);
   2097   rn = parse_reg(d);
   2098   reject_sp_reg(d, rd, what);
   2099   reject_sp_reg(d, rn, what);
   2100   if (rd.is64 != rn.is64)
   2101     asm_driver_panic(d, "asm: %.*s: width mismatch",
   2102                      SLICE_ARG(slice_from_cstr(what)));
   2103   expect_comma(d, what);
   2104   lsb = parse_imm_const(d);
   2105   expect_comma(d, what);
   2106   width = parse_imm_const(d);
   2107   reg_width = rd.is64 ? 64u : 32u;
   2108   if (lsb < 0 || width <= 0 || (u64)lsb >= reg_width ||
   2109       (u64)width > (u64)reg_width - (u64)lsb) {
   2110     asm_driver_panic(d, "asm: %.*s: bit range out of bounds",
   2111                      SLICE_ARG(slice_from_cstr(what)));
   2112   }
   2113   emit32(d, aa64_bitfield(rd.is64, opc, (u32)lsb, (u32)(lsb + width - 1),
   2114                           rd.num, rn.num));
   2115 }
   2116 
   2117 static void p_sbfx(AsmDriver* d) { p_bfx(d, 0u, "sbfx"); }
   2118 static void p_ubfx(AsmDriver* d) { p_bfx(d, 2u, "ubfx"); }
   2119 
   2120 static void p_sxt(AsmDriver* d, u32 bits, const char* what) {
   2121   AA64Reg rd = parse_reg(d);
   2122   AA64Reg rn;
   2123   expect_comma(d, what);
   2124   rn = parse_reg(d);
   2125   reject_sp_reg(d, rd, what);
   2126   reject_sp_reg(d, rn, what);
   2127   if (rn.is64)
   2128     asm_driver_panic(d, "asm: %.*s: source must be a W register",
   2129                      SLICE_ARG(slice_from_cstr(what)));
   2130   if (bits == 32u && !rd.is64)
   2131     asm_driver_panic(d, "asm: sxtw: destination must be an X register");
   2132   emit32(d, aa64_bitfield(rd.is64, 0u, 0u, bits - 1u, rd.num, rn.num));
   2133 }
   2134 
   2135 static void p_uxt(AsmDriver* d, u32 bits, const char* what) {
   2136   AA64Reg rd = parse_reg(d);
   2137   AA64Reg rn;
   2138   u32 sf;
   2139   expect_comma(d, what);
   2140   rn = parse_reg(d);
   2141   reject_sp_reg(d, rd, what);
   2142   reject_sp_reg(d, rn, what);
   2143   if (rn.is64)
   2144     asm_driver_panic(d, "asm: %.*s: source must be a W register",
   2145                      SLICE_ARG(slice_from_cstr(what)));
   2146   if (bits == 32u && !rd.is64)
   2147     asm_driver_panic(d, "asm: uxtw: destination must be an X register");
   2148   sf = bits == 32u ? 1u : 0u;
   2149   emit32(d, aa64_bitfield(sf, 2u, 0u, bits - 1u, rd.num, rn.num));
   2150 }
   2151 
   2152 static void p_sxtb(AsmDriver* d) { p_sxt(d, 8u, "sxtb"); }
   2153 static void p_sxth(AsmDriver* d) { p_sxt(d, 16u, "sxth"); }
   2154 static void p_sxtw(AsmDriver* d) { p_sxt(d, 32u, "sxtw"); }
   2155 static void p_uxtb(AsmDriver* d) { p_uxt(d, 8u, "uxtb"); }
   2156 static void p_uxth(AsmDriver* d) { p_uxt(d, 16u, "uxth"); }
   2157 static void p_uxtw(AsmDriver* d) { p_uxt(d, 32u, "uxtw"); }
   2158 
   2159 /* fmov: Vd,Vn (FP reg move) | Rd,Vn (fp->gpr) | Vd,Rn (gpr->fp). */
   2160 static void p_fmov(AsmDriver* d) {
   2161   FpOrGpr a = parse_fp_or_gpr(d);
   2162   FpOrGpr b;
   2163   expect_comma(d, "fmov");
   2164   b = parse_fp_or_gpr(d);
   2165   if (a.is_fp && b.is_fp) {
   2166     if (a.ftype != b.ftype)
   2167       asm_driver_panic(d, "asm: fmov: operand type mismatch");
   2168     emit32(d, aa64_fp_dp1(a.ftype, AA64_FP_DP1_FMOV, a.num, b.num));
   2169   } else if (!a.is_fp && b.is_fp) {
   2170     emit32(d, aa64_fp_int_cvt((u32)a.is64, b.ftype, AA64_FP_ICVT_FMOV_TO_GPR,
   2171                               a.num, b.num));
   2172   } else if (a.is_fp && !b.is_fp) {
   2173     emit32(d, aa64_fp_int_cvt((u32)b.is64, a.ftype, AA64_FP_ICVT_FMOV_TO_FP,
   2174                               a.num, b.num));
   2175   } else {
   2176     asm_driver_panic(d, "asm: fmov: gpr,gpr form not supported (use mov)");
   2177   }
   2178 }
   2179 
   2180 /* ---- atomics / exclusive wrappers ----
   2181  *
   2182  * Access log2-sizes: byte=0, half=1, word=2, dword=3.  The w/x variants
   2183  * share a mnemonic stem (e.g. `ldxr`) and pick the size from the operand
   2184  * register width — the encoders key on the explicit size, so a width-
   2185  * sensing wrapper peeks the operand register before dispatching. */
   2186 #define AA64_ATOMIC_SIZE_B 0u
   2187 #define AA64_ATOMIC_SIZE_H 1u
   2188 #define AA64_ATOMIC_SIZE_W 2u
   2189 #define AA64_ATOMIC_SIZE_X 3u
   2190 
   2191 /* Load-exclusive family: o2,o0 select ldxr/ldaxr/ldar. */
   2192 #define DEF_LDEX(fn, sz, o2, o0, name) \
   2193   static void fn(AsmDriver* d) { p_ldex(d, sz, o2, o0, name); }
   2194 /* ldxr / ldxrb / ldxrh: o2=0 o0=0.  The non-b/h stem derives size from
   2195  * the register width, so we route it through a width-sensing wrapper. */
   2196 static void p_ldxr_wx(AsmDriver* d) {
   2197   /* Peek the destination register to choose word vs dword size. */
   2198   AsmTok t = asm_driver_peek(d);
   2199   AA64Reg r;
   2200   memset(&r, 0, sizeof r);
   2201   if (t.kind != ASM_TOK_IDENT || !parse_reg_from_ident(d, t.v.ident, &r))
   2202     asm_driver_panic(d, "asm: ldxr: expected register");
   2203   p_ldex(d, r.is64 ? AA64_ATOMIC_SIZE_X : AA64_ATOMIC_SIZE_W, 0u, 0u, "ldxr");
   2204 }
   2205 DEF_LDEX(p_ldxrb, AA64_ATOMIC_SIZE_B, 0u, 0u, "ldxrb")
   2206 DEF_LDEX(p_ldxrh, AA64_ATOMIC_SIZE_H, 0u, 0u, "ldxrh")
   2207 static void p_ldaxr_wx(AsmDriver* d) {
   2208   AsmTok t = asm_driver_peek(d);
   2209   AA64Reg r;
   2210   memset(&r, 0, sizeof r);
   2211   if (t.kind != ASM_TOK_IDENT || !parse_reg_from_ident(d, t.v.ident, &r))
   2212     asm_driver_panic(d, "asm: ldaxr: expected register");
   2213   p_ldex(d, r.is64 ? AA64_ATOMIC_SIZE_X : AA64_ATOMIC_SIZE_W, 0u, 1u, "ldaxr");
   2214 }
   2215 DEF_LDEX(p_ldaxrb, AA64_ATOMIC_SIZE_B, 0u, 1u, "ldaxrb")
   2216 DEF_LDEX(p_ldaxrh, AA64_ATOMIC_SIZE_H, 0u, 1u, "ldaxrh")
   2217 static void p_ldar_wx(AsmDriver* d) {
   2218   AsmTok t = asm_driver_peek(d);
   2219   AA64Reg r;
   2220   memset(&r, 0, sizeof r);
   2221   if (t.kind != ASM_TOK_IDENT || !parse_reg_from_ident(d, t.v.ident, &r))
   2222     asm_driver_panic(d, "asm: ldar: expected register");
   2223   p_ldex(d, r.is64 ? AA64_ATOMIC_SIZE_X : AA64_ATOMIC_SIZE_W, 1u, 1u, "ldar");
   2224 }
   2225 DEF_LDEX(p_ldarb, AA64_ATOMIC_SIZE_B, 1u, 1u, "ldarb")
   2226 DEF_LDEX(p_ldarh, AA64_ATOMIC_SIZE_H, 1u, 1u, "ldarh")
   2227 
   2228 /* stlr (no status): width-driven for the non-b/h stem. */
   2229 static void p_stlr_wx(AsmDriver* d) {
   2230   AsmTok t = asm_driver_peek(d);
   2231   AA64Reg r;
   2232   memset(&r, 0, sizeof r);
   2233   if (t.kind != ASM_TOK_IDENT || !parse_reg_from_ident(d, t.v.ident, &r))
   2234     asm_driver_panic(d, "asm: stlr: expected register");
   2235   p_stlr(d, r.is64 ? AA64_ATOMIC_SIZE_X : AA64_ATOMIC_SIZE_W, "stlr");
   2236 }
   2237 static void p_stlrb_(AsmDriver* d) { p_stlr(d, AA64_ATOMIC_SIZE_B, "stlrb"); }
   2238 static void p_stlrh_(AsmDriver* d) { p_stlr(d, AA64_ATOMIC_SIZE_H, "stlrh"); }
   2239 
   2240 /* Store-exclusive family: o0 selects stxr vs stlxr.  Status reg is always
   2241  * 32-bit; the stored value reg drives the size for the non-b/h stem. */
   2242 static void p_stxr_wx(AsmDriver* d) {
   2243   AA64Reg rs = parse_reg(d);
   2244   reject_sp_reg(d, rs, "stxr");
   2245   if (rs.is64) asm_driver_panic(d, "asm: stxr: status reg must be 32-bit");
   2246   expect_comma(d, "stxr");
   2247   AsmTok t = asm_driver_peek(d);
   2248   AA64Reg rt;
   2249   memset(&rt, 0, sizeof rt);
   2250   if (t.kind != ASM_TOK_IDENT || !parse_reg_from_ident(d, t.v.ident, &rt))
   2251     asm_driver_panic(d, "asm: stxr: expected value register");
   2252   u32 size = rt.is64 ? AA64_ATOMIC_SIZE_X : AA64_ATOMIC_SIZE_W;
   2253   rt = parse_reg(d);
   2254   require_gpr_width(d, rt, size, "stxr");
   2255   expect_comma(d, "stxr");
   2256   AA64Mem m = parse_mem_bare(d, "stxr");
   2257   reject_stex_alias(d, rs, rt, m, "stxr");
   2258   emit32(d, aa64_ldstex_pack((AA64LdStEx){.size = size,
   2259                                           .o2 = 0u,
   2260                                           .L = 0u,
   2261                                           .o1 = 0u,
   2262                                           .Rs = rs.num,
   2263                                           .o0 = 0u,
   2264                                           .Rt2 = AA64_ZR,
   2265                                           .Rn = m.base.num,
   2266                                           .Rt = rt.num}));
   2267 }
   2268 static void p_stlxr_wx(AsmDriver* d) {
   2269   AA64Reg rs = parse_reg(d);
   2270   reject_sp_reg(d, rs, "stlxr");
   2271   if (rs.is64) asm_driver_panic(d, "asm: stlxr: status reg must be 32-bit");
   2272   expect_comma(d, "stlxr");
   2273   AsmTok t = asm_driver_peek(d);
   2274   AA64Reg rt;
   2275   memset(&rt, 0, sizeof rt);
   2276   if (t.kind != ASM_TOK_IDENT || !parse_reg_from_ident(d, t.v.ident, &rt))
   2277     asm_driver_panic(d, "asm: stlxr: expected value register");
   2278   u32 size = rt.is64 ? AA64_ATOMIC_SIZE_X : AA64_ATOMIC_SIZE_W;
   2279   rt = parse_reg(d);
   2280   require_gpr_width(d, rt, size, "stlxr");
   2281   expect_comma(d, "stlxr");
   2282   AA64Mem m = parse_mem_bare(d, "stlxr");
   2283   reject_stex_alias(d, rs, rt, m, "stlxr");
   2284   emit32(d, aa64_ldstex_pack((AA64LdStEx){.size = size,
   2285                                           .o2 = 0u,
   2286                                           .L = 0u,
   2287                                           .o1 = 0u,
   2288                                           .Rs = rs.num,
   2289                                           .o0 = 1u,
   2290                                           .Rt2 = AA64_ZR,
   2291                                           .Rn = m.base.num,
   2292                                           .Rt = rt.num}));
   2293 }
   2294 static void p_stxrb_(AsmDriver* d) {
   2295   p_stex(d, AA64_ATOMIC_SIZE_B, 0u, "stxrb");
   2296 }
   2297 static void p_stxrh_(AsmDriver* d) {
   2298   p_stex(d, AA64_ATOMIC_SIZE_H, 0u, "stxrh");
   2299 }
   2300 static void p_stlxrb_(AsmDriver* d) {
   2301   p_stex(d, AA64_ATOMIC_SIZE_B, 1u, "stlxrb");
   2302 }
   2303 static void p_stlxrh_(AsmDriver* d) {
   2304   p_stex(d, AA64_ATOMIC_SIZE_H, 1u, "stlxrh");
   2305 }
   2306 
   2307 /* CAS family: width-driven for the non-b/h stems (Rs/Rt are same width). */
   2308 #define DEF_CAS(fn, L, o0, name)                                             \
   2309   static void fn##_wx(AsmDriver* d) {                                        \
   2310     AsmTok t = asm_driver_peek(d);                                           \
   2311     AA64Reg r;                                                               \
   2312     memset(&r, 0, sizeof r);                                                 \
   2313     if (t.kind != ASM_TOK_IDENT || !parse_reg_from_ident(d, t.v.ident, &r))  \
   2314       asm_driver_panic(d, "asm: " name ": expected register");               \
   2315     p_cas(d, r.is64 ? AA64_ATOMIC_SIZE_X : AA64_ATOMIC_SIZE_W, L, o0, name); \
   2316   }                                                                          \
   2317   static void fn##b(AsmDriver* d) {                                          \
   2318     p_cas(d, AA64_ATOMIC_SIZE_B, L, o0, name "b");                           \
   2319   }                                                                          \
   2320   static void fn##h(AsmDriver* d) {                                          \
   2321     p_cas(d, AA64_ATOMIC_SIZE_H, L, o0, name "h");                           \
   2322   }
   2323 DEF_CAS(p_cas, 0u, 0u, "cas")
   2324 DEF_CAS(p_casa, 1u, 0u, "casa")
   2325 DEF_CAS(p_casl, 0u, 1u, "casl")
   2326 DEF_CAS(p_casal, 1u, 1u, "casal")
   2327 
   2328 /* LSE atomic family: A/R from the suffix, o3/opc from the stem.  Each
   2329  * mnemonic generates a width-driven stem plus b/h wrappers. */
   2330 #define DEF_LSE(fn, A, R, o3, opc, name)                                      \
   2331   static void fn##_wx(AsmDriver* d) {                                         \
   2332     AsmTok t = asm_driver_peek(d);                                            \
   2333     AA64Reg r;                                                                \
   2334     memset(&r, 0, sizeof r);                                                  \
   2335     if (t.kind != ASM_TOK_IDENT || !parse_reg_from_ident(d, t.v.ident, &r))   \
   2336       asm_driver_panic(d, "asm: " name ": expected register");                \
   2337     p_lse(d, r.is64 ? AA64_ATOMIC_SIZE_X : AA64_ATOMIC_SIZE_W, A, R, o3, opc, \
   2338           name);                                                              \
   2339   }                                                                           \
   2340   static void fn##b(AsmDriver* d) {                                           \
   2341     p_lse(d, AA64_ATOMIC_SIZE_B, A, R, o3, opc, name "b");                    \
   2342   }                                                                           \
   2343   static void fn##h(AsmDriver* d) {                                           \
   2344     p_lse(d, AA64_ATOMIC_SIZE_H, A, R, o3, opc, name "h");                    \
   2345   }
   2346 /* SWP (o3=1, opc=000). */
   2347 DEF_LSE(p_swp, 0u, 0u, 1u, AA64_LSE_OPC_SWP, "swp")
   2348 DEF_LSE(p_swpa, 1u, 0u, 1u, AA64_LSE_OPC_SWP, "swpa")
   2349 DEF_LSE(p_swpl, 0u, 1u, 1u, AA64_LSE_OPC_SWP, "swpl")
   2350 DEF_LSE(p_swpal, 1u, 1u, 1u, AA64_LSE_OPC_SWP, "swpal")
   2351 /* LDADD. */
   2352 DEF_LSE(p_ldadd, 0u, 0u, 0u, AA64_LSE_OPC_LDADD, "ldadd")
   2353 DEF_LSE(p_ldadda, 1u, 0u, 0u, AA64_LSE_OPC_LDADD, "ldadda")
   2354 DEF_LSE(p_ldaddl, 0u, 1u, 0u, AA64_LSE_OPC_LDADD, "ldaddl")
   2355 DEF_LSE(p_ldaddal, 1u, 1u, 0u, AA64_LSE_OPC_LDADD, "ldaddal")
   2356 /* LDCLR. */
   2357 DEF_LSE(p_ldclr, 0u, 0u, 0u, AA64_LSE_OPC_LDCLR, "ldclr")
   2358 DEF_LSE(p_ldclra, 1u, 0u, 0u, AA64_LSE_OPC_LDCLR, "ldclra")
   2359 DEF_LSE(p_ldclrl, 0u, 1u, 0u, AA64_LSE_OPC_LDCLR, "ldclrl")
   2360 DEF_LSE(p_ldclral, 1u, 1u, 0u, AA64_LSE_OPC_LDCLR, "ldclral")
   2361 /* LDEOR. */
   2362 DEF_LSE(p_ldeor, 0u, 0u, 0u, AA64_LSE_OPC_LDEOR, "ldeor")
   2363 DEF_LSE(p_ldeora, 1u, 0u, 0u, AA64_LSE_OPC_LDEOR, "ldeora")
   2364 DEF_LSE(p_ldeorl, 0u, 1u, 0u, AA64_LSE_OPC_LDEOR, "ldeorl")
   2365 DEF_LSE(p_ldeoral, 1u, 1u, 0u, AA64_LSE_OPC_LDEOR, "ldeoral")
   2366 /* LDSET. */
   2367 DEF_LSE(p_ldset, 0u, 0u, 0u, AA64_LSE_OPC_LDSET, "ldset")
   2368 DEF_LSE(p_ldseta, 1u, 0u, 0u, AA64_LSE_OPC_LDSET, "ldseta")
   2369 DEF_LSE(p_ldsetl, 0u, 1u, 0u, AA64_LSE_OPC_LDSET, "ldsetl")
   2370 DEF_LSE(p_ldsetal, 1u, 1u, 0u, AA64_LSE_OPC_LDSET, "ldsetal")
   2371 
   2372 static const AA64Mn kTable[] = {
   2373     {"fadd", p_fadd, 0},
   2374     {"fsub", p_fsub, 0},
   2375     {"fmul", p_fmul, 0},
   2376     {"fdiv", p_fdiv, 0},
   2377     {"fmax", p_fmax, 0},
   2378     {"fmin", p_fmin, 0},
   2379     {"fnmul", p_fnmul, 0},
   2380     {"fneg", p_fneg, 0},
   2381     {"fabs", p_fabs, 0},
   2382     {"fsqrt", p_fsqrt, 0},
   2383     {"fmov", p_fmov, 0},
   2384     {"fcmp", p_fcmp, 0},
   2385     {"fcvt", p_fcvt, 0},
   2386     {"scvtf", p_scvtf, 0},
   2387     {"ucvtf", p_ucvtf, 0},
   2388     {"fcvtzs", p_fcvtzs, 0},
   2389     {"fcvtzu", p_fcvtzu, 0},
   2390     {"clz", p_clz, 0},
   2391     {"rbit", p_rbit, 0},
   2392     {"rev", p_rev, 0},
   2393     {"rev16", p_rev16, 0},
   2394     {"sbfm", p_sbfm, 0},
   2395     {"ubfm", p_ubfm, 0},
   2396     {"bfm", p_bfm, 0},
   2397     {"sbfx", p_sbfx, 0},
   2398     {"ubfx", p_ubfx, 0},
   2399     {"sxtb", p_sxtb, 0},
   2400     {"sxth", p_sxth, 0},
   2401     {"sxtw", p_sxtw, 0},
   2402     {"uxtb", p_uxtb, 0},
   2403     {"uxth", p_uxth, 0},
   2404     {"uxtw", p_uxtw, 0},
   2405     {"nop", p_nop, 0},
   2406     {"dmb", p_dmb, 0},
   2407     {"dsb", p_dsb, 0},
   2408     {"isb", p_isb, 0},
   2409     {"clrex", p_clrex, 0},
   2410     {"ret", p_ret, 0},
   2411     {"br", p_br, 0},
   2412     {"blr", p_blr, 0},
   2413     {"mov", p_mov, 0},
   2414     {"mvn", p_mvn, 0},
   2415     {"movz", p_movz_, 0},
   2416     {"movn", p_movn_, 0},
   2417     {"movk", p_movk_, 0},
   2418     {"add", p_addsub_add, 0},
   2419     {"adds", p_addsub_adds, 0},
   2420     {"sub", p_addsub_sub, 0},
   2421     {"subs", p_addsub_subs, 0},
   2422     {"cmp", p_cmp_w, 0},
   2423     {"cmn", p_cmn_w, 0},
   2424     {"csel", p_csel_, 0},
   2425     {"csinc", p_csinc_, 0},
   2426     {"csinv", p_csinv_, 0},
   2427     {"csneg", p_csneg_, 0},
   2428     {"cset", p_cset_, 0},
   2429     {"csetm", p_csetm_, 0},
   2430     {"neg", p_neg_w, 0},
   2431     {"negs", p_negs_w, 0},
   2432     {"and", p_and_w, 0},
   2433     {"bic", p_bic_w, 0},
   2434     {"orr", p_orr_w, 0},
   2435     {"orn", p_orn_w, 0},
   2436     {"eor", p_eor_w, 0},
   2437     {"eon", p_eon_w, 0},
   2438     {"ands", p_ands_w, 0},
   2439     {"bics", p_bics_w, 0},
   2440     {"madd", p_madd, 0},
   2441     {"msub", p_msub, 0},
   2442     {"mul", p_mul_w, 0},
   2443     {"mneg", p_mneg_w, 0},
   2444     {"udiv", p_udiv_w, 0},
   2445     {"sdiv", p_sdiv_w, 0},
   2446     {"lslv", p_lslv_w, 0},
   2447     {"lsrv", p_lsrv_w, 0},
   2448     {"asrv", p_asrv_w, 0},
   2449     {"rorv", p_rorv_w, 0},
   2450     {"lsl", p_lsl_, 0},
   2451     {"lsr", p_lsr_, 0},
   2452     {"asr", p_asr_, 0},
   2453     {"b", p_b_, 0},
   2454     {"bl", p_bl_, 0},
   2455     {"cbz", p_cbz_, 0},
   2456     {"cbnz", p_cbnz_, 0},
   2457     {"svc", p_svc_, 0},
   2458     {"brk", p_brk_, 0},
   2459     {"hlt", p_hlt_, 0},
   2460     {"mrs", p_mrs_, 0},
   2461     {"msr", p_msr_, 0},
   2462     {"ldr", p_ldr_, 0},
   2463     {"str", p_str_, 0},
   2464     {"ldrb", p_ldrb, 0},
   2465     {"strb", p_strb, 0},
   2466     {"ldrh", p_ldrh, 0},
   2467     {"strh", p_strh, 0},
   2468     {"ldrsb", p_ldrsb, 0},
   2469     {"ldrsh", p_ldrsh, 0},
   2470     {"ldrsw", p_ldrsw, 0},
   2471     {"ldur", p_ldur_, 0},
   2472     {"stur", p_stur_, 0},
   2473     {"ldurb", p_ldurb, 0},
   2474     {"sturb", p_sturb, 0},
   2475     {"ldurh", p_ldurh, 0},
   2476     {"sturh", p_sturh, 0},
   2477     {"ldursb", p_ldursb, 0},
   2478     {"ldursh", p_ldursh, 0},
   2479     {"ldursw", p_ldursw, 0},
   2480     {"ldp", p_ldp_, 0},
   2481     {"stp", p_stp_, 0},
   2482     {"adr", p_adr_, 0},
   2483     {"adrp", p_adrp_, 0},
   2484     /* ---- atomics / exclusive ---- */
   2485     {"ldxr", p_ldxr_wx, 0},
   2486     {"ldxrb", p_ldxrb, 0},
   2487     {"ldxrh", p_ldxrh, 0},
   2488     {"ldaxr", p_ldaxr_wx, 0},
   2489     {"ldaxrb", p_ldaxrb, 0},
   2490     {"ldaxrh", p_ldaxrh, 0},
   2491     {"ldar", p_ldar_wx, 0},
   2492     {"ldarb", p_ldarb, 0},
   2493     {"ldarh", p_ldarh, 0},
   2494     {"stxr", p_stxr_wx, 0},
   2495     {"stxrb", p_stxrb_, 0},
   2496     {"stxrh", p_stxrh_, 0},
   2497     {"stlxr", p_stlxr_wx, 0},
   2498     {"stlxrb", p_stlxrb_, 0},
   2499     {"stlxrh", p_stlxrh_, 0},
   2500     {"stlr", p_stlr_wx, 0},
   2501     {"stlrb", p_stlrb_, 0},
   2502     {"stlrh", p_stlrh_, 0},
   2503     {"cas", p_cas_wx, 0},
   2504     {"casb", p_casb, 0},
   2505     {"cash", p_cash, 0},
   2506     {"casa", p_casa_wx, 0},
   2507     {"casab", p_casab, 0},
   2508     {"casah", p_casah, 0},
   2509     {"casl", p_casl_wx, 0},
   2510     {"caslb", p_caslb, 0},
   2511     {"caslh", p_caslh, 0},
   2512     {"casal", p_casal_wx, 0},
   2513     {"casalb", p_casalb, 0},
   2514     {"casalh", p_casalh, 0},
   2515     {"swp", p_swp_wx, 0},
   2516     {"swpb", p_swpb, 0},
   2517     {"swph", p_swph, 0},
   2518     {"swpa", p_swpa_wx, 0},
   2519     {"swpab", p_swpab, 0},
   2520     {"swpah", p_swpah, 0},
   2521     {"swpl", p_swpl_wx, 0},
   2522     {"swplb", p_swplb, 0},
   2523     {"swplh", p_swplh, 0},
   2524     {"swpal", p_swpal_wx, 0},
   2525     {"swpalb", p_swpalb, 0},
   2526     {"swpalh", p_swpalh, 0},
   2527     {"ldadd", p_ldadd_wx, 0},
   2528     {"ldaddb", p_ldaddb, 0},
   2529     {"ldaddh", p_ldaddh, 0},
   2530     {"ldadda", p_ldadda_wx, 0},
   2531     {"ldaddab", p_ldaddab, 0},
   2532     {"ldaddah", p_ldaddah, 0},
   2533     {"ldaddl", p_ldaddl_wx, 0},
   2534     {"ldaddlb", p_ldaddlb, 0},
   2535     {"ldaddlh", p_ldaddlh, 0},
   2536     {"ldaddal", p_ldaddal_wx, 0},
   2537     {"ldaddalb", p_ldaddalb, 0},
   2538     {"ldaddalh", p_ldaddalh, 0},
   2539     {"ldclr", p_ldclr_wx, 0},
   2540     {"ldclrb", p_ldclrb, 0},
   2541     {"ldclrh", p_ldclrh, 0},
   2542     {"ldclra", p_ldclra_wx, 0},
   2543     {"ldclrab", p_ldclrab, 0},
   2544     {"ldclrah", p_ldclrah, 0},
   2545     {"ldclrl", p_ldclrl_wx, 0},
   2546     {"ldclrlb", p_ldclrlb, 0},
   2547     {"ldclrlh", p_ldclrlh, 0},
   2548     {"ldclral", p_ldclral_wx, 0},
   2549     {"ldclralb", p_ldclralb, 0},
   2550     {"ldclralh", p_ldclralh, 0},
   2551     {"ldeor", p_ldeor_wx, 0},
   2552     {"ldeorb", p_ldeorb, 0},
   2553     {"ldeorh", p_ldeorh, 0},
   2554     {"ldeora", p_ldeora_wx, 0},
   2555     {"ldeorab", p_ldeorab, 0},
   2556     {"ldeorah", p_ldeorah, 0},
   2557     {"ldeorl", p_ldeorl_wx, 0},
   2558     {"ldeorlb", p_ldeorlb, 0},
   2559     {"ldeorlh", p_ldeorlh, 0},
   2560     {"ldeoral", p_ldeoral_wx, 0},
   2561     {"ldeoralb", p_ldeoralb, 0},
   2562     {"ldeoralh", p_ldeoralh, 0},
   2563     {"ldset", p_ldset_wx, 0},
   2564     {"ldsetb", p_ldsetb, 0},
   2565     {"ldseth", p_ldseth, 0},
   2566     {"ldseta", p_ldseta_wx, 0},
   2567     {"ldsetab", p_ldsetab, 0},
   2568     {"ldsetah", p_ldsetah, 0},
   2569     {"ldsetl", p_ldsetl_wx, 0},
   2570     {"ldsetlb", p_ldsetlb, 0},
   2571     {"ldsetlh", p_ldsetlh, 0},
   2572     {"ldsetal", p_ldsetal_wx, 0},
   2573     {"ldsetalb", p_ldsetalb, 0},
   2574     {"ldsetalh", p_ldsetalh, 0},
   2575     {"b.eq", p_b_eq, 0},
   2576     {"b.ne", p_b_ne, 0},
   2577     {"b.cs", p_b_cs, 0},
   2578     {"b.hs", p_b_hs, 0},
   2579     {"b.cc", p_b_cc, 0},
   2580     {"b.lo", p_b_lo, 0},
   2581     {"b.mi", p_b_mi, 0},
   2582     {"b.pl", p_b_pl, 0},
   2583     {"b.vs", p_b_vs, 0},
   2584     {"b.vc", p_b_vc, 0},
   2585     {"b.hi", p_b_hi, 0},
   2586     {"b.ls", p_b_ls, 0},
   2587     {"b.ge", p_b_ge, 0},
   2588     {"b.lt", p_b_lt, 0},
   2589     {"b.gt", p_b_gt, 0},
   2590     {"b.le", p_b_le, 0},
   2591     {"b.al", p_b_al, 0},
   2592     {NULL, NULL, 0},
   2593 };
   2594 
   2595 void aa64_asm_insn(AA64Asm* a, AsmDriver* d, Sym mnemonic) {
   2596   (void)a;
   2597   Slice msl = pool_slice(asm_driver_pool(d), mnemonic);
   2598   const char* mp = msl.s;
   2599   size_t mn = msl.len;
   2600   for (const AA64Mn* row = kTable; row->name; ++row) {
   2601     if (icase_eq(mp, mn, row->name)) {
   2602       row->fn(d);
   2603       return;
   2604     }
   2605   }
   2606   asm_driver_panic(d, "asm: unknown mnemonic");
   2607 }
   2608 
   2609 /* ---- inline-asm template walker (Phase 4b Track C) ---- */
   2610 
   2611 /* Per-call rendered-line buffer.  GCC's inline asm rarely emits more
   2612  * than a handful of instructions per block; one line of substituted
   2613  * text fits comfortably inside this. Truncation panics — the operator
   2614  * grammar should never grow a single line beyond this without a
   2615  * deliberate reason. */
   2616 #define AA64_INLINE_LINE_CAP 1024
   2617 
   2618 _Noreturn static void inline_panic(AA64Asm* a, const char* msg);
   2619 
   2620 /* Render a 5-bit register number into the StrBuf using the requested
   2621  * width form.  is64 picks x-form vs w-form; SP / ZR encode as
   2622  * register #31 and we render them as wzr/xzr or wsp/sp depending on
   2623  * caller intent — for inline-asm v1 the bound operand always names a
   2624  * GP register, never SP, so we emit wzr/xzr for #31. */
   2625 static void render_reg(StrBuf* sb, u32 reg, int is64) {
   2626   if (reg == 31u) {
   2627     strbuf_puts(sb, is64 ? "xzr" : "wzr");
   2628     return;
   2629   }
   2630   strbuf_putc(sb, is64 ? 'x' : 'w');
   2631   if (reg >= 10u) strbuf_putc(sb, (char)('0' + (reg / 10u)));
   2632   strbuf_putc(sb, (char)('0' + (reg % 10u)));
   2633 }
   2634 
   2635 static void render_fp_reg(StrBuf* sb, u32 reg, u32 nbytes) {
   2636   strbuf_putc(sb, nbytes <= 4u ? 's' : 'd');
   2637   if (reg >= 10u) strbuf_putc(sb, (char)('0' + (reg / 10u)));
   2638   strbuf_putc(sb, (char)('0' + (reg % 10u)));
   2639 }
   2640 
   2641 static u32 inline_op_size(AA64Asm* a, const Operand* op) {
   2642   if (!op->type) return 8u;
   2643   u64 n = cg_type_size(a->c, op->type);
   2644   if (!n) return 8u;
   2645   if (n > 16u) inline_panic(a, "inline asm operand is too large");
   2646   return (u32)n;
   2647 }
   2648 
   2649 static int inline_op_is_ptr(AA64Asm* a, const Operand* op) {
   2650   return op->type && cg_type_is_ptr(a->c, op->type);
   2651 }
   2652 
   2653 /* Render a signed 64-bit integer prefixed with '#'. */
   2654 static void render_imm(StrBuf* sb, i64 v) {
   2655   strbuf_putc(sb, '#');
   2656   strbuf_put_i64(sb, v);
   2657 }
   2658 
   2659 /* Render an addressing form `[xN, #ofs]` for OPK_INDIRECT. */
   2660 static void render_indirect(StrBuf* sb, Reg base, i32 ofs) {
   2661   strbuf_putc(sb, '[');
   2662   render_reg(sb, (u32)base, /*is64=*/1);
   2663   if (ofs != 0) {
   2664     strbuf_puts(sb, ", ");
   2665     render_imm(sb, (i64)ofs);
   2666   }
   2667   strbuf_putc(sb, ']');
   2668 }
   2669 
   2670 _Noreturn static void inline_panic(AA64Asm* a, const char* msg) {
   2671   SrcLoc loc = {0, 0, 0};
   2672   compiler_panic(a->c, loc, "inline asm: %.*s",
   2673                  SLICE_ARG(slice_from_cstr(msg)));
   2674 }
   2675 
   2676 /* Resolve operand index N → (kind=0 forced default, 1=force-w, 2=force-x,
   2677  * 3=address form `%aN`).  Renders into sb. */
   2678 static void render_operand(AA64Asm* a, StrBuf* sb, u32 idx, int form) {
   2679   u32 ntot = a->nout + a->nin;
   2680   if (idx >= ntot) inline_panic(a, "operand index out of range");
   2681   const Operand* op =
   2682       (idx < a->nout) ? &a->out_ops[idx] : &a->in_ops[idx - a->nout];
   2683   switch (form) {
   2684     case 1: /* %wN — force 32-bit register form */
   2685       if (op->kind != AA64_INLINE_OPK_REG ||
   2686           op->pad[0] != AA64_INLINE_OPCLS_INT)
   2687         inline_panic(a, "%w on non-integer-register operand");
   2688       render_reg(sb, (u32)op->v.local, 0);
   2689       return;
   2690     case 2: /* %xN — force 64-bit register form */
   2691       if (op->kind != AA64_INLINE_OPK_REG ||
   2692           op->pad[0] != AA64_INLINE_OPCLS_INT)
   2693         inline_panic(a, "%x on non-integer-register operand");
   2694       render_reg(sb, (u32)op->v.local, 1);
   2695       return;
   2696     case 3: /* %aN — memory addressing form */
   2697       if (op->kind != OPK_INDIRECT) inline_panic(a, "%a on non-memory operand");
   2698       /* Inline asm consumes a plain pointer-shaped address; the cg
   2699        * contract guarantees no EA index here. */
   2700       if (op->v.ind.index != REG_NONE)
   2701         inline_panic(a, "%a operand has unexpected EA index");
   2702       render_indirect(sb, op->v.ind.base, op->v.ind.ofs);
   2703       return;
   2704     default:
   2705       break;
   2706   }
   2707   /* Default rendering by operand kind. */
   2708   switch (op->kind) {
   2709     case AA64_INLINE_OPK_REG:
   2710       if (op->pad[0] == AA64_INLINE_OPCLS_FP) {
   2711         render_fp_reg(sb, (u32)op->v.local, inline_op_size(a, op));
   2712       } else {
   2713         render_reg(sb, (u32)op->v.local,
   2714                    inline_op_is_ptr(a, op) || inline_op_size(a, op) > 4u);
   2715       }
   2716       return;
   2717     case OPK_IMM:
   2718       render_imm(sb, op->v.imm);
   2719       return;
   2720     case OPK_INDIRECT:
   2721       if (op->v.ind.index != REG_NONE)
   2722         inline_panic(a, "inline-asm operand has unexpected EA index");
   2723       render_indirect(sb, op->v.ind.base, op->v.ind.ofs);
   2724       return;
   2725     default:
   2726       inline_panic(a, "unsupported operand kind for %N");
   2727   }
   2728 }
   2729 
   2730 /* Lex one line of substituted asm and dispatch via aa64_asm_insn. */
   2731 static void run_one_line(AA64Asm* a, MCEmitter* mc, const char* text,
   2732                          size_t len) {
   2733   /* Skip blank lines. */
   2734   size_t i;
   2735   for (i = 0; i < len; ++i) {
   2736     if (text[i] != ' ' && text[i] != '\t') break;
   2737   }
   2738   if (i == len) return;
   2739 
   2740   AsmLexer* lx = asm_lex_open_mem(a->c, "<inline-asm>", text, len);
   2741   AsmDriver* d = asm_driver_open_inline(a->c, mc, lx);
   2742 
   2743   /* The first non-trivial token must be the mnemonic identifier (or a
   2744    * `.directive`, but inline asm doesn't normally use directives — leave
   2745    * that path unsupported until needed). */
   2746   AsmTok t = asm_driver_peek(d);
   2747   while (t.kind == ASM_TOK_NEWLINE || t.kind == ASM_TOK_HASH) {
   2748     (void)asm_driver_next(d);
   2749     if (t.kind == ASM_TOK_HASH) {
   2750       /* Skip cpp linemarker rest of line. */
   2751       while (!asm_driver_at_eol(d)) (void)asm_driver_next(d);
   2752     }
   2753     t = asm_driver_peek(d);
   2754   }
   2755   if (t.kind == ASM_TOK_EOF) {
   2756     asm_driver_close_inline(d);
   2757     asm_lex_close(lx);
   2758     return;
   2759   }
   2760   if (t.kind != ASM_TOK_IDENT)
   2761     inline_panic(a, "expected mnemonic at start of inline asm line");
   2762   (void)asm_driver_next(d);
   2763   Sym mn = t.v.ident;
   2764   /* Compose `b.eq` etc. — same trick as the standalone driver. */
   2765   AsmTok dot = asm_driver_peek(d);
   2766   if (asm_driver_tok_is_punct(dot, '.')) {
   2767     (void)asm_driver_next(d);
   2768     AsmTok rest = asm_driver_next(d);
   2769     if (rest.kind != ASM_TOK_IDENT)
   2770       inline_panic(a, "composite mnemonic: expected ident after '.'");
   2771     Slice hsl = pool_slice(asm_driver_pool(d), mn);
   2772     Slice rsl = pool_slice(asm_driver_pool(d), rest.v.ident);
   2773     size_t hn = hsl.len, rn = rsl.len;
   2774     const char* hp = hsl.s;
   2775     const char* rp = rsl.s;
   2776     char buf[64];
   2777     if (hn + 1 + rn >= sizeof buf)
   2778       inline_panic(a, "composite mnemonic too long");
   2779     for (size_t k = 0; k < hn; ++k) buf[k] = hp[k];
   2780     buf[hn] = '.';
   2781     for (size_t k = 0; k < rn; ++k) buf[hn + 1 + k] = rp[k];
   2782     mn = pool_intern_slice(asm_driver_pool(d),
   2783                            (Slice){.s = buf, .len = hn + 1 + rn});
   2784   }
   2785   aa64_asm_insn(a, d, mn);
   2786   asm_driver_close_inline(d);
   2787   asm_lex_close(lx);
   2788 }
   2789 
   2790 /* Substitute placeholders into one line's StrBuf, then dispatch.
   2791  *
   2792  * The input range is [start, end) inside `tmpl`.  Updates `*line_idx`
   2793  * is not used — the caller resets the StrBuf between lines. */
   2794 static void render_and_run_line(AA64Asm* a, MCEmitter* mc, StrBuf* sb,
   2795                                 const char* start, const char* end) {
   2796   strbuf_reset(sb);
   2797   for (const char* p = start; p < end; ++p) {
   2798     char c = *p;
   2799     if (c != '%') {
   2800       strbuf_putc(sb, c);
   2801       continue;
   2802     }
   2803     /* Placeholder. */
   2804     if (p + 1 >= end) inline_panic(a, "trailing '%' in template");
   2805     char n = *(p + 1);
   2806     if (n == '%') {
   2807       strbuf_putc(sb, '%');
   2808       ++p;
   2809       continue;
   2810     }
   2811     if (n == '[') {
   2812       /* %[name] — scan to the closing ']' and resolve against
   2813        * AsmConstraint.name on the combined outs+ins list. Match by
   2814        * comparing the named-bracket contents against the interned name
   2815        * Sym stored on each constraint. */
   2816       const char* nbeg = p + 2;
   2817       const char* nend = nbeg;
   2818       while (nend < end && *nend != ']') ++nend;
   2819       if (nend == end) inline_panic(a, "unterminated %[name]");
   2820       size_t nlen = (size_t)(nend - nbeg);
   2821       Sym needle =
   2822           pool_intern_slice(a->c->global, (Slice){.s = nbeg, .len = nlen});
   2823       u32 idx = (u32)-1;
   2824       for (u32 k = 0; k < a->nout; ++k) {
   2825         if (a->outs[k].name == needle) {
   2826           idx = k;
   2827           break;
   2828         }
   2829       }
   2830       if (idx == (u32)-1) {
   2831         for (u32 k = 0; k < a->nin; ++k) {
   2832           if (a->ins[k].name == needle) {
   2833             idx = a->nout + k;
   2834             break;
   2835           }
   2836         }
   2837       }
   2838       if (idx == (u32)-1)
   2839         inline_panic(a, "%[name] does not match any constraint");
   2840       p = nend; /* loop's ++p steps past the ']' */
   2841       render_operand(a, sb, idx, 0);
   2842       continue;
   2843     }
   2844     int form = 0; /* 0=default, 1=w, 2=x, 3=a */
   2845     if (n == 'w' || n == 'x' || n == 'a') {
   2846       form = (n == 'w') ? 1 : (n == 'x') ? 2 : 3;
   2847       ++p;
   2848       if (p + 1 >= end) inline_panic(a, "trailing '%' modifier in template");
   2849       n = *(p + 1);
   2850     }
   2851     if (n == '[') {
   2852       /* %w[name] / %x[name] / %a[name] — width modifier + symbolic
   2853        * operand. Resolves the same way as %[name] but renders with the
   2854        * declared form. */
   2855       const char* nbeg = p + 2;
   2856       const char* nend = nbeg;
   2857       while (nend < end && *nend != ']') ++nend;
   2858       if (nend == end) inline_panic(a, "unterminated %[name]");
   2859       size_t nlen = (size_t)(nend - nbeg);
   2860       Sym needle =
   2861           pool_intern_slice(a->c->global, (Slice){.s = nbeg, .len = nlen});
   2862       u32 idx = (u32)-1;
   2863       for (u32 k = 0; k < a->nout; ++k) {
   2864         if (a->outs[k].name == needle) {
   2865           idx = k;
   2866           break;
   2867         }
   2868       }
   2869       if (idx == (u32)-1) {
   2870         for (u32 k = 0; k < a->nin; ++k) {
   2871           if (a->ins[k].name == needle) {
   2872             idx = a->nout + k;
   2873             break;
   2874           }
   2875         }
   2876       }
   2877       if (idx == (u32)-1)
   2878         inline_panic(a, "%[name] does not match any constraint");
   2879       p = nend; /* loop's ++p steps past the ']' */
   2880       render_operand(a, sb, idx, form);
   2881       continue;
   2882     }
   2883     if (n < '0' || n > '9') inline_panic(a, "expected digit after '%'");
   2884     u32 idx = (u32)(n - '0');
   2885     ++p;
   2886     /* GCC syntax permits up to two digits (%0..%99). */
   2887     if (p + 1 < end && *(p + 1) >= '0' && *(p + 1) <= '9') {
   2888       idx = idx * 10 + (u32)(*(p + 1) - '0');
   2889       ++p;
   2890     }
   2891     render_operand(a, sb, idx, form);
   2892   }
   2893   if (sb->truncated) inline_panic(a, "inline asm line buffer overflow");
   2894   run_one_line(a, mc, strbuf_cstr(sb), strbuf_len(sb));
   2895 }
   2896 
   2897 void aa64_asm_run_template(AA64Asm* a, MCEmitter* mc, const char* tmpl) {
   2898   if (!tmpl || !*tmpl) return;
   2899 
   2900   char buf[AA64_INLINE_LINE_CAP];
   2901   StrBuf sb;
   2902   strbuf_init(&sb, buf, sizeof buf);
   2903 
   2904   /* Walk tmpl, splitting on '\n' and ';' line terminators.  Track bracket
   2905    * depth and quote state so that a literal ';' inside `[ ... ]` or a
   2906    * quoted string is not mistaken for a statement separator. */
   2907   const char* line_start = tmpl;
   2908   int bracket = 0;
   2909   char quote = 0;
   2910   for (const char* p = tmpl;; ++p) {
   2911     char c = *p;
   2912     if (c == '\0') {
   2913       render_and_run_line(a, mc, &sb, line_start, p);
   2914       break;
   2915     }
   2916     if (quote) {
   2917       if (c == '\\' && *(p + 1)) {
   2918         ++p;
   2919         continue;
   2920       }
   2921       if (c == quote) quote = 0;
   2922       continue;
   2923     }
   2924     if (c == '"' || c == '\'') {
   2925       quote = c;
   2926       continue;
   2927     }
   2928     if (c == '[') {
   2929       ++bracket;
   2930       continue;
   2931     }
   2932     if (c == ']') {
   2933       if (bracket) --bracket;
   2934       continue;
   2935     }
   2936     if (bracket == 0 && (c == '\n' || c == ';')) {
   2937       render_and_run_line(a, mc, &sb, line_start, p);
   2938       line_start = p + 1;
   2939     }
   2940   }
   2941 }