kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

isa.h (61761B)


      1 #ifndef KIT_ARCH_AA64_ISA_H
      2 #define KIT_ARCH_AA64_ISA_H
      3 
      4 /* AArch64 ISA descriptors — single source of truth for every instruction
      5  * the encoder, decoder, and disassembler all need to agree on.
      6  *
      7  * Each format declares:
      8  *   - A field struct naming every encoded bitfield.
      9  *   - {pack, unpack} pure functions that round-trip through a u32 word.
     10  *   - A {family_match, family_mask} pair identifying the format.
     11  *   - Per-instruction inline wrappers that bake in the opc bits and
     12  *     return the encoded word; callers emit it via MCEmitter.
     13  *
     14  * A descriptor table at the bottom (aa64_insn_table) maps mnemonic →
     15  * (match, mask, AA64Format), so the disassembler matches a raw word with
     16  *   for (i=0; i<N; ++i) if ((word & desc[i].mask) == desc[i].match) ...
     17  * and then calls the format's unpack to recover the operand fields.
     18  *
     19  * Conventions:
     20  *   - sf = 0 selects the 32-bit (W) form, sf = 1 selects 64-bit (X).
     21  *   - Reg values are the raw 5-bit encoding (0..30 + 31 for ZR/SP).
     22  *   - All wrappers take Rd first, then Rn, Rm, Ra to match the AAPCS
     23  *     "destination first" convention used in the AArch64 manual.
     24  *
     25  * New instructions land as one entry in the table and (typically) one
     26  * inline wrapper in the relevant format section. */
     27 
     28 #include "core/core.h"
     29 #include "core/slice.h"
     30 #include "core/strbuf.h"
     31 
     32 /* ---- common register names ---- */
     33 #define AA64_ZR 31u /* WZR / XZR */
     34 #define AA64_SP 31u /* SP at Rd/Rn slot */
     35 #define AA64_LR 30u /* X30 / link register */
     36 
     37 /* ---- format kinds ---- */
     38 typedef enum AA64Format {
     39   AA64_FMT_MOVEWIDE,
     40   AA64_FMT_LOG_SR,      /* logical, shifted register */
     41   AA64_FMT_ADDSUB_SR,   /* add/sub, shifted register */
     42   AA64_FMT_DP3,         /* data-processing, 3 source */
     43   AA64_FMT_DP2,         /* data-processing, 2 source */
     44   AA64_FMT_CONDSEL,     /* conditional select (CSEL / CSINC / aliases) */
     45   AA64_FMT_BR_REG,      /* unconditional branch (register) */
     46   AA64_FMT_PCREL_ADR,   /* PC-relative ADR / ADRP */
     47   AA64_FMT_ADDSUB_IMM,  /* add/sub, immediate */
     48   AA64_FMT_LDST_UIMM,   /* load/store, unsigned 12-bit immediate offset */
     49   AA64_FMT_LDSTP_PRE,   /* load/store pair, pre-indexed */
     50   AA64_FMT_LDSTP_SOFF,  /* load/store pair, signed-offset */
     51   AA64_FMT_LDSTP_POST,  /* load/store pair, post-indexed */
     52   AA64_FMT_LDST_SIMM9,  /* load/store, unscaled 9-bit signed offset
     53                            (LDUR / STUR, V=0 and V=1) */
     54   AA64_FMT_BR_IMM,      /* unconditional branch (immediate) — B / BL */
     55   AA64_FMT_BR_COND,     /* B.cond (imm19) */
     56   AA64_FMT_CB,          /* compare-and-branch (CBZ / CBNZ) */
     57   AA64_FMT_EXCEPT,      /* exception generation (BRK / SVC / HVC / ...) */
     58   AA64_FMT_HINT,        /* hint (NOP / YIELD / ...) */
     59   AA64_FMT_BARRIER,     /* memory barrier (DMB / DSB / ISB / CLREX) */
     60   AA64_FMT_DP1,         /* data-processing, 1 source (RBIT/REV/REV16/CLZ) */
     61   AA64_FMT_BITFIELD,    /* bitfield move (SBFM / UBFM): Rd, Rn, #immr, #imms */
     62   AA64_FMT_LDST_REGOFF, /* load/store, register offset [Xn, Xm{, LSL #s}] */
     63   AA64_FMT_FP_DP2,      /* FP data-processing 2-source (FADD/FSUB/FMUL/FDIV) */
     64   AA64_FMT_FP_DP1,      /* FP data-processing 1-source (FMOV/FNEG/FABS/FSQRT) */
     65   AA64_FMT_FP_CMP,      /* FP compare (FCMP) */
     66   AA64_FMT_FP_CVT,      /* FP precision convert (FCVT single<->double) */
     67   AA64_FMT_FP_INT_CVT,  /* FP<->int convert + FMOV gpr<->fp
     68                          * (SCVTF/UCVTF/FCVTZS/FCVTZU/FMOV) */
     69   AA64_FMT_LDST_EXCL,   /* load/store exclusive + acquire/release ordered
     70                          * (LDXR/LDAXR/STXR/STLXR/LDAR/STLR + b/h) */
     71   AA64_FMT_LOG_IMM,     /* logical, immediate (AND/ORR/EOR/ANDS #bitmask) */
     72   AA64_FMT_SYSREG,      /* system-register move (MRS Xt,<reg> / MSR <reg>,Xt) */
     73 } AA64Format;
     74 
     75 /* ---- AsmFlags column on AA64InsnDesc ----
     76  *
     77  * Per-row metadata that varies across same-format members.  Most rows
     78  * carry 0.  When the disassembler matches a row whose ALIAS bit is set,
     79  * that's the spelling it prints; the assembler also accepts both the
     80  * alias and the canonical form because both rows live in the table. */
     81 #define AA64_ASMFL_ALIAS                                                      \
     82   0x01u                      /* row is an alias (e.g. MOV → ORR Rd, ZR, Rm) \
     83                               */
     84 #define AA64_ASMFL_SF1 0x02u /* 64-bit form only (sf hard-wired) */
     85 #define AA64_ASMFL_NORN \
     86   0x04u /* hide Rn operand in print (e.g. RET when Rn=30) */
     87 
     88 /* ====================================================================
     89  * Move-wide immediate (MOVN / MOVZ / MOVK)
     90  *   sf  opc(2)  100101  hw(2)  imm16(16)  Rd(5)
     91  *   31  30..29  28..23  22..21 20..5      4..0
     92  * ==================================================================== */
     93 
     94 #define AA64_MOVN_OPC 0u
     95 #define AA64_MOVZ_OPC 2u
     96 #define AA64_MOVK_OPC 3u
     97 
     98 #define AA64_MOVEWIDE_FAMILY_MATCH 0x12800000u
     99 #define AA64_MOVEWIDE_FAMILY_MASK 0x1F800000u /* bits 28:23 */
    100 
    101 typedef struct AA64MoveWide {
    102   u32 sf, opc, hw, imm16, Rd;
    103 } AA64MoveWide;
    104 
    105 static inline u32 aa64_movewide_pack(AA64MoveWide f) {
    106   return ((f.sf & 1u) << 31) | ((f.opc & 3u) << 29) |
    107          AA64_MOVEWIDE_FAMILY_MATCH | ((f.hw & 3u) << 21) |
    108          ((f.imm16 & 0xffffu) << 5) | (f.Rd & 0x1fu);
    109 }
    110 
    111 static inline AA64MoveWide aa64_movewide_unpack(u32 w) {
    112   AA64MoveWide f;
    113   f.sf = (w >> 31) & 1u;
    114   f.opc = (w >> 29) & 3u;
    115   f.hw = (w >> 21) & 3u;
    116   f.imm16 = (w >> 5) & 0xffffu;
    117   f.Rd = w & 0x1fu;
    118   return f;
    119 }
    120 
    121 static inline u32 aa64_movz(u32 sf, u32 Rd, u32 imm16, u32 hw) {
    122   return aa64_movewide_pack((AA64MoveWide){
    123       .sf = sf, .opc = AA64_MOVZ_OPC, .hw = hw, .imm16 = imm16, .Rd = Rd});
    124 }
    125 static inline u32 aa64_movn(u32 sf, u32 Rd, u32 imm16, u32 hw) {
    126   return aa64_movewide_pack((AA64MoveWide){
    127       .sf = sf, .opc = AA64_MOVN_OPC, .hw = hw, .imm16 = imm16, .Rd = Rd});
    128 }
    129 static inline u32 aa64_movk(u32 sf, u32 Rd, u32 imm16, u32 hw) {
    130   return aa64_movewide_pack((AA64MoveWide){
    131       .sf = sf, .opc = AA64_MOVK_OPC, .hw = hw, .imm16 = imm16, .Rd = Rd});
    132 }
    133 
    134 /* ====================================================================
    135  * Logical, shifted register (AND / ORR / EOR / ANDS, with N inverting
    136  * Rm to BIC / ORN / EON / BICS).
    137  *   sf  opc(2)  01010  shift(2)  N(1)  Rm(5)  imm6(6)  Rn(5)  Rd(5)
    138  *   31  30..29  28..24 23..22    21    20..16 15..10   9..5   4..0
    139  * ==================================================================== */
    140 
    141 #define AA64_LOG_AND_OPC 0u
    142 #define AA64_LOG_ORR_OPC 1u
    143 #define AA64_LOG_EOR_OPC 2u
    144 #define AA64_LOG_ANDS_OPC 3u
    145 
    146 #define AA64_LOGSR_FAMILY_MATCH 0x0A000000u
    147 #define AA64_LOGSR_FAMILY_MASK 0x1F000000u /* bits 28:24 */
    148 
    149 typedef struct AA64LogSR {
    150   u32 sf, opc, shift, N, Rm, imm6, Rn, Rd;
    151 } AA64LogSR;
    152 
    153 static inline u32 aa64_logsr_pack(AA64LogSR f) {
    154   return ((f.sf & 1u) << 31) | ((f.opc & 3u) << 29) | AA64_LOGSR_FAMILY_MATCH |
    155          ((f.shift & 3u) << 22) | ((f.N & 1u) << 21) | ((f.Rm & 0x1fu) << 16) |
    156          ((f.imm6 & 0x3fu) << 10) | ((f.Rn & 0x1fu) << 5) | (f.Rd & 0x1fu);
    157 }
    158 
    159 static inline AA64LogSR aa64_logsr_unpack(u32 w) {
    160   AA64LogSR f;
    161   f.sf = (w >> 31) & 1u;
    162   f.opc = (w >> 29) & 3u;
    163   f.shift = (w >> 22) & 3u;
    164   f.N = (w >> 21) & 1u;
    165   f.Rm = (w >> 16) & 0x1fu;
    166   f.imm6 = (w >> 10) & 0x3fu;
    167   f.Rn = (w >> 5) & 0x1fu;
    168   f.Rd = w & 0x1fu;
    169   return f;
    170 }
    171 
    172 static inline u32 aa64_and(u32 sf, u32 Rd, u32 Rn, u32 Rm) {
    173   return aa64_logsr_pack((AA64LogSR){
    174       .sf = sf, .opc = AA64_LOG_AND_OPC, .Rm = Rm, .Rn = Rn, .Rd = Rd});
    175 }
    176 static inline u32 aa64_orr(u32 sf, u32 Rd, u32 Rn, u32 Rm) {
    177   return aa64_logsr_pack((AA64LogSR){
    178       .sf = sf, .opc = AA64_LOG_ORR_OPC, .Rm = Rm, .Rn = Rn, .Rd = Rd});
    179 }
    180 static inline u32 aa64_eor(u32 sf, u32 Rd, u32 Rn, u32 Rm) {
    181   return aa64_logsr_pack((AA64LogSR){
    182       .sf = sf, .opc = AA64_LOG_EOR_OPC, .Rm = Rm, .Rn = Rn, .Rd = Rd});
    183 }
    184 static inline u32 aa64_orn(u32 sf, u32 Rd, u32 Rn, u32 Rm) {
    185   return aa64_logsr_pack((AA64LogSR){
    186       .sf = sf, .opc = AA64_LOG_ORR_OPC, .N = 1, .Rm = Rm, .Rn = Rn, .Rd = Rd});
    187 }
    188 
    189 /* MOV Wd, Wm  ≡  ORR Wd, WZR, Wm */
    190 static inline u32 aa64_mov_reg(u32 sf, u32 Rd, u32 Rm) {
    191   return aa64_orr(sf, Rd, AA64_ZR, Rm);
    192 }
    193 /* MVN Wd, Wm  ≡  ORN Wd, WZR, Wm */
    194 static inline u32 aa64_mvn(u32 sf, u32 Rd, u32 Rm) {
    195   return aa64_orn(sf, Rd, AA64_ZR, Rm);
    196 }
    197 
    198 /* ====================================================================
    199  * Logical, immediate (AND / ORR / EOR / ANDS, bitmask-imm form)
    200  *   sf  opc(2)  100100  N(1)  immr(6)  imms(6)  Rn(5)  Rd(5)
    201  *   31  30..29  28..23  22    21..16   15..10   9..5   4..0
    202  *
    203  * N:immr:imms encodes a repeated-pattern bitmask. The encoder
    204  * aa64_logimm_encode below computes those fields from a literal value;
    205  * this pack just lays the bits out. For 32-bit ops (sf=0), N must be 0;
    206  * for 64-bit ops N can be 0 or 1 and selects whether the pattern
    207  * element is 64 bits (N=1) or 2..32 bits (N=0).
    208  * ==================================================================== */
    209 
    210 #define AA64_LOGIMM_FAMILY_MATCH 0x12000000u
    211 #define AA64_LOGIMM_FAMILY_MASK 0x1F800000u /* bits 28:23 */
    212 
    213 typedef struct AA64LogImm {
    214   u32 sf, opc, N, immr, imms, Rn, Rd;
    215 } AA64LogImm;
    216 
    217 static inline u32 aa64_logimm_pack(AA64LogImm f) {
    218   return ((f.sf & 1u) << 31) | ((f.opc & 3u) << 29) | AA64_LOGIMM_FAMILY_MATCH |
    219          ((f.N & 1u) << 22) | ((f.immr & 0x3fu) << 16) |
    220          ((f.imms & 0x3fu) << 10) | ((f.Rn & 0x1fu) << 5) | (f.Rd & 0x1fu);
    221 }
    222 
    223 static inline u32 aa64_and_imm(u32 sf, u32 Rd, u32 Rn, u32 N, u32 immr,
    224                                u32 imms) {
    225   return aa64_logimm_pack((AA64LogImm){.sf = sf,
    226                                        .opc = AA64_LOG_AND_OPC,
    227                                        .N = N,
    228                                        .immr = immr,
    229                                        .imms = imms,
    230                                        .Rn = Rn,
    231                                        .Rd = Rd});
    232 }
    233 static inline u32 aa64_orr_imm(u32 sf, u32 Rd, u32 Rn, u32 N, u32 immr,
    234                                u32 imms) {
    235   return aa64_logimm_pack((AA64LogImm){.sf = sf,
    236                                        .opc = AA64_LOG_ORR_OPC,
    237                                        .N = N,
    238                                        .immr = immr,
    239                                        .imms = imms,
    240                                        .Rn = Rn,
    241                                        .Rd = Rd});
    242 }
    243 static inline u32 aa64_eor_imm(u32 sf, u32 Rd, u32 Rn, u32 N, u32 immr,
    244                                u32 imms) {
    245   return aa64_logimm_pack((AA64LogImm){.sf = sf,
    246                                        .opc = AA64_LOG_EOR_OPC,
    247                                        .N = N,
    248                                        .immr = immr,
    249                                        .imms = imms,
    250                                        .Rn = Rn,
    251                                        .Rd = Rd});
    252 }
    253 
    254 /* Bitmask-immediate predicate + encoder. Returns 1 and writes N/immr/imms
    255  * if `imm` is encodable as an AArch64 logical immediate of width
    256  * (sf ? 64 : 32); returns 0 otherwise (caller materializes into a
    257  * scratch and uses the shifted-register form).
    258  *
    259  * Algorithm (inverse of ARM ARM "DecodeBitMasks"): an encodable value
    260  * is a non-zero, non-all-ones bitmask made of a repeated `size`-bit
    261  * element (size ∈ {2,4,8,16,32,64}); within one element the pattern is
    262  * a rotation of (0…0 1…1). Find size by detecting the smallest
    263  * repeating period; find the rotation that places the 1-run at the
    264  * LSB; encode size and ones-count into imms per the standard scheme
    265  * (top bits of imms inverted-encode size, low bits are ones-count-1). */
    266 static inline int aa64_logimm_encode(u64 imm, u32 sf, u32* N_out, u32* immr_out,
    267                                      u32* imms_out) {
    268   if (!sf) {
    269     u64 lo = imm & 0xFFFFFFFFu;
    270     u64 hi = imm >> 32;
    271     if (hi != 0 && hi != lo) return 0;
    272     imm = lo | (lo << 32);
    273   }
    274   if (imm == 0 || imm == ~(u64)0) return 0;
    275 
    276   u32 size = 64;
    277   for (u32 s = 32; s >= 2; s >>= 1) {
    278     u64 mask = ((u64)1 << s) - 1u;
    279     if ((imm & mask) != ((imm >> s) & mask)) break;
    280     size = s;
    281   }
    282   u64 elt_mask = (size == 64) ? ~(u64)0 : (((u64)1 << size) - 1u);
    283   u64 elt = imm & elt_mask;
    284   if (elt == 0 || elt == elt_mask) return 0;
    285 
    286   u32 ones = 0;
    287   for (u64 x = elt; x; x >>= 1) ones += (u32)(x & 1u);
    288   if (ones == 0 || ones >= size) return 0;
    289 
    290   u64 aligned = ((u64)1 << ones) - 1u;
    291   u32 rotation = 0xFFFFFFFFu;
    292   for (u32 r = 0; r < size; r++) {
    293     u64 rotated =
    294         r == 0 ? elt : (((elt >> r) | (elt << (size - r))) & elt_mask);
    295     if (rotated == aligned) {
    296       rotation = r;
    297       break;
    298     }
    299   }
    300   if (rotation == 0xFFFFFFFFu) return 0;
    301 
    302   if (size == 64) {
    303     *N_out = 1u;
    304     *imms_out = (ones - 1u) & 0x3Fu;
    305   } else {
    306     *N_out = 0u;
    307     u32 neg_size_shl1 = ((u32)(-(i32)size) << 1) & 0x3Fu;
    308     *imms_out = neg_size_shl1 | ((ones - 1u) & 0x3Fu);
    309   }
    310   *immr_out = rotation ? (size - rotation) & (size - 1u) : 0u;
    311   return 1;
    312 }
    313 
    314 /* Shift-by-immediate field generators for LSL/LSR/ASR (encoded via
    315  * UBFM/SBFM). Predicate: shift < width. The aa64_ubfm / aa64_sbfm
    316  * encoders live in aarch64.c; callers pair these (immr, imms) with the
    317  * matching pack. */
    318 static inline int aa64_lsl_imm_fields(u32 shift, u32 sf, u32* immr_out,
    319                                       u32* imms_out) {
    320   u32 width = sf ? 64u : 32u;
    321   if (shift >= width) return 0;
    322   *immr_out = (width - shift) & (width - 1u);
    323   *imms_out = width - 1u - shift;
    324   return 1;
    325 }
    326 static inline int aa64_lsr_imm_fields(u32 shift, u32 sf, u32* immr_out,
    327                                       u32* imms_out) {
    328   u32 width = sf ? 64u : 32u;
    329   if (shift >= width) return 0;
    330   *immr_out = shift;
    331   *imms_out = width - 1u;
    332   return 1;
    333 }
    334 static inline int aa64_asr_imm_fields(u32 shift, u32 sf, u32* immr_out,
    335                                       u32* imms_out) {
    336   u32 width = sf ? 64u : 32u;
    337   if (shift >= width) return 0;
    338   *immr_out = shift;
    339   *imms_out = width - 1u;
    340   return 1;
    341 }
    342 
    343 /* ====================================================================
    344  * Add/Sub, shifted register (ADD / SUB / ADDS / SUBS)
    345  *   sf  op(1)  S(1)  01011  shift(2)  0  Rm(5)  imm6(6)  Rn(5)  Rd(5)
    346  *   31  30     29    28..24 23..22    21 20..16 15..10   9..5   4..0
    347  * ==================================================================== */
    348 
    349 #define AA64_ADDSUBSR_FAMILY_MATCH 0x0B000000u
    350 #define AA64_ADDSUBSR_FAMILY_MASK 0x1F200000u /* bits 28:24 + bit 21 */
    351 
    352 typedef struct AA64AddSubSR {
    353   u32 sf, op, S, shift, Rm, imm6, Rn, Rd;
    354 } AA64AddSubSR;
    355 
    356 static inline u32 aa64_addsubsr_pack(AA64AddSubSR f) {
    357   return ((f.sf & 1u) << 31) | ((f.op & 1u) << 30) | ((f.S & 1u) << 29) |
    358          AA64_ADDSUBSR_FAMILY_MATCH | ((f.shift & 3u) << 22) |
    359          ((f.Rm & 0x1fu) << 16) | ((f.imm6 & 0x3fu) << 10) |
    360          ((f.Rn & 0x1fu) << 5) | (f.Rd & 0x1fu);
    361 }
    362 
    363 static inline AA64AddSubSR aa64_addsubsr_unpack(u32 w) {
    364   AA64AddSubSR f;
    365   f.sf = (w >> 31) & 1u;
    366   f.op = (w >> 30) & 1u;
    367   f.S = (w >> 29) & 1u;
    368   f.shift = (w >> 22) & 3u;
    369   f.Rm = (w >> 16) & 0x1fu;
    370   f.imm6 = (w >> 10) & 0x3fu;
    371   f.Rn = (w >> 5) & 0x1fu;
    372   f.Rd = w & 0x1fu;
    373   return f;
    374 }
    375 
    376 static inline u32 aa64_add(u32 sf, u32 Rd, u32 Rn, u32 Rm) {
    377   return aa64_addsubsr_pack(
    378       (AA64AddSubSR){.sf = sf, .op = 0, .Rm = Rm, .Rn = Rn, .Rd = Rd});
    379 }
    380 static inline u32 aa64_sub(u32 sf, u32 Rd, u32 Rn, u32 Rm) {
    381   return aa64_addsubsr_pack(
    382       (AA64AddSubSR){.sf = sf, .op = 1, .Rm = Rm, .Rn = Rn, .Rd = Rd});
    383 }
    384 
    385 /* NEG Wd, Wm  ≡  SUB Wd, WZR, Wm */
    386 static inline u32 aa64_neg(u32 sf, u32 Rd, u32 Rm) {
    387   return aa64_sub(sf, Rd, AA64_ZR, Rm);
    388 }
    389 
    390 /* ====================================================================
    391  * Data-processing, 3-source (MADD / MSUB / SMULL / UMULL / ...)
    392  *   sf  op54(2)  11011  op31(3)  Rm(5)  o0(1)  Ra(5)  Rn(5)  Rd(5)
    393  *   31  30..29   28..24 23..21   20..16 15     14..10 9..5   4..0
    394  * ==================================================================== */
    395 
    396 #define AA64_DP3_FAMILY_MATCH 0x1B000000u
    397 #define AA64_DP3_FAMILY_MASK 0x1F000000u /* bits 28:24 */
    398 
    399 typedef struct AA64DP3 {
    400   u32 sf, op54, op31, Rm, o0, Ra, Rn, Rd;
    401 } AA64DP3;
    402 
    403 static inline u32 aa64_dp3_pack(AA64DP3 f) {
    404   return ((f.sf & 1u) << 31) | ((f.op54 & 3u) << 29) | AA64_DP3_FAMILY_MATCH |
    405          ((f.op31 & 7u) << 21) | ((f.Rm & 0x1fu) << 16) | ((f.o0 & 1u) << 15) |
    406          ((f.Ra & 0x1fu) << 10) | ((f.Rn & 0x1fu) << 5) | (f.Rd & 0x1fu);
    407 }
    408 
    409 static inline AA64DP3 aa64_dp3_unpack(u32 w) {
    410   AA64DP3 f;
    411   f.sf = (w >> 31) & 1u;
    412   f.op54 = (w >> 29) & 3u;
    413   f.op31 = (w >> 21) & 7u;
    414   f.Rm = (w >> 16) & 0x1fu;
    415   f.o0 = (w >> 15) & 1u;
    416   f.Ra = (w >> 10) & 0x1fu;
    417   f.Rn = (w >> 5) & 0x1fu;
    418   f.Rd = w & 0x1fu;
    419   return f;
    420 }
    421 
    422 static inline u32 aa64_madd(u32 sf, u32 Rd, u32 Rn, u32 Rm, u32 Ra) {
    423   return aa64_dp3_pack((AA64DP3){
    424       .sf = sf, .op31 = 0, .o0 = 0, .Rm = Rm, .Ra = Ra, .Rn = Rn, .Rd = Rd});
    425 }
    426 static inline u32 aa64_msub(u32 sf, u32 Rd, u32 Rn, u32 Rm, u32 Ra) {
    427   return aa64_dp3_pack((AA64DP3){
    428       .sf = sf, .op31 = 0, .o0 = 1, .Rm = Rm, .Ra = Ra, .Rn = Rn, .Rd = Rd});
    429 }
    430 /* MUL Wd, Wn, Wm  ≡  MADD Wd, Wn, Wm, WZR */
    431 static inline u32 aa64_mul(u32 sf, u32 Rd, u32 Rn, u32 Rm) {
    432   return aa64_madd(sf, Rd, Rn, Rm, AA64_ZR);
    433 }
    434 
    435 /* ====================================================================
    436  * Data-processing, 2-source (UDIV / SDIV / LSLV / LSRV / ASRV / RORV)
    437  *   sf  0  S(1)  11010110  Rm(5)  opcode(6)  Rn(5)  Rd(5)
    438  *   31  30 29    28..21    20..16 15..10     9..5   4..0
    439  * ==================================================================== */
    440 
    441 #define AA64_DP2_UDIV_OP 0x02u
    442 #define AA64_DP2_SDIV_OP 0x03u
    443 #define AA64_DP2_LSLV_OP 0x08u
    444 #define AA64_DP2_LSRV_OP 0x09u
    445 #define AA64_DP2_ASRV_OP 0x0Au
    446 #define AA64_DP2_RORV_OP 0x0Bu
    447 
    448 #define AA64_DP2_FAMILY_MATCH 0x1AC00000u
    449 #define AA64_DP2_FAMILY_MASK 0x5FE00000u /* bit 30 + bits 28:21 */
    450 
    451 typedef struct AA64DP2 {
    452   u32 sf, S, opcode, Rm, Rn, Rd;
    453 } AA64DP2;
    454 
    455 static inline u32 aa64_dp2_pack(AA64DP2 f) {
    456   return ((f.sf & 1u) << 31) | ((f.S & 1u) << 29) | AA64_DP2_FAMILY_MATCH |
    457          ((f.Rm & 0x1fu) << 16) | ((f.opcode & 0x3fu) << 10) |
    458          ((f.Rn & 0x1fu) << 5) | (f.Rd & 0x1fu);
    459 }
    460 
    461 static inline AA64DP2 aa64_dp2_unpack(u32 w) {
    462   AA64DP2 f;
    463   f.sf = (w >> 31) & 1u;
    464   f.S = (w >> 29) & 1u;
    465   f.Rm = (w >> 16) & 0x1fu;
    466   f.opcode = (w >> 10) & 0x3fu;
    467   f.Rn = (w >> 5) & 0x1fu;
    468   f.Rd = w & 0x1fu;
    469   return f;
    470 }
    471 
    472 static inline u32 aa64_udiv(u32 sf, u32 Rd, u32 Rn, u32 Rm) {
    473   return aa64_dp2_pack((AA64DP2){
    474       .sf = sf, .opcode = AA64_DP2_UDIV_OP, .Rm = Rm, .Rn = Rn, .Rd = Rd});
    475 }
    476 static inline u32 aa64_sdiv(u32 sf, u32 Rd, u32 Rn, u32 Rm) {
    477   return aa64_dp2_pack((AA64DP2){
    478       .sf = sf, .opcode = AA64_DP2_SDIV_OP, .Rm = Rm, .Rn = Rn, .Rd = Rd});
    479 }
    480 static inline u32 aa64_lslv(u32 sf, u32 Rd, u32 Rn, u32 Rm) {
    481   return aa64_dp2_pack((AA64DP2){
    482       .sf = sf, .opcode = AA64_DP2_LSLV_OP, .Rm = Rm, .Rn = Rn, .Rd = Rd});
    483 }
    484 static inline u32 aa64_lsrv(u32 sf, u32 Rd, u32 Rn, u32 Rm) {
    485   return aa64_dp2_pack((AA64DP2){
    486       .sf = sf, .opcode = AA64_DP2_LSRV_OP, .Rm = Rm, .Rn = Rn, .Rd = Rd});
    487 }
    488 static inline u32 aa64_asrv(u32 sf, u32 Rd, u32 Rn, u32 Rm) {
    489   return aa64_dp2_pack((AA64DP2){
    490       .sf = sf, .opcode = AA64_DP2_ASRV_OP, .Rm = Rm, .Rn = Rn, .Rd = Rd});
    491 }
    492 static inline u32 aa64_rorv(u32 sf, u32 Rd, u32 Rn, u32 Rm) {
    493   return aa64_dp2_pack((AA64DP2){
    494       .sf = sf, .opcode = AA64_DP2_RORV_OP, .Rm = Rm, .Rn = Rn, .Rd = Rd});
    495 }
    496 
    497 /* ====================================================================
    498  * Conditional select (CSEL / CSINC / CSINV / CSNEG)
    499  *   sf  op  S  11010100  Rm(5)  cond(4)  op2(2)  Rn(5)  Rd(5)
    500  *   31  30  29 28..21    20..16 15..12   11..10  9..5   4..0
    501  *
    502  * The integer forms this backend emits keep S=0.  Aliases such as CSET
    503  * are descriptor-table rows over this same encoding family. */
    504 
    505 #define AA64_CONDSEL_FAMILY_MATCH 0x1A800000u
    506 #define AA64_CONDSEL_FAMILY_MASK 0x1FE00000u /* bits 28:21 fixed */
    507 
    508 typedef struct AA64CondSel {
    509   u32 sf, op, S, Rm, cond, op2, Rn, Rd;
    510 } AA64CondSel;
    511 
    512 static inline u32 aa64_condsel_pack(AA64CondSel f) {
    513   return ((f.sf & 1u) << 31) | ((f.op & 1u) << 30) | ((f.S & 1u) << 29) |
    514          AA64_CONDSEL_FAMILY_MATCH | ((f.Rm & 0x1fu) << 16) |
    515          ((f.cond & 0xfu) << 12) | ((f.op2 & 3u) << 10) |
    516          ((f.Rn & 0x1fu) << 5) | (f.Rd & 0x1fu);
    517 }
    518 
    519 static inline AA64CondSel aa64_condsel_unpack(u32 w) {
    520   AA64CondSel f;
    521   f.sf = (w >> 31) & 1u;
    522   f.op = (w >> 30) & 1u;
    523   f.S = (w >> 29) & 1u;
    524   f.Rm = (w >> 16) & 0x1fu;
    525   f.cond = (w >> 12) & 0xfu;
    526   f.op2 = (w >> 10) & 3u;
    527   f.Rn = (w >> 5) & 0x1fu;
    528   f.Rd = w & 0x1fu;
    529   return f;
    530 }
    531 
    532 static inline u32 aa64_csel_enc(u32 sf, u32 Rd, u32 Rn, u32 Rm, u32 cond) {
    533   return aa64_condsel_pack((AA64CondSel){.sf = sf,
    534                                          .op = 0,
    535                                          .S = 0,
    536                                          .Rm = Rm,
    537                                          .cond = cond,
    538                                          .op2 = 0,
    539                                          .Rn = Rn,
    540                                          .Rd = Rd});
    541 }
    542 static inline u32 aa64_csinc_enc(u32 sf, u32 Rd, u32 Rn, u32 Rm, u32 cond) {
    543   return aa64_condsel_pack((AA64CondSel){.sf = sf,
    544                                          .op = 0,
    545                                          .S = 0,
    546                                          .Rm = Rm,
    547                                          .cond = cond,
    548                                          .op2 = 1,
    549                                          .Rn = Rn,
    550                                          .Rd = Rd});
    551 }
    552 static inline u32 aa64_csinv_enc(u32 sf, u32 Rd, u32 Rn, u32 Rm, u32 cond) {
    553   return aa64_condsel_pack((AA64CondSel){.sf = sf,
    554                                          .op = 1,
    555                                          .S = 0,
    556                                          .Rm = Rm,
    557                                          .cond = cond,
    558                                          .op2 = 0,
    559                                          .Rn = Rn,
    560                                          .Rd = Rd});
    561 }
    562 static inline u32 aa64_csneg_enc(u32 sf, u32 Rd, u32 Rn, u32 Rm, u32 cond) {
    563   return aa64_condsel_pack((AA64CondSel){.sf = sf,
    564                                          .op = 1,
    565                                          .S = 0,
    566                                          .Rm = Rm,
    567                                          .cond = cond,
    568                                          .op2 = 1,
    569                                          .Rn = Rn,
    570                                          .Rd = Rd});
    571 }
    572 
    573 /* ====================================================================
    574  * Unconditional branch (register) — BR / BLR / RET
    575  *   1101011  opc(4)  op2(5)=11111  op3(6)=000000  Rn(5)  op4(5)=00000
    576  *   31..25   24..21  20..16        15..10         9..5   4..0
    577  * ==================================================================== */
    578 
    579 #define AA64_BR_OP_BR 0u
    580 #define AA64_BR_OP_BLR 1u
    581 #define AA64_BR_OP_RET 2u
    582 
    583 #define AA64_BR_REG_FAMILY_MATCH 0xD61F0000u
    584 #define AA64_BR_REG_FAMILY_MASK \
    585   0xFE1FFC1Fu /* everything fixed except opc + Rn */
    586 
    587 typedef struct AA64BrReg {
    588   u32 opc, Rn;
    589 } AA64BrReg;
    590 
    591 static inline u32 aa64_brreg_pack(AA64BrReg f) {
    592   return AA64_BR_REG_FAMILY_MATCH | ((f.opc & 0xfu) << 21) |
    593          ((f.Rn & 0x1fu) << 5);
    594 }
    595 
    596 static inline AA64BrReg aa64_brreg_unpack(u32 w) {
    597   AA64BrReg f;
    598   f.opc = (w >> 21) & 0xfu;
    599   f.Rn = (w >> 5) & 0x1fu;
    600   return f;
    601 }
    602 
    603 static inline u32 aa64_br(u32 Rn) {
    604   return aa64_brreg_pack((AA64BrReg){.opc = AA64_BR_OP_BR, .Rn = Rn});
    605 }
    606 static inline u32 aa64_blr(u32 Rn) {
    607   return aa64_brreg_pack((AA64BrReg){.opc = AA64_BR_OP_BLR, .Rn = Rn});
    608 }
    609 static inline u32 aa64_ret(u32 Rn) {
    610   return aa64_brreg_pack((AA64BrReg){.opc = AA64_BR_OP_RET, .Rn = Rn});
    611 }
    612 
    613 /* ====================================================================
    614  * PC-relative addressing (ADR / ADRP)
    615  *   op(1)  immlo(2)  10000  immhi(19)  Rd(5)
    616  *   31     30..29    28..24 23..5      4..0
    617  *
    618  * op = 0 → ADR  (PC + sign_extend(immhi:immlo))
    619  * op = 1 → ADRP (page(PC) + sign_extend(immhi:immlo) << 12)
    620  *
    621  * The two immediate halves stay split because the linker's
    622  * R_AARCH64_ADR_PREL_PG_HI21 reloc patches them in place; keeping the
    623  * field layout symmetric with the encoded word lets reloc-apply code
    624  * reuse the same pack/unpack helpers.
    625  * ==================================================================== */
    626 
    627 #define AA64_ADR_OP_ADR 0u
    628 #define AA64_ADR_OP_ADRP 1u
    629 
    630 #define AA64_PCREL_ADR_FAMILY_MATCH 0x10000000u
    631 #define AA64_PCREL_ADR_FAMILY_MASK 0x1F000000u /* bits 28:24 */
    632 
    633 typedef struct AA64PCRelAdr {
    634   u32 op, immlo, immhi, Rd;
    635 } AA64PCRelAdr;
    636 
    637 static inline u32 aa64_pcrel_adr_pack(AA64PCRelAdr f) {
    638   return ((f.op & 1u) << 31) | ((f.immlo & 3u) << 29) |
    639          AA64_PCREL_ADR_FAMILY_MATCH | ((f.immhi & 0x7ffffu) << 5) |
    640          (f.Rd & 0x1fu);
    641 }
    642 
    643 static inline AA64PCRelAdr aa64_pcrel_adr_unpack(u32 w) {
    644   AA64PCRelAdr f;
    645   f.op = (w >> 31) & 1u;
    646   f.immlo = (w >> 29) & 3u;
    647   f.immhi = (w >> 5) & 0x7ffffu;
    648   f.Rd = w & 0x1fu;
    649   return f;
    650 }
    651 
    652 static inline u32 aa64_adrp(u32 Rd, u32 immlo, u32 immhi) {
    653   return aa64_pcrel_adr_pack((AA64PCRelAdr){
    654       .op = AA64_ADR_OP_ADRP, .immlo = immlo, .immhi = immhi, .Rd = Rd});
    655 }
    656 static inline u32 aa64_adr(u32 Rd, u32 immlo, u32 immhi) {
    657   return aa64_pcrel_adr_pack((AA64PCRelAdr){
    658       .op = AA64_ADR_OP_ADR, .immlo = immlo, .immhi = immhi, .Rd = Rd});
    659 }
    660 
    661 /* ====================================================================
    662  * Add/Sub, immediate (ADD / SUB / ADDS / SUBS, 12-bit imm with shift)
    663  *   sf  op(1)  S(1)  100010  sh(1)  imm12(12)  Rn(5)  Rd(5)
    664  *   31  30     29    28..23  22     21..10     9..5   4..0
    665  *
    666  * sh selects whether imm12 is left-shifted by 12.  Used by PLT entries
    667  * for `add x16, x16, #lo12(slot)` where sh=0 and imm12 = slot & 0xfff.
    668  * ==================================================================== */
    669 
    670 #define AA64_ADDSUBIMM_FAMILY_MATCH 0x11000000u
    671 #define AA64_ADDSUBIMM_FAMILY_MASK 0x1F000000u /* bits 28:24 */
    672 
    673 typedef struct AA64AddSubImm {
    674   u32 sf, op, S, sh, imm12, Rn, Rd;
    675 } AA64AddSubImm;
    676 
    677 static inline u32 aa64_addsubimm_pack(AA64AddSubImm f) {
    678   return ((f.sf & 1u) << 31) | ((f.op & 1u) << 30) | ((f.S & 1u) << 29) |
    679          AA64_ADDSUBIMM_FAMILY_MATCH | ((f.sh & 1u) << 22) |
    680          ((f.imm12 & 0xfffu) << 10) | ((f.Rn & 0x1fu) << 5) | (f.Rd & 0x1fu);
    681 }
    682 
    683 static inline AA64AddSubImm aa64_addsubimm_unpack(u32 w) {
    684   AA64AddSubImm f;
    685   f.sf = (w >> 31) & 1u;
    686   f.op = (w >> 30) & 1u;
    687   f.S = (w >> 29) & 1u;
    688   f.sh = (w >> 22) & 1u;
    689   f.imm12 = (w >> 10) & 0xfffu;
    690   f.Rn = (w >> 5) & 0x1fu;
    691   f.Rd = w & 0x1fu;
    692   return f;
    693 }
    694 
    695 static inline u32 aa64_add_imm(u32 sf, u32 Rd, u32 Rn, u32 imm12, u32 sh) {
    696   return aa64_addsubimm_pack((AA64AddSubImm){
    697       .sf = sf, .op = 0, .sh = sh, .imm12 = imm12, .Rn = Rn, .Rd = Rd});
    698 }
    699 static inline u32 aa64_sub_imm(u32 sf, u32 Rd, u32 Rn, u32 imm12, u32 sh) {
    700   return aa64_addsubimm_pack((AA64AddSubImm){
    701       .sf = sf, .op = 1, .sh = sh, .imm12 = imm12, .Rn = Rn, .Rd = Rd});
    702 }
    703 /* SUBS imm — sets flags. Used for CMP imm (Rd=ZR) and for branchless
    704  * compares that feed CSET. The 12-bit-shifted form covers 0..0xFFFFF000
    705  * stepped by 0x1000; cg_fold collapses literal-only compares upstream,
    706  * so this encoder is reached for `x cmp const` and `if (x)` patterns. */
    707 static inline u32 aa64_subs_imm12(u32 sf, u32 Rd, u32 Rn, u32 imm12, u32 sh) {
    708   return aa64_addsubimm_pack((AA64AddSubImm){
    709       .sf = sf, .op = 1, .S = 1, .sh = sh, .imm12 = imm12, .Rn = Rn, .Rd = Rd});
    710 }
    711 
    712 /* Predicate: does `imm` fit ADD/SUB/CMP's 12-bit immediate (optionally
    713  * left-shifted by 12)? On success writes the encoded imm12 and sh and
    714  * returns 1; on failure returns 0 and leaves outputs untouched.
    715  *
    716  * The encoding admits 0..4095 directly (sh=0) and multiples of 4096 up
    717  * to 0xFFF000 (sh=1). Negative literals are rejected here — the caller
    718  * (e.g. opt's machinize, or a smarter cg) is free to swap ADD ↔ SUB and
    719  * retry with the negated literal; the bare predicate keeps the contract
    720  * narrow. */
    721 static inline int aa64_addsub_imm_fits(i64 imm, u32* imm12_out, u32* sh_out) {
    722   if (imm < 0) return 0;
    723   u64 u = (u64)imm;
    724   if (u <= 0xFFFu) {
    725     *imm12_out = (u32)u;
    726     *sh_out = 0;
    727     return 1;
    728   }
    729   if ((u & 0xFFFu) == 0 && (u >> 12) <= 0xFFFu) {
    730     *imm12_out = (u32)(u >> 12);
    731     *sh_out = 1;
    732     return 1;
    733   }
    734   return 0;
    735 }
    736 
    737 /* ====================================================================
    738  * Load/store, unsigned 12-bit immediate offset (LDR / STR, scaled)
    739  *   size(2)  111  V(1)  01  opc(2)  imm12(12)  Rn(5)  Rt(5)
    740  *   31..30   29..27 26   25..24 23..22 21..10  9..5   4..0
    741  *
    742  * size=11, V=0, opc=01 → LDR (64-bit, integer).  imm12 is the byte
    743  * offset divided by the access size (8 for LDR Xt), giving a 0..32760
    744  * byte range.
    745  *
    746  * Only the LDR Xt form is needed by the linker today (PLT loads through
    747  * x16/x17); the family encoders cover STR and the smaller widths so
    748  * future callers can drop in without touching this header.
    749  * ==================================================================== */
    750 
    751 #define AA64_LDST_SIZE_64 3u
    752 #define AA64_LDST_OPC_STR 0u
    753 #define AA64_LDST_OPC_LDR 1u
    754 
    755 #define AA64_LDST_UIMM_FAMILY_MATCH 0x39000000u
    756 #define AA64_LDST_UIMM_FAMILY_MASK 0x3B000000u /* bits 29:27 + bits 25:24 */
    757 
    758 typedef struct AA64LdStUimm {
    759   u32 size, V, opc, imm12, Rn, Rt;
    760 } AA64LdStUimm;
    761 
    762 static inline u32 aa64_ldst_uimm_pack(AA64LdStUimm f) {
    763   return ((f.size & 3u) << 30) | AA64_LDST_UIMM_FAMILY_MATCH |
    764          ((f.V & 1u) << 26) | ((f.opc & 3u) << 22) |
    765          ((f.imm12 & 0xfffu) << 10) | ((f.Rn & 0x1fu) << 5) | (f.Rt & 0x1fu);
    766 }
    767 
    768 static inline AA64LdStUimm aa64_ldst_uimm_unpack(u32 w) {
    769   AA64LdStUimm f;
    770   f.size = (w >> 30) & 3u;
    771   f.V = (w >> 26) & 1u;
    772   f.opc = (w >> 22) & 3u;
    773   f.imm12 = (w >> 10) & 0xfffu;
    774   f.Rn = (w >> 5) & 0x1fu;
    775   f.Rt = w & 0x1fu;
    776   return f;
    777 }
    778 
    779 /* LDR Xt, [Xn, #imm12_scaled].  imm12_scaled is the encoded field —
    780  * callers pass `byte_offset >> 3` for the 64-bit form. */
    781 static inline u32 aa64_ldr64_uimm12(u32 Rt, u32 Rn, u32 imm12_scaled) {
    782   return aa64_ldst_uimm_pack((AA64LdStUimm){.size = AA64_LDST_SIZE_64,
    783                                             .V = 0,
    784                                             .opc = AA64_LDST_OPC_LDR,
    785                                             .imm12 = imm12_scaled,
    786                                             .Rn = Rn,
    787                                             .Rt = Rt});
    788 }
    789 static inline u32 aa64_str64_uimm12(u32 Rt, u32 Rn, u32 imm12_scaled) {
    790   return aa64_ldst_uimm_pack((AA64LdStUimm){.size = AA64_LDST_SIZE_64,
    791                                             .V = 0,
    792                                             .opc = AA64_LDST_OPC_STR,
    793                                             .imm12 = imm12_scaled,
    794                                             .Rn = Rn,
    795                                             .Rt = Rt});
    796 }
    797 
    798 /* ---- Scalar floating-point encoders ----
    799  * ftype: 0=single (Sn), 1=double (Dn), 3=half (Hn). The bit layouts match the
    800  * FP_* decode rows in isa.c and the aa_* encoders in native.c, so encode and
    801  * decode round-trip. The DP2/DP1 `op` and the FP_INT_CVT `opcode` are the
    802  * named field values below. */
    803 #define AA64_FP_DP2_FMUL 0x0800u
    804 #define AA64_FP_DP2_FDIV 0x1800u
    805 #define AA64_FP_DP2_FADD 0x2800u
    806 #define AA64_FP_DP2_FSUB 0x3800u
    807 #define AA64_FP_DP2_FMAX 0x4800u
    808 #define AA64_FP_DP2_FMIN 0x5800u
    809 #define AA64_FP_DP2_FNMUL 0x8800u
    810 #define AA64_FP_DP1_FMOV 0x4000u
    811 #define AA64_FP_DP1_FABS 0xC000u
    812 #define AA64_FP_DP1_FNEG 0x14000u
    813 #define AA64_FP_DP1_FSQRT 0x1C000u
    814 #define AA64_FP_ICVT_SCVTF 0x02u
    815 #define AA64_FP_ICVT_UCVTF 0x03u
    816 #define AA64_FP_ICVT_FCVTZS 0x18u
    817 #define AA64_FP_ICVT_FCVTZU 0x19u
    818 #define AA64_FP_ICVT_FMOV_TO_GPR 0x06u /* fmov Rd, Vn */
    819 #define AA64_FP_ICVT_FMOV_TO_FP 0x07u  /* fmov Vd, Rn */
    820 
    821 static inline u32 aa64_fp_dp2(u32 ftype, u32 op, u32 Rd, u32 Rn, u32 Rm) {
    822   return 0x1E200000u | ((ftype & 3u) << 22) | op | ((Rm & 0x1fu) << 16) |
    823          ((Rn & 0x1fu) << 5) | (Rd & 0x1fu);
    824 }
    825 static inline u32 aa64_fp_dp1(u32 ftype, u32 op, u32 Rd, u32 Rn) {
    826   return 0x1E200000u | ((ftype & 3u) << 22) | op | ((Rn & 0x1fu) << 5) |
    827          (Rd & 0x1fu);
    828 }
    829 static inline u32 aa64_fcmp_reg(u32 ftype, u32 Rn, u32 Rm) {
    830   return 0x1E202000u | ((ftype & 3u) << 22) | ((Rm & 0x1fu) << 16) |
    831          ((Rn & 0x1fu) << 5);
    832 }
    833 static inline u32 aa64_fcvt_prec(u32 src_ftype, u32 dst_ftype, u32 Rd, u32 Rn) {
    834   return 0x1E204000u | ((src_ftype & 3u) << 22) | (1u << 17) |
    835          ((dst_ftype & 3u) << 15) | ((Rn & 0x1fu) << 5) | (Rd & 0x1fu);
    836 }
    837 static inline u32 aa64_fp_int_cvt(u32 sf, u32 ftype, u32 opcode, u32 Rd,
    838                                   u32 Rn) {
    839   return ((sf & 1u) << 31) | 0x1E200000u | ((ftype & 3u) << 22) |
    840          ((opcode & 0x1fu) << 16) | ((Rn & 0x1fu) << 5) | (Rd & 0x1fu);
    841 }
    842 
    843 /* Bitfield move (opc: 0=SBFM, 1=BFM, 2=UBFM). The N bit tracks sf for the
    844  * 32-/64-bit forms. Matches native.c aa_sbfm/aa_ubfm and the BITFIELD row. */
    845 static inline u32 aa64_bitfield(u32 sf, u32 opc, u32 immr, u32 imms, u32 Rd,
    846                                 u32 Rn) {
    847   return ((sf & 1u) << 31) | ((opc & 3u) << 29) | 0x13000000u |
    848          ((sf & 1u) << 22) | ((immr & 0x3fu) << 16) | ((imms & 0x3fu) << 10) |
    849          ((Rn & 0x1fu) << 5) | (Rd & 0x1fu);
    850 }
    851 
    852 /* Data-processing (1 source). opcode2 (bits[15:10]): RBIT=0, REV16=1,
    853  * REV(32)=2, REV(64)=3, CLZ=4. Matches native.c aa_clz/aa_rbit/aa_rev. */
    854 #define AA64_DP1_RBIT 0x00u
    855 #define AA64_DP1_REV16 0x01u
    856 #define AA64_DP1_REV32 0x02u
    857 #define AA64_DP1_REV64 0x03u
    858 #define AA64_DP1_CLZ 0x04u
    859 static inline u32 aa64_dp1(u32 sf, u32 opcode2, u32 Rd, u32 Rn) {
    860   return ((sf & 1u) << 31) | 0x5AC00000u | ((opcode2 & 0x3fu) << 10) |
    861          ((Rn & 0x1fu) << 5) | (Rd & 0x1fu);
    862 }
    863 
    864 /* ====================================================================
    865  * Load/store register pair, pre-indexed (STP / LDP, 64-bit form)
    866  *   opc(2)  101  V(1)  010  L(1)  imm7(7)  Rt2(5)  Rn(5)  Rt(5)
    867  *   31..30  29..27 26   25..23 22  21..15   14..10  9..5   4..0
    868  *
    869  * 64-bit integer form fixes opc=10, V=0.  L=0 → STP, L=1 → LDP.
    870  * imm7 is a signed 7-bit value scaled by 8 (for the 64-bit form): the
    871  * encoded field equals `byte_offset / 8`.  Callers pass the scaled
    872  * value already; the helper masks to 7 bits to handle negative inputs
    873  * sign-extended in i32.
    874  * ==================================================================== */
    875 
    876 #define AA64_LDSTP_PRE_FAMILY_MATCH 0x29800000u
    877 #define AA64_LDSTP_PRE_FAMILY_MASK 0x7FC00000u /* bits 30:23 */
    878 
    879 typedef struct AA64LdStPPre {
    880   u32 opc, V, L, imm7, Rt2, Rn, Rt;
    881 } AA64LdStPPre;
    882 
    883 static inline u32 aa64_ldstp_pre_pack(AA64LdStPPre f) {
    884   return ((f.opc & 3u) << 30) | AA64_LDSTP_PRE_FAMILY_MATCH |
    885          ((f.V & 1u) << 26) | ((f.L & 1u) << 22) | ((f.imm7 & 0x7fu) << 15) |
    886          ((f.Rt2 & 0x1fu) << 10) | ((f.Rn & 0x1fu) << 5) | (f.Rt & 0x1fu);
    887 }
    888 
    889 static inline AA64LdStPPre aa64_ldstp_pre_unpack(u32 w) {
    890   AA64LdStPPre f;
    891   f.opc = (w >> 30) & 3u;
    892   f.V = (w >> 26) & 1u;
    893   f.L = (w >> 22) & 1u;
    894   f.imm7 = (w >> 15) & 0x7fu;
    895   f.Rt2 = (w >> 10) & 0x1fu;
    896   f.Rn = (w >> 5) & 0x1fu;
    897   f.Rt = w & 0x1fu;
    898   return f;
    899 }
    900 
    901 /* STP Xt, Xt2, [Xn, #imm7_scaled]!  — opc=10 selects the 64-bit form.
    902  * imm7_scaled is `byte_offset / 8`; callers pass it pre-scaled (e.g.
    903  * -2 for [sp, #-16]!). */
    904 static inline u32 aa64_stp64_pre(u32 Rt, u32 Rt2, u32 Rn, i32 imm7_scaled) {
    905   return aa64_ldstp_pre_pack((AA64LdStPPre){.opc = 2,
    906                                             .V = 0,
    907                                             .L = 0,
    908                                             .imm7 = (u32)imm7_scaled & 0x7fu,
    909                                             .Rt2 = Rt2,
    910                                             .Rn = Rn,
    911                                             .Rt = Rt});
    912 }
    913 static inline u32 aa64_ldp64_pre(u32 Rt, u32 Rt2, u32 Rn, i32 imm7_scaled) {
    914   return aa64_ldstp_pre_pack((AA64LdStPPre){.opc = 2,
    915                                             .V = 0,
    916                                             .L = 1,
    917                                             .imm7 = (u32)imm7_scaled & 0x7fu,
    918                                             .Rt2 = Rt2,
    919                                             .Rn = Rn,
    920                                             .Rt = Rt});
    921 }
    922 
    923 /* Post-indexed STP/LDP — same field layout as the pre-indexed form, only
    924  * bits[25:23] differ (001 vs 011); reuse AA64LdStPPre. Used for the slim
    925  * prologue's epilogue restore: `ldp x29,x30,[sp],#16`. */
    926 #define AA64_LDSTP_POST_FAMILY_MATCH 0x28800000u
    927 #define AA64_LDSTP_POST_FAMILY_MASK 0x7FC00000u /* bits 30:23 */
    928 
    929 static inline u32 aa64_ldstp_post_pack(AA64LdStPPre f) {
    930   return ((f.opc & 3u) << 30) | AA64_LDSTP_POST_FAMILY_MATCH |
    931          ((f.V & 1u) << 26) | ((f.L & 1u) << 22) | ((f.imm7 & 0x7fu) << 15) |
    932          ((f.Rt2 & 0x1fu) << 10) | ((f.Rn & 0x1fu) << 5) | (f.Rt & 0x1fu);
    933 }
    934 
    935 static inline u32 aa64_stp64_post(u32 Rt, u32 Rt2, u32 Rn, i32 imm7_scaled) {
    936   return aa64_ldstp_post_pack((AA64LdStPPre){.opc = 2,
    937                                              .V = 0,
    938                                              .L = 0,
    939                                              .imm7 = (u32)imm7_scaled & 0x7fu,
    940                                              .Rt2 = Rt2,
    941                                              .Rn = Rn,
    942                                              .Rt = Rt});
    943 }
    944 static inline u32 aa64_ldp64_post(u32 Rt, u32 Rt2, u32 Rn, i32 imm7_scaled) {
    945   return aa64_ldstp_post_pack((AA64LdStPPre){.opc = 2,
    946                                              .V = 0,
    947                                              .L = 1,
    948                                              .imm7 = (u32)imm7_scaled & 0x7fu,
    949                                              .Rt2 = Rt2,
    950                                              .Rn = Rn,
    951                                              .Rt = Rt});
    952 }
    953 
    954 /* ====================================================================
    955  * Hint instructions (NOP / YIELD / WFE / WFI / SEV / SEVL)
    956  *   1101 0101 0000 0011 0010 CRm(4) op2(3) 11111
    957  *   31..16             15..12 11..8 7..5    4..0
    958  *
    959  * NOP encodes CRm=0, op2=0 → 0xD503201F.  The full hint family lives
    960  * inside the system-instruction space; we only expose NOP today since
    961  * that's the only slot the linker fills.
    962  * ==================================================================== */
    963 
    964 #define AA64_HINT_FAMILY_MATCH 0xD503201Fu
    965 #define AA64_HINT_FAMILY_MASK 0xFFFFF01Fu /* CRm + op2 vary */
    966 
    967 /* HINT #N with CRm=0: op2 selects the variant. */
    968 #define AA64_HINT_OP_NOP 0u   /* CRm=0, op2=0 */
    969 #define AA64_HINT_OP_YIELD 1u /* CRm=0, op2=1 */
    970 #define AA64_HINT_OP_WFE 2u   /* CRm=0, op2=2 */
    971 #define AA64_HINT_OP_WFI 3u   /* CRm=0, op2=3 */
    972 #define AA64_HINT_OP_SEV 4u   /* CRm=0, op2=4 */
    973 #define AA64_HINT_OP_SEVL 5u  /* CRm=0, op2=5 */
    974 
    975 typedef struct AA64Hint {
    976   u32 CRm, op2;
    977 } AA64Hint;
    978 
    979 static inline u32 aa64_hint_pack(AA64Hint f) {
    980   return AA64_HINT_FAMILY_MATCH | ((f.CRm & 0xfu) << 8) | ((f.op2 & 7u) << 5);
    981 }
    982 
    983 static inline u32 aa64_hint(u32 op2) {
    984   return aa64_hint_pack((AA64Hint){.CRm = 0, .op2 = op2});
    985 }
    986 
    987 static inline AA64Hint aa64_hint_unpack(u32 w) {
    988   AA64Hint f;
    989   f.CRm = (w >> 8) & 0xfu;
    990   f.op2 = (w >> 5) & 7u;
    991   return f;
    992 }
    993 
    994 static inline u32 aa64_nop(void) {
    995   return aa64_hint_pack((AA64Hint){.CRm = 0, .op2 = AA64_HINT_OP_NOP});
    996 }
    997 
    998 /* ====================================================================
    999  * Memory barriers (DMB / DSB / ISB / CLREX)
   1000  *   1101 0101 0000 0011 0011 CRm(4) op2(3) 11111
   1001  *   31..16             15..12 11..8 7..5    4..0
   1002  *
   1003  * Shared encoding family with HINT (which uses bits[15:12]=0010);
   1004  * barriers use bits[15:12]=0011. op2 selects the specific instruction:
   1005  *   CLREX=010  DSB=100  DMB=101  ISB=110
   1006  * CRm is the option / domain (SY=15, ISH=11, NSH=7, OSH=3, ...).
   1007  * ==================================================================== */
   1008 
   1009 #define AA64_BARRIER_FAMILY_MATCH 0xD503301Fu
   1010 #define AA64_BARRIER_FAMILY_MASK 0xFFFFF01Fu /* CRm + op2 vary */
   1011 
   1012 #define AA64_BARRIER_OP2_CLREX 2u
   1013 #define AA64_BARRIER_OP2_DSB 4u
   1014 #define AA64_BARRIER_OP2_DMB 5u
   1015 #define AA64_BARRIER_OP2_ISB 6u
   1016 
   1017 /* Common CRm option encodings (ARM ARM C5.1.42). */
   1018 #define AA64_BARRIER_OPT_OSHLD 1u
   1019 #define AA64_BARRIER_OPT_OSHST 2u
   1020 #define AA64_BARRIER_OPT_OSH 3u
   1021 #define AA64_BARRIER_OPT_NSHLD 5u
   1022 #define AA64_BARRIER_OPT_NSHST 6u
   1023 #define AA64_BARRIER_OPT_NSH 7u
   1024 #define AA64_BARRIER_OPT_ISHLD 9u
   1025 #define AA64_BARRIER_OPT_ISHST 10u
   1026 #define AA64_BARRIER_OPT_ISH 11u
   1027 #define AA64_BARRIER_OPT_LD 13u
   1028 #define AA64_BARRIER_OPT_ST 14u
   1029 #define AA64_BARRIER_OPT_SY 15u
   1030 
   1031 typedef struct AA64Barrier {
   1032   u32 CRm, op2;
   1033 } AA64Barrier;
   1034 
   1035 static inline u32 aa64_barrier_pack(AA64Barrier f) {
   1036   return AA64_BARRIER_FAMILY_MATCH | ((f.CRm & 0xfu) << 8) |
   1037          ((f.op2 & 7u) << 5);
   1038 }
   1039 
   1040 static inline AA64Barrier aa64_barrier_unpack(u32 w) {
   1041   AA64Barrier f;
   1042   f.CRm = (w >> 8) & 0xfu;
   1043   f.op2 = (w >> 5) & 7u;
   1044   return f;
   1045 }
   1046 
   1047 static inline u32 aa64_dmb(u32 opt) {
   1048   return aa64_barrier_pack(
   1049       (AA64Barrier){.CRm = opt, .op2 = AA64_BARRIER_OP2_DMB});
   1050 }
   1051 static inline u32 aa64_dsb(u32 opt) {
   1052   return aa64_barrier_pack(
   1053       (AA64Barrier){.CRm = opt, .op2 = AA64_BARRIER_OP2_DSB});
   1054 }
   1055 static inline u32 aa64_isb(u32 opt) {
   1056   return aa64_barrier_pack(
   1057       (AA64Barrier){.CRm = opt, .op2 = AA64_BARRIER_OP2_ISB});
   1058 }
   1059 static inline u32 aa64_clrex(u32 opt) {
   1060   return aa64_barrier_pack(
   1061       (AA64Barrier){.CRm = opt, .op2 = AA64_BARRIER_OP2_CLREX});
   1062 }
   1063 
   1064 /* ====================================================================
   1065  * Interrupt-mask (DAIF) system register access. Used by the IRQ-control
   1066  * intrinsics; privileged at EL0. Only the encodings the backend emits live
   1067  * here (they are not registered in the disassembler's mnemonic table).
   1068  *   MRS Xt, DAIF       : 1101 0101 0011 1011 0100 0010 000 Rt -> 0xD53B4200|Rt
   1069  *   MSR DAIF, Xt       : 1101 0101 0001 1011 0100 0010 000 Rt -> 0xD51B4200|Rt
   1070  *   MSR DAIFSet, #imm4 : op1=011, op2=110 -> 0xD50340DF | (imm4 << 8)
   1071  *   MSR DAIFClr, #imm4 : op1=011, op2=111 -> 0xD50340FF | (imm4 << 8)
   1072  * imm4 = 0xF masks/unmasks D,A,I,F together. ==================== */
   1073 #define AA64_DAIF_ALL 0xfu
   1074 
   1075 static inline u32 aa64_mrs_daif(u32 rt) { return 0xD53B4200u | (rt & 0x1fu); }
   1076 static inline u32 aa64_msr_daif(u32 rt) { return 0xD51B4200u | (rt & 0x1fu); }
   1077 static inline u32 aa64_msr_daifset(u32 imm4) {
   1078   return 0xD50340DFu | ((imm4 & 0xfu) << 8);
   1079 }
   1080 static inline u32 aa64_msr_daifclr(u32 imm4) {
   1081   return 0xD50340FFu | ((imm4 & 0xfu) << 8);
   1082 }
   1083 
   1084 /* ====================================================================
   1085  * Generic system-register move (MRS/MSR register form). A named system
   1086  * register is the 15-bit selector op0:op1:CRn:CRm:op2 (op0's high bit is
   1087  * fixed by the encoding, so only its low bit is a field).
   1088  *   MRS Xt, <sysreg> : 1101 0101 0 0 1 op0lo op1 CRn CRm op2 Rt  (read,  L=1)
   1089  *   MSR <sysreg>, Xt : 1101 0101 0 0 0 op0lo op1 CRn CRm op2 Rt  (write, L=0)
   1090  * e.g. TPIDR_EL0 = (op0=3,op1=3,CRn=13,CRm=0,op2=2):
   1091  *   MSR TPIDR_EL0, X0 -> 0xd51bd040 ; MRS X0, TPIDR_EL0 -> 0xd53bd040. */
   1092 static inline u32 aa64_sysreg_move(int is_read, u32 op0, u32 op1, u32 crn,
   1093                                    u32 crm, u32 op2, u32 rt) {
   1094   return 0xd5000000u | (is_read ? (1u << 21) : 0u) | ((op0 & 3u) << 19) |
   1095          ((op1 & 7u) << 16) | ((crn & 0xfu) << 12) | ((crm & 0xfu) << 8) |
   1096          ((op2 & 7u) << 5) | (rt & 0x1fu);
   1097 }
   1098 
   1099 /* System-register move encoding family: MRS (read, L=1) and MSR (write,
   1100  * L=0). The disassembler matches these; the register selector op0:op1:CRn:
   1101  * CRm:op2 and Rt are decoded from the word. op0's high bit is fixed (bit
   1102  * 20), so the mask pins bits[31:20] and leaves op0lo/op1/CRn/CRm/op2/Rt. */
   1103 #define AA64_MRS_MATCH 0xd5300000u
   1104 #define AA64_MSR_MATCH 0xd5100000u
   1105 #define AA64_SYSREG_MOVE_MASK 0xfff00000u
   1106 
   1107 /* Shared system-register name table (single source for the assembler's
   1108  * name->selector parse and the disassembler's selector->name print). */
   1109 typedef struct AA64SysRegName {
   1110   const char* name;
   1111   u8 op0, op1, crn, crm, op2;
   1112 } AA64SysRegName;
   1113 
   1114 /* Resolve a system-register name (case-insensitive, length n) to its five
   1115  * selector fields. Returns 1 on a hit, 0 otherwise. */
   1116 int aa64_sysreg_by_name(const char* s, size_t n, u32* op0, u32* op1, u32* crn,
   1117                         u32* crm, u32* op2);
   1118 
   1119 /* Reverse lookup: canonical lowercase name for a selector, or NULL when the
   1120  * selector is not in the table (the caller prints the generic Sx_x_Cx_Cx_x
   1121  * spelling instead). */
   1122 const char* aa64_sysreg_name(u32 op0, u32 op1, u32 crn, u32 crm, u32 op2);
   1123 
   1124 /* ====================================================================
   1125  * Load/store pair, signed-offset (STP / LDP, no pre/post-increment).
   1126  *   opc(2) 101 V(1) 010 L(1) imm7 Rt2 Rn Rt          (bit 23 = 0)
   1127  *
   1128  * Mirrors the LDSTP_PRE format with bit 23 cleared; the field layout is
   1129  * otherwise identical and the pack/unpack helpers above are reused for
   1130  * pre/post/sign-offset via different family-match constants.  Codegen
   1131  * emits both X (opc=10) and FP-D (opc=01, V=1) variants for callee-save
   1132  * spill/reload (`stp x29,x30,[sp,#16]`, `stp d8,d9,[sp,#32]`). */
   1133 
   1134 #define AA64_LDSTP_SOFF_FAMILY_MATCH 0x29000000u
   1135 #define AA64_LDSTP_SOFF_FAMILY_MASK 0x7FC00000u /* bits 30:23 (bit 23 = 0) */
   1136 
   1137 typedef AA64LdStPPre AA64LdStPSOff;
   1138 
   1139 static inline u32 aa64_ldstp_soff_pack(AA64LdStPSOff f) {
   1140   return ((f.opc & 3u) << 30) | AA64_LDSTP_SOFF_FAMILY_MATCH |
   1141          ((f.V & 1u) << 26) | ((f.L & 1u) << 22) | ((f.imm7 & 0x7fu) << 15) |
   1142          ((f.Rt2 & 0x1fu) << 10) | ((f.Rn & 0x1fu) << 5) | (f.Rt & 0x1fu);
   1143 }
   1144 
   1145 static inline AA64LdStPSOff aa64_ldstp_soff_unpack(u32 w) {
   1146   AA64LdStPSOff f;
   1147   f.opc = (w >> 30) & 3u;
   1148   f.V = (w >> 26) & 1u;
   1149   f.L = (w >> 22) & 1u;
   1150   f.imm7 = (w >> 15) & 0x7fu;
   1151   f.Rt2 = (w >> 10) & 0x1fu;
   1152   f.Rn = (w >> 5) & 0x1fu;
   1153   f.Rt = w & 0x1fu;
   1154   return f;
   1155 }
   1156 
   1157 /* 64-bit integer STP/LDP, signed offset (no writeback). imm7_scaled is
   1158  * byte_offset / 8. Used for the prologue/epilogue frame record and callee-save
   1159  * pairs, which address off a fixed base (x17 / fp). */
   1160 static inline u32 aa64_stp64_soff(u32 Rt, u32 Rt2, u32 Rn, i32 imm7_scaled) {
   1161   return aa64_ldstp_soff_pack((AA64LdStPSOff){.opc = 2,
   1162                                               .V = 0,
   1163                                               .L = 0,
   1164                                               .imm7 = (u32)imm7_scaled & 0x7fu,
   1165                                               .Rt2 = Rt2,
   1166                                               .Rn = Rn,
   1167                                               .Rt = Rt});
   1168 }
   1169 static inline u32 aa64_ldp64_soff(u32 Rt, u32 Rt2, u32 Rn, i32 imm7_scaled) {
   1170   return aa64_ldstp_soff_pack((AA64LdStPSOff){.opc = 2,
   1171                                               .V = 0,
   1172                                               .L = 1,
   1173                                               .imm7 = (u32)imm7_scaled & 0x7fu,
   1174                                               .Rt2 = Rt2,
   1175                                               .Rn = Rn,
   1176                                               .Rt = Rt});
   1177 }
   1178 
   1179 /* ====================================================================
   1180  * Load/store, unscaled 9-bit signed offset (LDUR / STUR, V=0 and V=1).
   1181  *   size(2) 111 V(1) 00 opc(2) 0 imm9(9) 00 Rn(5) Rt(5)
   1182  *   31..30  29..27 26  25..24 23..22 21  20..12  11..10 9..5 4..0
   1183  *
   1184  * size: 00=B, 01=H, 10=W, 11=X (V=0) — D when V=1 selects FP/SIMD.
   1185  * opc: 00=STR, 01=LDR (sign-extension variants set opc bit 1 for the
   1186  *      smaller widths; not used by codegen today). */
   1187 
   1188 #define AA64_LDST_SIMM9_FAMILY_MATCH 0x38000000u
   1189 /* bits 29:27 (=111) + bits 25:24 (=00) + bits 11:10 (=00). size, V, opc,
   1190  * imm9, Rn, Rt all vary; bit 21 is fixed 0 for this variant. */
   1191 #define AA64_LDST_SIMM9_FAMILY_MASK 0x3B200C00u
   1192 
   1193 typedef struct AA64LdStSimm9 {
   1194   u32 size, V, opc, imm9, Rn, Rt;
   1195 } AA64LdStSimm9;
   1196 
   1197 static inline u32 aa64_ldst_simm9_pack(AA64LdStSimm9 f) {
   1198   return ((f.size & 3u) << 30) | AA64_LDST_SIMM9_FAMILY_MATCH |
   1199          ((f.V & 1u) << 26) | ((f.opc & 3u) << 22) | ((f.imm9 & 0x1ffu) << 12) |
   1200          ((f.Rn & 0x1fu) << 5) | (f.Rt & 0x1fu);
   1201 }
   1202 
   1203 static inline AA64LdStSimm9 aa64_ldst_simm9_unpack(u32 w) {
   1204   AA64LdStSimm9 f;
   1205   f.size = (w >> 30) & 3u;
   1206   f.V = (w >> 26) & 1u;
   1207   f.opc = (w >> 22) & 3u;
   1208   f.imm9 = (w >> 12) & 0x1ffu;
   1209   f.Rn = (w >> 5) & 0x1fu;
   1210   f.Rt = w & 0x1fu;
   1211   return f;
   1212 }
   1213 
   1214 /* ====================================================================
   1215  * Load/store, register offset (LDR/STR Rt,[Xn,Rm{,extend{#s}}]).
   1216  *   size(2) 111 V(1) 00 opc(2) 1 Rm(5) option(3) S(1) 10 Rn(5) Rt(5)
   1217  *   31..30  29..27 26 25..24 23..22 21 20..16 15..13  12  11..10 9..5 4..0
   1218  *
   1219  * option selects the index extend: 010=UXTW, 011=LSL/UXTX, 110=SXTW,
   1220  * 111=SXTX. S=1 scales the index by the access size (log2 = size); S=0
   1221  * leaves it unscaled. opc/size match the uimm12 form. */
   1222 
   1223 #define AA64_LDST_REGOFF_FAMILY_MATCH 0x38200800u
   1224 /* bits 29:27 (=111), 25:24 (=00), 21 (=1), 11:10 (=10). */
   1225 #define AA64_LDST_REGOFF_FAMILY_MASK 0x3B200C00u
   1226 
   1227 /* Index-extend option encodings. */
   1228 #define AA64_LDST_OPTION_UXTW 2u
   1229 #define AA64_LDST_OPTION_LSL 3u /* a.k.a. UXTX for 64-bit index */
   1230 #define AA64_LDST_OPTION_SXTW 6u
   1231 #define AA64_LDST_OPTION_SXTX 7u
   1232 
   1233 typedef struct AA64LdStRegOff {
   1234   u32 size, V, opc, Rm, option, S, Rn, Rt;
   1235 } AA64LdStRegOff;
   1236 
   1237 static inline u32 aa64_ldst_regoff_pack(AA64LdStRegOff f) {
   1238   return ((f.size & 3u) << 30) | AA64_LDST_REGOFF_FAMILY_MATCH |
   1239          ((f.V & 1u) << 26) | ((f.opc & 3u) << 22) | ((f.Rm & 0x1fu) << 16) |
   1240          ((f.option & 7u) << 13) | ((f.S & 1u) << 12) | ((f.Rn & 0x1fu) << 5) |
   1241          (f.Rt & 0x1fu);
   1242 }
   1243 
   1244 static inline AA64LdStRegOff aa64_ldst_regoff_unpack(u32 w) {
   1245   AA64LdStRegOff f;
   1246   f.size = (w >> 30) & 3u;
   1247   f.V = (w >> 26) & 1u;
   1248   f.opc = (w >> 22) & 3u;
   1249   f.Rm = (w >> 16) & 0x1fu;
   1250   f.option = (w >> 13) & 7u;
   1251   f.S = (w >> 12) & 1u;
   1252   f.Rn = (w >> 5) & 0x1fu;
   1253   f.Rt = w & 0x1fu;
   1254   return f;
   1255 }
   1256 
   1257 /* ====================================================================
   1258  * Load/store, immediate pre/post-index (writeback).
   1259  *   size(2) 111 V(1) 00 opc(2) 0 imm9(9) idx(2) Rn(5) Rt(5)
   1260  *   31..30  29..27 26 25..24 23..22 21 20..12 11..10 9..5 4..0
   1261  *
   1262  * idx (bits[11:10]) selects: 00=unscaled (LDUR, no writeback — see the
   1263  * SIMM9 helpers above), 01=post-index, 11=pre-index. imm9 is the
   1264  * unscaled signed byte offset (-256..255). */
   1265 
   1266 #define AA64_LDST_IDX_POST 1u
   1267 #define AA64_LDST_IDX_PRE 3u
   1268 
   1269 typedef struct AA64LdStWBack {
   1270   u32 size, V, opc, imm9, idx, Rn, Rt;
   1271 } AA64LdStWBack;
   1272 
   1273 static inline u32 aa64_ldst_wback_pack(AA64LdStWBack f) {
   1274   return ((f.size & 3u) << 30) | AA64_LDST_SIMM9_FAMILY_MATCH |
   1275          ((f.V & 1u) << 26) | ((f.opc & 3u) << 22) | ((f.imm9 & 0x1ffu) << 12) |
   1276          ((f.idx & 3u) << 10) | ((f.Rn & 0x1fu) << 5) | (f.Rt & 0x1fu);
   1277 }
   1278 
   1279 static inline AA64LdStWBack aa64_ldst_wback_unpack(u32 w) {
   1280   AA64LdStWBack f;
   1281   f.size = (w >> 30) & 3u;
   1282   f.V = (w >> 26) & 1u;
   1283   f.opc = (w >> 22) & 3u;
   1284   f.imm9 = (w >> 12) & 0x1ffu;
   1285   f.idx = (w >> 10) & 3u;
   1286   f.Rn = (w >> 5) & 0x1fu;
   1287   f.Rt = w & 0x1fu;
   1288   return f;
   1289 }
   1290 
   1291 /* ====================================================================
   1292  * Load/store exclusive (LDXR/STXR + acquire/release variants).
   1293  *   size(2) 001000 o2(1) L(1) o1(1) Rs(5) o0(1) Rt2(5) Rn(5) Rt(5)
   1294  *   31..30  29..24  23    22   21   20..16 15   14..10 9..5  4..0
   1295  *
   1296  * size: 00=byte,01=half,10=word,11=dword.  o1=0 for the LDXR/STXR
   1297  * single-register family (CAS sets o1=1 via the CAS pack below).
   1298  *   LDXR:  L=1 o0=0 o2=0      STXR:  L=0 o0=0 o2=0
   1299  *   LDAXR: L=1 o0=1 o2=0      STLXR: L=0 o0=1 o2=0
   1300  *   LDAR:  L=1 o0=1 o2=1      STLR:  L=0 o0=1 o2=1
   1301  * For LDXR/LDAXR/LDAR/STLR, Rs and Rt2 are unused (encode 11111). */
   1302 
   1303 #define AA64_LDSTEX_FAMILY_MATCH 0x08000000u
   1304 /* bits 29:24 (=001000). */
   1305 #define AA64_LDSTEX_FAMILY_MASK 0x3F000000u
   1306 
   1307 typedef struct AA64LdStEx {
   1308   u32 size, o2, L, o1, Rs, o0, Rt2, Rn, Rt;
   1309 } AA64LdStEx;
   1310 
   1311 static inline u32 aa64_ldstex_pack(AA64LdStEx f) {
   1312   return ((f.size & 3u) << 30) | AA64_LDSTEX_FAMILY_MATCH |
   1313          ((f.o2 & 1u) << 23) | ((f.L & 1u) << 22) | ((f.o1 & 1u) << 21) |
   1314          ((f.Rs & 0x1fu) << 16) | ((f.o0 & 1u) << 15) |
   1315          ((f.Rt2 & 0x1fu) << 10) | ((f.Rn & 0x1fu) << 5) | (f.Rt & 0x1fu);
   1316 }
   1317 
   1318 static inline AA64LdStEx aa64_ldstex_unpack(u32 w) {
   1319   AA64LdStEx f;
   1320   f.size = (w >> 30) & 3u;
   1321   f.o2 = (w >> 23) & 1u;
   1322   f.L = (w >> 22) & 1u;
   1323   f.o1 = (w >> 21) & 1u;
   1324   f.Rs = (w >> 16) & 0x1fu;
   1325   f.o0 = (w >> 15) & 1u;
   1326   f.Rt2 = (w >> 10) & 0x1fu;
   1327   f.Rn = (w >> 5) & 0x1fu;
   1328   f.Rt = w & 0x1fu;
   1329   return f;
   1330 }
   1331 
   1332 /* ====================================================================
   1333  * Compare and swap (CAS / CASA / CASL / CASAL + b/h variants, LSE).
   1334  *   size(2) 001000 1 L(1) 1 Rs(5) o0(1) 11111 Rn(5) Rt(5)
   1335  *   31..30  29..23 . 22  . 20..16 15    14..10 9..5  4..0
   1336  *
   1337  *   CAS:  L=0 o0=0    CASA:  L=1 o0=0
   1338  *   CASL: L=0 o0=1    CASAL: L=1 o0=1
   1339  * Rt2 (bits[14:10]) is fixed at 11111. */
   1340 
   1341 #define AA64_CAS_FAMILY_MATCH 0x08a07c00u
   1342 /* bits 29:24 (=001000), 23 (=1), 21 (=1), 14:10 (=11111). */
   1343 #define AA64_CAS_FAMILY_MASK 0x3Fa0fc00u
   1344 
   1345 typedef struct AA64Cas {
   1346   u32 size, L, Rs, o0, Rn, Rt;
   1347 } AA64Cas;
   1348 
   1349 static inline u32 aa64_cas_pack(AA64Cas f) {
   1350   return ((f.size & 3u) << 30) | AA64_CAS_FAMILY_MATCH | ((f.L & 1u) << 22) |
   1351          ((f.Rs & 0x1fu) << 16) | ((f.o0 & 1u) << 15) | ((f.Rn & 0x1fu) << 5) |
   1352          (f.Rt & 0x1fu);
   1353 }
   1354 
   1355 static inline AA64Cas aa64_cas_unpack(u32 w) {
   1356   AA64Cas f;
   1357   f.size = (w >> 30) & 3u;
   1358   f.L = (w >> 22) & 1u;
   1359   f.Rs = (w >> 16) & 0x1fu;
   1360   f.o0 = (w >> 15) & 1u;
   1361   f.Rn = (w >> 5) & 0x1fu;
   1362   f.Rt = w & 0x1fu;
   1363   return f;
   1364 }
   1365 
   1366 /* ====================================================================
   1367  * LSE atomic memory operations (SWP / LDADD / LDCLR / LDEOR / LDSET +
   1368  * acquire/release variants and b/h widths).
   1369  *   size(2) 111 V(1) 00 A(1) R(1) 1 Rs(5) o3(1) opc(3) 00 Rn(5) Rt(5)
   1370  *   31..30  29..27 26 25..24 23  22  21 20..16 15   14..12 11..10 9..5 4..0
   1371  *
   1372  * A=acquire (a-suffix), R=release (l-suffix).  o3=1 selects SWP (opc=000);
   1373  * o3=0 with opc in {000=LDADD,001=LDCLR,010=LDEOR,011=LDSET}. */
   1374 
   1375 #define AA64_LSE_ATOMIC_FAMILY_MATCH 0x38200000u
   1376 /* bits 29:27 (=111), 25:24 (=00), 21 (=1), 11:10 (=00). */
   1377 #define AA64_LSE_ATOMIC_FAMILY_MASK 0x3B200C00u
   1378 
   1379 #define AA64_LSE_OPC_LDADD 0u
   1380 #define AA64_LSE_OPC_LDCLR 1u
   1381 #define AA64_LSE_OPC_LDEOR 2u
   1382 #define AA64_LSE_OPC_LDSET 3u
   1383 #define AA64_LSE_OPC_SWP 0u /* paired with o3=1 */
   1384 
   1385 typedef struct AA64LseAtomic {
   1386   u32 size, A, R, Rs, o3, opc, Rn, Rt;
   1387 } AA64LseAtomic;
   1388 
   1389 static inline u32 aa64_lse_atomic_pack(AA64LseAtomic f) {
   1390   return ((f.size & 3u) << 30) | AA64_LSE_ATOMIC_FAMILY_MATCH |
   1391          ((f.A & 1u) << 23) | ((f.R & 1u) << 22) | ((f.Rs & 0x1fu) << 16) |
   1392          ((f.o3 & 1u) << 15) | ((f.opc & 7u) << 12) | ((f.Rn & 0x1fu) << 5) |
   1393          (f.Rt & 0x1fu);
   1394 }
   1395 
   1396 static inline AA64LseAtomic aa64_lse_atomic_unpack(u32 w) {
   1397   AA64LseAtomic f;
   1398   f.size = (w >> 30) & 3u;
   1399   f.A = (w >> 23) & 1u;
   1400   f.R = (w >> 22) & 1u;
   1401   f.Rs = (w >> 16) & 0x1fu;
   1402   f.o3 = (w >> 15) & 1u;
   1403   f.opc = (w >> 12) & 7u;
   1404   f.Rn = (w >> 5) & 0x1fu;
   1405   f.Rt = w & 0x1fu;
   1406   return f;
   1407 }
   1408 
   1409 /* ====================================================================
   1410  * Unconditional branch (immediate) — B / BL
   1411  *   op(1) 00101 imm26(26)
   1412  *   31    30..26 25..0
   1413  *
   1414  * op=0 → B, op=1 → BL.  imm26 is a signed 26-bit word displacement
   1415  * (multiply by 4 to get byte offset).  Codegen emits with imm26=0 paired
   1416  * with a JUMP26 / CALL26 relocation. */
   1417 
   1418 #define AA64_BR_IMM_FAMILY_MATCH 0x14000000u
   1419 #define AA64_BR_IMM_FAMILY_MASK 0x7C000000u /* bits 30:26 (=00101) */
   1420 
   1421 typedef struct AA64BrImm {
   1422   u32 op, imm26;
   1423 } AA64BrImm;
   1424 
   1425 static inline u32 aa64_brimm_pack(AA64BrImm f) {
   1426   return ((f.op & 1u) << 31) | AA64_BR_IMM_FAMILY_MATCH |
   1427          (f.imm26 & 0x3ffffffu);
   1428 }
   1429 
   1430 static inline AA64BrImm aa64_brimm_unpack(u32 w) {
   1431   AA64BrImm f;
   1432   f.op = (w >> 31) & 1u;
   1433   f.imm26 = w & 0x3ffffffu;
   1434   return f;
   1435 }
   1436 
   1437 static inline u32 aa64_b(u32 imm26) {
   1438   return aa64_brimm_pack((AA64BrImm){.op = 0, .imm26 = imm26});
   1439 }
   1440 static inline u32 aa64_bl(u32 imm26) {
   1441   return aa64_brimm_pack((AA64BrImm){.op = 1, .imm26 = imm26});
   1442 }
   1443 
   1444 /* ====================================================================
   1445  * Conditional branch (immediate) — B.cond
   1446  *   0101 0100 imm19(19) 0 cond(4)
   1447  *   31..24    23..5     4 3..0
   1448  *
   1449  * imm19 is a signed 19-bit word displacement; cond is the 4-bit ARM
   1450  * condition code (EQ=0, NE=1, ...). */
   1451 
   1452 #define AA64_BR_COND_FAMILY_MATCH 0x54000000u
   1453 #define AA64_BR_COND_FAMILY_MASK              \
   1454   0xFF000010u /* bits 31:24 fixed + bit 4 = 0 \
   1455                */
   1456 
   1457 typedef struct AA64BrCond {
   1458   u32 imm19, cond;
   1459 } AA64BrCond;
   1460 
   1461 static inline u32 aa64_brcond_pack(AA64BrCond f) {
   1462   return AA64_BR_COND_FAMILY_MATCH | ((f.imm19 & 0x7ffffu) << 5) |
   1463          (f.cond & 0xfu);
   1464 }
   1465 
   1466 static inline AA64BrCond aa64_brcond_unpack(u32 w) {
   1467   AA64BrCond f;
   1468   f.imm19 = (w >> 5) & 0x7ffffu;
   1469   f.cond = w & 0xfu;
   1470   return f;
   1471 }
   1472 
   1473 /* ====================================================================
   1474  * Compare-and-branch — CBZ / CBNZ
   1475  *   sf 011010 op(1) imm19(19) Rt(5)
   1476  *   31 30..25 24    23..5     4..0
   1477  *
   1478  * op=0 → CBZ (branch if zero), op=1 → CBNZ. */
   1479 
   1480 #define AA64_CB_FAMILY_MATCH 0x34000000u
   1481 #define AA64_CB_FAMILY_MASK 0x7E000000u /* bits 30:25 (=011010) */
   1482 
   1483 typedef struct AA64CB {
   1484   u32 sf, op, imm19, Rt;
   1485 } AA64CB;
   1486 
   1487 static inline u32 aa64_cb_pack(AA64CB f) {
   1488   return ((f.sf & 1u) << 31) | AA64_CB_FAMILY_MATCH | ((f.op & 1u) << 24) |
   1489          ((f.imm19 & 0x7ffffu) << 5) | (f.Rt & 0x1fu);
   1490 }
   1491 
   1492 static inline AA64CB aa64_cb_unpack(u32 w) {
   1493   AA64CB f;
   1494   f.sf = (w >> 31) & 1u;
   1495   f.op = (w >> 24) & 1u;
   1496   f.imm19 = (w >> 5) & 0x7ffffu;
   1497   f.Rt = w & 0x1fu;
   1498   return f;
   1499 }
   1500 
   1501 static inline u32 aa64_cbz(u32 sf, u32 Rt, u32 imm19) {
   1502   return aa64_cb_pack((AA64CB){.sf = sf, .op = 0, .imm19 = imm19, .Rt = Rt});
   1503 }
   1504 static inline u32 aa64_cbnz_imm(u32 sf, u32 Rt, u32 imm19) {
   1505   return aa64_cb_pack((AA64CB){.sf = sf, .op = 1, .imm19 = imm19, .Rt = Rt});
   1506 }
   1507 
   1508 /* ====================================================================
   1509  * Exception generation — BRK / SVC / HVC / SMC / HLT / UDF aliases.
   1510  *   1101 0100 opc(3) imm16(16) op2(3) LL(2)
   1511  *   31..24    23..21 20..5     4..2   1..0
   1512  *
   1513  * SVC: opc=000, LL=01.  BRK: opc=001, LL=00.  HVC/SMC/HLT/...: other
   1514  * combos.  Codegen emits BRK today. */
   1515 
   1516 #define AA64_EXCEPT_FAMILY_MATCH 0xD4000000u
   1517 #define AA64_EXCEPT_FAMILY_MASK 0xFF000000u /* bits 31:24 */
   1518 
   1519 typedef struct AA64Except {
   1520   u32 opc, imm16, op2, LL;
   1521 } AA64Except;
   1522 
   1523 static inline u32 aa64_except_pack(AA64Except f) {
   1524   return AA64_EXCEPT_FAMILY_MATCH | ((f.opc & 7u) << 21) |
   1525          ((f.imm16 & 0xffffu) << 5) | ((f.op2 & 7u) << 2) | (f.LL & 3u);
   1526 }
   1527 
   1528 static inline AA64Except aa64_except_unpack(u32 w) {
   1529   AA64Except f;
   1530   f.opc = (w >> 21) & 7u;
   1531   f.imm16 = (w >> 5) & 0xffffu;
   1532   f.op2 = (w >> 2) & 7u;
   1533   f.LL = w & 3u;
   1534   return f;
   1535 }
   1536 
   1537 static inline u32 aa64_brk(u32 imm16) {
   1538   return aa64_except_pack(
   1539       (AA64Except){.opc = 1, .imm16 = imm16, .op2 = 0, .LL = 0});
   1540 }
   1541 static inline u32 aa64_svc(u32 imm16) {
   1542   return aa64_except_pack(
   1543       (AA64Except){.opc = 0, .imm16 = imm16, .op2 = 0, .LL = 1});
   1544 }
   1545 
   1546 /* ====================================================================
   1547  * Disassembler descriptor table.
   1548  * ==================================================================== */
   1549 
   1550 typedef struct AA64InsnDesc {
   1551   Slice mnemonic;
   1552   u32 match;
   1553   u32 mask;
   1554   u8 fmt;   /* AA64Format */
   1555   u8 flags; /* AA64_ASMFL_* */
   1556   u8 pad[2];
   1557 } AA64InsnDesc;
   1558 
   1559 extern const AA64InsnDesc aa64_insn_table[];
   1560 extern const u32 aa64_insn_table_n;
   1561 
   1562 /* Linear-scan lookup. Returns the matching descriptor or NULL. First
   1563  * match wins; ordering in aa64_insn_table.c puts more-specific entries
   1564  * before broader ones (so aliases like MOV/MUL/NEG win over their
   1565  * canonical ORR/MADD/SUB forms). */
   1566 const AA64InsnDesc* aa64_disasm_find(u32 word);
   1567 
   1568 /* ====================================================================
   1569  * Operand print / parse — one entry per AA64Format.
   1570  *
   1571  * aa64_print_operands renders the operand text (everything after the
   1572  * mnemonic) for `word` into `sb`, using `desc->fmt` to dispatch.
   1573  * Mnemonic itself is in `desc->mnemonic`; the caller writes it before
   1574  * calling this helper.  `vaddr` is the instruction's virtual address
   1575  * for PC-relative formats; pass 0 if not known.
   1576  *
   1577  * aa64_parse_operands is the dual: read the operand grammar for the
   1578  * format from `tok` (opaque to phase 2 — declared but unimplemented)
   1579  * and fill `fields_out` (a pointer to the format's field struct).
   1580  * Phase 3 wires `tok` up; for now the function is a forward declaration
   1581  * the assembler can resolve once it's in place. */
   1582 
   1583 struct AA64AsmTok; /* opaque, defined by the phase-3 asm parser */
   1584 
   1585 void aa64_print_operands(StrBuf* sb, const AA64InsnDesc* desc, u32 word,
   1586                          u64 vaddr);
   1587 
   1588 /* If `word` is an SBFM/UBFM that has a preferred shift-alias disassembly,
   1589  * return its mnemonic ("lsl"/"lsr"/"asr") and write the shift amount to
   1590  * *shift; return NULL otherwise. Shared by the disassembler's mnemonic and
   1591  * operand printers so the alias decision lives in one place. */
   1592 const char* aa64_bitfield_shift_alias(u32 word, u32* shift);
   1593 
   1594 /* Preferred SBFM/UBFM extension aliases: sxtb/sxth/sxtw/uxtb/uxth.
   1595  * Returns NULL when the bitfield does not match one of those forms. */
   1596 const char* aa64_bitfield_extend_alias(u32 word);
   1597 
   1598 /* Preferred SBFM/UBFM extract aliases: sbfx/ubfx. Writes the least-significant
   1599  * source bit and extracted width when an alias is available. */
   1600 const char* aa64_bitfield_extract_alias(u32 word, u32* lsb, u32* width);
   1601 
   1602 /* Returns 1 on success, 0 on parse error.  Phase 2 stub returns 0 for
   1603  * every format; phase 3 fills in the bodies. */
   1604 int aa64_parse_operands(struct AA64AsmTok* tok, const AA64InsnDesc* desc,
   1605                         void* fields_out);
   1606 
   1607 #endif