cg/aa64: implement Groups D, E, F (control flow, conv, memory) - kit

commit 3e695b43cadf39b052bc1d70f402d5fdfdf49292
parent ed3b81da82f373155f1790929e3c7a7d96cf2b62
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sat,  9 May 2026 14:36:53 -0700

cg/aa64: implement Groups D, E, F (control flow, conv, memory)

Brings the AArch64 CGTarget through the full cg test corpus:

- D: cmp/cmp_branch via SUBS XZR + CSET/B.cond, with a CmpOp→ARM cond
  table; labels passthrough to MCEmitter; SCOPE_IF via a per-function
  AAScope table tracking else/end labels. mc.c learns to apply
  R_AARCH64_CONDBR19 fixups for B.cond's imm19 displacement.
- E: convert covers SEXT/ZEXT (SBFM/UBFM), TRUNC (W-view MOV), ITOF_S/U
  (SCVTF/UCVTF), FTOI_U (FCVTZU), FEXT/FTRUNC (FCVT S↔D), and BITCAST
  (FMOV between GPR and FP, single and double).
- F: copy_bytes/set_bytes as 8/4/2/1-byte unrolled LDUR/STUR loops
  (XZR fast-path for zero-fill); bitfield_load uses LDUR + UBFX/SBFX,
  bitfield_store is a read-modify-write with BFI.

All 288 cg cases (72 × D/R/E/J) now pass.

Diffstat:
M src/arch/aarch64.c  | 455 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M src/arch/mc.c  | 17 +++++++++++++++++

2 files changed, 454 insertions(+), 18 deletions(-)
diff --git a/src/arch/aarch64.c b/src/arch/aarch64.c
@@ -130,6 +130,53 @@ static inline u32 aa64_cset_eq(u32 sf, u32 Rd)
  * sf: 0=W, 1=X. type: 0=S, 1=D. */
 static inline u32 aa64_fcvtzs(u32 sf, u32 type, u32 Rd, u32 Rn)
 { return 0x1E380000u | (sf<<31) | ((type&3)<<22) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_fcvtzu(u32 sf, u32 type, u32 Rd, u32 Rn)
+{ return 0x1E390000u | (sf<<31) | ((type&3)<<22) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_scvtf(u32 sf, u32 type, u32 Rd, u32 Rn)
+{ return 0x1E220000u | (sf<<31) | ((type&3)<<22) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_ucvtf(u32 sf, u32 type, u32 Rd, u32 Rn)
+{ return 0x1E230000u | (sf<<31) | ((type&3)<<22) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
+/* FCVT — between FP precisions. S→D widens; D→S narrows. */
+static inline u32 aa64_fcvt_d_s(u32 Rd, u32 Rn)
+{ return 0x1E22C000u | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_fcvt_s_d(u32 Rd, u32 Rn)
+{ return 0x1E624000u | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
+/* FMOV between FP and GPR (BITCAST). */
+static inline u32 aa64_fmov_s_w(u32 Rd, u32 Rn)  /* GPR→FP, single */
+{ return 0x1E270000u | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_fmov_w_s(u32 Rd, u32 Rn)  /* FP→GPR, single */
+{ return 0x1E260000u | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_fmov_d_x(u32 Rd, u32 Rn)  /* GPR→FP, double */
+{ return 0x9E670000u | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_fmov_x_d(u32 Rd, u32 Rn)  /* FP→GPR, double */
+{ return 0x9E660000u | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
+/* SUBS shifted register (Rd=ZR encodes CMP). */
+static inline u32 aa64_subs_reg(u32 sf, u32 Rd, u32 Rn, u32 Rm)
+{ return 0x6B000000u | (sf<<31) | ((Rm&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+
+/* B.cond — imm19 at bits 5..23 left as zero; patched by linker / MCEmitter. */
+static inline u32 aa64_b_cond(u32 cond)
+{ return 0x54000000u | (cond & 0xfu); }
+
+/* CSINC Rd, Rn, Rm, cond (CSEL family with op2=01). CSET Rd, cond
+ * is CSINC Rd, ZR, ZR, !cond. */
+static inline u32 aa64_csinc(u32 sf, u32 Rd, u32 Rn, u32 Rm, u32 cond)
+{ return 0x1A800400u | (sf<<31) | ((Rm&0x1f)<<16) | ((cond&0xfu)<<12) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_cset(u32 sf, u32 Rd, u32 cond)
+{ return aa64_csinc(sf, Rd, 31u, 31u, cond ^ 1u); }
+
+/* SBFM / UBFM / BFM (bitfield move family).
+ *   sf opc(2) 100110 N immr(6) imms(6) Rn(5) Rd(5)
+ * opc: 00=SBFM, 01=BFM, 10=UBFM. N must equal sf. */
+static inline u32 aa64_sbfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms)
+{ return 0x13000000u | (sf<<31) | (sf<<22) | ((immr&0x3fu)<<16) | ((imms&0x3fu)<<10) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_ubfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms)
+{ return 0x53000000u | (sf<<31) | (sf<<22) | ((immr&0x3fu)<<16) | ((imms&0x3fu)<<10) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
+static inline u32 aa64_bfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms)
+{ return 0x33000000u | (sf<<31) | (sf<<22) | ((immr&0x3fu)<<16) | ((imms&0x3fu)<<10) | ((Rn&0x1f)<<5) | (Rd&0x1f); }
 
 /* ============================================================
  * AAImpl
@@ -145,6 +192,14 @@ typedef struct AASlot {
     u8  pad[3];
 } AASlot;
 
+typedef struct AAScope {
+    u8       kind;          /* ScopeKind */
+    u8       has_else;
+    u8       pad[2];
+    MCLabel  else_label;    /* SCOPE_IF: false branch target / end-of-then */
+    MCLabel  end_label;     /* SCOPE_IF: join point past the whole if/else */
+} AAScope;
+
 typedef struct AAImpl {
     CGTarget   base;
     SrcLoc     loc;
@@ -172,6 +227,13 @@ typedef struct AAImpl {
     /* Reg allocator (callee-saved prefix). */
     u32        used_int;         /* x19 + i, i in [0, used_int) */
     u32        used_fp;          /* v8  + i, i in [0, used_fp ) */
+
+    /* Structured-scope stack. Entries are not popped — IDs returned to
+     * the caller are stable indices into this array for the lifetime
+     * of the function. nscopes is reset at func_begin. */
+    AAScope*   scopes;
+    u32        nscopes;
+    u32        scopes_cap;
 } AAImpl;
 
 static AAImpl* impl_of(CGTarget* t) { return (AAImpl*)t; }
@@ -179,6 +241,7 @@ static AAImpl* impl_of(CGTarget* t) { return (AAImpl*)t; }
 /* Forward decls used before definition. */
 static FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d);
 static AASlot* slot_get(AAImpl* a, FrameSlot fs);
+static u32 force_reg_int(CGTarget* t, Operand op, u32 sf, u32 scratch);
 
 /* ---- helpers ---- */
 
@@ -341,6 +404,7 @@ static void aa_func_begin(CGTarget* t, const CGFuncDesc* fd)
     a->used_int    = 0;
     a->used_fp     = 0;
     a->nslots      = 0;
+    a->nscopes     = 0;
     a->sret_ptr_slot = FRAME_SLOT_NONE;
     a->epilogue_label = mc->label_new(mc);
 
@@ -598,18 +662,151 @@ static const Reg* aa_clobbers (CGTarget* t, RegClass c, u32* n)      { (void)c; 
 static void      aa_spill_reg (CGTarget* t, Operand s, FrameSlot f, MemAccess m) { (void)s; (void)f; (void)m; aa_panic(t, "spill_reg"); }
 static void      aa_reload_reg(CGTarget* t, Operand d, FrameSlot f, MemAccess m) { (void)d; (void)f; (void)m; aa_panic(t, "reload_reg"); }
 
-/* ---- labels / control flow (deferred for D-group; ret uses internal label) ---- */
+/* ---- labels / control flow ----
+ *
+ * Label is a transparent wrapper around MCLabel — the MCEmitter already
+ * tracks placement and applies pending fixups. Jumps emit a B with
+ * imm26=0 paired with R_AARCH64_JUMP26; conditional branches emit a
+ * B.cond with imm19=0 paired with R_AARCH64_CONDBR19. */
+
+static Label aa_label_new(CGTarget* t)
+{
+    return (Label)t->mc->label_new(t->mc);
+}
+
+static void aa_label_place(CGTarget* t, Label l)
+{
+    t->mc->label_place(t->mc, (MCLabel)l);
+}
+
+static void aa_jump(CGTarget* t, Label l)
+{
+    MCEmitter* mc = t->mc;
+    emit32(mc, aa64_b_base());
+    mc->emit_label_ref(mc, (MCLabel)l, R_AARCH64_JUMP26, 4, 0);
+}
+
+/* Map CmpOp → AArch64 condition code. Boolean (i1) "true" means take the
+ * branch / set 1. */
+static u32 cmp_to_cond(CmpOp op)
+{
+    switch (op) {
+    case CMP_EQ:   return 0x0u;   /* EQ */
+    case CMP_NE:   return 0x1u;   /* NE */
+    case CMP_LT_U: return 0x3u;   /* CC/LO */
+    case CMP_LE_U: return 0x9u;   /* LS */
+    case CMP_GT_U: return 0x8u;   /* HI */
+    case CMP_GE_U: return 0x2u;   /* CS/HS */
+    case CMP_LT_S: return 0xbu;   /* LT */
+    case CMP_LE_S: return 0xdu;   /* LE */
+    case CMP_GT_S: return 0xcu;   /* GT */
+    case CMP_GE_S: return 0xau;   /* GE */
+    /* FP compares route through FCMP, not yet exercised here. */
+    default:       return 0x0u;
+    }
+}
+
+/* Emit CMP a, b (= SUBS ZR, a, b). Materializes IMM operands through
+ * scratch x9/x10. Width comes from `a`; signedness lives in the cond. */
+static void emit_cmp_ab(CGTarget* t, Operand a_op, Operand b_op)
+{
+    MCEmitter* mc = t->mc;
+    u32 sf = type_is_64(a_op.type) ? 1u : 0u;
+    /* Special-case CMP Rn, #0 so a literal zero compare doesn't need
+     * a scratch register. */
+    if (b_op.kind == OPK_IMM && b_op.v.imm == 0 && a_op.kind == OPK_REG) {
+        emit32(mc, aa64_subs_imm(sf, /*Rd=ZR*/31u, reg_num(a_op), 0));
+        return;
+    }
+    u32 rn = force_reg_int(t, a_op, sf, 9);
+    u32 rm = force_reg_int(t, b_op, sf, (rn == 9) ? 10u : 9u);
+    emit32(mc, aa64_subs_reg(sf, /*Rd=ZR*/31u, rn, rm));
+}
+
+static void aa_cmp_branch(CGTarget* t, CmpOp op, Operand a, Operand b, Label l)
+{
+    MCEmitter* mc = t->mc;
+    emit_cmp_ab(t, a, b);
+    emit32(mc, aa64_b_cond(cmp_to_cond(op)));
+    mc->emit_label_ref(mc, (MCLabel)l, R_AARCH64_CONDBR19, 4, 0);
+}
+
+static void aa_cmp(CGTarget* t, CmpOp op, Operand dst, Operand a, Operand b)
+{
+    emit_cmp_ab(t, a, b);
+    u32 sf_dst = type_is_64(dst.type) ? 1u : 0u;
+    emit32(t->mc, aa64_cset(sf_dst, reg_num(dst), cmp_to_cond(op)));
+}
+
+/* ---- structured scopes (SCOPE_IF only for v1) ---- */
 
-static Label aa_label_new  (CGTarget* t)                         { aa_panic(t, "label_new"); }
-static void  aa_label_place(CGTarget* t, Label l)                { (void)l; aa_panic(t, "label_place"); }
-static void  aa_jump       (CGTarget* t, Label l)                { (void)l; aa_panic(t, "jump"); }
-static void  aa_cmp_branch (CGTarget* t, CmpOp op, Operand a, Operand b, Label l) { (void)op;(void)a;(void)b;(void)l; aa_panic(t, "cmp_branch"); }
+static CGScope aa_scope_begin(CGTarget* t, const CGScopeDesc* d)
+{
+    AAImpl* a = impl_of(t);
+    if (a->nscopes == a->scopes_cap) {
+        u32 ncap = a->scopes_cap ? a->scopes_cap * 2u : 4u;
+        AAScope* nb = arena_array(t->c->tu, AAScope, ncap);
+        if (a->scopes) memcpy(nb, a->scopes, sizeof(AAScope) * a->nscopes);
+        a->scopes = nb;
+        a->scopes_cap = ncap;
+    }
+    AAScope* sc = &a->scopes[a->nscopes];
+    sc->kind       = (u8)d->kind;
+    sc->has_else   = 0;
+    sc->else_label = t->mc->label_new(t->mc);
+    sc->end_label  = t->mc->label_new(t->mc);
+
+    if (d->kind == SCOPE_IF) {
+        /* Test cond against zero, branch to else_label on EQ (false). */
+        u32 sf = type_is_64(d->cond.type) ? 1u : 0u;
+        u32 rn = force_reg_int(t, d->cond, sf, 9);
+        emit32(t->mc, aa64_subs_imm(sf, /*Rd=ZR*/31u, rn, 0));
+        emit32(t->mc, aa64_b_cond(0x0u /*EQ*/));
+        t->mc->emit_label_ref(t->mc, sc->else_label,
+                              R_AARCH64_CONDBR19, 4, 0);
+    } else {
+        /* BLOCK / LOOP not yet exercised by the corpus. */
+        compiler_panic(t->c, a->loc,
+            "aarch64 scope_begin: kind %d not yet implemented", (int)d->kind);
+    }
 
-static CGScope aa_scope_begin(CGTarget* t, const CGScopeDesc* d)  { (void)d; aa_panic(t, "scope_begin"); }
-static void    aa_scope_else (CGTarget* t, CGScope s)             { (void)s; aa_panic(t, "scope_else"); }
-static void    aa_scope_end  (CGTarget* t, CGScope s)             { (void)s; aa_panic(t, "scope_end"); }
-static void    aa_break_to   (CGTarget* t, CGScope s)             { (void)s; aa_panic(t, "break_to"); }
-static void    aa_continue_to(CGTarget* t, CGScope s)             { (void)s; aa_panic(t, "continue_to"); }
+    a->nscopes++;
+    return (CGScope)a->nscopes;   /* 1-based */
+}
+
+static void aa_scope_else(CGTarget* t, CGScope s)
+{
+    AAImpl* a = impl_of(t);
+    if (s == CG_SCOPE_NONE || s > a->nscopes) {
+        compiler_panic(t->c, a->loc, "aarch64 scope_else: bad scope %u",
+                       (unsigned)s);
+    }
+    AAScope* sc = &a->scopes[s - 1];
+    /* End of the then-arm: jump past the else body. */
+    emit32(t->mc, aa64_b_base());
+    t->mc->emit_label_ref(t->mc, sc->end_label, R_AARCH64_JUMP26, 4, 0);
+    /* Begin of the else-arm. */
+    t->mc->label_place(t->mc, sc->else_label);
+    sc->has_else = 1;
+}
+
+static void aa_scope_end(CGTarget* t, CGScope s)
+{
+    AAImpl* a = impl_of(t);
+    if (s == CG_SCOPE_NONE || s > a->nscopes) {
+        compiler_panic(t->c, a->loc, "aarch64 scope_end: bad scope %u",
+                       (unsigned)s);
+    }
+    AAScope* sc = &a->scopes[s - 1];
+    if (sc->kind == SCOPE_IF && !sc->has_else) {
+        /* No else body — false-branch lands at scope_end. */
+        t->mc->label_place(t->mc, sc->else_label);
+    }
+    t->mc->label_place(t->mc, sc->end_label);
+}
+
+static void aa_break_to   (CGTarget* t, CGScope s) { (void)s; aa_panic(t, "break_to"); }
+static void aa_continue_to(CGTarget* t, CGScope s) { (void)s; aa_panic(t, "continue_to"); }
 
 /* ---- data movement ---- */
 
@@ -779,10 +976,155 @@ static void aa_addr_of(CGTarget* t, Operand dst, Operand lv)
 }
 
 static void aa_tls_addr_of(CGTarget* t, Operand d, ObjSymId s, i64 a)   { (void)d;(void)s;(void)a; aa_panic(t, "tls_addr_of"); }
-static void aa_copy_bytes(CGTarget* t, Operand d, Operand s, AggregateAccess g) { (void)d;(void)s;(void)g; aa_panic(t, "copy_bytes"); }
-static void aa_set_bytes (CGTarget* t, Operand d, Operand b, AggregateAccess g) { (void)d;(void)b;(void)g; aa_panic(t, "set_bytes"); }
-static void aa_bitfield_load (CGTarget* t, Operand d, Operand a, BitFieldAccess f) { (void)d;(void)a;(void)f; aa_panic(t, "bitfield_load"); }
-static void aa_bitfield_store(CGTarget* t, Operand a, Operand s, BitFieldAccess f) { (void)a;(void)s;(void)f; aa_panic(t, "bitfield_store"); }
+
+/* Resolve a dst/src address operand for the aggregate ops below.
+ * Accepts OPK_REG (already a pointer) and OPK_LOCAL (= fp - off);
+ * for OPK_LOCAL we materialize the address into a scratch register. */
+static u32 agg_addr_reg(CGTarget* t, Operand op, u32 scratch)
+{
+    if (op.kind == OPK_REG) return reg_num(op);
+    if (op.kind == OPK_LOCAL) {
+        AAImpl* a = impl_of(t);
+        AASlot* s = slot_get(a, op.v.frame_slot);
+        if (!s) compiler_panic(t->c, a->loc, "aarch64 agg: bad slot");
+        emit32(t->mc, aa64_sub_imm(1, scratch, 29, s->off, 0));
+        return scratch;
+    }
+    compiler_panic(t->c, impl_of(t)->loc,
+        "aarch64 agg: address kind %d unsupported", (int)op.kind);
+}
+
+static void aa_copy_bytes(CGTarget* t, Operand dst_addr, Operand src_addr,
+                          AggregateAccess agg)
+{
+    MCEmitter* mc = t->mc;
+    u32 dr = agg_addr_reg(t, dst_addr, 9);
+    u32 sr = agg_addr_reg(t, src_addr, (dr == 10) ? 11u : 10u);
+    u32 nbytes = agg.size;
+    u32 i = 0;
+    /* Unrolled per-element copy through scratch x12. We use unscaled
+     * LDUR/STUR so we don't depend on `agg.align` for legality. */
+    while (i + 8 <= nbytes) {
+        emit32(mc, aa64_ldur(3, 12, sr, (i32)i));
+        emit32(mc, aa64_stur(3, 12, dr, (i32)i));
+        i += 8;
+    }
+    while (i + 4 <= nbytes) {
+        emit32(mc, aa64_ldur(2, 12, sr, (i32)i));
+        emit32(mc, aa64_stur(2, 12, dr, (i32)i));
+        i += 4;
+    }
+    while (i + 2 <= nbytes) {
+        emit32(mc, aa64_ldur(1, 12, sr, (i32)i));
+        emit32(mc, aa64_stur(1, 12, dr, (i32)i));
+        i += 2;
+    }
+    while (i < nbytes) {
+        emit32(mc, aa64_ldur(0, 12, sr, (i32)i));
+        emit32(mc, aa64_stur(0, 12, dr, (i32)i));
+        i += 1;
+    }
+}
+
+static void aa_set_bytes(CGTarget* t, Operand dst_addr, Operand byte_value,
+                         AggregateAccess agg)
+{
+    MCEmitter* mc = t->mc;
+    u32 dr = agg_addr_reg(t, dst_addr, 9);
+
+    u32 byte;
+    if (byte_value.kind == OPK_IMM) {
+        byte = (u32)(byte_value.v.imm & 0xffu);
+    } else {
+        compiler_panic(t->c, impl_of(t)->loc,
+            "aarch64 set_bytes: REG byte not yet supported");
+    }
+    u32 nbytes = agg.size;
+
+    if (byte == 0) {
+        /* Use XZR/WZR directly — no broadcast register needed. */
+        u32 i = 0;
+        while (i + 8 <= nbytes) { emit32(mc, aa64_stur(3, 31, dr, (i32)i)); i += 8; }
+        while (i + 4 <= nbytes) { emit32(mc, aa64_stur(2, 31, dr, (i32)i)); i += 4; }
+        while (i + 2 <= nbytes) { emit32(mc, aa64_stur(1, 31, dr, (i32)i)); i += 2; }
+        while (i < nbytes)      { emit32(mc, aa64_stur(0, 31, dr, (i32)i)); i += 1; }
+        return;
+    }
+
+    /* Broadcast byte into x12 then strided-store. */
+    u64 b64 = byte;
+    b64 |= b64 << 8;
+    b64 |= b64 << 16;
+    b64 |= b64 << 32;
+    emit_load_imm(mc, /*sf=*/1u, /*Rd=*/12u, (i64)b64);
+
+    u32 i = 0;
+    while (i + 8 <= nbytes) { emit32(mc, aa64_stur(3, 12, dr, (i32)i)); i += 8; }
+    while (i + 4 <= nbytes) { emit32(mc, aa64_stur(2, 12, dr, (i32)i)); i += 4; }
+    while (i + 2 <= nbytes) { emit32(mc, aa64_stur(1, 12, dr, (i32)i)); i += 2; }
+    while (i < nbytes)      { emit32(mc, aa64_stur(0, 12, dr, (i32)i)); i += 1; }
+}
+
+static void aa_bitfield_load(CGTarget* t, Operand dst, Operand record_addr,
+                             BitFieldAccess bf)
+{
+    MCEmitter* mc = t->mc;
+    u32 base = agg_addr_reg(t, record_addr, 9);
+    u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
+    u32 sf = (storage_bytes == 8u) ? 1u : 0u;
+    u32 sidx = size_idx_for_bytes(storage_bytes);
+    u32 rd = reg_num(dst);
+
+    /* Load the entire storage unit, then extract bf.bit_width bits at
+     * bf.bit_offset. UBFX (zero-extend) or SBFX (sign-extend) per the
+     * field's signedness. */
+    emit32(mc, aa64_ldur(sidx, rd, base, (i32)bf.storage_offset));
+    u32 lsb   = bf.bit_offset;
+    u32 width = bf.bit_width ? bf.bit_width : 1u;
+    u32 imms  = lsb + width - 1u;
+    if (bf.signed_) {
+        emit32(mc, aa64_sbfm(sf, rd, rd, lsb, imms));
+    } else {
+        emit32(mc, aa64_ubfm(sf, rd, rd, lsb, imms));
+    }
+}
+
+static void aa_bitfield_store(CGTarget* t, Operand record_addr, Operand src,
+                              BitFieldAccess bf)
+{
+    MCEmitter* mc = t->mc;
+    u32 base = agg_addr_reg(t, record_addr, 9);
+    u32 storage_bytes = bf.storage.size ? bf.storage.size : 4u;
+    u32 sf = (storage_bytes == 8u) ? 1u : 0u;
+    u32 sidx = size_idx_for_bytes(storage_bytes);
+
+    /* Read-modify-write through scratch registers x10 (storage) and x11
+     * (the source value). */
+    emit32(mc, aa64_ldur(sidx, /*Rt=*/10u, base, (i32)bf.storage_offset));
+
+    u32 src_reg;
+    if (src.kind == OPK_IMM) {
+        emit_load_imm(mc, sf, /*Rd=*/11u, src.v.imm);
+        src_reg = 11u;
+    } else if (src.kind == OPK_REG) {
+        src_reg = reg_num(src);
+    } else {
+        compiler_panic(t->c, impl_of(t)->loc,
+            "aarch64 bitfield_store: src kind %d unsupported", (int)src.kind);
+    }
+
+    /* BFI Rd, Rn, #lsb, #width — insert width bits of Rn[0..width-1]
+     * starting at bit lsb of Rd. Encoded as BFM with
+     * immr = (RegSize - lsb) mod RegSize, imms = width - 1. */
+    u32 reg_size = sf ? 64u : 32u;
+    u32 lsb   = bf.bit_offset;
+    u32 width = bf.bit_width ? bf.bit_width : 1u;
+    u32 immr  = (reg_size - lsb) % reg_size;
+    u32 imms  = width - 1u;
+    emit32(mc, aa64_bfm(sf, /*Rd=*/10u, src_reg, immr, imms));
+
+    emit32(mc, aa64_stur(sidx, /*Rt=*/10u, base, (i32)bf.storage_offset));
+}
 
 /* ---- arithmetic ---- */
 
@@ -865,20 +1207,97 @@ static void aa_unop(CGTarget* t, UnOp op, Operand dst, Operand a_op)
     emit32(mc, word);
 }
 
-static void aa_cmp(CGTarget* t, CmpOp op, Operand d, Operand a, Operand b)
-{ (void)op;(void)d;(void)a;(void)b; aa_panic(t, "cmp"); }
-
 static void aa_convert(CGTarget* t, ConvKind k, Operand dst, Operand src)
 {
     AAImpl* a = impl_of(t);
+    MCEmitter* mc = t->mc;
+    u32 rd = reg_num(dst);
+    u32 rn = reg_num(src);
+
     switch (k) {
+    case CV_SEXT: {
+        if (src.cls != RC_INT || dst.cls != RC_INT) {
+            compiler_panic(t->c, a->loc, "aarch64 convert SEXT: bad classes");
+        }
+        u32 src_bits = type_byte_size(src.type) * 8u;
+        u32 sf_dst = type_is_64(dst.type) ? 1u : 0u;
+        emit32(mc, aa64_sbfm(sf_dst, rd, rn, /*immr=*/0, /*imms=*/src_bits - 1u));
+        return;
+    }
+    case CV_ZEXT: {
+        if (src.cls != RC_INT || dst.cls != RC_INT) {
+            compiler_panic(t->c, a->loc, "aarch64 convert ZEXT: bad classes");
+        }
+        u32 src_bits = type_byte_size(src.type) * 8u;
+        if (src_bits == 32u) {
+            /* MOV Wd, Wn auto-zero-extends into the X register. */
+            emit32(mc, aa64_mov_reg(0, rd, rn));
+        } else {
+            emit32(mc, aa64_ubfm(0, rd, rn, /*immr=*/0, /*imms=*/src_bits - 1u));
+        }
+        return;
+    }
+    case CV_TRUNC: {
+        /* Reading the W view of any X register zeros the upper 32 bits.
+         * For narrower truncations the consumer (store / ret) selects
+         * the byte width — leaving extra high bits is harmless. */
+        emit32(mc, aa64_mov_reg(0, rd, rn));
+        return;
+    }
+    case CV_ITOF_S: {
+        u32 sf_src = type_is_64(src.type) ? 1u : 0u;
+        u32 type   = type_is_fp_double(dst.type) ? 1u : 0u;
+        emit32(mc, aa64_scvtf(sf_src, type, rd, rn));
+        return;
+    }
+    case CV_ITOF_U: {
+        u32 sf_src = type_is_64(src.type) ? 1u : 0u;
+        u32 type   = type_is_fp_double(dst.type) ? 1u : 0u;
+        emit32(mc, aa64_ucvtf(sf_src, type, rd, rn));
+        return;
+    }
     case CV_FTOI_S: {
         if (src.cls != RC_FP || dst.cls != RC_INT) {
             compiler_panic(t->c, a->loc, "aarch64 convert FTOI_S: bad classes");
         }
         u32 sf   = type_is_64(dst.type) ? 1u : 0u;
         u32 type = type_is_fp_double(src.type) ? 1u : 0u;
-        emit32(t->mc, aa64_fcvtzs(sf, type, reg_num(dst), reg_num(src)));
+        emit32(mc, aa64_fcvtzs(sf, type, rd, rn));
+        return;
+    }
+    case CV_FTOI_U: {
+        if (src.cls != RC_FP || dst.cls != RC_INT) {
+            compiler_panic(t->c, a->loc, "aarch64 convert FTOI_U: bad classes");
+        }
+        u32 sf   = type_is_64(dst.type) ? 1u : 0u;
+        u32 type = type_is_fp_double(src.type) ? 1u : 0u;
+        emit32(mc, aa64_fcvtzu(sf, type, rd, rn));
+        return;
+    }
+    case CV_FEXT: {
+        /* float (S) → double (D). */
+        emit32(mc, aa64_fcvt_d_s(rd, rn));
+        return;
+    }
+    case CV_FTRUNC: {
+        /* double (D) → float (S). */
+        emit32(mc, aa64_fcvt_s_d(rd, rn));
+        return;
+    }
+    case CV_BITCAST: {
+        /* Same-size cross-class reinterpret (i32↔f32, i64↔f64). */
+        if (src.cls == RC_INT && dst.cls == RC_FP) {
+            u32 sz = type_byte_size(dst.type);
+            emit32(mc, sz == 8 ? aa64_fmov_d_x(rd, rn)
+                               : aa64_fmov_s_w(rd, rn));
+        } else if (src.cls == RC_FP && dst.cls == RC_INT) {
+            u32 sz = type_byte_size(src.type);
+            emit32(mc, sz == 8 ? aa64_fmov_x_d(rd, rn)
+                               : aa64_fmov_w_s(rd, rn));
+        } else {
+            compiler_panic(t->c, a->loc,
+                "aarch64 convert BITCAST: same-class not yet supported");
+        }
         return;
     }
     default:
diff --git a/src/arch/mc.c b/src/arch/mc.c
@@ -119,6 +119,23 @@ static void apply_fixup(MCImpl* mc, const MCFixup* fx, u32 target_offset)
         obj_patch(mc->base.obj, fx->sec_id, fx->offset, cur, 4);
         break;
     }
+    case R_AARCH64_CONDBR19: {
+        /* imm19 at bits 5..23 of B.cond; word-aligned displacement. */
+        i64 idisp = disp >> 2;
+        u32 imm19 = (u32)(idisp & 0x7ffffu);
+        const Section* s = obj_section_get(mc->base.obj, fx->sec_id);
+        if (!s) break;
+        u8 cur[4];
+        buf_read(&s->bytes, fx->offset, cur, 4);
+        u32 word = (u32)cur[0] | ((u32)cur[1] << 8) | ((u32)cur[2] << 16) | ((u32)cur[3] << 24);
+        word = (word & ~(0x7ffffu << 5)) | (imm19 << 5);
+        cur[0] = (u8)(word        & 0xff);
+        cur[1] = (u8)((word >> 8) & 0xff);
+        cur[2] = (u8)((word >> 16)& 0xff);
+        cur[3] = (u8)((word >> 24)& 0xff);
+        obj_patch(mc->base.obj, fx->sec_id, fx->offset, cur, 4);
+        break;
+    }
     default:
         compiler_panic(mc->base.c, mc->loc,
                        "MCEmitter: unsupported label-ref reloc kind %d",

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/arch/aarch64.c	\|	455	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M	src/arch/mc.c	\|	17	+++++++++++++++++