kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 5ae1703a745e575657fee65c946f9e1f241d4a3a
parent b871959ddf3e4188f507041b2f9f7181d2662750
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Sat,  9 May 2026 15:17:20 -0700

cg/aa64: implement Groups G, H, I (calls, control flow, alloca)

- Calls: FP binops (FADD/FSUB/FMUL/FDIV), addr_of for OPK_GLOBAL via
  ADRP+ADD, indirect calls (BLR Xn), multi-part returns (HFA into
  LOCAL storage), and memcpy-on-entry for ABI_ARG_INDIRECT params.
- Control flow: SCOPE_LOOP / SCOPE_BLOCK as bookkeeping over the
  caller-driven label_place/jump, with break_to/continue_to forwarding
  to the recorded labels.
- Alloca: SUB SP by an aligned const or runtime size, return
  SP + max_outgoing via a placeholder ADD patched at func_end. For
  has_alloca functions, restore SP from FP at the epilogue.

Diffstat:
Msrc/arch/aarch64.c | 328+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 279 insertions(+), 49 deletions(-)

diff --git a/src/arch/aarch64.c b/src/arch/aarch64.c @@ -153,6 +153,12 @@ static inline u32 aa64_fmov_d_x(u32 Rd, u32 Rn) /* GPR→FP, double */ static inline u32 aa64_fmov_x_d(u32 Rd, u32 Rn) /* FP→GPR, double */ { return 0x9E660000u | ((Rn&0x1f)<<5) | (Rd&0x1f); } +/* SUB (extended register), 64-bit, UXTX, shift 0. Unlike SUB shifted-reg + * (where Rd=31 means ZR), this form treats Rd/Rn=31 as SP — needed to + * decrement SP by a register amount during alloca. */ +static inline u32 aa64_sub_extreg_x_uxtx(u32 Rd, u32 Rn, u32 Rm) +{ return 0xCB206000u | ((Rm&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rd&0x1f); } + /* SUBS shifted register (Rd=ZR encodes CMP). */ static inline u32 aa64_subs_reg(u32 sf, u32 Rd, u32 Rn, u32 Rm) { return 0x6B000000u | (sf<<31) | ((Rm&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rd&0x1f); } @@ -168,6 +174,16 @@ static inline u32 aa64_csinc(u32 sf, u32 Rd, u32 Rn, u32 Rm, u32 cond) static inline u32 aa64_cset(u32 sf, u32 Rd, u32 cond) { return aa64_csinc(sf, Rd, 31u, 31u, cond ^ 1u); } +/* FADD / FSUB / FMUL / FDIV (scalar). type: 0=S (float), 1=D (double). */ +static inline u32 aa64_fadd(u32 type, u32 Rd, u32 Rn, u32 Rm) +{ return 0x1E202800u | ((type&3)<<22) | ((Rm&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rd&0x1f); } +static inline u32 aa64_fsub(u32 type, u32 Rd, u32 Rn, u32 Rm) +{ return 0x1E203800u | ((type&3)<<22) | ((Rm&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rd&0x1f); } +static inline u32 aa64_fmul(u32 type, u32 Rd, u32 Rn, u32 Rm) +{ return 0x1E200800u | ((type&3)<<22) | ((Rm&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rd&0x1f); } +static inline u32 aa64_fdiv(u32 type, u32 Rd, u32 Rn, u32 Rm) +{ return 0x1E201800u | ((type&3)<<22) | ((Rm&0x1f)<<16) | ((Rn&0x1f)<<5) | (Rd&0x1f); } + /* SBFM / UBFM / BFM (bitfield move family). * sf opc(2) 100110 N immr(6) imms(6) Rn(5) Rd(5) * opc: 00=SBFM, 01=BFM, 10=UBFM. N must equal sf. */ @@ -198,6 +214,8 @@ typedef struct AAScope { u8 pad[2]; MCLabel else_label; /* SCOPE_IF: false branch target / end-of-then */ MCLabel end_label; /* SCOPE_IF: join point past the whole if/else */ + Label break_label; /* SCOPE_LOOP/BLOCK: explicit break target */ + Label continue_label;/* SCOPE_LOOP: explicit continue target */ } AAScope; typedef struct AAImpl { @@ -234,6 +252,15 @@ typedef struct AAImpl { AAScope* scopes; u32 nscopes; u32 scopes_cap; + + /* alloca: each call emits an `ADD result, SP, #0` placeholder; at + * func_end the imm12 is patched with the final max_outgoing. Tracks + * (instruction pos, dst reg) for each placeholder. has_alloca also + * triggers SP-from-FP restoration in the epilogue. */ + u8 has_alloca; + struct AAAllocaPatch { u32 pos; u32 dst_reg; }* add_patches; + u32 nadd_patches; + u32 add_patches_cap; } AAImpl; static AAImpl* impl_of(CGTarget* t) { return (AAImpl*)t; } @@ -405,6 +432,8 @@ static void aa_func_begin(CGTarget* t, const CGFuncDesc* fd) a->used_fp = 0; a->nslots = 0; a->nscopes = 0; + a->has_alloca = 0; + a->nadd_patches= 0; a->sret_ptr_slot = FRAME_SLOT_NONE; a->epilogue_label = mc->label_new(mc); @@ -454,6 +483,19 @@ static void aa_func_end(CGTarget* t) * branches land here. */ mc->label_place(mc, a->epilogue_label); + /* If the body called alloca, SP may sit below the locals area. + * Restore SP from FP before reloading callee-saves, since those use + * SP-relative offsets. */ + if (a->has_alloca) { + if (fp_lr_off <= 0xfff) { + emit32(mc, aa64_sub_imm(1, /*Rd=SP*/31, /*Rn=*/29, fp_lr_off, 0)); + } else { + compiler_panic(t->c, a->loc, + "aarch64: has_alloca + fp_lr_off %u out of imm12 range", + fp_lr_off); + } + } + /* Restore FP saves, then INT saves, then fp/lr, then add sp + ret. */ for (i32 i = (i32)n_fp_pairs - 1; i >= 0; --i) { u32 r0 = 8u + (u32)i * 2u; @@ -526,6 +568,19 @@ overflow: patch32(obj, sec, pos + i*4u, words[i]); } + /* Patch each alloca's `ADD dst, SP, #0` placeholder with the final + * max_outgoing offset, now that the high-water mark is known. */ + if (a->max_outgoing > 0xfff) { + compiler_panic(t->c, a->loc, + "aarch64: max_outgoing %u out of imm12 range for alloca patch", + a->max_outgoing); + } + for (u32 i = 0; i < a->nadd_patches; ++i) { + u32 dr = a->add_patches[i].dst_reg; + u32 word = aa64_add_imm(1, dr, /*Rn=SP*/31, a->max_outgoing, 0); + patch32(obj, sec, a->add_patches[i].pos, word); + } + /* Define the function symbol. */ u32 end = mc->pos(mc); obj_symbol_define(obj, a->fd->sym, sec, @@ -608,18 +663,40 @@ static void aa_param(CGTarget* t, const CGParamDesc* p) if (ai->kind == ABI_ARG_IGNORE) return; if (ai->kind == ABI_ARG_INDIRECT) { - /* Caller passes a pointer to the data. Pointer comes in next - * INT arg reg; store it into the home slot (which holds the - * pointer-sized address). */ + /* Caller passes a pointer to a copy. Materialize that pointer + * into a scratch reg, then memcpy `s->size` bytes from there + * into the slot — so subsequent LOCAL_op(slot) reads/writes the + * struct contents directly, not the pointer. */ + u32 ptr_reg; if (a->next_param_int < 8) { - u32 reg = a->next_param_int++; - emit32(t->mc, aa64_stur(3, reg, 29, -(i32)s->off)); + ptr_reg = a->next_param_int++; } else { - /* Pointer on stack — load and store. */ u32 caller_off = a->next_param_stack; a->next_param_stack += 8; emit32(t->mc, aa64_ldur(3, 9, 29, (i32)(16 + caller_off))); - emit32(t->mc, aa64_stur(3, 9, 29, -(i32)s->off)); + ptr_reg = 9; + } + u32 nbytes = s->size; + u32 i = 0; + while (i + 8 <= nbytes) { + emit32(t->mc, aa64_ldur(3, 10, ptr_reg, (i32)i)); + emit32(t->mc, aa64_stur(3, 10, 29, -(i32)s->off + (i32)i)); + i += 8; + } + while (i + 4 <= nbytes) { + emit32(t->mc, aa64_ldur(2, 10, ptr_reg, (i32)i)); + emit32(t->mc, aa64_stur(2, 10, 29, -(i32)s->off + (i32)i)); + i += 4; + } + while (i + 2 <= nbytes) { + emit32(t->mc, aa64_ldur(1, 10, ptr_reg, (i32)i)); + emit32(t->mc, aa64_stur(1, 10, 29, -(i32)s->off + (i32)i)); + i += 2; + } + while (i < nbytes) { + emit32(t->mc, aa64_ldur(0, 10, ptr_reg, (i32)i)); + emit32(t->mc, aa64_stur(0, 10, 29, -(i32)s->off + (i32)i)); + i += 1; } return; } @@ -751,12 +828,16 @@ static CGScope aa_scope_begin(CGTarget* t, const CGScopeDesc* d) a->scopes_cap = ncap; } AAScope* sc = &a->scopes[a->nscopes]; - sc->kind = (u8)d->kind; - sc->has_else = 0; - sc->else_label = t->mc->label_new(t->mc); - sc->end_label = t->mc->label_new(t->mc); + sc->kind = (u8)d->kind; + sc->has_else = 0; + sc->else_label = 0; + sc->end_label = 0; + sc->break_label = d->break_label; + sc->continue_label = d->continue_label; if (d->kind == SCOPE_IF) { + sc->else_label = t->mc->label_new(t->mc); + sc->end_label = t->mc->label_new(t->mc); /* Test cond against zero, branch to else_label on EQ (false). */ u32 sf = type_is_64(d->cond.type) ? 1u : 0u; u32 rn = force_reg_int(t, d->cond, sf, 9); @@ -764,8 +845,11 @@ static CGScope aa_scope_begin(CGTarget* t, const CGScopeDesc* d) emit32(t->mc, aa64_b_cond(0x0u /*EQ*/)); t->mc->emit_label_ref(t->mc, sc->else_label, R_AARCH64_CONDBR19, 4, 0); + } else if (d->kind == SCOPE_LOOP || d->kind == SCOPE_BLOCK) { + /* Structured loop/block: bookkeep only. The caller drives + * label_place + jump itself; break_to/continue_to forward to the + * recorded labels. No instructions emitted here. */ } else { - /* BLOCK / LOOP not yet exercised by the corpus. */ compiler_panic(t->c, a->loc, "aarch64 scope_begin: kind %d not yet implemented", (int)d->kind); } @@ -798,15 +882,37 @@ static void aa_scope_end(CGTarget* t, CGScope s) (unsigned)s); } AAScope* sc = &a->scopes[s - 1]; - if (sc->kind == SCOPE_IF && !sc->has_else) { - /* No else body — false-branch lands at scope_end. */ - t->mc->label_place(t->mc, sc->else_label); + if (sc->kind == SCOPE_IF) { + if (!sc->has_else) { + /* No else body — false-branch lands at scope_end. */ + t->mc->label_place(t->mc, sc->else_label); + } + t->mc->label_place(t->mc, sc->end_label); + } + /* SCOPE_LOOP / SCOPE_BLOCK: caller has already placed the break_label. */ +} + +static void aa_break_to(CGTarget* t, CGScope s) +{ + AAImpl* a = impl_of(t); + if (s == CG_SCOPE_NONE || s > a->nscopes) { + compiler_panic(t->c, a->loc, "aarch64 break_to: bad scope %u", + (unsigned)s); } - t->mc->label_place(t->mc, sc->end_label); + AAScope* sc = &a->scopes[s - 1]; + aa_jump(t, sc->break_label); } -static void aa_break_to (CGTarget* t, CGScope s) { (void)s; aa_panic(t, "break_to"); } -static void aa_continue_to(CGTarget* t, CGScope s) { (void)s; aa_panic(t, "continue_to"); } +static void aa_continue_to(CGTarget* t, CGScope s) +{ + AAImpl* a = impl_of(t); + if (s == CG_SCOPE_NONE || s > a->nscopes) { + compiler_panic(t->c, a->loc, "aarch64 continue_to: bad scope %u", + (unsigned)s); + } + AAScope* sc = &a->scopes[s - 1]; + aa_jump(t, sc->continue_label); +} /* ---- data movement ---- */ @@ -972,6 +1078,23 @@ static void aa_addr_of(CGTarget* t, Operand dst, Operand lv) } return; } + if (lv.kind == OPK_GLOBAL) { + /* ADRP Xd, sym ; ADD Xd, Xd, #:lo12:sym (with addend baked into both + * relocations). Used to materialize a function or data pointer. */ + u32 rd = reg_num(dst); + u32 sec = t->mc->section_id; + u32 adrp_pos = t->mc->pos(t->mc); + emit32(t->mc, aa64_adrp_base(rd)); + t->mc->emit_reloc_at(t->mc, sec, adrp_pos, + R_AARCH64_ADR_PREL_PG_HI21, + lv.v.global.sym, lv.v.global.addend, 0, 0); + u32 add_pos = t->mc->pos(t->mc); + emit32(t->mc, aa64_add_imm(1, rd, rd, 0, 0)); + t->mc->emit_reloc_at(t->mc, sec, add_pos, + R_AARCH64_ADD_ABS_LO12_NC, + lv.v.global.sym, lv.v.global.addend, 0, 0); + return; + } aa_panic(t, "addr_of"); } @@ -1144,6 +1267,29 @@ static u32 force_reg_int(CGTarget* t, Operand op, u32 sf, u32 scratch) static void aa_binop(CGTarget* t, BinOp op, Operand dst, Operand a_op, Operand b_op) { MCEmitter* mc = t->mc; + + /* FP binops route through scalar FADD/FSUB/FMUL/FDIV. */ + if (op == BO_FADD || op == BO_FSUB || op == BO_FMUL || op == BO_FDIV) { + if (a_op.kind != OPK_REG || b_op.kind != OPK_REG || dst.cls != RC_FP) { + compiler_panic(t->c, impl_of(t)->loc, + "aarch64 binop: FP op requires REG operands"); + } + u32 type = type_is_fp_double(dst.type) ? 1u : 0u; + u32 rd = reg_num(dst); + u32 rn = reg_num(a_op); + u32 rm = reg_num(b_op); + u32 w; + switch (op) { + case BO_FADD: w = aa64_fadd(type, rd, rn, rm); break; + case BO_FSUB: w = aa64_fsub(type, rd, rn, rm); break; + case BO_FMUL: w = aa64_fmul(type, rd, rn, rm); break; + case BO_FDIV: w = aa64_fdiv(type, rd, rn, rm); break; + default: w = 0; break; /* unreachable */ + } + emit32(mc, w); + return; + } + u32 sf = type_is_64(dst.type) ? 1u : 0u; u32 rd = reg_num(dst); u32 rn = force_reg_int(t, a_op, sf, 9); @@ -1434,16 +1580,19 @@ static void aa_call(CGTarget* t, const CGCallDesc* d) u32 needed = (stack_off + 15u) & ~15u; if (needed > a->max_outgoing) a->max_outgoing = needed; - /* BL <callee> — direct only. */ - if (d->callee.kind != OPK_GLOBAL) { + /* Direct (BL <sym>) vs. indirect (BLR Xn). */ + if (d->callee.kind == OPK_GLOBAL) { + u32 bl_pos = mc->pos(mc); + emit32(mc, aa64_bl_base()); + mc->emit_reloc_at(mc, mc->section_id, bl_pos, + R_AARCH64_CALL26, d->callee.v.global.sym, + d->callee.v.global.addend, 0, 0); + } else if (d->callee.kind == OPK_REG) { + emit32(mc, aa64_blr(reg_num(d->callee))); + } else { compiler_panic(t->c, a->loc, - "aarch64 call: indirect call not yet supported"); + "aarch64 call: callee kind %d unsupported", (int)d->callee.kind); } - u32 bl_pos = mc->pos(mc); - emit32(mc, aa64_bl_base()); - mc->emit_reloc_at(mc, mc->section_id, bl_pos, - R_AARCH64_CALL26, d->callee.v.global.sym, - d->callee.v.global.addend, 0, 0); /* Receive return value. */ const ABIArgInfo* ri = &d->abi->ret; @@ -1451,37 +1600,54 @@ static void aa_call(CGTarget* t, const CGCallDesc* d) /* Nothing to copy — sret was placed directly into the dst slot. */ return; } - /* DIRECT scalar in our coverage: a single INT or FP part placed in - * x0 / v0. Move into ret_storage. */ if (ri->nparts == 0) return; - const ABIArgPart* p0 = &ri->parts[0]; + Operand rs = d->ret.storage; - if (p0->cls == ABI_CLASS_INT) { - u32 sf = (p0->size == 8) ? 1u : 0u; - if (rs.kind == OPK_REG) { - emit32(mc, aa64_mov_reg(sf, reg_num(rs), 0)); - } else if (rs.kind == OPK_LOCAL) { - AASlot* s = slot_get(a, rs.v.frame_slot); - if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad ret slot"); - u32 sidx = size_idx_for_bytes(p0->size); - emit32(mc, aa64_stur(sidx, 0, 29, -(i32)s->off)); + /* Walk parts; INT parts come from x0, x1, ...; FP parts from v0, v1, .... */ + u32 next_int_ret = 0, next_fp_ret = 0; + for (u16 i = 0; i < ri->nparts; ++i) { + const ABIArgPart* p = &ri->parts[i]; + u32 src_reg; + if (p->cls == ABI_CLASS_INT) { + src_reg = next_int_ret++; + } else if (p->cls == ABI_CLASS_FP) { + src_reg = next_fp_ret++; + } else { + compiler_panic(t->c, a->loc, + "aarch64 call: ret part cls %d unimpl", (int)p->cls); } - } else if (p0->cls == ABI_CLASS_FP) { - u32 type = (p0->size == 8) ? 1u : 0u; + if (rs.kind == OPK_REG) { - emit32(mc, aa64_fmov_reg(type, reg_num(rs), 0)); + if (ri->nparts != 1) { + compiler_panic(t->c, a->loc, + "aarch64 call: REG ret_storage with %u parts", (unsigned)ri->nparts); + } + if (p->cls == ABI_CLASS_INT) { + u32 sf = (p->size == 8) ? 1u : 0u; + emit32(mc, aa64_mov_reg(sf, reg_num(rs), src_reg)); + } else { + u32 type = (p->size == 8) ? 1u : 0u; + emit32(mc, aa64_fmov_reg(type, reg_num(rs), src_reg)); + } } else if (rs.kind == OPK_LOCAL) { AASlot* s = slot_get(a, rs.v.frame_slot); if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad ret slot"); - u32 sidx = size_idx_for_bytes(p0->size); - emit32(mc, aa64_stur_fp(sidx, 0, 29, -(i32)s->off)); + u32 sidx = size_idx_for_bytes(p->size); + i32 off = -(i32)s->off + (i32)p->src_offset; + if (p->cls == ABI_CLASS_INT) { + emit32(mc, aa64_stur(sidx, src_reg, 29, off)); + } else { + emit32(mc, aa64_stur_fp(sidx, src_reg, 29, off)); + } + } else if (rs.kind == OPK_IMM && rs.type + && rs.type->kind == TY_VOID) { + /* Void return placeholder — nothing to do. */ + } else { + compiler_panic(t->c, a->loc, + "aarch64 call: ret_storage kind %d unsupported", + (int)rs.kind); } } - /* Multi-part returns: not exercised yet. */ - if (ri->nparts > 1) { - compiler_panic(t->c, a->loc, - "aarch64 call: multi-part return not yet supported"); - } } /* Materialize the return value, then branch to the function epilogue. */ @@ -1571,7 +1737,71 @@ static void aa_ret(CGTarget* t, const CGABIValue* val) (void)bpos; } -static void aa_alloca_ (CGTarget* t, Operand d, Operand s, u32 a) { (void)d;(void)s;(void)a; aa_panic(t, "alloca"); } +/* Dynamic stack allocation. Layout: outgoing-args (max_outgoing bytes, + * 16-aligned) sit at the bottom of SP; the alloca block goes immediately + * above. After lowering SP by an aligned size, the new block's address is + * (SP + max_outgoing). max_outgoing is only known at func_end, so each + * alloca emits a placeholder `ADD dst, SP, #0` and registers a patch site; + * func_end rewrites the imm12 with the final max_outgoing. */ +static void aa_alloca_(CGTarget* t, Operand d, Operand sz, u32 align) +{ + AAImpl* a = impl_of(t); + MCEmitter* mc = t->mc; + + if (d.kind != OPK_REG) { + compiler_panic(t->c, a->loc, "aarch64 alloca: dst must be REG"); + } + /* SP is 16-aligned and we lower it by an aligned amount, so result + * inherits 16-byte alignment. Larger requests would need an + * additional mask on the result; reject so we notice. */ + if (align > 16) { + compiler_panic(t->c, a->loc, + "aarch64 alloca: align %u > 16 not yet supported", align); + } + + if (sz.kind == OPK_IMM) { + i64 v = sz.v.imm; + if (v < 0) { + compiler_panic(t->c, a->loc, "aarch64 alloca: negative size"); + } + u64 aligned = ((u64)v + 15u) & ~(u64)15u; + if (aligned == 0) aligned = 16; /* keep SP changing */ + if (aligned > 0xfffu) { + compiler_panic(t->c, a->loc, + "aarch64 alloca: const size %llu too large for v1", + (unsigned long long)aligned); + } + emit32(mc, aa64_sub_imm(1, /*Rd=SP*/31, /*Rn=SP*/31, (u32)aligned, 0)); + } else if (sz.kind == OPK_REG) { + /* Round size up to a 16-byte multiple, then `sub sp, sp, x9` + * (extended-register form so Rd/Rn=SP work). */ + u32 sz_reg = reg_num(sz); + emit32(mc, aa64_add_imm(1, 9, sz_reg, 15u, 0)); /* x9 = size+15 */ + emit32(mc, aa64_ubfm(1, 9, 9, 4, 63)); /* lsr x9, x9, #4 */ + emit32(mc, aa64_ubfm(1, 9, 9, 60, 59)); /* lsl x9, x9, #4 */ + emit32(mc, aa64_sub_extreg_x_uxtx(/*SP*/31, /*SP*/31, 9)); + } else { + compiler_panic(t->c, a->loc, + "aarch64 alloca: size kind %d unsupported", (int)sz.kind); + } + + /* Placeholder ADD dst, SP, #<max_outgoing>. Patched at func_end. */ + if (a->nadd_patches == a->add_patches_cap) { + u32 ncap = a->add_patches_cap ? a->add_patches_cap * 2 : 4; + struct AAAllocaPatch* nb = arena_array(t->c->tu, + struct AAAllocaPatch, ncap); + if (a->add_patches) memcpy(nb, a->add_patches, + sizeof(*nb) * a->nadd_patches); + a->add_patches = nb; + a->add_patches_cap = ncap; + } + u32 dst_reg = reg_num(d); + a->add_patches[a->nadd_patches].pos = mc->pos(mc); + a->add_patches[a->nadd_patches].dst_reg = dst_reg; + a->nadd_patches++; + emit32(mc, aa64_add_imm(1, dst_reg, /*Rn=SP*/31, 0, 0)); + a->has_alloca = 1; +} static void aa_va_start_(CGTarget* t, Operand a) { (void)a; aa_panic(t, "va_start"); } static void aa_va_arg_ (CGTarget* t, Operand d, Operand a, const Type* ty) { (void)d;(void)a;(void)ty; aa_panic(t, "va_arg"); } static void aa_va_end_ (CGTarget* t, Operand a) { (void)a; aa_panic(t, "va_end"); }