kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit d33227bb2f34d7317626dd64e31ce3e96f455bf8
parent 9f00fa4fc9d80f10ce4bcc2c7bff5e8d32d59631
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed, 27 May 2026 05:44:09 -0700

opt: aggregate ABI lowering on optimizer path (partial)

Progress toward routing aggregate-typed functions through the optimizer
instead of the direct-replay bypass:

- Force aggregate / >8-byte locals to frame storage in cg_ir_lower (they
  cannot live in a single PReg).
- Type each ABI part by its own width in aa_plan_call/aa_plan_ret direct
  return paths (was using the whole aggregate type, producing truncating
  32-bit moves for i64 fields).
- emit_call / emit_ret: copy aggregate / oversized results via copy_bytes
  (and hand plan_ret the aggregate's memory location directly) instead of a
  scalar move that exceeds register width.

Removes the "scalar too large" panics on sret returns; aggregate result
values are no longer truncated. Default test path remains green (408/408);
remaining aggregate value-correctness bugs (130/124/36/37) still under
investigation. No change to scalar codegen.

Diffstat:
Msrc/arch/aa64/native.c | 52++++++++++++++++++++++++++++++++++++++--------------
Msrc/opt/cg_ir_lower.c | 6+++++-
Msrc/opt/pass_native_emit.c | 47+++++++++++++++++++++++++++++++++++++----------
3 files changed, 80 insertions(+), 25 deletions(-)

diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c @@ -1637,6 +1637,25 @@ static u32 aa_part_stack_align(const ABIArgPart* part) { return al; } +/* The scalar type used to move one ABI part through a register. Aggregate + * args/results are split into parts; each part must move at its own width, not + * the (possibly >8-byte) aggregate width. */ +static CfreeCgTypeId aa_part_scalar_type(const ABIArgPart* part) { + if (part->cls == ABI_CLASS_FP) + return part->size <= 4u ? builtin_id(CFREE_CG_BUILTIN_F32) + : builtin_id(CFREE_CG_BUILTIN_F64); + switch (part->size) { + case 1u: + return builtin_id(CFREE_CG_BUILTIN_I8); + case 2u: + return builtin_id(CFREE_CG_BUILTIN_I16); + case 4u: + return builtin_id(CFREE_CG_BUILTIN_I32); + default: + return builtin_id(CFREE_CG_BUILTIN_I64); + } +} + static u32 aa_class_stack_size(const ABIArgInfo* ai) { u32 total = 0; if (!ai || ai->kind == ABI_ARG_IGNORE) return 0; @@ -1779,18 +1798,20 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc, const ABIArgPart* part = &abi->ret.parts[p]; NativeAllocClass cls = part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; - rets[nr].src = aa_reg_loc(desc->results[0].type, cls, - cls == NATIVE_REG_FP ? nf++ : ni++); + CfreeCgTypeId pty = aa_part_scalar_type(part); + rets[nr].src = aa_reg_loc(pty, cls, cls == NATIVE_REG_FP ? nf++ : ni++); rets[nr].dst = desc->results[0]; if (rets[nr].dst.kind == NATIVE_LOC_FRAME) rets[nr].dst = - aa_stack_loc(desc->results[0].type, desc->results[0].v.frame, - (i32)part->src_offset); - else if (rets[nr].dst.kind == NATIVE_LOC_STACK) + aa_stack_loc(pty, desc->results[0].v.frame, (i32)part->src_offset); + else if (rets[nr].dst.kind == NATIVE_LOC_STACK) { rets[nr].dst.v.stack.offset += (i32)part->src_offset; - else if (rets[nr].dst.kind == NATIVE_LOC_ADDR) + rets[nr].dst.type = pty; + } else if (rets[nr].dst.kind == NATIVE_LOC_ADDR) { rets[nr].dst.v.addr.offset += (i32)part->src_offset; - rets[nr].mem = aa_mem_for_type(t, desc->results[0].type, part->size); + rets[nr].dst.type = pty; + } + rets[nr].mem = aa_mem_for_type(t, pty, part->size); nr++; } plan->nrets = nr; @@ -1887,17 +1908,20 @@ static void aa_plan_ret(NativeTarget* t, const CGFuncDesc* fd, const ABIArgPart* part = &abi->ret.parts[p]; NativeAllocClass cls = part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT; + CfreeCgTypeId pty = aa_part_scalar_type(part); rets[nr].src = values[0]; if (rets[nr].src.kind == NATIVE_LOC_FRAME) - rets[nr].src = aa_stack_loc(values[0].type, values[0].v.frame, - (i32)part->src_offset); - else if (rets[nr].src.kind == NATIVE_LOC_STACK) + rets[nr].src = + aa_stack_loc(pty, values[0].v.frame, (i32)part->src_offset); + else if (rets[nr].src.kind == NATIVE_LOC_STACK) { rets[nr].src.v.stack.offset += (i32)part->src_offset; - else if (rets[nr].src.kind == NATIVE_LOC_ADDR) + rets[nr].src.type = pty; + } else if (rets[nr].src.kind == NATIVE_LOC_ADDR) { rets[nr].src.v.addr.offset += (i32)part->src_offset; - rets[nr].dst = - aa_reg_loc(values[0].type, cls, cls == NATIVE_REG_FP ? nf++ : ni++); - rets[nr].mem = aa_mem_for_type(t, values[0].type, part->size); + rets[nr].src.type = pty; + } + rets[nr].dst = aa_reg_loc(pty, cls, cls == NATIVE_REG_FP ? nf++ : ni++); + rets[nr].mem = aa_mem_for_type(t, pty, part->size); nr++; } } else if (nvalues) { diff --git a/src/opt/cg_ir_lower.c b/src/opt/cg_ir_lower.c @@ -156,8 +156,12 @@ static void lower_locals(CgIrLower* l) { m->size = in->desc.size; m->align = in->desc.align; m->cls = local_reg_class(l->c, in->desc.type); + /* Aggregates and oversized scalars cannot live in a single PReg; they need + * a memory home regardless of whether their address is taken. */ m->address_taken = - local_needs_home(in) || local_address_used_in_cg_ir(l->src, in->id); + local_needs_home(in) || local_address_used_in_cg_ir(l->src, in->id) || + cg_type_is_aggregate(l->c, in->desc.type) || + cg_type_size(l->c, in->desc.type) > 8u; PReg r = ir_alloc_preg(l->f, in->desc.type, m->cls); if (m->address_taken) { diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c @@ -645,10 +645,29 @@ static void emit_call(NativeEmitCtx* e, Inst* in) { for (u32 i = 0; i < plan.nrets; ++i) write_loc(e, plan.rets[i].dst, plan.rets[i].src, plan.rets[i].mem, in->loc); if (result_slot && final_result.kind != NATIVE_LOC_NONE) { - NativeLoc tmp = loc_frame( - aux->desc.ret.type, class_for_type(e, aux->desc.ret.type), result_slot); - result_mem = mem_for_type(e->c, aux->desc.ret.type); - write_loc(e, final_result, tmp, result_mem, in->loc); + CfreeCgTypeId rty = aux->desc.ret.type; + NativeLoc tmp = + loc_frame(rty, class_for_type(e, rty), result_slot); + result_mem = mem_for_type(e->c, rty); + if (final_result.kind != NATIVE_LOC_REG && + (cg_type_is_aggregate(e->c, rty) || + type_size_or(e->c, rty, 8u) > 8u)) { + /* Aggregate / oversized result: move bytes rather than a scalar copy + * (which would exceed the single-register width). The result was either + * written in parts by plan_call's rets, or by the callee via the sret + * pointer; either way it now lives in the temp slot. */ + AggregateAccess acc; + NativeAddr da = addr_from_loc(e, final_result, in->loc); + NativeAddr sa = addr_from_loc(e, tmp, in->loc); + memset(&acc, 0, sizeof acc); + acc.type = rty; + acc.size = type_size_or(e->c, rty, 8u); + acc.align = type_align_or(e->c, rty, 8u); + acc.mem = result_mem; + e->target->copy_bytes(e->target, da, sa, acc); + } else { + write_loc(e, final_result, tmp, result_mem, in->loc); + } } } @@ -660,12 +679,20 @@ static void emit_ret(NativeEmitCtx* e, Inst* in, const CGFuncDesc* fd) { u32 nrets = 0; if (aux && aux->present) { NativeLoc final = abi_storage_loc(e, &aux->val, in->loc); - NativeFrameSlot slot = - temp_slot(e, aux->val.type, in->loc, NATIVE_FRAME_SLOT_SPILL); - NativeLoc frame = - loc_frame(aux->val.type, class_for_type(e, aux->val.type), slot); - write_loc(e, frame, final, mem_for_type(e->c, aux->val.type), in->loc); - value = frame; + CfreeCgTypeId vty = aux->val.type; + if (cg_type_is_aggregate(e->c, vty) || type_size_or(e->c, vty, 8u) > 8u) { + /* Aggregate / oversized return: hand plan_ret the value's memory + * location directly. It copies to the sret pointer (indirect) or reads + * the parts into the return registers (direct); a scalar copy through a + * temp would exceed the single-register width. */ + value = final; + } else { + NativeFrameSlot slot = + temp_slot(e, vty, in->loc, NATIVE_FRAME_SLOT_SPILL); + NativeLoc frame = loc_frame(vty, class_for_type(e, vty), slot); + write_loc(e, frame, final, mem_for_type(e->c, vty), in->loc); + value = frame; + } values = &value; } e->target->plan_ret(e->target, fd, values, values ? 1u : 0u, &rets, &nrets);