commit d33227bb2f34d7317626dd64e31ce3e96f455bf8
parent 9f00fa4fc9d80f10ce4bcc2c7bff5e8d32d59631
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Wed, 27 May 2026 05:44:09 -0700
opt: aggregate ABI lowering on optimizer path (partial)
Progress toward routing aggregate-typed functions through the optimizer
instead of the direct-replay bypass:
- Force aggregate / >8-byte locals to frame storage in cg_ir_lower (they
cannot live in a single PReg).
- Type each ABI part by its own width in aa_plan_call/aa_plan_ret direct
return paths (was using the whole aggregate type, producing truncating
32-bit moves for i64 fields).
- emit_call / emit_ret: copy aggregate / oversized results via copy_bytes
(and hand plan_ret the aggregate's memory location directly) instead of a
scalar move that exceeds register width.
Removes the "scalar too large" panics on sret returns; aggregate result
values are no longer truncated. Default test path remains green (408/408);
remaining aggregate value-correctness bugs (130/124/36/37) still under
investigation. No change to scalar codegen.
Diffstat:
3 files changed, 80 insertions(+), 25 deletions(-)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -1637,6 +1637,25 @@ static u32 aa_part_stack_align(const ABIArgPart* part) {
return al;
}
+/* The scalar type used to move one ABI part through a register. Aggregate
+ * args/results are split into parts; each part must move at its own width, not
+ * the (possibly >8-byte) aggregate width. */
+static CfreeCgTypeId aa_part_scalar_type(const ABIArgPart* part) {
+ if (part->cls == ABI_CLASS_FP)
+ return part->size <= 4u ? builtin_id(CFREE_CG_BUILTIN_F32)
+ : builtin_id(CFREE_CG_BUILTIN_F64);
+ switch (part->size) {
+ case 1u:
+ return builtin_id(CFREE_CG_BUILTIN_I8);
+ case 2u:
+ return builtin_id(CFREE_CG_BUILTIN_I16);
+ case 4u:
+ return builtin_id(CFREE_CG_BUILTIN_I32);
+ default:
+ return builtin_id(CFREE_CG_BUILTIN_I64);
+ }
+}
+
static u32 aa_class_stack_size(const ABIArgInfo* ai) {
u32 total = 0;
if (!ai || ai->kind == ABI_ARG_IGNORE) return 0;
@@ -1779,18 +1798,20 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc,
const ABIArgPart* part = &abi->ret.parts[p];
NativeAllocClass cls =
part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
- rets[nr].src = aa_reg_loc(desc->results[0].type, cls,
- cls == NATIVE_REG_FP ? nf++ : ni++);
+ CfreeCgTypeId pty = aa_part_scalar_type(part);
+ rets[nr].src = aa_reg_loc(pty, cls, cls == NATIVE_REG_FP ? nf++ : ni++);
rets[nr].dst = desc->results[0];
if (rets[nr].dst.kind == NATIVE_LOC_FRAME)
rets[nr].dst =
- aa_stack_loc(desc->results[0].type, desc->results[0].v.frame,
- (i32)part->src_offset);
- else if (rets[nr].dst.kind == NATIVE_LOC_STACK)
+ aa_stack_loc(pty, desc->results[0].v.frame, (i32)part->src_offset);
+ else if (rets[nr].dst.kind == NATIVE_LOC_STACK) {
rets[nr].dst.v.stack.offset += (i32)part->src_offset;
- else if (rets[nr].dst.kind == NATIVE_LOC_ADDR)
+ rets[nr].dst.type = pty;
+ } else if (rets[nr].dst.kind == NATIVE_LOC_ADDR) {
rets[nr].dst.v.addr.offset += (i32)part->src_offset;
- rets[nr].mem = aa_mem_for_type(t, desc->results[0].type, part->size);
+ rets[nr].dst.type = pty;
+ }
+ rets[nr].mem = aa_mem_for_type(t, pty, part->size);
nr++;
}
plan->nrets = nr;
@@ -1887,17 +1908,20 @@ static void aa_plan_ret(NativeTarget* t, const CGFuncDesc* fd,
const ABIArgPart* part = &abi->ret.parts[p];
NativeAllocClass cls =
part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
+ CfreeCgTypeId pty = aa_part_scalar_type(part);
rets[nr].src = values[0];
if (rets[nr].src.kind == NATIVE_LOC_FRAME)
- rets[nr].src = aa_stack_loc(values[0].type, values[0].v.frame,
- (i32)part->src_offset);
- else if (rets[nr].src.kind == NATIVE_LOC_STACK)
+ rets[nr].src =
+ aa_stack_loc(pty, values[0].v.frame, (i32)part->src_offset);
+ else if (rets[nr].src.kind == NATIVE_LOC_STACK) {
rets[nr].src.v.stack.offset += (i32)part->src_offset;
- else if (rets[nr].src.kind == NATIVE_LOC_ADDR)
+ rets[nr].src.type = pty;
+ } else if (rets[nr].src.kind == NATIVE_LOC_ADDR) {
rets[nr].src.v.addr.offset += (i32)part->src_offset;
- rets[nr].dst =
- aa_reg_loc(values[0].type, cls, cls == NATIVE_REG_FP ? nf++ : ni++);
- rets[nr].mem = aa_mem_for_type(t, values[0].type, part->size);
+ rets[nr].src.type = pty;
+ }
+ rets[nr].dst = aa_reg_loc(pty, cls, cls == NATIVE_REG_FP ? nf++ : ni++);
+ rets[nr].mem = aa_mem_for_type(t, pty, part->size);
nr++;
}
} else if (nvalues) {
diff --git a/src/opt/cg_ir_lower.c b/src/opt/cg_ir_lower.c
@@ -156,8 +156,12 @@ static void lower_locals(CgIrLower* l) {
m->size = in->desc.size;
m->align = in->desc.align;
m->cls = local_reg_class(l->c, in->desc.type);
+ /* Aggregates and oversized scalars cannot live in a single PReg; they need
+ * a memory home regardless of whether their address is taken. */
m->address_taken =
- local_needs_home(in) || local_address_used_in_cg_ir(l->src, in->id);
+ local_needs_home(in) || local_address_used_in_cg_ir(l->src, in->id) ||
+ cg_type_is_aggregate(l->c, in->desc.type) ||
+ cg_type_size(l->c, in->desc.type) > 8u;
PReg r = ir_alloc_preg(l->f, in->desc.type, m->cls);
if (m->address_taken) {
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -645,10 +645,29 @@ static void emit_call(NativeEmitCtx* e, Inst* in) {
for (u32 i = 0; i < plan.nrets; ++i)
write_loc(e, plan.rets[i].dst, plan.rets[i].src, plan.rets[i].mem, in->loc);
if (result_slot && final_result.kind != NATIVE_LOC_NONE) {
- NativeLoc tmp = loc_frame(
- aux->desc.ret.type, class_for_type(e, aux->desc.ret.type), result_slot);
- result_mem = mem_for_type(e->c, aux->desc.ret.type);
- write_loc(e, final_result, tmp, result_mem, in->loc);
+ CfreeCgTypeId rty = aux->desc.ret.type;
+ NativeLoc tmp =
+ loc_frame(rty, class_for_type(e, rty), result_slot);
+ result_mem = mem_for_type(e->c, rty);
+ if (final_result.kind != NATIVE_LOC_REG &&
+ (cg_type_is_aggregate(e->c, rty) ||
+ type_size_or(e->c, rty, 8u) > 8u)) {
+ /* Aggregate / oversized result: move bytes rather than a scalar copy
+ * (which would exceed the single-register width). The result was either
+ * written in parts by plan_call's rets, or by the callee via the sret
+ * pointer; either way it now lives in the temp slot. */
+ AggregateAccess acc;
+ NativeAddr da = addr_from_loc(e, final_result, in->loc);
+ NativeAddr sa = addr_from_loc(e, tmp, in->loc);
+ memset(&acc, 0, sizeof acc);
+ acc.type = rty;
+ acc.size = type_size_or(e->c, rty, 8u);
+ acc.align = type_align_or(e->c, rty, 8u);
+ acc.mem = result_mem;
+ e->target->copy_bytes(e->target, da, sa, acc);
+ } else {
+ write_loc(e, final_result, tmp, result_mem, in->loc);
+ }
}
}
@@ -660,12 +679,20 @@ static void emit_ret(NativeEmitCtx* e, Inst* in, const CGFuncDesc* fd) {
u32 nrets = 0;
if (aux && aux->present) {
NativeLoc final = abi_storage_loc(e, &aux->val, in->loc);
- NativeFrameSlot slot =
- temp_slot(e, aux->val.type, in->loc, NATIVE_FRAME_SLOT_SPILL);
- NativeLoc frame =
- loc_frame(aux->val.type, class_for_type(e, aux->val.type), slot);
- write_loc(e, frame, final, mem_for_type(e->c, aux->val.type), in->loc);
- value = frame;
+ CfreeCgTypeId vty = aux->val.type;
+ if (cg_type_is_aggregate(e->c, vty) || type_size_or(e->c, vty, 8u) > 8u) {
+ /* Aggregate / oversized return: hand plan_ret the value's memory
+ * location directly. It copies to the sret pointer (indirect) or reads
+ * the parts into the return registers (direct); a scalar copy through a
+ * temp would exceed the single-register width. */
+ value = final;
+ } else {
+ NativeFrameSlot slot =
+ temp_slot(e, vty, in->loc, NATIVE_FRAME_SLOT_SPILL);
+ NativeLoc frame = loc_frame(vty, class_for_type(e, vty), slot);
+ write_loc(e, frame, final, mem_for_type(e->c, vty), in->loc);
+ value = frame;
+ }
values = &value;
}
e->target->plan_ret(e->target, fd, values, values ? 1u : 0u, &rets, &nrets);