opt: aggregate ABI lowering on optimizer path (partial) - kit

commit d33227bb2f34d7317626dd64e31ce3e96f455bf8
parent 9f00fa4fc9d80f10ce4bcc2c7bff5e8d32d59631
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Wed, 27 May 2026 05:44:09 -0700

opt: aggregate ABI lowering on optimizer path (partial)

Progress toward routing aggregate-typed functions through the optimizer
instead of the direct-replay bypass:

- Force aggregate / >8-byte locals to frame storage in cg_ir_lower (they
  cannot live in a single PReg).
- Type each ABI part by its own width in aa_plan_call/aa_plan_ret direct
  return paths (was using the whole aggregate type, producing truncating
  32-bit moves for i64 fields).
- emit_call / emit_ret: copy aggregate / oversized results via copy_bytes
  (and hand plan_ret the aggregate's memory location directly) instead of a
  scalar move that exceeds register width.

Removes the "scalar too large" panics on sret returns; aggregate result
values are no longer truncated. Default test path remains green (408/408);
remaining aggregate value-correctness bugs (130/124/36/37) still under
investigation. No change to scalar codegen.

Diffstat:
M src/arch/aa64/native.c  | 52 ++++++++++++++++++++++++++++++++++++++--------------
M src/opt/cg_ir_lower.c  | 6 +++++-
M src/opt/pass_native_emit.c  | 47 +++++++++++++++++++++++++++++++++++++----------

3 files changed, 80 insertions(+), 25 deletions(-)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -1637,6 +1637,25 @@ static u32 aa_part_stack_align(const ABIArgPart* part) {
   return al;
 }
 
+/* The scalar type used to move one ABI part through a register. Aggregate
+ * args/results are split into parts; each part must move at its own width, not
+ * the (possibly >8-byte) aggregate width. */
+static CfreeCgTypeId aa_part_scalar_type(const ABIArgPart* part) {
+  if (part->cls == ABI_CLASS_FP)
+    return part->size <= 4u ? builtin_id(CFREE_CG_BUILTIN_F32)
+                            : builtin_id(CFREE_CG_BUILTIN_F64);
+  switch (part->size) {
+    case 1u:
+      return builtin_id(CFREE_CG_BUILTIN_I8);
+    case 2u:
+      return builtin_id(CFREE_CG_BUILTIN_I16);
+    case 4u:
+      return builtin_id(CFREE_CG_BUILTIN_I32);
+    default:
+      return builtin_id(CFREE_CG_BUILTIN_I64);
+  }
+}
+
 static u32 aa_class_stack_size(const ABIArgInfo* ai) {
   u32 total = 0;
   if (!ai || ai->kind == ABI_ARG_IGNORE) return 0;
@@ -1779,18 +1798,20 @@ static void aa_plan_call(NativeTarget* t, const NativeCallDesc* desc,
       const ABIArgPart* part = &abi->ret.parts[p];
       NativeAllocClass cls =
           part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
-      rets[nr].src = aa_reg_loc(desc->results[0].type, cls,
-                                cls == NATIVE_REG_FP ? nf++ : ni++);
+      CfreeCgTypeId pty = aa_part_scalar_type(part);
+      rets[nr].src = aa_reg_loc(pty, cls, cls == NATIVE_REG_FP ? nf++ : ni++);
       rets[nr].dst = desc->results[0];
       if (rets[nr].dst.kind == NATIVE_LOC_FRAME)
         rets[nr].dst =
-            aa_stack_loc(desc->results[0].type, desc->results[0].v.frame,
-                         (i32)part->src_offset);
-      else if (rets[nr].dst.kind == NATIVE_LOC_STACK)
+            aa_stack_loc(pty, desc->results[0].v.frame, (i32)part->src_offset);
+      else if (rets[nr].dst.kind == NATIVE_LOC_STACK) {
         rets[nr].dst.v.stack.offset += (i32)part->src_offset;
-      else if (rets[nr].dst.kind == NATIVE_LOC_ADDR)
+        rets[nr].dst.type = pty;
+      } else if (rets[nr].dst.kind == NATIVE_LOC_ADDR) {
         rets[nr].dst.v.addr.offset += (i32)part->src_offset;
-      rets[nr].mem = aa_mem_for_type(t, desc->results[0].type, part->size);
+        rets[nr].dst.type = pty;
+      }
+      rets[nr].mem = aa_mem_for_type(t, pty, part->size);
       nr++;
     }
     plan->nrets = nr;
@@ -1887,17 +1908,20 @@ static void aa_plan_ret(NativeTarget* t, const CGFuncDesc* fd,
       const ABIArgPart* part = &abi->ret.parts[p];
       NativeAllocClass cls =
           part->cls == ABI_CLASS_FP ? NATIVE_REG_FP : NATIVE_REG_INT;
+      CfreeCgTypeId pty = aa_part_scalar_type(part);
       rets[nr].src = values[0];
       if (rets[nr].src.kind == NATIVE_LOC_FRAME)
-        rets[nr].src = aa_stack_loc(values[0].type, values[0].v.frame,
-                                    (i32)part->src_offset);
-      else if (rets[nr].src.kind == NATIVE_LOC_STACK)
+        rets[nr].src =
+            aa_stack_loc(pty, values[0].v.frame, (i32)part->src_offset);
+      else if (rets[nr].src.kind == NATIVE_LOC_STACK) {
         rets[nr].src.v.stack.offset += (i32)part->src_offset;
-      else if (rets[nr].src.kind == NATIVE_LOC_ADDR)
+        rets[nr].src.type = pty;
+      } else if (rets[nr].src.kind == NATIVE_LOC_ADDR) {
         rets[nr].src.v.addr.offset += (i32)part->src_offset;
-      rets[nr].dst =
-          aa_reg_loc(values[0].type, cls, cls == NATIVE_REG_FP ? nf++ : ni++);
-      rets[nr].mem = aa_mem_for_type(t, values[0].type, part->size);
+        rets[nr].src.type = pty;
+      }
+      rets[nr].dst = aa_reg_loc(pty, cls, cls == NATIVE_REG_FP ? nf++ : ni++);
+      rets[nr].mem = aa_mem_for_type(t, pty, part->size);
       nr++;
     }
   } else if (nvalues) {
diff --git a/src/opt/cg_ir_lower.c b/src/opt/cg_ir_lower.c
@@ -156,8 +156,12 @@ static void lower_locals(CgIrLower* l) {
     m->size = in->desc.size;
     m->align = in->desc.align;
     m->cls = local_reg_class(l->c, in->desc.type);
+    /* Aggregates and oversized scalars cannot live in a single PReg; they need
+     * a memory home regardless of whether their address is taken. */
     m->address_taken =
-        local_needs_home(in) || local_address_used_in_cg_ir(l->src, in->id);
+        local_needs_home(in) || local_address_used_in_cg_ir(l->src, in->id) ||
+        cg_type_is_aggregate(l->c, in->desc.type) ||
+        cg_type_size(l->c, in->desc.type) > 8u;
 
     PReg r = ir_alloc_preg(l->f, in->desc.type, m->cls);
     if (m->address_taken) {
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -645,10 +645,29 @@ static void emit_call(NativeEmitCtx* e, Inst* in) {
   for (u32 i = 0; i < plan.nrets; ++i)
     write_loc(e, plan.rets[i].dst, plan.rets[i].src, plan.rets[i].mem, in->loc);
   if (result_slot && final_result.kind != NATIVE_LOC_NONE) {
-    NativeLoc tmp = loc_frame(
-        aux->desc.ret.type, class_for_type(e, aux->desc.ret.type), result_slot);
-    result_mem = mem_for_type(e->c, aux->desc.ret.type);
-    write_loc(e, final_result, tmp, result_mem, in->loc);
+    CfreeCgTypeId rty = aux->desc.ret.type;
+    NativeLoc tmp =
+        loc_frame(rty, class_for_type(e, rty), result_slot);
+    result_mem = mem_for_type(e->c, rty);
+    if (final_result.kind != NATIVE_LOC_REG &&
+        (cg_type_is_aggregate(e->c, rty) ||
+         type_size_or(e->c, rty, 8u) > 8u)) {
+      /* Aggregate / oversized result: move bytes rather than a scalar copy
+       * (which would exceed the single-register width). The result was either
+       * written in parts by plan_call's rets, or by the callee via the sret
+       * pointer; either way it now lives in the temp slot. */
+      AggregateAccess acc;
+      NativeAddr da = addr_from_loc(e, final_result, in->loc);
+      NativeAddr sa = addr_from_loc(e, tmp, in->loc);
+      memset(&acc, 0, sizeof acc);
+      acc.type = rty;
+      acc.size = type_size_or(e->c, rty, 8u);
+      acc.align = type_align_or(e->c, rty, 8u);
+      acc.mem = result_mem;
+      e->target->copy_bytes(e->target, da, sa, acc);
+    } else {
+      write_loc(e, final_result, tmp, result_mem, in->loc);
+    }
   }
 }
 
@@ -660,12 +679,20 @@ static void emit_ret(NativeEmitCtx* e, Inst* in, const CGFuncDesc* fd) {
   u32 nrets = 0;
   if (aux && aux->present) {
     NativeLoc final = abi_storage_loc(e, &aux->val, in->loc);
-    NativeFrameSlot slot =
-        temp_slot(e, aux->val.type, in->loc, NATIVE_FRAME_SLOT_SPILL);
-    NativeLoc frame =
-        loc_frame(aux->val.type, class_for_type(e, aux->val.type), slot);
-    write_loc(e, frame, final, mem_for_type(e->c, aux->val.type), in->loc);
-    value = frame;
+    CfreeCgTypeId vty = aux->val.type;
+    if (cg_type_is_aggregate(e->c, vty) || type_size_or(e->c, vty, 8u) > 8u) {
+      /* Aggregate / oversized return: hand plan_ret the value's memory
+       * location directly. It copies to the sret pointer (indirect) or reads
+       * the parts into the return registers (direct); a scalar copy through a
+       * temp would exceed the single-register width. */
+      value = final;
+    } else {
+      NativeFrameSlot slot =
+          temp_slot(e, vty, in->loc, NATIVE_FRAME_SLOT_SPILL);
+      NativeLoc frame = loc_frame(vty, class_for_type(e, vty), slot);
+      write_loc(e, frame, final, mem_for_type(e->c, vty), in->loc);
+      value = frame;
+    }
     values = &value;
   }
   e->target->plan_ret(e->target, fd, values, values ? 1u : 0u, &rets, &nrets);

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/arch/aa64/native.c	\|	52	++++++++++++++++++++++++++++++++++++++--------------
M	src/opt/cg_ir_lower.c	\|	6	+++++-
M	src/opt/pass_native_emit.c	\|	47	+++++++++++++++++++++++++++++++++++++----------