opt/aa64: fix five O1 codegen bugs (i128/ldbl/variadic/asm) - kit

commit 0dc9d9b9d6fd9100c28f165cfa7e71e3033a16cf
parent 7eca5a4d48b4d1fc043f5cd81bd010fbd1785b1f
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 28 May 2026 14:27:56 -0700

opt/aa64: fix five O1 codegen bugs (i128/ldbl/variadic/asm)

All pre-existing miscompiles on the O1 optimizer path (plus one shared
with O0), surfaced by test-parse J-path runs:

1. pass_native_emit pointer_addr_from_operand loaded every OPK_LOCAL as a
   pointer; for an agg_copy of a by-value 16-byte slot (__int128) it
   dereferenced the value as an address. Branch on cg_type_is_ptr like the
   single-pass nd_addr_pointer. Fixes the i128/ldbl128 SIGABRTs.

2. aa_move selected fmov s/d for FP register moves, truncating a 16-byte
   long double to single. Add a 128-bit mov vd.16b, vn.16b.

3. aa_va_start_core set __stack = fp + next_param_stack, omitting the
   saved-pair offset aa_fp_off_in_arg adds (and bind_param uses), so
   va_arg's overflow path read into the saved fp/lr. Use aa_fp_off_in_arg.

4. rec_file_scope_asm was a no-op, dropping file-scope __asm__ blocks on
   the optimizer path. Capture them on CgIrModule and replay in
   opt_on_finalize.

5. The O1 inline-asm hook panicked on a register-constrained input that
   arrived in a frame slot (an address-taken local that is also an asm
   output). Materialize non-register integer 'r' inputs into a scratch
   register, as the direct path does.

test-parse: 3720 pass, 0 fail (J path 930/0). test-toy, test-opt,
test-aa64-inline, test-cg-api green.

Diffstat:
M src/arch/aa64/native.c  | 45 ++++++++++++++++++++++++++++++++++++++++-----
M src/cg/ir.c  | 27 +++++++++++++++++++++++++++
M src/cg/ir.h  | 13 +++++++++++++
M src/cg/ir_recorder.c  | 5 ++---
M src/opt/opt.c  | 9 ++++++++-
M src/opt/pass_native_emit.c  | 18 ++++++++++++++++--

6 files changed, 106 insertions(+), 11 deletions(-)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -454,6 +454,14 @@ static u32 aa_fmov_fp(u32 is_double, u32 rd, u32 rn) {
          (rd & 0x1fu);
 }
 
+/* MOV Vd.16B, Vn.16B (alias of ORR Vd.16B, Vn.16B, Vn.16B): a full 128-bit
+ * SIMD register copy. Used to move binary128 / long double values, which fmov
+ * (scalar, max 64-bit) would truncate. */
+static u32 aa_mov_vec16(u32 rd, u32 rn) {
+  return 0x4ea01c00u | ((rn & 0x1fu) << 16) | ((rn & 0x1fu) << 5) |
+         (rd & 0x1fu);
+}
+
 static u32 aa_scvtf(u32 is_double_dst, u32 is64_src, u32 fd, u32 rn) {
   return (is64_src ? 0x9e220000u : 0x1e220000u) |
          (is_double_dst ? 0x00400000u : 0) | ((rn & 0x1fu) << 5) | (fd & 0x1fu);
@@ -1584,8 +1592,11 @@ static void aa_move(NativeTarget* t, NativeLoc dst, NativeLoc src) {
       loc_is_fp(dst) == loc_is_fp(src) && dst.v.reg == src.v.reg)
     return;
   if (loc_is_fp(dst) && loc_is_fp(src)) {
-    aa_emit32(t->mc, aa_fmov_fp(type_size32(t, dst.type) == 8u, loc_reg(dst),
-                                loc_reg(src)));
+    if (type_size32(t, dst.type) == 16u)
+      aa_emit32(t->mc, aa_mov_vec16(loc_reg(dst), loc_reg(src)));
+    else
+      aa_emit32(t->mc, aa_fmov_fp(type_size32(t, dst.type) == 8u, loc_reg(dst),
+                                  loc_reg(src)));
   } else if (loc_is_fp(dst)) {
     aa_emit32(t->mc,
               aa_fmov_gpr_to_fp(loc_is_64(t, src), loc_reg(dst), loc_reg(src)));
@@ -3717,7 +3728,10 @@ static void aa_va_start_core(AANativeTarget* a, NativeAddr ap) {
                                                        : vai.gp_reg_count;
     u32 used_vr = a->next_param_fp < vai.fp_reg_count ? a->next_param_fp
                                                       : vai.fp_reg_count;
-    aa_emit_add_imm(a, AA_TMP0, AA_FP, (i32)a->next_param_stack);
+    /* __stack points at the incoming stack args, which sit above the saved
+     * fp/lr pair — the same address bind_param uses (aa_fp_off_in_arg), not the
+     * raw next_param_stack cursor. */
+    aa_emit_add_imm(a, AA_TMP0, AA_FP, aa_fp_off_in_arg(a->next_param_stack));
     aa_emit_mem(a, 0, ptr, aa_reg_addr(ptr.type, base, (i32)vai.stack_offset),
                 ptr_mem);
     aa_emit_add_imm(a, AA_TMP0, AA_FP,
@@ -4367,8 +4381,29 @@ static void aa_asm_block_native(NativeTarget* t, const char* tmpl,
       continue;
     }
     type = ins[i].type ? ins[i].type : in_locs[i].type;
-    aa_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, in_locs[i],
-                       &ntmp);
+    {
+      const char* in_body = aa_asm_constraint_body(ins[i].str);
+      NativeLoc inloc = in_locs[i];
+      /* A register-constrained input whose value is an address-taken local
+       * arrives in a frame slot: the optimizer cannot keep an address-taken
+       * local live in a register across the block, so the "inputs are already
+       * in registers" contract does not hold for it. Load it into a reserved
+       * scratch register (as the direct path does) before binding. Only the
+       * integer 'r' form is handled here — 'w' would need an FP scratch, which
+       * isn't reserved; an address-taken FP input still falls to the panic. */
+      if (in_body[0] == 'r' && inloc.kind != NATIVE_LOC_REG) {
+        Reg r;
+        if (ntmp >= 2u)
+          aa_asm_panic_at(c, loc, "too many memory asm operands");
+        r = (ntmp == 0u) ? AA_TMP0 : AA_TMP1;
+        ntmp++;
+        inloc = aa_reg_loc(type, NATIVE_REG_INT, r);
+        aa_emit_mem(a, 1, inloc, aa_asm_loc_to_addr(a, loc, in_locs[i]),
+                    aa_mem_for_type(t, type, type_size32(t, type)));
+      }
+      aa_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc,
+                         &ntmp);
+    }
   }
 
   saved = aa_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved);
diff --git a/src/cg/ir.c b/src/cg/ir.c
@@ -46,6 +46,21 @@ static void module_alias_grow(CgIrModule* m, u32 want) {
   m->aliases_cap = cap;
 }
 
+static void module_file_scope_asm_grow(CgIrModule* m, u32 want) {
+  CgIrFileScopeAsm* next;
+  u32 cap;
+  if (m->file_scope_asms_cap >= want) return;
+  cap = m->file_scope_asms_cap ? m->file_scope_asms_cap : 4u;
+  while (cap < want) cap *= 2u;
+  next = ir_zalloc_or_panic(m->c, m->arena, sizeof(*next) * cap,
+                            _Alignof(CgIrFileScopeAsm));
+  if (m->file_scope_asms)
+    memcpy(next, m->file_scope_asms,
+           sizeof(*next) * m->nfile_scope_asms);
+  m->file_scope_asms = next;
+  m->file_scope_asms_cap = cap;
+}
+
 CgIrModule* cg_ir_module_new(Compiler* c) {
   CgIrModule* m = arena_znew(c->tu, CgIrModule);
   if (!m) return NULL;
@@ -72,6 +87,18 @@ void cg_ir_module_add_alias(CgIrModule* m, ObjSymId alias_sym,
   a->type = type;
 }
 
+void cg_ir_module_add_file_scope_asm(CgIrModule* m, const char* src,
+                                     size_t len) {
+  CgIrFileScopeAsm* e;
+  if (!m || !src) return;
+  module_file_scope_asm_grow(m, m->nfile_scope_asms + 1u);
+  e = &m->file_scope_asms[m->nfile_scope_asms++];
+  /* Copy the source: the parser's buffer does not outlive recording, but the
+   * block is replayed at finalize. */
+  e->src = arena_strdup(m->arena, src, len);
+  e->len = len;
+}
+
 static CGFuncDesc dup_func_desc(Arena* a, const CGFuncDesc* in) {
   CGFuncDesc out = *in;
   if (in->nresults) {
diff --git a/src/cg/ir.h b/src/cg/ir.h
@@ -227,6 +227,15 @@ typedef struct CgIrAlias {
   CfreeCgTypeId type;
 } CgIrAlias;
 
+/* A file-scope `__asm__(...)` block, captured verbatim for replay. The
+ * single-pass target emits it inline during recording; the optimizer path has
+ * no live target then, so the module retains it for opt_on_finalize to replay
+ * (see cg_ir_module_add_file_scope_asm). */
+typedef struct CgIrFileScopeAsm {
+  const char* src;
+  size_t len;
+} CgIrFileScopeAsm;
+
 typedef struct CgIrModule {
   Arena* arena;
   Compiler* c;
@@ -236,6 +245,9 @@ typedef struct CgIrModule {
   CgIrAlias* aliases;
   u32 naliases;
   u32 aliases_cap;
+  CgIrFileScopeAsm* file_scope_asms;
+  u32 nfile_scope_asms;
+  u32 file_scope_asms_cap;
 } CgIrModule;
 
 CgIrModule* cg_ir_module_new(Compiler*);
@@ -243,6 +255,7 @@ CgIrFunc* cg_ir_func_new(Compiler*, const CGFuncDesc*);
 void cg_ir_module_add_func(CgIrModule*, CgIrFunc*);
 void cg_ir_module_add_alias(CgIrModule*, ObjSymId alias_sym,
                             ObjSymId target_sym, CfreeCgTypeId type);
+void cg_ir_module_add_file_scope_asm(CgIrModule*, const char* src, size_t len);
 
 CGLocal cg_ir_func_add_local(CgIrFunc*, const CGLocalDesc*, int is_param,
                              u32 param_index);
diff --git a/src/cg/ir_recorder.c b/src/cg/ir_recorder.c
@@ -540,9 +540,8 @@ static void rec_asm_block(CgTarget* t, const char* tmpl,
 }
 
 static void rec_file_scope_asm(CgTarget* t, const char* src, size_t len) {
-  (void)t;
-  (void)src;
-  (void)len;
+  CgIrRecorder* r = rec_of(t);
+  cg_ir_module_add_file_scope_asm(r->module, src, len);
 }
 
 static void rec_set_loc(CgTarget* t, SrcLoc loc) { rec_of(t)->loc = loc; }
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -246,7 +246,14 @@ static void opt_on_func(void* user, CgIrFunc* cg_func) {
 
 static void opt_on_finalize(void* user, const CgIrModule* module) {
   OptImpl* o = (OptImpl*)user;
-  (void)module;
+  /* File-scope asm blocks are captured during recording (no live target then)
+   * and replayed here, before finalize. Emission order relative to functions
+   * does not matter: each block selects its own sections (.data/.text/...). */
+  if (o->native && o->native->file_scope_asm && module) {
+    for (u32 i = 0; i < module->nfile_scope_asms; ++i)
+      o->native->file_scope_asm(o->native, module->file_scope_asms[i].src,
+                                module->file_scope_asms[i].len);
+  }
   if (o->native && o->native->finalize) o->native->finalize(o->native);
 }
 
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -324,8 +324,22 @@ static NativeAddr pointer_addr_from_operand(NativeEmitCtx* e,
     case OPT_OPK_LOCAL: {
       NativeAddr frame;
       NativeLoc dst;
-      NativeAllocClass cls = class_for_type(e, op->type);
-      Reg r = scratch_reg(e, cls, avoid_a, avoid_b, loc);
+      NativeAllocClass cls;
+      Reg r;
+      /* An OPK_LOCAL in a pointer-address position is ambiguous. When the
+       * operand's type is a pointer, the local *holds* the pointer value and
+       * must be loaded to get the address. Otherwise the local *is* the
+       * aggregate storage and its frame home is the address directly — loading
+       * it would dereference the aggregate's first 8 bytes as a pointer (e.g.
+       * an `__int128` call result copied by `agg_copy`). Mirrors the
+       * single-pass path's nd_addr_pointer. */
+      if (!cg_type_is_ptr(e->c, op->type)) {
+        addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+        addr.base.frame = map_slot(e, op->v.frame_slot, loc);
+        return addr;
+      }
+      cls = class_for_type(e, op->type);
+      r = scratch_reg(e, cls, avoid_a, avoid_b, loc);
       memset(&frame, 0, sizeof frame);
       frame.base_kind = NATIVE_ADDR_BASE_FRAME;
       frame.base.frame = map_slot(e, op->v.frame_slot, loc);

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	src/arch/aa64/native.c	\|	45	++++++++++++++++++++++++++++++++++++++++-----
M	src/cg/ir.c	\|	27	+++++++++++++++++++++++++++
M	src/cg/ir.h	\|	13	+++++++++++++
M	src/cg/ir_recorder.c	\|	5	++---
M	src/opt/opt.c	\|	9	++++++++-
M	src/opt/pass_native_emit.c	\|	18	++++++++++++++++--