commit 0dc9d9b9d6fd9100c28f165cfa7e71e3033a16cf
parent 7eca5a4d48b4d1fc043f5cd81bd010fbd1785b1f
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 28 May 2026 14:27:56 -0700
opt/aa64: fix five O1 codegen bugs (i128/ldbl/variadic/asm)
All pre-existing miscompiles on the O1 optimizer path (plus one shared
with O0), surfaced by test-parse J-path runs:
1. pass_native_emit pointer_addr_from_operand loaded every OPK_LOCAL as a
pointer; for an agg_copy of a by-value 16-byte slot (__int128) it
dereferenced the value as an address. Branch on cg_type_is_ptr like the
single-pass nd_addr_pointer. Fixes the i128/ldbl128 SIGABRTs.
2. aa_move selected fmov s/d for FP register moves, truncating a 16-byte
long double to single. Add a 128-bit mov vd.16b, vn.16b.
3. aa_va_start_core set __stack = fp + next_param_stack, omitting the
saved-pair offset aa_fp_off_in_arg adds (and bind_param uses), so
va_arg's overflow path read into the saved fp/lr. Use aa_fp_off_in_arg.
4. rec_file_scope_asm was a no-op, dropping file-scope __asm__ blocks on
the optimizer path. Capture them on CgIrModule and replay in
opt_on_finalize.
5. The O1 inline-asm hook panicked on a register-constrained input that
arrived in a frame slot (an address-taken local that is also an asm
output). Materialize non-register integer 'r' inputs into a scratch
register, as the direct path does.
test-parse: 3720 pass, 0 fail (J path 930/0). test-toy, test-opt,
test-aa64-inline, test-cg-api green.
Diffstat:
6 files changed, 106 insertions(+), 11 deletions(-)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -454,6 +454,14 @@ static u32 aa_fmov_fp(u32 is_double, u32 rd, u32 rn) {
(rd & 0x1fu);
}
+/* MOV Vd.16B, Vn.16B (alias of ORR Vd.16B, Vn.16B, Vn.16B): a full 128-bit
+ * SIMD register copy. Used to move binary128 / long double values, which fmov
+ * (scalar, max 64-bit) would truncate. */
+static u32 aa_mov_vec16(u32 rd, u32 rn) {
+ return 0x4ea01c00u | ((rn & 0x1fu) << 16) | ((rn & 0x1fu) << 5) |
+ (rd & 0x1fu);
+}
+
static u32 aa_scvtf(u32 is_double_dst, u32 is64_src, u32 fd, u32 rn) {
return (is64_src ? 0x9e220000u : 0x1e220000u) |
(is_double_dst ? 0x00400000u : 0) | ((rn & 0x1fu) << 5) | (fd & 0x1fu);
@@ -1584,8 +1592,11 @@ static void aa_move(NativeTarget* t, NativeLoc dst, NativeLoc src) {
loc_is_fp(dst) == loc_is_fp(src) && dst.v.reg == src.v.reg)
return;
if (loc_is_fp(dst) && loc_is_fp(src)) {
- aa_emit32(t->mc, aa_fmov_fp(type_size32(t, dst.type) == 8u, loc_reg(dst),
- loc_reg(src)));
+ if (type_size32(t, dst.type) == 16u)
+ aa_emit32(t->mc, aa_mov_vec16(loc_reg(dst), loc_reg(src)));
+ else
+ aa_emit32(t->mc, aa_fmov_fp(type_size32(t, dst.type) == 8u, loc_reg(dst),
+ loc_reg(src)));
} else if (loc_is_fp(dst)) {
aa_emit32(t->mc,
aa_fmov_gpr_to_fp(loc_is_64(t, src), loc_reg(dst), loc_reg(src)));
@@ -3717,7 +3728,10 @@ static void aa_va_start_core(AANativeTarget* a, NativeAddr ap) {
: vai.gp_reg_count;
u32 used_vr = a->next_param_fp < vai.fp_reg_count ? a->next_param_fp
: vai.fp_reg_count;
- aa_emit_add_imm(a, AA_TMP0, AA_FP, (i32)a->next_param_stack);
+ /* __stack points at the incoming stack args, which sit above the saved
+ * fp/lr pair — the same address bind_param uses (aa_fp_off_in_arg), not the
+ * raw next_param_stack cursor. */
+ aa_emit_add_imm(a, AA_TMP0, AA_FP, aa_fp_off_in_arg(a->next_param_stack));
aa_emit_mem(a, 0, ptr, aa_reg_addr(ptr.type, base, (i32)vai.stack_offset),
ptr_mem);
aa_emit_add_imm(a, AA_TMP0, AA_FP,
@@ -4367,8 +4381,29 @@ static void aa_asm_block_native(NativeTarget* t, const char* tmpl,
continue;
}
type = ins[i].type ? ins[i].type : in_locs[i].type;
- aa_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, in_locs[i],
- &ntmp);
+ {
+ const char* in_body = aa_asm_constraint_body(ins[i].str);
+ NativeLoc inloc = in_locs[i];
+ /* A register-constrained input whose value is an address-taken local
+ * arrives in a frame slot: the optimizer cannot keep an address-taken
+ * local live in a register across the block, so the "inputs are already
+ * in registers" contract does not hold for it. Load it into a reserved
+ * scratch register (as the direct path does) before binding. Only the
+ * integer 'r' form is handled here — 'w' would need an FP scratch, which
+ * isn't reserved; an address-taken FP input still falls to the panic. */
+ if (in_body[0] == 'r' && inloc.kind != NATIVE_LOC_REG) {
+ Reg r;
+ if (ntmp >= 2u)
+ aa_asm_panic_at(c, loc, "too many memory asm operands");
+ r = (ntmp == 0u) ? AA_TMP0 : AA_TMP1;
+ ntmp++;
+ inloc = aa_reg_loc(type, NATIVE_REG_INT, r);
+ aa_emit_mem(a, 1, inloc, aa_asm_loc_to_addr(a, loc, in_locs[i]),
+ aa_mem_for_type(t, type, type_size32(t, type)));
+ }
+ aa_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc,
+ &ntmp);
+ }
}
saved = aa_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved);
diff --git a/src/cg/ir.c b/src/cg/ir.c
@@ -46,6 +46,21 @@ static void module_alias_grow(CgIrModule* m, u32 want) {
m->aliases_cap = cap;
}
+static void module_file_scope_asm_grow(CgIrModule* m, u32 want) {
+ CgIrFileScopeAsm* next;
+ u32 cap;
+ if (m->file_scope_asms_cap >= want) return;
+ cap = m->file_scope_asms_cap ? m->file_scope_asms_cap : 4u;
+ while (cap < want) cap *= 2u;
+ next = ir_zalloc_or_panic(m->c, m->arena, sizeof(*next) * cap,
+ _Alignof(CgIrFileScopeAsm));
+ if (m->file_scope_asms)
+ memcpy(next, m->file_scope_asms,
+ sizeof(*next) * m->nfile_scope_asms);
+ m->file_scope_asms = next;
+ m->file_scope_asms_cap = cap;
+}
+
CgIrModule* cg_ir_module_new(Compiler* c) {
CgIrModule* m = arena_znew(c->tu, CgIrModule);
if (!m) return NULL;
@@ -72,6 +87,18 @@ void cg_ir_module_add_alias(CgIrModule* m, ObjSymId alias_sym,
a->type = type;
}
+void cg_ir_module_add_file_scope_asm(CgIrModule* m, const char* src,
+ size_t len) {
+ CgIrFileScopeAsm* e;
+ if (!m || !src) return;
+ module_file_scope_asm_grow(m, m->nfile_scope_asms + 1u);
+ e = &m->file_scope_asms[m->nfile_scope_asms++];
+ /* Copy the source: the parser's buffer does not outlive recording, but the
+ * block is replayed at finalize. */
+ e->src = arena_strdup(m->arena, src, len);
+ e->len = len;
+}
+
static CGFuncDesc dup_func_desc(Arena* a, const CGFuncDesc* in) {
CGFuncDesc out = *in;
if (in->nresults) {
diff --git a/src/cg/ir.h b/src/cg/ir.h
@@ -227,6 +227,15 @@ typedef struct CgIrAlias {
CfreeCgTypeId type;
} CgIrAlias;
+/* A file-scope `__asm__(...)` block, captured verbatim for replay. The
+ * single-pass target emits it inline during recording; the optimizer path has
+ * no live target then, so the module retains it for opt_on_finalize to replay
+ * (see cg_ir_module_add_file_scope_asm). */
+typedef struct CgIrFileScopeAsm {
+ const char* src;
+ size_t len;
+} CgIrFileScopeAsm;
+
typedef struct CgIrModule {
Arena* arena;
Compiler* c;
@@ -236,6 +245,9 @@ typedef struct CgIrModule {
CgIrAlias* aliases;
u32 naliases;
u32 aliases_cap;
+ CgIrFileScopeAsm* file_scope_asms;
+ u32 nfile_scope_asms;
+ u32 file_scope_asms_cap;
} CgIrModule;
CgIrModule* cg_ir_module_new(Compiler*);
@@ -243,6 +255,7 @@ CgIrFunc* cg_ir_func_new(Compiler*, const CGFuncDesc*);
void cg_ir_module_add_func(CgIrModule*, CgIrFunc*);
void cg_ir_module_add_alias(CgIrModule*, ObjSymId alias_sym,
ObjSymId target_sym, CfreeCgTypeId type);
+void cg_ir_module_add_file_scope_asm(CgIrModule*, const char* src, size_t len);
CGLocal cg_ir_func_add_local(CgIrFunc*, const CGLocalDesc*, int is_param,
u32 param_index);
diff --git a/src/cg/ir_recorder.c b/src/cg/ir_recorder.c
@@ -540,9 +540,8 @@ static void rec_asm_block(CgTarget* t, const char* tmpl,
}
static void rec_file_scope_asm(CgTarget* t, const char* src, size_t len) {
- (void)t;
- (void)src;
- (void)len;
+ CgIrRecorder* r = rec_of(t);
+ cg_ir_module_add_file_scope_asm(r->module, src, len);
}
static void rec_set_loc(CgTarget* t, SrcLoc loc) { rec_of(t)->loc = loc; }
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -246,7 +246,14 @@ static void opt_on_func(void* user, CgIrFunc* cg_func) {
static void opt_on_finalize(void* user, const CgIrModule* module) {
OptImpl* o = (OptImpl*)user;
- (void)module;
+ /* File-scope asm blocks are captured during recording (no live target then)
+ * and replayed here, before finalize. Emission order relative to functions
+ * does not matter: each block selects its own sections (.data/.text/...). */
+ if (o->native && o->native->file_scope_asm && module) {
+ for (u32 i = 0; i < module->nfile_scope_asms; ++i)
+ o->native->file_scope_asm(o->native, module->file_scope_asms[i].src,
+ module->file_scope_asms[i].len);
+ }
if (o->native && o->native->finalize) o->native->finalize(o->native);
}
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -324,8 +324,22 @@ static NativeAddr pointer_addr_from_operand(NativeEmitCtx* e,
case OPT_OPK_LOCAL: {
NativeAddr frame;
NativeLoc dst;
- NativeAllocClass cls = class_for_type(e, op->type);
- Reg r = scratch_reg(e, cls, avoid_a, avoid_b, loc);
+ NativeAllocClass cls;
+ Reg r;
+ /* An OPK_LOCAL in a pointer-address position is ambiguous. When the
+ * operand's type is a pointer, the local *holds* the pointer value and
+ * must be loaded to get the address. Otherwise the local *is* the
+ * aggregate storage and its frame home is the address directly — loading
+ * it would dereference the aggregate's first 8 bytes as a pointer (e.g.
+ * an `__int128` call result copied by `agg_copy`). Mirrors the
+ * single-pass path's nd_addr_pointer. */
+ if (!cg_type_is_ptr(e->c, op->type)) {
+ addr.base_kind = NATIVE_ADDR_BASE_FRAME;
+ addr.base.frame = map_slot(e, op->v.frame_slot, loc);
+ return addr;
+ }
+ cls = class_for_type(e, op->type);
+ r = scratch_reg(e, cls, avoid_a, avoid_b, loc);
memset(&frame, 0, sizeof frame);
frame.base_kind = NATIVE_ADDR_BASE_FRAME;
frame.base.frame = map_slot(e, op->v.frame_slot, loc);