kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit 0dc9d9b9d6fd9100c28f165cfa7e71e3033a16cf
parent 7eca5a4d48b4d1fc043f5cd81bd010fbd1785b1f
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 28 May 2026 14:27:56 -0700

opt/aa64: fix five O1 codegen bugs (i128/ldbl/variadic/asm)

All pre-existing miscompiles on the O1 optimizer path (plus one shared
with O0), surfaced by test-parse J-path runs:

1. pass_native_emit pointer_addr_from_operand loaded every OPK_LOCAL as a
   pointer; for an agg_copy of a by-value 16-byte slot (__int128) it
   dereferenced the value as an address. Branch on cg_type_is_ptr like the
   single-pass nd_addr_pointer. Fixes the i128/ldbl128 SIGABRTs.

2. aa_move selected fmov s/d for FP register moves, truncating a 16-byte
   long double to single. Add a 128-bit mov vd.16b, vn.16b.

3. aa_va_start_core set __stack = fp + next_param_stack, omitting the
   saved-pair offset aa_fp_off_in_arg adds (and bind_param uses), so
   va_arg's overflow path read into the saved fp/lr. Use aa_fp_off_in_arg.

4. rec_file_scope_asm was a no-op, dropping file-scope __asm__ blocks on
   the optimizer path. Capture them on CgIrModule and replay in
   opt_on_finalize.

5. The O1 inline-asm hook panicked on a register-constrained input that
   arrived in a frame slot (an address-taken local that is also an asm
   output). Materialize non-register integer 'r' inputs into a scratch
   register, as the direct path does.

test-parse: 3720 pass, 0 fail (J path 930/0). test-toy, test-opt,
test-aa64-inline, test-cg-api green.

Diffstat:
Msrc/arch/aa64/native.c | 45++++++++++++++++++++++++++++++++++++++++-----
Msrc/cg/ir.c | 27+++++++++++++++++++++++++++
Msrc/cg/ir.h | 13+++++++++++++
Msrc/cg/ir_recorder.c | 5++---
Msrc/opt/opt.c | 9++++++++-
Msrc/opt/pass_native_emit.c | 18++++++++++++++++--
6 files changed, 106 insertions(+), 11 deletions(-)

diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c @@ -454,6 +454,14 @@ static u32 aa_fmov_fp(u32 is_double, u32 rd, u32 rn) { (rd & 0x1fu); } +/* MOV Vd.16B, Vn.16B (alias of ORR Vd.16B, Vn.16B, Vn.16B): a full 128-bit + * SIMD register copy. Used to move binary128 / long double values, which fmov + * (scalar, max 64-bit) would truncate. */ +static u32 aa_mov_vec16(u32 rd, u32 rn) { + return 0x4ea01c00u | ((rn & 0x1fu) << 16) | ((rn & 0x1fu) << 5) | + (rd & 0x1fu); +} + static u32 aa_scvtf(u32 is_double_dst, u32 is64_src, u32 fd, u32 rn) { return (is64_src ? 0x9e220000u : 0x1e220000u) | (is_double_dst ? 0x00400000u : 0) | ((rn & 0x1fu) << 5) | (fd & 0x1fu); @@ -1584,8 +1592,11 @@ static void aa_move(NativeTarget* t, NativeLoc dst, NativeLoc src) { loc_is_fp(dst) == loc_is_fp(src) && dst.v.reg == src.v.reg) return; if (loc_is_fp(dst) && loc_is_fp(src)) { - aa_emit32(t->mc, aa_fmov_fp(type_size32(t, dst.type) == 8u, loc_reg(dst), - loc_reg(src))); + if (type_size32(t, dst.type) == 16u) + aa_emit32(t->mc, aa_mov_vec16(loc_reg(dst), loc_reg(src))); + else + aa_emit32(t->mc, aa_fmov_fp(type_size32(t, dst.type) == 8u, loc_reg(dst), + loc_reg(src))); } else if (loc_is_fp(dst)) { aa_emit32(t->mc, aa_fmov_gpr_to_fp(loc_is_64(t, src), loc_reg(dst), loc_reg(src))); @@ -3717,7 +3728,10 @@ static void aa_va_start_core(AANativeTarget* a, NativeAddr ap) { : vai.gp_reg_count; u32 used_vr = a->next_param_fp < vai.fp_reg_count ? a->next_param_fp : vai.fp_reg_count; - aa_emit_add_imm(a, AA_TMP0, AA_FP, (i32)a->next_param_stack); + /* __stack points at the incoming stack args, which sit above the saved + * fp/lr pair — the same address bind_param uses (aa_fp_off_in_arg), not the + * raw next_param_stack cursor. */ + aa_emit_add_imm(a, AA_TMP0, AA_FP, aa_fp_off_in_arg(a->next_param_stack)); aa_emit_mem(a, 0, ptr, aa_reg_addr(ptr.type, base, (i32)vai.stack_offset), ptr_mem); aa_emit_add_imm(a, AA_TMP0, AA_FP, @@ -4367,8 +4381,29 @@ static void aa_asm_block_native(NativeTarget* t, const char* tmpl, continue; } type = ins[i].type ? ins[i].type : in_locs[i].type; - aa_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, in_locs[i], - &ntmp); + { + const char* in_body = aa_asm_constraint_body(ins[i].str); + NativeLoc inloc = in_locs[i]; + /* A register-constrained input whose value is an address-taken local + * arrives in a frame slot: the optimizer cannot keep an address-taken + * local live in a register across the block, so the "inputs are already + * in registers" contract does not hold for it. Load it into a reserved + * scratch register (as the direct path does) before binding. Only the + * integer 'r' form is handled here — 'w' would need an FP scratch, which + * isn't reserved; an address-taken FP input still falls to the panic. */ + if (in_body[0] == 'r' && inloc.kind != NATIVE_LOC_REG) { + Reg r; + if (ntmp >= 2u) + aa_asm_panic_at(c, loc, "too many memory asm operands"); + r = (ntmp == 0u) ? AA_TMP0 : AA_TMP1; + ntmp++; + inloc = aa_reg_loc(type, NATIVE_REG_INT, r); + aa_emit_mem(a, 1, inloc, aa_asm_loc_to_addr(a, loc, in_locs[i]), + aa_mem_for_type(t, type, type_size32(t, type))); + } + aa_asm_bind_native(a, loc, &bound_ins[i], ins[i].str, type, inloc, + &ntmp); + } } saved = aa_asm_save_callee_clobbers(a, clob_int, clob_fp, &nsaved); diff --git a/src/cg/ir.c b/src/cg/ir.c @@ -46,6 +46,21 @@ static void module_alias_grow(CgIrModule* m, u32 want) { m->aliases_cap = cap; } +static void module_file_scope_asm_grow(CgIrModule* m, u32 want) { + CgIrFileScopeAsm* next; + u32 cap; + if (m->file_scope_asms_cap >= want) return; + cap = m->file_scope_asms_cap ? m->file_scope_asms_cap : 4u; + while (cap < want) cap *= 2u; + next = ir_zalloc_or_panic(m->c, m->arena, sizeof(*next) * cap, + _Alignof(CgIrFileScopeAsm)); + if (m->file_scope_asms) + memcpy(next, m->file_scope_asms, + sizeof(*next) * m->nfile_scope_asms); + m->file_scope_asms = next; + m->file_scope_asms_cap = cap; +} + CgIrModule* cg_ir_module_new(Compiler* c) { CgIrModule* m = arena_znew(c->tu, CgIrModule); if (!m) return NULL; @@ -72,6 +87,18 @@ void cg_ir_module_add_alias(CgIrModule* m, ObjSymId alias_sym, a->type = type; } +void cg_ir_module_add_file_scope_asm(CgIrModule* m, const char* src, + size_t len) { + CgIrFileScopeAsm* e; + if (!m || !src) return; + module_file_scope_asm_grow(m, m->nfile_scope_asms + 1u); + e = &m->file_scope_asms[m->nfile_scope_asms++]; + /* Copy the source: the parser's buffer does not outlive recording, but the + * block is replayed at finalize. */ + e->src = arena_strdup(m->arena, src, len); + e->len = len; +} + static CGFuncDesc dup_func_desc(Arena* a, const CGFuncDesc* in) { CGFuncDesc out = *in; if (in->nresults) { diff --git a/src/cg/ir.h b/src/cg/ir.h @@ -227,6 +227,15 @@ typedef struct CgIrAlias { CfreeCgTypeId type; } CgIrAlias; +/* A file-scope `__asm__(...)` block, captured verbatim for replay. The + * single-pass target emits it inline during recording; the optimizer path has + * no live target then, so the module retains it for opt_on_finalize to replay + * (see cg_ir_module_add_file_scope_asm). */ +typedef struct CgIrFileScopeAsm { + const char* src; + size_t len; +} CgIrFileScopeAsm; + typedef struct CgIrModule { Arena* arena; Compiler* c; @@ -236,6 +245,9 @@ typedef struct CgIrModule { CgIrAlias* aliases; u32 naliases; u32 aliases_cap; + CgIrFileScopeAsm* file_scope_asms; + u32 nfile_scope_asms; + u32 file_scope_asms_cap; } CgIrModule; CgIrModule* cg_ir_module_new(Compiler*); @@ -243,6 +255,7 @@ CgIrFunc* cg_ir_func_new(Compiler*, const CGFuncDesc*); void cg_ir_module_add_func(CgIrModule*, CgIrFunc*); void cg_ir_module_add_alias(CgIrModule*, ObjSymId alias_sym, ObjSymId target_sym, CfreeCgTypeId type); +void cg_ir_module_add_file_scope_asm(CgIrModule*, const char* src, size_t len); CGLocal cg_ir_func_add_local(CgIrFunc*, const CGLocalDesc*, int is_param, u32 param_index); diff --git a/src/cg/ir_recorder.c b/src/cg/ir_recorder.c @@ -540,9 +540,8 @@ static void rec_asm_block(CgTarget* t, const char* tmpl, } static void rec_file_scope_asm(CgTarget* t, const char* src, size_t len) { - (void)t; - (void)src; - (void)len; + CgIrRecorder* r = rec_of(t); + cg_ir_module_add_file_scope_asm(r->module, src, len); } static void rec_set_loc(CgTarget* t, SrcLoc loc) { rec_of(t)->loc = loc; } diff --git a/src/opt/opt.c b/src/opt/opt.c @@ -246,7 +246,14 @@ static void opt_on_func(void* user, CgIrFunc* cg_func) { static void opt_on_finalize(void* user, const CgIrModule* module) { OptImpl* o = (OptImpl*)user; - (void)module; + /* File-scope asm blocks are captured during recording (no live target then) + * and replayed here, before finalize. Emission order relative to functions + * does not matter: each block selects its own sections (.data/.text/...). */ + if (o->native && o->native->file_scope_asm && module) { + for (u32 i = 0; i < module->nfile_scope_asms; ++i) + o->native->file_scope_asm(o->native, module->file_scope_asms[i].src, + module->file_scope_asms[i].len); + } if (o->native && o->native->finalize) o->native->finalize(o->native); } diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c @@ -324,8 +324,22 @@ static NativeAddr pointer_addr_from_operand(NativeEmitCtx* e, case OPT_OPK_LOCAL: { NativeAddr frame; NativeLoc dst; - NativeAllocClass cls = class_for_type(e, op->type); - Reg r = scratch_reg(e, cls, avoid_a, avoid_b, loc); + NativeAllocClass cls; + Reg r; + /* An OPK_LOCAL in a pointer-address position is ambiguous. When the + * operand's type is a pointer, the local *holds* the pointer value and + * must be loaded to get the address. Otherwise the local *is* the + * aggregate storage and its frame home is the address directly — loading + * it would dereference the aggregate's first 8 bytes as a pointer (e.g. + * an `__int128` call result copied by `agg_copy`). Mirrors the + * single-pass path's nd_addr_pointer. */ + if (!cg_type_is_ptr(e->c, op->type)) { + addr.base_kind = NATIVE_ADDR_BASE_FRAME; + addr.base.frame = map_slot(e, op->v.frame_slot, loc); + return addr; + } + cls = class_for_type(e, op->type); + r = scratch_reg(e, cls, avoid_a, avoid_b, loc); memset(&frame, 0, sizeof frame); frame.base_kind = NATIVE_ADDR_BASE_FRAME; frame.base.frame = map_slot(e, op->v.frame_slot, loc);