kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit a7442212be4d2e3a72cad198038e1f75103ac0bb
parent e3cc40502d041ed5d803683ac0afc4edf7b641ef
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 15 May 2026 14:51:25 -0700

Fix AArch64 large frame offset emission

Diffstat:
Mdoc/OPT1.md | 24+++++++++---------------
Msrc/arch/aa64/emit.c | 129+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Msrc/arch/aa64/internal.h | 13+++++++++++--
Msrc/arch/aa64/ops.c | 30+++++++++++++++++++-----------
Mtest/opt/phase0_guardrails.sh | 40++++++++++++++++++++++++++++++++++++++++
5 files changed, 183 insertions(+), 53 deletions(-)

diff --git a/doc/OPT1.md b/doc/OPT1.md @@ -287,18 +287,13 @@ mov w19, w21 MIR's O1 path suggests these high-value local cleanups that still fit cfree's fast tier: -1. Reduce and fix the spill-heavy JIT/runtime crash. - The nonconstant wide-local spill probe returned correctly for many sizes but - segfaulted at nearby large sizes. Isolate this to a small parse or CG API - testcase before doing more spill-pressure perf work. - -2. Clean up local branch layout artifacts. +1. Clean up local branch layout artifacts. MIR's full jump optimizer is O2-only, but its cheap pieces are appropriate for O1: delete branches to immediate fallthrough blocks, forward branch-to-branch targets, and invert a branch when it removes an unconditional jump. Avoid full CFG layout work. -3. Promote remaining scalar entry slots before backend allocation. +2. Promote remaining scalar entry slots before backend allocation. MIR's C frontend represents normal scalar block locals as MIR registers and leaves stack slots for aggregates, forced-stack cases, and address-taken values. O1 now keeps simple loop locals in registers in the probe, but still @@ -306,33 +301,32 @@ fast tier: pass should promote remaining integer/pointer scalars whose address does not escape, starting with parameters and single-entry structured control flow. -4. Avoid unnecessary callee-save traffic. +3. Avoid unnecessary callee-save traffic. Reserve and preserve only hard registers that survive final post-rewrite cleanup, and consider caller-saved registers for values that are not live across calls. This would make small leaf functions much closer to expected O1 output without requiring global optimization. -5. Continue tightening post-rewrite DCE and copy cleanup. +4. Continue tightening post-rewrite DCE and copy cleanup. Model hard-register call arguments and call clobbers precisely enough to delete dead caller-saved defs before calls without removing required ABI traffic. Also fold single-use arithmetic temporaries into their destination when target constraints allow it. -6. Keep compare-branch fusion covered by tests. +5. Keep compare-branch fusion covered by tests. The current probes show direct `cmp` plus branch shapes for branch-consuming compares on AArch64. Add focused regression coverage so the old `cmp; cset; cmp #0; b.cond` bridge does not return. -7. Keep tiny local constant simplification bounded. +6. Keep tiny local constant simplification bounded. The vstack constfold path now removes obvious immediate and straight-line scalar-local code before allocation. Keep it basic-block-local; broader propagation belongs in the O2 SSA/value optimizer. -8. Watch spill-pressure regalloc slope. +7. Watch spill-pressure regalloc slope. Normal-path scaling is linear, but heavy spill pressure still bends slightly - in regalloc time. After the correctness crash is fixed, rerun the - nonconstant wide-local ladder and decide whether interval probing or stack - slot assignment needs another narrow cleanup. + in regalloc time. Rerun the nonconstant wide-local ladder and decide whether + interval probing or stack slot assignment needs another narrow cleanup. - Keep `opt_combine` legality target-aware. Existing one-use copy/immediate/convert folds should stay conservative. New diff --git a/src/arch/aa64/emit.c b/src/arch/aa64/emit.c @@ -287,9 +287,34 @@ static u32 aa_build_prologue(CGTarget *t, u32 *words, u32 cap, u32 frame_size, words[wi++] = aa64_sub_imm(1, 31, 31, (frame_size >> 12) & 0xfff, 1); words[wi++] = aa64_sub_imm(1, 31, 31, frame_size & 0xfff, 0); } - if (wi + 2 > cap) goto overflow; - words[wi++] = aa64_stp_x(29, 30, 31, (i32)fp_lr_off); - words[wi++] = aa64_add_imm(1, 29, 31, fp_lr_off, 0); + if (fp_lr_off <= 504u) { + if (wi >= cap) goto overflow; + words[wi++] = aa64_stp_x(29, 30, 31, (i32)fp_lr_off); + } else { + if (wi + 2 > cap) goto overflow; + words[wi++] = aa64_str_uimm(3, 29, 31, fp_lr_off); + words[wi++] = aa64_str_uimm(3, 30, 31, fp_lr_off + 8u); + } + if (wi >= cap) goto overflow; + if (fp_lr_off <= 0xfffu) { + words[wi++] = aa64_add_imm(1, 29, 31, fp_lr_off, 0); + } else if ((fp_lr_off >> 24) == 0) { + u32 hi = (fp_lr_off >> 12) & 0xfffu; + u32 lo = fp_lr_off & 0xfffu; + if (hi) { + words[wi++] = aa64_add_imm(1, 29, 31, hi, 1); + if (lo) { + if (wi >= cap) goto overflow; + words[wi++] = aa64_add_imm(1, 29, 29, lo, 0); + } + } else if (lo) { + words[wi++] = aa64_add_imm(1, 29, 31, lo, 0); + } + } else { + compiler_panic(t->c, a->loc, + "aarch64: fp/lr offset %u out of prologue range", + fp_lr_off); + } if (a->has_sret && a->sret_ptr_slot != FRAME_SLOT_NONE) { AASlot *s = aa64_slot_get(a, a->sret_ptr_slot); if (s) { @@ -395,7 +420,12 @@ void aa_func_end(CGTarget *t) { u32 r0 = int_regs[i]; aa64_emit32(mc, aa64_ldr_uimm(3, r0, 31, int_save_off + (u32)i * 8u)); } - aa64_emit32(mc, aa64_ldp_x(29, 30, 31, (i32)fp_lr_off)); + if (fp_lr_off <= 504u) { + aa64_emit32(mc, aa64_ldp_x(29, 30, 31, (i32)fp_lr_off)); + } else { + aa64_emit32(mc, aa64_ldr_uimm(3, 29, 31, fp_lr_off)); + aa64_emit32(mc, aa64_ldr_uimm(3, 30, 31, fp_lr_off + 8u)); + } emit_sp_add(mc, frame_size); aa64_emit32(mc, aa64_ret(AA64_LR)); @@ -487,29 +517,34 @@ void aa_param(CGTarget *t, const CGParamDesc *p) { } else { u32 caller_off = a->next_param_stack; a->next_param_stack += 8; - aa64_emit32(t->mc, aa64_ldur(3, AA_TMP0, 29, (i32)(16 + caller_off))); + aa64_emit_ldur_off(t->mc, 3, AA_TMP0, 29, (i32)(16 + caller_off), + AA_TMP0); ptr_reg = AA_TMP0; } u32 nbytes = s->size; u32 i = 0; while (i + 8 <= nbytes) { - aa64_emit32(t->mc, aa64_ldur(3, AA_TMP1, ptr_reg, (i32)i)); - aa64_emit32(t->mc, aa64_stur(3, AA_TMP1, 29, -(i32)s->off + (i32)i)); + aa64_emit_ldur_off(t->mc, 3, AA_TMP1, ptr_reg, (i32)i, AA_TMP2); + aa64_emit_stur_off(t->mc, 3, AA_TMP1, 29, -(i32)s->off + (i32)i, + AA_TMP2); i += 8; } while (i + 4 <= nbytes) { - aa64_emit32(t->mc, aa64_ldur(2, AA_TMP1, ptr_reg, (i32)i)); - aa64_emit32(t->mc, aa64_stur(2, AA_TMP1, 29, -(i32)s->off + (i32)i)); + aa64_emit_ldur_off(t->mc, 2, AA_TMP1, ptr_reg, (i32)i, AA_TMP2); + aa64_emit_stur_off(t->mc, 2, AA_TMP1, 29, -(i32)s->off + (i32)i, + AA_TMP2); i += 4; } while (i + 2 <= nbytes) { - aa64_emit32(t->mc, aa64_ldur(1, AA_TMP1, ptr_reg, (i32)i)); - aa64_emit32(t->mc, aa64_stur(1, AA_TMP1, 29, -(i32)s->off + (i32)i)); + aa64_emit_ldur_off(t->mc, 1, AA_TMP1, ptr_reg, (i32)i, AA_TMP2); + aa64_emit_stur_off(t->mc, 1, AA_TMP1, 29, -(i32)s->off + (i32)i, + AA_TMP2); i += 2; } while (i < nbytes) { - aa64_emit32(t->mc, aa64_ldur(0, AA_TMP1, ptr_reg, (i32)i)); - aa64_emit32(t->mc, aa64_stur(0, AA_TMP1, 29, -(i32)s->off + (i32)i)); + aa64_emit_ldur_off(t->mc, 0, AA_TMP1, ptr_reg, (i32)i, AA_TMP2); + aa64_emit_stur_off(t->mc, 0, AA_TMP1, 29, -(i32)s->off + (i32)i, + AA_TMP2); i += 1; } return; @@ -523,28 +558,28 @@ void aa_param(CGTarget *t, const CGParamDesc *p) { if (pt->cls == ABI_CLASS_INT) { if (a->next_param_int < 8) { u32 reg = a->next_param_int++; - aa64_emit32(t->mc, - aa64_stur(sidx, reg, 29, -(i32)s->off + (i32)part_off)); + aa64_emit_stur_off(t->mc, sidx, reg, 29, + -(i32)s->off + (i32)part_off, AA_TMP0); } else { u32 caller_off = a->next_param_stack; a->next_param_stack += 8; - aa64_emit32(t->mc, - aa64_ldur(sidx, AA_TMP0, 29, (i32)(16 + caller_off))); - aa64_emit32(t->mc, - aa64_stur(sidx, AA_TMP0, 29, -(i32)s->off + (i32)part_off)); + aa64_emit_ldur_off(t->mc, sidx, AA_TMP0, 29, + (i32)(16 + caller_off), AA_TMP0); + aa64_emit_stur_off(t->mc, sidx, AA_TMP0, 29, + -(i32)s->off + (i32)part_off, AA_TMP1); } } else if (pt->cls == ABI_CLASS_FP) { if (a->next_param_fp < 8) { u32 reg = a->next_param_fp++; - aa64_emit32(t->mc, - aa64_stur_fp(sidx, reg, 29, -(i32)s->off + (i32)part_off)); + aa64_emit_stur_fp_off(t->mc, sidx, reg, 29, + -(i32)s->off + (i32)part_off, AA_TMP0); } else { u32 caller_off = a->next_param_stack; a->next_param_stack += 8; - aa64_emit32(t->mc, - aa64_ldur_fp(sidx, AA_FP_TMP0, 29, (i32)(16 + caller_off))); - aa64_emit32(t->mc, aa64_stur_fp(sidx, AA_FP_TMP0, 29, - -(i32)s->off + (i32)part_off)); + aa64_emit_ldur_fp_off(t->mc, sidx, AA_FP_TMP0, 29, + (i32)(16 + caller_off), AA_TMP0); + aa64_emit_stur_fp_off(t->mc, sidx, AA_FP_TMP0, 29, + -(i32)s->off + (i32)part_off, AA_TMP0); } } else { compiler_panic(t->c, a->loc, "aarch64 param: ABI class %d unimpl", @@ -623,3 +658,47 @@ void aa64_emit_addr_adjust(MCEmitter *mc, u32 Rd, u32 base, i32 off) { aa64_emit_load_imm(mc, 1, Rd, off); aa64_emit32(mc, aa64_add(1, Rd, base, Rd)); } + +static int aa64_simm9_fits(i32 off) { + return off >= -256 && off <= 255; +} + +void aa64_emit_ldur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off, + u32 tmp) { + if (aa64_simm9_fits(off)) { + aa64_emit32(mc, aa64_ldur(size, Rt, Rn, off)); + return; + } + aa64_emit_addr_adjust(mc, tmp, Rn, off); + aa64_emit32(mc, aa64_ldur(size, Rt, tmp, 0)); +} + +void aa64_emit_stur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off, + u32 tmp) { + if (aa64_simm9_fits(off)) { + aa64_emit32(mc, aa64_stur(size, Rt, Rn, off)); + return; + } + aa64_emit_addr_adjust(mc, tmp, Rn, off); + aa64_emit32(mc, aa64_stur(size, Rt, tmp, 0)); +} + +void aa64_emit_ldur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off, + u32 tmp) { + if (aa64_simm9_fits(off)) { + aa64_emit32(mc, aa64_ldur_fp(size, Rt, Rn, off)); + return; + } + aa64_emit_addr_adjust(mc, tmp, Rn, off); + aa64_emit32(mc, aa64_ldur_fp(size, Rt, tmp, 0)); +} + +void aa64_emit_stur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off, + u32 tmp) { + if (aa64_simm9_fits(off)) { + aa64_emit32(mc, aa64_stur_fp(size, Rt, Rn, off)); + return; + } + aa64_emit_addr_adjust(mc, tmp, Rn, off); + aa64_emit32(mc, aa64_stur_fp(size, Rt, tmp, 0)); +} diff --git a/src/arch/aa64/internal.h b/src/arch/aa64/internal.h @@ -202,8 +202,9 @@ static inline u32 aa64_bfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) { * ============================================================ */ #define AA_PROLOGUE_WORDS \ - 23u /* worst case: sub sp + stp/add fp + sret + 10 int + 8 fp saves */ -#define AA_PROLOGUE_FRAME_WORDS 4u /* worst-case frame adjust + stp/add fp */ + 25u /* worst case: sub sp + str/str/add-add fp + sret + 10 int + 8 fp */ +#define AA_PROLOGUE_FRAME_WORDS \ + 6u /* worst-case frame adjust + split fp/lr saves + add-add fp */ typedef struct AASlot { u32 off; @@ -281,6 +282,14 @@ void aa64_patch32(ObjBuilder* obj, u32 sec_id, u32 ofs, u32 word); void aa64_emit_load_imm(MCEmitter* mc, u32 sf, u32 Rd, i64 imm); void emit_sp_add(MCEmitter* mc, u32 imm); void aa64_emit_addr_adjust(MCEmitter* mc, u32 Rd, u32 base, i32 off); +void aa64_emit_ldur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off, + u32 tmp); +void aa64_emit_stur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off, + u32 tmp); +void aa64_emit_ldur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off, + u32 tmp); +void aa64_emit_stur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off, + u32 tmp); void aa64_emit_got_load_addr(CGTarget* t, u32 dst_reg, ObjSymId sym); void emit_global_addr(CGTarget* t, u32 dst_reg, ObjSymId sym, i64 addend); diff --git a/src/arch/aa64/ops.c b/src/arch/aa64/ops.c @@ -839,7 +839,7 @@ static void emit_arg_value(CGTarget* t, const ABIFuncInfo* fi, AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot); if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad arg slot"); i32 off = -(i32)s->off + (i32)pt->src_offset; - aa64_emit32(t->mc, aa64_ldur(sidx, dst_reg, 29, off)); + aa64_emit_ldur_off(t->mc, sidx, dst_reg, 29, off, dst_reg); break; } case OPK_INDIRECT: { @@ -1095,22 +1095,26 @@ static void aa_ret(CGTarget* t, const CGABIValue* val) { u32 nbytes = s->size; u32 i = 0; while (i + 8 <= nbytes) { - aa64_emit32(mc, aa64_ldur(3, AA_TMP0, 29, -(i32)s->off + (i32)i)); + aa64_emit_ldur_off(mc, 3, AA_TMP0, 29, -(i32)s->off + (i32)i, + AA_TMP0); aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i)); i += 8; } while (i + 4 <= nbytes) { - aa64_emit32(mc, aa64_ldur(2, AA_TMP0, 29, -(i32)s->off + (i32)i)); + aa64_emit_ldur_off(mc, 2, AA_TMP0, 29, -(i32)s->off + (i32)i, + AA_TMP0); aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i)); i += 4; } while (i + 2 <= nbytes) { - aa64_emit32(mc, aa64_ldur(1, AA_TMP0, 29, -(i32)s->off + (i32)i)); + aa64_emit_ldur_off(mc, 1, AA_TMP0, 29, -(i32)s->off + (i32)i, + AA_TMP0); aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i)); i += 2; } while (i < nbytes) { - aa64_emit32(mc, aa64_ldur(0, AA_TMP0, 29, -(i32)s->off + (i32)i)); + aa64_emit_ldur_off(mc, 0, AA_TMP0, 29, -(i32)s->off + (i32)i, + AA_TMP0); aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i)); i += 1; } @@ -1128,22 +1132,26 @@ static void aa_ret(CGTarget* t, const CGABIValue* val) { i32 base_off = val->storage.v.ind.ofs; u32 i = 0; while (i + 8 <= nbytes) { - aa64_emit32(mc, aa64_ldur(3, AA_TMP0, base_reg, base_off + (i32)i)); + aa64_emit_ldur_off(mc, 3, AA_TMP0, base_reg, base_off + (i32)i, + AA_TMP0); aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i)); i += 8; } while (i + 4 <= nbytes) { - aa64_emit32(mc, aa64_ldur(2, AA_TMP0, base_reg, base_off + (i32)i)); + aa64_emit_ldur_off(mc, 2, AA_TMP0, base_reg, base_off + (i32)i, + AA_TMP0); aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i)); i += 4; } while (i + 2 <= nbytes) { - aa64_emit32(mc, aa64_ldur(1, AA_TMP0, base_reg, base_off + (i32)i)); + aa64_emit_ldur_off(mc, 1, AA_TMP0, base_reg, base_off + (i32)i, + AA_TMP0); aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i)); i += 2; } while (i < nbytes) { - aa64_emit32(mc, aa64_ldur(0, AA_TMP0, base_reg, base_off + (i32)i)); + aa64_emit_ldur_off(mc, 0, AA_TMP0, base_reg, base_off + (i32)i, + AA_TMP0); aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i)); i += 1; } @@ -1182,9 +1190,9 @@ static void aa_ret(CGTarget* t, const CGABIValue* val) { u32 sidx = size_idx_for_bytes(pt->size); i32 off = base_off + (i32)pt->src_offset; if (pt->cls == ABI_CLASS_INT) { - aa64_emit32(mc, aa64_ldur(sidx, /*Rt=*/i, base_reg, off)); + aa64_emit_ldur_off(mc, sidx, /*Rt=*/i, base_reg, off, AA_TMP0); } else if (pt->cls == ABI_CLASS_FP) { - aa64_emit32(mc, aa64_ldur_fp(sidx, /*Rt=*/i, base_reg, off)); + aa64_emit_ldur_fp_off(mc, sidx, /*Rt=*/i, base_reg, off, AA_TMP0); } else { compiler_panic(t->c, a->loc, "aarch64 ret: ret part cls %d unimpl", (int)pt->cls); diff --git a/test/opt/phase0_guardrails.sh b/test/opt/phase0_guardrails.sh @@ -100,6 +100,37 @@ write_large_straight_line() { } >"$TMP/large_straight_line.c" } +write_many_stack_args_o1() { + local nargs=96 + local expected=0 + { + printf 'static int sink_many(' + for i in $(seq 0 $((nargs - 1))); do + if [ "$i" -gt 0 ]; then printf ','; fi + printf 'int a%d' "$i" + done + printf ') {\n int s = 0;\n' + for i in $(seq 0 $((nargs - 1))); do + local p=$(( (i * 17 + 3) % 101 + 1 )) + expected=$(( (expected + ((5 + i) & 255) * p) & 255 )) + printf ' s = (s + a%d * %d) & 255;\n' "$i" "$p" + done + printf ' return s;\n}\n' + printf 'int main() {\n' + for i in $(seq 0 $((nargs - 1))); do + printf ' int v%d = (5 + %d) & 255;\n' "$i" "$i" + done + printf ' int s = sink_many(' + for i in $(seq 0 $((nargs - 1))); do + if [ "$i" -gt 0 ]; then printf ','; fi + printf 'v%d' "$i" + done + printf ');\n' + printf ' return s == %d ? 0 : 1;\n' "$expected" + printf '}\n' + } >"$TMP/many_stack_args_o1.c" +} + run_case() { local name="$1" local src="$2" @@ -109,6 +140,13 @@ run_case() { printf 'phase0 %-24s O0/O1 OK\n' "$name" } +run_o1_case() { + local name="$1" + local src="$2" + "$BIN" run -O1 "$src" >/dev/null + printf 'phase0 %-24s O1 OK\n' "$name" +} + check_metrics() { local src="$TMP/branch_liveness.c" local err="$TMP/metrics.err" @@ -160,6 +198,7 @@ write_late_addrof_join write_spills write_many_small_functions write_large_straight_line +write_many_stack_args_o1 run_case branch_liveness "$TMP/branch_liveness.c" run_case call_clobber "$TMP/call_clobber.c" @@ -168,6 +207,7 @@ run_case late_addrof_join "$TMP/late_addrof_join.c" run_case spills "$TMP/spills.c" run_case many_small_functions "$TMP/many_small_functions.c" run_case large_straight_line "$TMP/large_straight_line.c" +run_o1_case many_stack_args "$TMP/many_stack_args_o1.c" check_metrics printf 'phase0 identified inline-asm stress: test/parse/cases/asm_01_grammar.c\n'