commit a7442212be4d2e3a72cad198038e1f75103ac0bb
parent e3cc40502d041ed5d803683ac0afc4edf7b641ef
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 15 May 2026 14:51:25 -0700
Fix AArch64 large frame offset emission
Diffstat:
5 files changed, 183 insertions(+), 53 deletions(-)
diff --git a/doc/OPT1.md b/doc/OPT1.md
@@ -287,18 +287,13 @@ mov w19, w21
MIR's O1 path suggests these high-value local cleanups that still fit cfree's
fast tier:
-1. Reduce and fix the spill-heavy JIT/runtime crash.
- The nonconstant wide-local spill probe returned correctly for many sizes but
- segfaulted at nearby large sizes. Isolate this to a small parse or CG API
- testcase before doing more spill-pressure perf work.
-
-2. Clean up local branch layout artifacts.
+1. Clean up local branch layout artifacts.
MIR's full jump optimizer is O2-only, but its cheap pieces are appropriate
for O1: delete branches to immediate fallthrough blocks, forward
branch-to-branch targets, and invert a branch when it removes an
unconditional jump. Avoid full CFG layout work.
-3. Promote remaining scalar entry slots before backend allocation.
+2. Promote remaining scalar entry slots before backend allocation.
MIR's C frontend represents normal scalar block locals as MIR registers and
leaves stack slots for aggregates, forced-stack cases, and address-taken
values. O1 now keeps simple loop locals in registers in the probe, but still
@@ -306,33 +301,32 @@ fast tier:
pass should promote remaining integer/pointer scalars whose address does not
escape, starting with parameters and single-entry structured control flow.
-4. Avoid unnecessary callee-save traffic.
+3. Avoid unnecessary callee-save traffic.
Reserve and preserve only hard registers that survive final post-rewrite
cleanup, and consider caller-saved registers for values that are not live
across calls. This would make small leaf functions much closer to expected
O1 output without requiring global optimization.
-5. Continue tightening post-rewrite DCE and copy cleanup.
+4. Continue tightening post-rewrite DCE and copy cleanup.
Model hard-register call arguments and call clobbers precisely enough to
delete dead caller-saved defs before calls without removing required ABI
traffic. Also fold single-use arithmetic temporaries into their destination
when target constraints allow it.
-6. Keep compare-branch fusion covered by tests.
+5. Keep compare-branch fusion covered by tests.
The current probes show direct `cmp` plus branch shapes for branch-consuming
compares on AArch64. Add focused regression coverage so the old
`cmp; cset; cmp #0; b.cond` bridge does not return.
-7. Keep tiny local constant simplification bounded.
+6. Keep tiny local constant simplification bounded.
The vstack constfold path now removes obvious immediate and straight-line
scalar-local code before allocation. Keep it basic-block-local; broader
propagation belongs in the O2 SSA/value optimizer.
-8. Watch spill-pressure regalloc slope.
+7. Watch spill-pressure regalloc slope.
Normal-path scaling is linear, but heavy spill pressure still bends slightly
- in regalloc time. After the correctness crash is fixed, rerun the
- nonconstant wide-local ladder and decide whether interval probing or stack
- slot assignment needs another narrow cleanup.
+ in regalloc time. Rerun the nonconstant wide-local ladder and decide whether
+ interval probing or stack slot assignment needs another narrow cleanup.
- Keep `opt_combine` legality target-aware.
Existing one-use copy/immediate/convert folds should stay conservative. New
diff --git a/src/arch/aa64/emit.c b/src/arch/aa64/emit.c
@@ -287,9 +287,34 @@ static u32 aa_build_prologue(CGTarget *t, u32 *words, u32 cap, u32 frame_size,
words[wi++] = aa64_sub_imm(1, 31, 31, (frame_size >> 12) & 0xfff, 1);
words[wi++] = aa64_sub_imm(1, 31, 31, frame_size & 0xfff, 0);
}
- if (wi + 2 > cap) goto overflow;
- words[wi++] = aa64_stp_x(29, 30, 31, (i32)fp_lr_off);
- words[wi++] = aa64_add_imm(1, 29, 31, fp_lr_off, 0);
+ if (fp_lr_off <= 504u) {
+ if (wi >= cap) goto overflow;
+ words[wi++] = aa64_stp_x(29, 30, 31, (i32)fp_lr_off);
+ } else {
+ if (wi + 2 > cap) goto overflow;
+ words[wi++] = aa64_str_uimm(3, 29, 31, fp_lr_off);
+ words[wi++] = aa64_str_uimm(3, 30, 31, fp_lr_off + 8u);
+ }
+ if (wi >= cap) goto overflow;
+ if (fp_lr_off <= 0xfffu) {
+ words[wi++] = aa64_add_imm(1, 29, 31, fp_lr_off, 0);
+ } else if ((fp_lr_off >> 24) == 0) {
+ u32 hi = (fp_lr_off >> 12) & 0xfffu;
+ u32 lo = fp_lr_off & 0xfffu;
+ if (hi) {
+ words[wi++] = aa64_add_imm(1, 29, 31, hi, 1);
+ if (lo) {
+ if (wi >= cap) goto overflow;
+ words[wi++] = aa64_add_imm(1, 29, 29, lo, 0);
+ }
+ } else if (lo) {
+ words[wi++] = aa64_add_imm(1, 29, 31, lo, 0);
+ }
+ } else {
+ compiler_panic(t->c, a->loc,
+ "aarch64: fp/lr offset %u out of prologue range",
+ fp_lr_off);
+ }
if (a->has_sret && a->sret_ptr_slot != FRAME_SLOT_NONE) {
AASlot *s = aa64_slot_get(a, a->sret_ptr_slot);
if (s) {
@@ -395,7 +420,12 @@ void aa_func_end(CGTarget *t) {
u32 r0 = int_regs[i];
aa64_emit32(mc, aa64_ldr_uimm(3, r0, 31, int_save_off + (u32)i * 8u));
}
- aa64_emit32(mc, aa64_ldp_x(29, 30, 31, (i32)fp_lr_off));
+ if (fp_lr_off <= 504u) {
+ aa64_emit32(mc, aa64_ldp_x(29, 30, 31, (i32)fp_lr_off));
+ } else {
+ aa64_emit32(mc, aa64_ldr_uimm(3, 29, 31, fp_lr_off));
+ aa64_emit32(mc, aa64_ldr_uimm(3, 30, 31, fp_lr_off + 8u));
+ }
emit_sp_add(mc, frame_size);
aa64_emit32(mc, aa64_ret(AA64_LR));
@@ -487,29 +517,34 @@ void aa_param(CGTarget *t, const CGParamDesc *p) {
} else {
u32 caller_off = a->next_param_stack;
a->next_param_stack += 8;
- aa64_emit32(t->mc, aa64_ldur(3, AA_TMP0, 29, (i32)(16 + caller_off)));
+ aa64_emit_ldur_off(t->mc, 3, AA_TMP0, 29, (i32)(16 + caller_off),
+ AA_TMP0);
ptr_reg = AA_TMP0;
}
u32 nbytes = s->size;
u32 i = 0;
while (i + 8 <= nbytes) {
- aa64_emit32(t->mc, aa64_ldur(3, AA_TMP1, ptr_reg, (i32)i));
- aa64_emit32(t->mc, aa64_stur(3, AA_TMP1, 29, -(i32)s->off + (i32)i));
+ aa64_emit_ldur_off(t->mc, 3, AA_TMP1, ptr_reg, (i32)i, AA_TMP2);
+ aa64_emit_stur_off(t->mc, 3, AA_TMP1, 29, -(i32)s->off + (i32)i,
+ AA_TMP2);
i += 8;
}
while (i + 4 <= nbytes) {
- aa64_emit32(t->mc, aa64_ldur(2, AA_TMP1, ptr_reg, (i32)i));
- aa64_emit32(t->mc, aa64_stur(2, AA_TMP1, 29, -(i32)s->off + (i32)i));
+ aa64_emit_ldur_off(t->mc, 2, AA_TMP1, ptr_reg, (i32)i, AA_TMP2);
+ aa64_emit_stur_off(t->mc, 2, AA_TMP1, 29, -(i32)s->off + (i32)i,
+ AA_TMP2);
i += 4;
}
while (i + 2 <= nbytes) {
- aa64_emit32(t->mc, aa64_ldur(1, AA_TMP1, ptr_reg, (i32)i));
- aa64_emit32(t->mc, aa64_stur(1, AA_TMP1, 29, -(i32)s->off + (i32)i));
+ aa64_emit_ldur_off(t->mc, 1, AA_TMP1, ptr_reg, (i32)i, AA_TMP2);
+ aa64_emit_stur_off(t->mc, 1, AA_TMP1, 29, -(i32)s->off + (i32)i,
+ AA_TMP2);
i += 2;
}
while (i < nbytes) {
- aa64_emit32(t->mc, aa64_ldur(0, AA_TMP1, ptr_reg, (i32)i));
- aa64_emit32(t->mc, aa64_stur(0, AA_TMP1, 29, -(i32)s->off + (i32)i));
+ aa64_emit_ldur_off(t->mc, 0, AA_TMP1, ptr_reg, (i32)i, AA_TMP2);
+ aa64_emit_stur_off(t->mc, 0, AA_TMP1, 29, -(i32)s->off + (i32)i,
+ AA_TMP2);
i += 1;
}
return;
@@ -523,28 +558,28 @@ void aa_param(CGTarget *t, const CGParamDesc *p) {
if (pt->cls == ABI_CLASS_INT) {
if (a->next_param_int < 8) {
u32 reg = a->next_param_int++;
- aa64_emit32(t->mc,
- aa64_stur(sidx, reg, 29, -(i32)s->off + (i32)part_off));
+ aa64_emit_stur_off(t->mc, sidx, reg, 29,
+ -(i32)s->off + (i32)part_off, AA_TMP0);
} else {
u32 caller_off = a->next_param_stack;
a->next_param_stack += 8;
- aa64_emit32(t->mc,
- aa64_ldur(sidx, AA_TMP0, 29, (i32)(16 + caller_off)));
- aa64_emit32(t->mc,
- aa64_stur(sidx, AA_TMP0, 29, -(i32)s->off + (i32)part_off));
+ aa64_emit_ldur_off(t->mc, sidx, AA_TMP0, 29,
+ (i32)(16 + caller_off), AA_TMP0);
+ aa64_emit_stur_off(t->mc, sidx, AA_TMP0, 29,
+ -(i32)s->off + (i32)part_off, AA_TMP1);
}
} else if (pt->cls == ABI_CLASS_FP) {
if (a->next_param_fp < 8) {
u32 reg = a->next_param_fp++;
- aa64_emit32(t->mc,
- aa64_stur_fp(sidx, reg, 29, -(i32)s->off + (i32)part_off));
+ aa64_emit_stur_fp_off(t->mc, sidx, reg, 29,
+ -(i32)s->off + (i32)part_off, AA_TMP0);
} else {
u32 caller_off = a->next_param_stack;
a->next_param_stack += 8;
- aa64_emit32(t->mc,
- aa64_ldur_fp(sidx, AA_FP_TMP0, 29, (i32)(16 + caller_off)));
- aa64_emit32(t->mc, aa64_stur_fp(sidx, AA_FP_TMP0, 29,
- -(i32)s->off + (i32)part_off));
+ aa64_emit_ldur_fp_off(t->mc, sidx, AA_FP_TMP0, 29,
+ (i32)(16 + caller_off), AA_TMP0);
+ aa64_emit_stur_fp_off(t->mc, sidx, AA_FP_TMP0, 29,
+ -(i32)s->off + (i32)part_off, AA_TMP0);
}
} else {
compiler_panic(t->c, a->loc, "aarch64 param: ABI class %d unimpl",
@@ -623,3 +658,47 @@ void aa64_emit_addr_adjust(MCEmitter *mc, u32 Rd, u32 base, i32 off) {
aa64_emit_load_imm(mc, 1, Rd, off);
aa64_emit32(mc, aa64_add(1, Rd, base, Rd));
}
+
+static int aa64_simm9_fits(i32 off) {
+ return off >= -256 && off <= 255;
+}
+
+void aa64_emit_ldur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
+ u32 tmp) {
+ if (aa64_simm9_fits(off)) {
+ aa64_emit32(mc, aa64_ldur(size, Rt, Rn, off));
+ return;
+ }
+ aa64_emit_addr_adjust(mc, tmp, Rn, off);
+ aa64_emit32(mc, aa64_ldur(size, Rt, tmp, 0));
+}
+
+void aa64_emit_stur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
+ u32 tmp) {
+ if (aa64_simm9_fits(off)) {
+ aa64_emit32(mc, aa64_stur(size, Rt, Rn, off));
+ return;
+ }
+ aa64_emit_addr_adjust(mc, tmp, Rn, off);
+ aa64_emit32(mc, aa64_stur(size, Rt, tmp, 0));
+}
+
+void aa64_emit_ldur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
+ u32 tmp) {
+ if (aa64_simm9_fits(off)) {
+ aa64_emit32(mc, aa64_ldur_fp(size, Rt, Rn, off));
+ return;
+ }
+ aa64_emit_addr_adjust(mc, tmp, Rn, off);
+ aa64_emit32(mc, aa64_ldur_fp(size, Rt, tmp, 0));
+}
+
+void aa64_emit_stur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
+ u32 tmp) {
+ if (aa64_simm9_fits(off)) {
+ aa64_emit32(mc, aa64_stur_fp(size, Rt, Rn, off));
+ return;
+ }
+ aa64_emit_addr_adjust(mc, tmp, Rn, off);
+ aa64_emit32(mc, aa64_stur_fp(size, Rt, tmp, 0));
+}
diff --git a/src/arch/aa64/internal.h b/src/arch/aa64/internal.h
@@ -202,8 +202,9 @@ static inline u32 aa64_bfm(u32 sf, u32 Rd, u32 Rn, u32 immr, u32 imms) {
* ============================================================ */
#define AA_PROLOGUE_WORDS \
- 23u /* worst case: sub sp + stp/add fp + sret + 10 int + 8 fp saves */
-#define AA_PROLOGUE_FRAME_WORDS 4u /* worst-case frame adjust + stp/add fp */
+ 25u /* worst case: sub sp + str/str/add-add fp + sret + 10 int + 8 fp */
+#define AA_PROLOGUE_FRAME_WORDS \
+ 6u /* worst-case frame adjust + split fp/lr saves + add-add fp */
typedef struct AASlot {
u32 off;
@@ -281,6 +282,14 @@ void aa64_patch32(ObjBuilder* obj, u32 sec_id, u32 ofs, u32 word);
void aa64_emit_load_imm(MCEmitter* mc, u32 sf, u32 Rd, i64 imm);
void emit_sp_add(MCEmitter* mc, u32 imm);
void aa64_emit_addr_adjust(MCEmitter* mc, u32 Rd, u32 base, i32 off);
+void aa64_emit_ldur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
+ u32 tmp);
+void aa64_emit_stur_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
+ u32 tmp);
+void aa64_emit_ldur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
+ u32 tmp);
+void aa64_emit_stur_fp_off(MCEmitter* mc, u32 size, u32 Rt, u32 Rn, i32 off,
+ u32 tmp);
void aa64_emit_got_load_addr(CGTarget* t, u32 dst_reg, ObjSymId sym);
void emit_global_addr(CGTarget* t, u32 dst_reg, ObjSymId sym, i64 addend);
diff --git a/src/arch/aa64/ops.c b/src/arch/aa64/ops.c
@@ -839,7 +839,7 @@ static void emit_arg_value(CGTarget* t, const ABIFuncInfo* fi,
AASlot* s = aa64_slot_get(a, av->storage.v.frame_slot);
if (!s) compiler_panic(t->c, a->loc, "aarch64 call: bad arg slot");
i32 off = -(i32)s->off + (i32)pt->src_offset;
- aa64_emit32(t->mc, aa64_ldur(sidx, dst_reg, 29, off));
+ aa64_emit_ldur_off(t->mc, sidx, dst_reg, 29, off, dst_reg);
break;
}
case OPK_INDIRECT: {
@@ -1095,22 +1095,26 @@ static void aa_ret(CGTarget* t, const CGABIValue* val) {
u32 nbytes = s->size;
u32 i = 0;
while (i + 8 <= nbytes) {
- aa64_emit32(mc, aa64_ldur(3, AA_TMP0, 29, -(i32)s->off + (i32)i));
+ aa64_emit_ldur_off(mc, 3, AA_TMP0, 29, -(i32)s->off + (i32)i,
+ AA_TMP0);
aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i));
i += 8;
}
while (i + 4 <= nbytes) {
- aa64_emit32(mc, aa64_ldur(2, AA_TMP0, 29, -(i32)s->off + (i32)i));
+ aa64_emit_ldur_off(mc, 2, AA_TMP0, 29, -(i32)s->off + (i32)i,
+ AA_TMP0);
aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i));
i += 4;
}
while (i + 2 <= nbytes) {
- aa64_emit32(mc, aa64_ldur(1, AA_TMP0, 29, -(i32)s->off + (i32)i));
+ aa64_emit_ldur_off(mc, 1, AA_TMP0, 29, -(i32)s->off + (i32)i,
+ AA_TMP0);
aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i));
i += 2;
}
while (i < nbytes) {
- aa64_emit32(mc, aa64_ldur(0, AA_TMP0, 29, -(i32)s->off + (i32)i));
+ aa64_emit_ldur_off(mc, 0, AA_TMP0, 29, -(i32)s->off + (i32)i,
+ AA_TMP0);
aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i));
i += 1;
}
@@ -1128,22 +1132,26 @@ static void aa_ret(CGTarget* t, const CGABIValue* val) {
i32 base_off = val->storage.v.ind.ofs;
u32 i = 0;
while (i + 8 <= nbytes) {
- aa64_emit32(mc, aa64_ldur(3, AA_TMP0, base_reg, base_off + (i32)i));
+ aa64_emit_ldur_off(mc, 3, AA_TMP0, base_reg, base_off + (i32)i,
+ AA_TMP0);
aa64_emit32(mc, aa64_str_uimm(3, AA_TMP0, 8, i));
i += 8;
}
while (i + 4 <= nbytes) {
- aa64_emit32(mc, aa64_ldur(2, AA_TMP0, base_reg, base_off + (i32)i));
+ aa64_emit_ldur_off(mc, 2, AA_TMP0, base_reg, base_off + (i32)i,
+ AA_TMP0);
aa64_emit32(mc, aa64_str_uimm(2, AA_TMP0, 8, i));
i += 4;
}
while (i + 2 <= nbytes) {
- aa64_emit32(mc, aa64_ldur(1, AA_TMP0, base_reg, base_off + (i32)i));
+ aa64_emit_ldur_off(mc, 1, AA_TMP0, base_reg, base_off + (i32)i,
+ AA_TMP0);
aa64_emit32(mc, aa64_str_uimm(1, AA_TMP0, 8, i));
i += 2;
}
while (i < nbytes) {
- aa64_emit32(mc, aa64_ldur(0, AA_TMP0, base_reg, base_off + (i32)i));
+ aa64_emit_ldur_off(mc, 0, AA_TMP0, base_reg, base_off + (i32)i,
+ AA_TMP0);
aa64_emit32(mc, aa64_str_uimm(0, AA_TMP0, 8, i));
i += 1;
}
@@ -1182,9 +1190,9 @@ static void aa_ret(CGTarget* t, const CGABIValue* val) {
u32 sidx = size_idx_for_bytes(pt->size);
i32 off = base_off + (i32)pt->src_offset;
if (pt->cls == ABI_CLASS_INT) {
- aa64_emit32(mc, aa64_ldur(sidx, /*Rt=*/i, base_reg, off));
+ aa64_emit_ldur_off(mc, sidx, /*Rt=*/i, base_reg, off, AA_TMP0);
} else if (pt->cls == ABI_CLASS_FP) {
- aa64_emit32(mc, aa64_ldur_fp(sidx, /*Rt=*/i, base_reg, off));
+ aa64_emit_ldur_fp_off(mc, sidx, /*Rt=*/i, base_reg, off, AA_TMP0);
} else {
compiler_panic(t->c, a->loc, "aarch64 ret: ret part cls %d unimpl",
(int)pt->cls);
diff --git a/test/opt/phase0_guardrails.sh b/test/opt/phase0_guardrails.sh
@@ -100,6 +100,37 @@ write_large_straight_line() {
} >"$TMP/large_straight_line.c"
}
+write_many_stack_args_o1() {
+ local nargs=96
+ local expected=0
+ {
+ printf 'static int sink_many('
+ for i in $(seq 0 $((nargs - 1))); do
+ if [ "$i" -gt 0 ]; then printf ','; fi
+ printf 'int a%d' "$i"
+ done
+ printf ') {\n int s = 0;\n'
+ for i in $(seq 0 $((nargs - 1))); do
+ local p=$(( (i * 17 + 3) % 101 + 1 ))
+ expected=$(( (expected + ((5 + i) & 255) * p) & 255 ))
+ printf ' s = (s + a%d * %d) & 255;\n' "$i" "$p"
+ done
+ printf ' return s;\n}\n'
+ printf 'int main() {\n'
+ for i in $(seq 0 $((nargs - 1))); do
+ printf ' int v%d = (5 + %d) & 255;\n' "$i" "$i"
+ done
+ printf ' int s = sink_many('
+ for i in $(seq 0 $((nargs - 1))); do
+ if [ "$i" -gt 0 ]; then printf ','; fi
+ printf 'v%d' "$i"
+ done
+ printf ');\n'
+ printf ' return s == %d ? 0 : 1;\n' "$expected"
+ printf '}\n'
+ } >"$TMP/many_stack_args_o1.c"
+}
+
run_case() {
local name="$1"
local src="$2"
@@ -109,6 +140,13 @@ run_case() {
printf 'phase0 %-24s O0/O1 OK\n' "$name"
}
+run_o1_case() {
+ local name="$1"
+ local src="$2"
+ "$BIN" run -O1 "$src" >/dev/null
+ printf 'phase0 %-24s O1 OK\n' "$name"
+}
+
check_metrics() {
local src="$TMP/branch_liveness.c"
local err="$TMP/metrics.err"
@@ -160,6 +198,7 @@ write_late_addrof_join
write_spills
write_many_small_functions
write_large_straight_line
+write_many_stack_args_o1
run_case branch_liveness "$TMP/branch_liveness.c"
run_case call_clobber "$TMP/call_clobber.c"
@@ -168,6 +207,7 @@ run_case late_addrof_join "$TMP/late_addrof_join.c"
run_case spills "$TMP/spills.c"
run_case many_small_functions "$TMP/many_small_functions.c"
run_case large_straight_line "$TMP/large_straight_line.c"
+run_o1_case many_stack_args "$TMP/many_stack_args_o1.c"
check_metrics
printf 'phase0 identified inline-asm stress: test/parse/cases/asm_01_grammar.c\n'