commit 38f83b0b93daa7403a21625ead1ffcaf1da37f20
parent 00b2d7e9dc31fd67c6fd1f2e6728ecbebfb0df21
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 29 May 2026 13:48:55 -0700
O1 aa64: fix two regalloc interference bugs + add third int scratch reg
Three independent -O1 (bootstrap-release) codegen bugs on aa64, found while
bringing up the -O1 self-build. Together they take `make bootstrap-release`
from failing on the first stage2 TU (src/api/asm_emit.c) to compiling and
linking a complete stage2 cfree that runs. -O0 still reproduces (stage2 ==
stage3) and toy is 1034/0.
1. opt_ranges_overlap_kind (pass_coalesce.c): measure overlap on the exact
raw_start/raw_end points, not the compressed start/end. range_compress_points
only keeps points that are a range boundary, so an interior instruction point
shared by two live values gets dropped — collapsing a genuine 2-point overlap
into a single compressed point that masquerades as the benign unit-overlap of
a coalescable move. This let the O1 hint fallback in opt_assign_ranges place a
live call result and a later x0-bound copy into the same hard reg (src/cg/control.c
block 18: `call def=v44` then `copy v46=v1`, both x0). The COPY/swap pattern the
unit-overlap is meant to permit is genuinely one *raw* point wide, so raw points
distinguish the two cases with no false positives.
2. apply_abi_aliasing_hints / opt_assign_ranges (pass_lower.c): never place a
value that is live across a call into the out-of-allocable-set hint reg when
that reg is caller-saved. The +1000 caller-save penalty in hard_reg_alloc_score
only deflects the hint when a cheaper reg is *found*; under high register
pressure (found == 0) the fallback took the hint reg regardless, parking a
cross-call value (x0-hinted via a copy chain from an earlier call result) in x0
where it collided with the next call's result (src/api/asm_emit.c: v38, live
across two calls, used in a successor block).
3. aa_int_scratch (aa64/native.c): three int scratch regs (x9/x10/x11), not two.
A 3-operand op whose dst and both sources all spill (e.g. `binop dst, a, b` with
a non-encodable immediate operand, or `store [base+index], value`) needs three
distinct scratch regs at emit time — the IR spill-rewrite round-robins operands
across this pool and the native emitter materializes each into one. Two left an
all-spilled binop's immediate operand with nowhere to land (src/arch/mc.c,
src/link/link_reloc_layout.c). x11 was an allocable caller-saved temp, now reserved.
Remaining: the stage2 (-O1-self-compiled) cfree segfaults compiling stage3's
abi.c — a runtime -O1 miscompile, tracked separately.
Diffstat:
3 files changed, 34 insertions(+), 9 deletions(-)
diff --git a/src/arch/aa64/native.c b/src/arch/aa64/native.c
@@ -3315,7 +3315,14 @@ static void aa_bind_native_param(NativeTarget* t, const CGParamDesc* p,
* under register pressure, after which the prologue saves/restores them. */
static const Reg aa_int_allocable[] = {8u, 11u, 12u, 13u, 14u, 15u, 19u, 20u,
21u, 22u, 23u, 24u, 25u, 26u, 27u, 28u};
-static const Reg aa_int_scratch[] = {9u, 10u};
+/* Three int scratch registers, not two: a 3-operand op (e.g. `binop dst, a, b`
+ * or `store [base+index], value`) whose dst/sources all spill needs three
+ * distinct scratch regs at emit time — the IR spill-rewrite round-robins
+ * operands across this pool and the native emitter materializes each into one.
+ * With only two, an immediate operand of an all-spilled binop had nowhere to
+ * land. x9/x10/x11 are all caller-saved temporaries reserved out of the
+ * allocable set below. */
+static const Reg aa_int_scratch[] = {9u, 10u, 11u};
static const Reg aa_fp_allocable[] = {18u, 19u, 8u, 9u, 10u,
11u, 12u, 13u, 14u, 15u};
static const Reg aa_fp_scratch[] = {20u, 21u};
@@ -3363,7 +3370,7 @@ static const NativePhysRegInfo aa_int_phys[] = {
AA_PHYS_INT_ARG(4u), AA_PHYS_INT_ARG(5u),
AA_PHYS_INT_ARG(6u), AA_PHYS_INT_ARG(7u),
AA_PHYS_INT_ALLOC(8u), AA_PHYS_INT_RESERVED(9u),
- AA_PHYS_INT_RESERVED(10u), AA_PHYS_INT_ALLOC(11u),
+ AA_PHYS_INT_RESERVED(10u), AA_PHYS_INT_RESERVED(11u),
AA_PHYS_INT_ALLOC(12u), AA_PHYS_INT_ALLOC(13u),
AA_PHYS_INT_ALLOC(14u), AA_PHYS_INT_ALLOC(15u),
AA_PHYS_INT_RESERVED(16u), AA_PHYS_INT_RESERVED(17u),
diff --git a/src/opt/pass_coalesce.c b/src/opt/pass_coalesce.c
@@ -67,6 +67,16 @@ int opt_ranges_overlap_kind(const OptLiveRangeSet* ranges, PReg a, PReg b) {
* dst has multiple non-SSA defs whose live ranges each clip src — those
* are real conflicts and must block coalescing, otherwise we'd allocate
* dst into src's hard register and clobber src at the extra def. */
+ /* Overlap must be measured on the *raw* (pre-compression) point numbers,
+ * not the compressed start/end. range_compress_points only keeps points
+ * that are some range's boundary, so an interior instruction point shared
+ * by two live values can be dropped — collapsing a genuine multi-point
+ * overlap down to a single compressed point and masquerading as the benign
+ * unit-overlap of a coalescable move. The benign move/swap pattern
+ * (`dst = COPY src`, or `sub x0, x21, x0`) is genuinely one *raw* point
+ * wide: src is used and dst is defined at the same instruction. A value
+ * that stays live across the def of an unrelated value spans two or more
+ * raw points and is a real conflict. */
u32 unit_count = 0;
for (u32 ar = ranges->first_range_by_preg[a]; ar != OPT_RANGE_NONE;
ar = ranges->ranges[ar].next) {
@@ -74,9 +84,10 @@ int opt_ranges_overlap_kind(const OptLiveRangeSet* ranges, PReg a, PReg b) {
for (u32 br = ranges->first_range_by_preg[b]; br != OPT_RANGE_NONE;
br = ranges->ranges[br].next) {
const OptLiveRange* rb = &ranges->ranges[br];
- if (ra->start < rb->end && rb->start < ra->end) {
- u32 start = ra->start > rb->start ? ra->start : rb->start;
- u32 end = ra->end < rb->end ? ra->end : rb->end;
+ if (ra->raw_start < rb->raw_end && rb->raw_start < ra->raw_end) {
+ u32 start =
+ ra->raw_start > rb->raw_start ? ra->raw_start : rb->raw_start;
+ u32 end = ra->raw_end < rb->raw_end ? ra->raw_end : rb->raw_end;
if (end > start + 1u) return 2;
if (++unit_count > 1u) return 2;
}
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -1083,10 +1083,17 @@ static void opt_assign_ranges(Func* f, const OptLiveRangeSet* ranges,
* allocable set (e.g. x0 on aa64: reserved as the ABI ret reg, not in
* aa_int_allocable). Used by apply_abi_aliasing_hints to let an IR_CALL
* result PReg or IR_RET value PReg land directly in x0, eliding the
- * materialization move emit_call/emit_ret would otherwise emit. The
- * existing caller-save penalty in hard_reg_alloc_score (+1000) keeps
- * cross-call values away from x0; only short-lived PRegs benefit. */
- if (vi->preferred_hard_reg >= 0) {
+ * materialization move emit_call/emit_ret would otherwise emit. Only
+ * short-lived PRegs benefit: a value live across a call cannot survive in
+ * a caller-saved hint reg, and this is the *only* path that can reach an
+ * out-of-allocable-set reg, so guard it explicitly. The +1000 caller-save
+ * penalty in hard_reg_alloc_score only deflects the hint when a cheaper
+ * reg is found; under high register pressure (found == 0) the fallback
+ * below would otherwise take the hint reg regardless of score, parking a
+ * cross-call value in x0 where it collides with the next call's result. */
+ if (vi->preferred_hard_reg >= 0 &&
+ !(vi->live_across_call_freq &&
+ is_caller_saved(f, cls, (Reg)vi->preferred_hard_reg))) {
Reg hint = (Reg)vi->preferred_hard_reg;
int already_tried = 0;
for (u32 r = 0; r < f->opt_hard_reg_count[cls]; ++r) {