commit d6525a7acdd2895269b80ec6084546262d3e2a34
parent 6da4e5eab3c298143aed3153325ca03e27c19d4a
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 29 May 2026 19:07:06 -0700
Fix O1 bootstrap scratch register hazards
Diffstat:
3 files changed, 160 insertions(+), 185 deletions(-)
diff --git a/doc/BOOTSTRAP_O1.md b/doc/BOOTSTRAP_O1.md
@@ -1,202 +1,163 @@
-# 3-Stage Bootstrap: state & open bugs
+# 3-Stage Bootstrap: O0/O1 Fixed Point
-The bootstrap compiles cfree with itself three times and checks the result is a
-fixed point:
+The bootstrap builds cfree with itself three times and requires the last two
+stages to be byte-identical:
-- **stage1** = the host (clang/asan) build of cfree (`build/cfree`, copied to
- `build/<mode>/bootstrap/stage1`).
-- **stage1 compiles stage2**, **stage2 compiles stage3**, then `cmp stage2 stage3`.
+- **stage1** = the host-built cfree copied into `build/<mode>/bootstrap/stage1`.
+- **stage2** = stage1 compiling the full tree.
+- **stage3** = stage2 compiling the full tree.
+- The fixed-point check is `cmp stage2/cfree stage3/cfree`.
-```
-make bootstrap-debug # -O0 path (HOST_OPTFLAGS=-O0)
-make bootstrap-release # -O1 path (Makefile sets HOST_OPTFLAGS=-O1 inside RELEASE=1)
-make bootstrap-test-toy # bootstrap-debug, then run test/toy against stage3
-```
+On aarch64-macos, both the debug/O0 and release/O1 bootstrap paths now reach
+the fixed point, and both bootstrapped compilers run the full Toy corpus.
-Host = aarch64-macos (aa64 backend, the `-O1` native-emit path:
-`src/opt/pass_native_emit.c` + `src/arch/aa64/native.c`).
+## Current Status
----
+Commands verified:
-## -O0 — DONE (reproduces)
-
-`make bootstrap-debug` reproduces: `stage2 == stage3` (identical sha256), toy
-**1034 pass / 0 fail / 8 skip**.
+```sh
+gmake -s bootstrap-debug
+gmake -s bootstrap-release
+gmake -s bootstrap-test-toy
+CFREE=$(pwd)/build/release/bootstrap/stage3/cfree test/toy/run.sh
+```
-Earlier `-O0` bugs (weak syms, return coercion, sret reg-clobber — commit
-`826fa2a`; and a `type_qualified` struct-init-padding self-miscompile — commit
-`85446c9`) are fixed. The `type_qualified` one is worth remembering as a class of
-hazard: cfree lowers aggregate **initialization** field-by-field and does **not**
-copy inter-field padding, whereas aggregate **assignment** uses a full-width
-`copy_bytes`. Any `memcmp` over a struct value built by aggregate *init* can see
-uninitialized padding. Sibling `memcmp`-on-struct sites still exist
-(`src/cg/type.c` ~329/347 over `attrs`); the safe idiom is to build the template
-by plain assignment (`Type tmpl; tmpl = *base;`).
+Stage executable hashes:
----
+```text
+62e17394d27b5f69678abf7a65c74fec2954132341b3a2b01bb539d91d77ea83 build/release/bootstrap/stage2/cfree
+62e17394d27b5f69678abf7a65c74fec2954132341b3a2b01bb539d91d77ea83 build/release/bootstrap/stage3/cfree
+a90c6c56281856e6963745a14b0c0ac1779583dc1c5199fc80e22fb513f7a72d build/debug/bootstrap/stage2/cfree
+a90c6c56281856e6963745a14b0c0ac1779583dc1c5199fc80e22fb513f7a72d build/debug/bootstrap/stage3/cfree
+```
-## -O1 — three aa64 codegen bugs fixed (commit `b520142`)
+The per-object fixed-point check also has zero differences in both modes:
+
+```sh
+find build/release/bootstrap/stage2 build/release/bootstrap/stage3 -type f -name '*.o' |
+ sed 's#build/release/bootstrap/stage[23]/##' | sort -u |
+ while read f; do
+ cmp -s "build/release/bootstrap/stage2/$f" \
+ "build/release/bootstrap/stage3/$f" || echo "$f"
+ done
+
+find build/debug/bootstrap/stage2 build/debug/bootstrap/stage3 -type f -name '*.o' |
+ sed 's#build/debug/bootstrap/stage[23]/##' | sort -u |
+ while read f; do
+ cmp -s "build/debug/bootstrap/stage2/$f" \
+ "build/debug/bootstrap/stage3/$f" || echo "$f"
+ done
+```
-Before this commit `make bootstrap-release` died on the **first** stage2 TU
-(`src/api/asm_emit.c`). After it, the `-O1` self-build compiles **and links a
-complete, runnable stage2**. `-O0` still reproduces and toy is still 1034/0.
+Both commands print nothing. The same check piped to `wc -l` prints `0`.
-Per-file reproduction used throughout:
+Toy results:
-```
-build/cfree cc -O1 -DNDEBUG -ffreestanding -nostdinc -Irt/include \
- -fvisibility=hidden -Iinclude -Isrc -c <file> -o /tmp/x.o
+```text
+debug stage3: 1034 pass, 0 fail, 8 skip
+release stage3: 1034 pass, 0 fail, 8 skip
```
-### 1. `opt_ranges_overlap_kind` must use raw, not compressed, points
-`src/opt/pass_coalesce.c`. `range_compress_points` only keeps points that are a
-range boundary, so an interior instruction point shared by two live values gets
-dropped — collapsing a genuine 2-point overlap into a single compressed point
-that masquerades as the benign unit-overlap of a coalescable move. This let the
-O1 hint fallback in `opt_assign_ranges` place a live call result and a later
-x0-bound copy into the **same** hard reg (`src/cg/control.c` block 18:
-`call def=v44` then `copy v46=v1`, both x0). The COPY/swap pattern the
-unit-overlap is meant to permit is genuinely one *raw* point wide, so raw points
-distinguish the two cases with no false positives. **Fix:** iterate
-`raw_start`/`raw_end` instead of `start`/`end`.
-
-### 2. Never park a live-across-call value in the caller-saved hint reg
-`src/opt/pass_lower.c`, `opt_assign_ranges` hint fallback. The +1000 caller-save
-penalty in `hard_reg_alloc_score` only deflects the out-of-allocable-set hint reg
-(e.g. x0 on aa64) when a cheaper reg is *found*; under high register pressure
-(`found == 0`) the fallback took the hint reg regardless, parking a cross-call
-value (x0-hinted via a copy chain from an earlier call result) in x0 where it
-collided with the next call's result (`src/api/asm_emit.c`: v38, live across two
-calls, used in a successor block). **Fix:** guard the hint-reg branch with
-`!(vi->live_across_call_freq && is_caller_saved(f, cls, hint))`.
-
-### 3. aa64 needs three int scratch registers, not two
-`src/arch/aa64/native.c`. A 3-operand op whose dst and both sources all spill
-(e.g. `binop dst, a, b` with a non-encodable immediate operand, or
-`store [base+index], value`) needs three distinct scratch regs at emit time — the
-IR spill-rewrite round-robins operands across this pool and the native emitter
-materializes each into one. Two left an all-spilled binop's immediate operand
-with nowhere to land (`src/arch/mc.c`, `src/link/link_reloc_layout.c`). **Fix:**
-`aa_int_scratch = {x9, x10, x11}`; x11 moved from allocable to reserved in
-`aa_int_phys`. (Note `aa_int_allocable[]` still lists x11; that array feeds only
-the **-O0** `NativeDirectTarget`, which uses its own x16/x17 temps, so there is no
-conflict — the two emitters keep independent register models.)
-
----
-
-## -O1 — deref-through-frame-pointer-local miscompile fixed (`src/opt/cg_ir_lower.c`)
-
-This fix took the `-O1` self-build from *crashing while compiling stage3's
-`src/abi/abi.c`* to *compiling all of stage3*. toy is still **1034/0/8 at both
-`-O0` and `-O1`** (toy runs `R`/`L` at both opt levels, so the suite exercises
-this `-O1` path). `-O0` still reproduces.
-
-> ⚠️ The earlier "`cc_alloc_arrays` … missing x19 reload" root cause in this doc
-> was a **`-g` artifact**: compiling the suspect TU with `-g` to get a named
-> backtrace *perturbs codegen* and shifts/creates a *different* bug. The real
-> stage2 crash compiling `abi.c` was in **`cc_fill_c_opts`** (`driver/cc.c`),
-> and `cc_alloc_arrays` is in fact compiled correctly. **Reproduce and inspect
-> without `-g`** (it changes the object: e.g. `driver/cc.o` sha differs with vs
-> without `-g`). The clang build is deterministic across host opt level
-> (`build/cfree` `-O0`+asan == `build/release/cfree` `-O1`), so `build/cfree cc
-> -O1 …` reproduces exactly what stage1 feeds into stage2.
-
-### Root cause (two coordinated defects, both about pointer locals)
-`cc_fill_c_opts(const CcOptions* o, …, CfreeCCompileOptions* copts)` does
-`*copts = zero;` then a run of `copts->code.field = …;`. `copts` is a pointer
-param whose **address is never taken** in C, so it should stay in a register.
-
-1. **AGG_COPY/AGG_SET force-homed the pointer.** `*copts = zero` lowers to an
- `agg_copy` whose dest operand is `copts`. The aggregate ops take their operands
- as pointer *values* (the emitter derefs an `OPK_LOCAL` pointer operand via
- `pointer_addr_from_operand`), but `lower_one_inst` ran them through
- `lower_addr_value_ops(naddr=1)`, so the dest went through `lower_operand_addr`,
- which **force-creates a frame home** for any `OPK_LOCAL`. `local_address_used_in_cg_ir`
- *also* flagged the same operand. Both together marked `copts` `address_taken`
- ⇒ `CG_LOCAL_STORAGE_FRAME`.
-
-2. **A FRAME pointer local can't be an indirect base.** The scalar stores
- `copts->field` lower to `store [copts+off]` (`OPK_INDIRECT`, base `copts`).
- `lower_operand_addr`'s `OPK_INDIRECT` case set `out.v.ind.base =
- base->storage.v.reg` — but `storage.v` is a **union** of `reg`/`frame_slot`, so
- for a FRAME local `.v.reg` is garbage. The store base register was therefore
- *never defined* (read uninitialized; flaky NULL / leftover path string ⇒
- segfault). `-O0`'s `nd_addr_pointer` handles this with a `FRAME_VALUE` base;
- `-O1` did not.
-
-### Fix
-- **(B)** Lower `AGG_COPY`/`AGG_SET` operands as **values** (`lower_use_ops`), and
- only count a *non-pointer* agg operand as address-taken
- (`operand_uses_local_agg_addr`) — a pointer operand uses the local's value, not
- its slot. This keeps `copts` in a register.
-- **(A)** `prematerialize_indirect_bases` runs before each instruction is emitted
- and, for every `OPK_INDIRECT` base/index that *is* a FRAME local, emits a load
- of the pointer from its home into a fresh PReg (or `addr_of` for a non-pointer
- aggregate base — defensive, not yet observed); `resolve_indirect_base_reg` then
- returns that PReg. This makes a genuinely address-taken pointer local
- (`int **q = &p; p->f = …`) work too, not just the `copts` false-positive.
-
-Minimal repro of the genuine (A) case (was: undefined `x8` base; now: `ldr x8,
-[home]` before each deref):
-```c
-typedef struct { int a,b; long d; } S;
-void g(S* p, S*** out){ *out=&p; p->a=1; p->b=2; p->d=3; }
-```
+The Toy runner used `PATHS=RLCW` and `OPT_LEVELS=0 1`, so this covers run,
+link/native, C backend, and Wasm paths where supported, at both Toy opt levels.
+
+## Final O1 Bugs Closed
----
+### Native emitter clobbered the RHS location
-## -O1 — STILL OPEN: stage2 emits malformed objects ⇒ stage3 **link** fails
+`src/opt/pass_native_emit.c` materialized the left-hand operand of `IR_BINOP`,
+`IR_CMP`, and `IR_CMP_BRANCH` without first protecting a right-hand operand that
+was already in a target scratch register.
-With the fix above, stage2 compiles every stage3 TU, but the **final stage3 link
-segfaults** (`Makefile:424`). The objects themselves are malformed — *not* a
-linker bug:
+The concrete failure was in `src/obj/macho/read.c` while evaluating:
+```c
+1u << (m->align_log2 & 31)
```
-# the clang-built (correct) linker rejects stage2-compiled stage3 objects too:
-build/release/cfree cc <captured stage3 link args, output to build/tmp>
- → fatal: link: LDST32_ABS_LO12_NC misaligned address (kind=27 S=0x10048ea26 A=0 …)
-# stage2's own ld is flaky on the same input: usually SIGSEGV, sometimes
-# "link_emit_macho: entry symbol below __TEXT base" (src/obj/macho/link.c:2410)
+
+The RHS shift count had been loaded into `w9`, then the LHS constant `1` was
+also materialized into `w9`, so the shift used `1 << 1`. Stage2's object reader
+therefore reported many Mach-O section alignments as `2**1`; `ld -r` propagated
+those bad alignments into `libcfree.o`, and the later stage3 link failed with
+Mach-O relocation alignment and entry-address errors.
+
+The fix is to compute the RHS `NativeLoc` first and pass `loc_avoid_reg(b)` when
+materializing the LHS for binops and comparisons. This prevents the LHS
+materialization from choosing a scratch register that already holds the RHS.
+
+### MIR combine propagated target scratch registers into calls
+
+`src/opt/pass_combine.c` treated backend scratch registers like ordinary hard
+registers during copy propagation. That is invalid after native lowering starts
+using those registers as transient emit-time temporaries.
+
+The concrete failure was the 9-argument call to `declare_function` in
+`parse_external_decl`. The `out_decl_flags` stack argument was first protected
+in an allocable register, but combine rewrote the call aux argument back to
+scratch `x9`. Before `aa_plan_call` stored the stack argument, another operand
+materialization reused `x9`, so the stack slot received the unrelated `dattrs`
+value.
+
+That bad argument sometimes gave `metrics_sink` the `CFREE_CG_INLINE_ALWAYS`
+flag, which made stage2 and stage3 emit different objects even after the link
+alignment bug was fixed.
+
+The fix is to reject copy propagation from any register listed in
+`f->opt_scratch_regs[cls]`. Scratch registers may appear in lowered MIR, but
+they must not be extended across later instructions by combine.
+
+## Investigation Harness Notes
+
+The useful oracle was not just "does stage3 link"; it was whether stage2, when
+used as a compiler, reproduced the same objects as the host-built compiler.
+That allowed the search to separate malformed-object bugs from link-driver
+symptoms.
+
+Effective checks:
+
+```sh
+# Compare a specific stage2-compiled object with the host-built compiler's output.
+build/release/cfree cc --support-dir . -O1 -DNDEBUG -ffunction-sections \
+ -fdata-sections -std=c11 -Wpedantic -Wall -Wextra -Werror \
+ -ffreestanding -nostdinc -Irt/include -fvisibility=hidden \
+ -Iinclude -Ilang/cpp -Ilang/c -MMD -MP \
+ -c lang/c/parse/parse.c -o /tmp/parse-host.o
+
+build/release/bootstrap/stage2/cfree cc --support-dir . -O1 -DNDEBUG \
+ -ffunction-sections -fdata-sections -std=c11 -Wpedantic -Wall -Wextra \
+ -Werror -ffreestanding -nostdinc -Irt/include -fvisibility=hidden \
+ -Iinclude -Ilang/cpp -Ilang/c -MMD -MP \
+ -c lang/c/parse/parse.c -o /tmp/parse-stage2.o
+
+cmp /tmp/parse-host.o /tmp/parse-stage2.o
```
-An `LDST32_ABS_LO12_NC` reloc (32-bit ldr/str `:lo12:`) targets a **2-aligned**
-symbol — the scaled imm12 can't encode a non-4-aligned offset. So a 32-bit access
-references a symbol cfree placed at a misaligned address.
-
-### What is known
-- This is a **separate, latent `-O1` codegen bug** (not A/B); it was masked by the
- earlier `abi.c` crash. toy does **not** trigger it — the triggering C pattern is
- in the compiler's own sources, not in toy programs.
-- It is a **stage2 self-miscompile**: cfree's `-O1` codegen is deterministic
- across host opt (`build/cfree` == `build/release/cfree` per-TU), and
- `build/release/cfree`(=stage1) output == `stage2/lib/*.o`. But **stage2 (cfree
- built by stage1) produces different output than the reference** for some TUs —
- i.e. cfree's `-O1` machine code for *some codegen function* is wrong, so stage2
- miscompiles. Per-object check `build/cfree cc -O1 … -c <src>` vs
- `stage3/lib/<src>.o`: **DIFFERS** for `cg/data.c`, `cg/type.c`, `cg/value.c`;
- **SAME** for `core/arena.c`, `core/vec.c`, `cg/local.c`. So it is one (or few)
- codegen function(s) hit by a specific pattern, not a universal break.
-- Bisection so far (swap stage2 codegen TUs to `-O0`, then test
- `stage2(cg/data.c) == build/cfree(cg/data.c)`): **not** fixed by `opt/*` at
- `-O0`, **not** by `arch/aa64/* + arch/mc.c + arch/cgtarget.c` at `-O0`. The
- buggy TU is therefore elsewhere — likely `cg/*` or `obj/*`.
-
-### Harnesses (left in `/tmp` from this session; regenerate from a fresh log)
-- `bisect2.sh <src…>`: recompile listed TUs at `-O0` into stage2, relink stage2
- (`s2_ldr.sh`,`s2_ar.sh`,`s2_link.sh`), then compare `stage2 cc -O1 -c
- src/cg/data.c` to `build/cfree`'s output. Prints `FIXED`/`STILL WRONG`. This is
- the right oracle — the **clang-built `build/release/cfree` linker reproduces the
- malformed-object rejection deterministically**, which is far cleaner than
- stage2's flaky `ld`.
-- For function-level IR inspection, re-add a `CFREE_DUMPFN=<substr>` filter to
- `opt_dbg_dump`/`opt_dbg_dump_cg` in `src/opt/opt.c` (resolve the func name via
- `pool_slice(o->c->global, obj_symbol_get(o->target->obj, f->desc.sym)->name)`)
- and dump at tag `pre-emit` (final MIR) / `entry` (clean IR). Invaluable; it is
- how `cc_fill_c_opts`'s use-before-def of the base PReg was found.
-
-### Old bisection harness notes (still apply)
-Replay each stage2 TU's exact compile command, swapping `-O1`→`-O0` (flags **must**
-match or `.build-config` changes force a full recompile). The release stage2 links
-only via cfree's own `ld`; but for triage prefer the clang-built
-`build/release/cfree` as the linker — it rejects the malformed objects with a
-precise message instead of crashing. Avoid `-g` for codegen triage — it perturbs
-the output.
+
+Hybrid rebuilds were also effective: relink stage2 after replacing one suspect
+TU, or one piece of a split TU, with a clang-built object, then use stage2 to
+compile the known-differing target object. This found that the malformed-link
+failure was not in the linker itself, then narrowed the final object divergence
+to codegen around C parsing and call lowering.
+
+For MIR-level inspection, a temporary filtered dump around the target symbol
+was enough. The decisive dump was the call to `_declare_function` after lowering
+and combine: before the fix, the call aux stack argument referenced scratch
+`x9`; after the fix, it references a non-scratch allocable register and survives
+the later materialization that also uses `x9`.
+
+Avoid `-g` while triaging O1 codegen. It changes object layout and can create or
+hide different bugs.
+
+## Earlier O1 Fixes Still Relevant
+
+These previous fixes are part of the same O1 bootstrap path and should stay in
+mind if this regresses:
+
+- `src/opt/pass_coalesce.c`: overlap checks must use raw range points, not
+ compressed points.
+- `src/opt/pass_lower.c`: the hint fallback must not put live-across-call values
+ into caller-saved hint registers.
+- `src/arch/aa64/native.c`: aa64 native emit needs three integer scratch
+ registers for all-spilled three-operand operations.
+- `src/opt/cg_ir_lower.c`: aggregate copy/set operands that are pointer values
+ must not force-home the pointer local; real frame-backed pointer locals need
+ prematerialized indirect bases.
diff --git a/src/opt/pass_combine.c b/src/opt/pass_combine.c
@@ -742,6 +742,14 @@ static int ret_scalar_storage(CGABIValue* v, Operand** out) {
return 1;
}
+static int is_target_scratch_reg(Func* f, u8 cls, Reg reg) {
+ if (!f || cls >= OPT_REG_CLASSES) return 0;
+ for (u32 i = 0; i < f->opt_scratch_reg_count[cls]; ++i) {
+ if (f->opt_scratch_regs[cls][i] == reg) return 1;
+ }
+ return 0;
+}
+
/* ---- Rewrite 1: substitute producer source into uses ---- */
/* Set the indirect's base or index to `new_reg`. */
@@ -856,6 +864,9 @@ static int try_substitute_for_reg(CombineCtx* ctx, Inst* in, i32 i, u8 cls,
if (ctx_def_changed_since(ctx, prod->opnds[1].cls, prod->opnds[1].v.reg,
prod_idx))
return 0;
+ if (is_target_scratch_reg(ctx->f, prod->opnds[1].cls,
+ prod->opnds[1].v.reg))
+ return 0;
kind = SK_REG;
src_op = prod->opnds[1];
} else if (pop == IR_LOAD_IMM) {
diff --git a/src/opt/pass_native_emit.c b/src/opt/pass_native_emit.c
@@ -909,9 +909,10 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in,
case IR_BINOP:
dst = loc_from_operand(e, &in->opnds[0], in->loc);
dst_reg = dst.kind == NATIVE_LOC_REG ? dst.v.reg : REG_NONE;
+ b = loc_from_operand(e, &in->opnds[2], in->loc);
a = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc),
class_for_type(e, in->opnds[1].type), in->opnds[1].type,
- dst_reg, REG_NONE, in->loc);
+ dst_reg, loc_avoid_reg(b), in->loc);
b = operand_imm_or_reg(e, &in->opnds[2], NATIVE_IMM_BINOP,
(u32)in->extra.imm, a.v.reg, dst_reg, in->loc);
if (dst.kind != NATIVE_LOC_REG)
@@ -941,9 +942,10 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in,
case IR_CMP:
dst = loc_from_operand(e, &in->opnds[0], in->loc);
dst_reg = dst.kind == NATIVE_LOC_REG ? dst.v.reg : REG_NONE;
+ b = loc_from_operand(e, &in->opnds[2], in->loc);
a = materialize(e, loc_from_operand(e, &in->opnds[1], in->loc),
class_for_type(e, in->opnds[1].type), in->opnds[1].type,
- dst_reg, REG_NONE, in->loc);
+ dst_reg, loc_avoid_reg(b), in->loc);
b = operand_imm_or_reg(e, &in->opnds[2], NATIVE_IMM_CMP,
(u32)in->extra.imm, a.v.reg, dst_reg, in->loc);
if (dst.kind != NATIVE_LOC_REG)
@@ -981,9 +983,10 @@ static void emit_inst(NativeEmitCtx* e, u32 block, u32 order_index, Inst* in,
u32 next = order_index + 1u < e->f->emit_order_n
? e->f->emit_order[order_index + 1u]
: UINT32_MAX;
+ b = loc_from_operand(e, &in->opnds[1], in->loc);
a = materialize(e, loc_from_operand(e, &in->opnds[0], in->loc),
class_for_type(e, in->opnds[0].type), in->opnds[0].type,
- REG_NONE, REG_NONE, in->loc);
+ REG_NONE, loc_avoid_reg(b), in->loc);
b = operand_imm_or_reg(e, &in->opnds[1], NATIVE_IMM_CMP,
(u32)in->extra.imm, a.v.reg, REG_NONE, in->loc);
e->target->cmp_branch(