kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit ed078ac80320bb5d7b8e4aa8c4e780d0aec8fdb8
parent a3da4df2317690b3bff662fffd9fd1f3455e580f
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 28 May 2026 18:20:10 -0700

opt: fold const convert-of-load_imm and thread branch targets through pass-through blocks"

Diffstat:
Mdoc/PERCALL.md | 28++++++++++++++++++----------
Msrc/opt/pass_combine.c | 112+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Msrc/opt/pass_jump.c | 67+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
Mtest/opt/cg_ir_lower_test.c | 63+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atest/opt/zero_arg.sh | 61+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mtest/test.mk | 7++++++-
6 files changed, 323 insertions(+), 15 deletions(-)

diff --git a/doc/PERCALL.md b/doc/PERCALL.md @@ -63,8 +63,7 @@ cfree puts the frame record above the callee-saves (`add x29, sp, #16`), so the sp-decrement and the fp/lr save cannot fold into a single pre-indexed `stp`. Moving the frame record to the bottom of the frame (fp-at-bottom, `mov x29, sp`) unlocks the pre-indexed entry `stp …, [sp, #-N]!` and post-indexed exit -`ldp …, [sp], #N`. This is [OPT_O1_PERF_TODO.md](OPT_O1_PERF_TODO.md) item 2, -the largest open per-call win. +`ldp …, [sp], #N`. (An aside: omitting the frame pointer entirely — packing a callee-save with lr, e.g. `stp x19, x30, [sp, #-N]!`, no `x29` at all — would save one more insn, but @@ -75,14 +74,22 @@ planned.) Two small constant adders beyond the prologue, both call-shaped: -1. **Zero through a temp** (item 3). `BottomUpTree`'s leaf `NewTreeNode(NULL, - NULL)` materializes each null arg via a scratch: +1. **Zero through a temp** (item 3) — **FIXED**. `BottomUpTree`'s leaf + `NewTreeNode(NULL, NULL)` used to materialize each null arg via a scratch: ``` movz x8, 0x0 ; mov x0, x8 ; movz x8, 0x0 ; mov x1, x8 ; 4 insns ``` - Optimal is `mov x0, #0 ; mov x1, #0` (2 insns). **−2** on that path. - `IR_LOAD_IMM` sources don't participate in the call-arg register-hint - propagation in `pass_lower.c`. + It now emits the optimal `movz x0, 0x0 ; movz x1, 0x0` (2 insns), **−2** on + that path. The trigger was a *pointer-typed* null (`(void*)0` / `NULL` from + headers, not an integer `0`): the cast inserts an int→ptr `IR_CONVERT` + between the `load_imm` and the call arg. `apply_abi_aliasing_hints` hints the + arg PReg to x0/x1 and `propagate_hint_through_copies` carries it backward — + but only across `IR_COPY`, so the convert broke the chain and its `load_imm` + source landed in the x8 scratch. Fixed by `try_fold_const_convert` in + `pass_combine.c`, which constant-folds `load_imm rT,k ; convert rD,rT` into a + single `load_imm rD,k'` (the now-dead `load_imm` is removed by DCE); the + folded `load_imm` inherits the convert's hinted dst directly. Covered by + `test/opt/zero_arg.sh`. 2. **Redundant branch chain** (item 4). `DeleteTree`'s if/else merge emits `b A; A: b B` — a conditional branch to a label that just unconditionally @@ -96,12 +103,13 @@ Two small constant adders beyond the prologue, both call-shaped: | --- | ---: | ---: | --- | | NewTreeNode | 14 | 16 | +2 prologue | | ItemCheck | 20 | 21 | +2 prologue | -| BottomUpTree | 25 | 24 | +2 prologue, +2 zero-temp | +| BottomUpTree | 23 | 24 | +2 prologue (zero-temp fixed) | | DeleteTree | 20 | 18 | +2 prologue, +1 branch | cfree already beats gcc -O0 on the malloc-heavy `NewTreeNode` (args stay in -registers vs gcc's 8 stack shuffles) and ties `ItemCheck`. It loses -`BottomUpTree`/`DeleteTree` purely on the fixed overhead above. +registers vs gcc's 8 stack shuffles), ties `ItemCheck`, and — with the +zero-temp fix — now edges ahead on `BottomUpTree`. `DeleteTree` remains the only +loss, on the fixed prologue overhead plus the item-4 branch chain. ## Optimal target diff --git a/src/opt/pass_combine.c b/src/opt/pass_combine.c @@ -1,6 +1,8 @@ #include <stdint.h> #include <string.h> +#include <cfree/cg.h> + #include "core/arena.h" #include "opt/opt_internal.h" @@ -594,6 +596,56 @@ static int ext_params(const Inst* in, u32* src_bytes_out, u32* dst_bytes_out, return 1; } +/* Width in bytes of a scalar integer or pointer type (1..8), else 0. Mirrors + * pass_simplify's simplify_width: builtin ints decode without the compiler, + * pointers fall back to the type-size query. */ +static u32 combine_scalar_width_bytes(Func* f, CfreeCgTypeId t) { + u32 b = builtin_int_bytes(t); + if (b) return b > 8u ? 0u : b; + if (f->c && cfree_cg_type_kind((CfreeCompiler*)f->c, t) == CFREE_CG_TYPE_PTR) { + u64 sz = cfree_cg_type_size((CfreeCompiler*)f->c, t); + if (sz && sz <= 8u) return (u32)sz; + } + return 0; +} + +static u64 combine_width_mask(u32 bytes) { + return bytes >= 8u ? UINT64_MAX : ((1ull << (bytes * 8u)) - 1ull); +} + +/* Compute the constant produced by applying an integer/pointer convert `k` + * (src width `sb`, dst width `db`, both bytes) to immediate `imm`. Returns 0 + * for kinds that aren't a bit-preserving integer/pointer move (the float + * conversions reinterpret the value and must not be folded this way). The + * materialized-constant model: a load_imm puts (u64)imm into a register, so a + * widening move/zext keeps the low `sb` bits and a trunc/narrowing keeps the + * low `db` bits. */ +static int const_convert_value(ConvKind k, i64 imm, u32 sb, u32 db, i64* out) { + if (!sb || !db) return 0; + u64 src_mask = combine_width_mask(sb); + u64 dst_mask = combine_width_mask(db); + u64 v = (u64)imm; + switch (k) { + case CV_BITCAST: + case CV_ZEXT: + *out = (i64)((v & src_mask) & dst_mask); + return 1; + case CV_TRUNC: + *out = (i64)(v & dst_mask); + return 1; + case CV_SEXT: { + u64 low = v & src_mask; + u32 sbits = sb * 8u; + if (sbits < 64u && (low & (1ull << (sbits - 1u)))) + low |= ~src_mask; /* replicate the sign bit into the high bits */ + *out = (i64)(low & dst_mask); + return 1; + } + default: + return 0; /* CV_ITOF_*, CV_FTOI_*, CV_FEXT, CV_FTRUNC */ + } +} + /* ---- producer-retarget legality (for sink rewrite) ---- */ static int binop_is_commutative(BinOp op) { @@ -1230,6 +1282,65 @@ static int try_ret_retarget(Func* f, Block* bl, i32 i) { return 1; } +/* ---- Rewrite 5: constant-fold a convert of a load_imm into a load_imm ---- + * + * `load_imm rT,k ; convert rD,rT` where the convert is a bit-preserving + * integer/pointer move collapses to `load_imm rD,k'` (k' = convert applied to + * k). The original load_imm, now dead, is removed by post-combine DCE. + * + * This is the convert-shaped sibling of the load_imm-into-copy fold (see + * try_substitute / combine_subst_slot). It matters for pointer-typed constant + * call args (e.g. NewTreeNode((void*)0, ...)): the arg-register hint reaches + * the convert's def but not its load_imm source, so without this the source + * lands in a scratch and the convert emits a `mov rD, scratch`. Folding lets + * the load_imm inherit the convert's (hinted) dst directly — `movz x0, 0`. */ +static int try_fold_const_convert(CombineCtx* ctx, Inst* in, i32 i) { + if ((IROp)in->op != IR_CONVERT || in->nopnds < 2) return 0; + if (in->opnds[0].kind != OPK_REG || in->opnds[1].kind != OPK_REG) return 0; + /* Integer/pointer domain only: a load_imm produces an int-class value, and + * the bit-preserving convert kinds we fold (BITCAST/ZEXT/SEXT/TRUNC) keep it + * int-class. A class change (RC_INT->RC_FP, e.g. an int->float bitcast) is + * not bit-preserving and must not collapse to a load_imm. */ + if (in->opnds[0].cls != RC_INT || in->opnds[1].cls != RC_INT) return 0; + + i32 prod_idx = ctx_producer_of(ctx, in->opnds[1].cls, in->opnds[1].v.reg); + if (prod_idx < 0 || prod_idx >= i) return 0; + Inst* prod = &ctx->bl->insts[prod_idx]; + if ((IROp)prod->op != IR_LOAD_IMM || prod->nopnds < 1 || + prod->opnds[0].kind != OPK_REG || + !same_phys_reg(&prod->opnds[0], &in->opnds[1])) + return 0; + + u32 sb = combine_scalar_width_bytes(ctx->f, in->opnds[1].type); + u32 db = combine_scalar_width_bytes(ctx->f, in->opnds[0].type); + i64 value = 0; + if (!const_convert_value((ConvKind)in->extra.imm, prod->extra.imm, sb, db, + &value)) + return 0; + + /* Single-use + not-live-out gate on the load_imm dst, mirroring the SK_IMM + * gate in try_substitute_for_reg: only fold when this convert is the sole + * use, so we replace rather than duplicate the materialization. */ + Operand src_def = prod->opnds[0]; + int killed = 0; + if (count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &src_def, &killed) != + 1) + return 0; + if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live, + ctx->bl->id, &src_def)) + return 0; + + /* Rewrite the convert in place into a load_imm of the converted constant. + * The dst operand (opnds[0]) is the hinted arg/return reg and is kept; the + * source operand is dropped. The dead load_imm at prod_idx is left for DCE. */ + in->op = (u16)IR_LOAD_IMM; + in->type = in->opnds[0].type; + in->nopnds = 1; + in->extra.imm = value; + ctx->block_change_p = 1; + return 1; +} + /* ---- per-BB driver ---- */ static int opt_combine_fold_block(Func* f, Block* bl, @@ -1273,6 +1384,7 @@ static int opt_combine_fold_block(Func* f, Block* bl, } if (enable_o1_combine_rewrites) { + try_fold_const_convert(&ctx, in, i); try_combine_exts(&ctx, in, i); try_substitute(&ctx, in, i); try_addr_synth(&ctx, in, i); diff --git a/src/opt/pass_jump.c b/src/opt/pass_jump.c @@ -9,6 +9,7 @@ typedef struct JumpCleanupCtx { u32* emit_index_by_block; u32* forwarded_target_by_block; u32* forward_path; + u8* has_label_addr_ref; } JumpCleanupCtx; static u32 emit_order_index(const JumpCleanupCtx* c, u32 block) { @@ -22,6 +23,35 @@ static u32 next_emit_block(const JumpCleanupCtx* c, u32 block) { return c->f->emit_order[idx + 1u]; } +static void mark_label_addr_refs(Func* f, u8* refs) { + if (!f || !refs) return; + for (u32 b = 0; b < f->nblocks; ++b) { + Block* bl = &f->blocks[b]; + for (u32 i = 0; i < bl->ninsts; ++i) { + Inst* in = &bl->insts[i]; + switch ((IROp)in->op) { + case IR_LOAD_LABEL_ADDR: + if ((u32)in->extra.imm < f->nblocks) refs[(u32)in->extra.imm] = 1; + break; + case IR_LOCAL_STATIC_DATA_LABEL_ADDR: { + CgIrLocalStaticLabelAux* aux = + (CgIrLocalStaticLabelAux*)in->extra.aux; + if (aux && (u32)aux->target < f->nblocks) + refs[(u32)aux->target] = 1; + break; + } + default: + break; + } + } + } +} + +static int block_has_label_addr_ref(const JumpCleanupCtx* c, u32 block) { + if (!c || block >= c->f->nblocks) return 0; + return c->has_label_addr_ref && c->has_label_addr_ref[block]; +} + static JumpCleanupCtx jump_cleanup_ctx(Func* f) { JumpCleanupCtx c; c.f = f; @@ -30,6 +60,8 @@ static JumpCleanupCtx jump_cleanup_ctx(Func* f) { c.forwarded_target_by_block = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u); c.forward_path = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u); + c.has_label_addr_ref = + arena_zarray(f->arena, u8, f->nblocks ? f->nblocks : 1u); for (u32 b = 0; b < f->nblocks; ++b) c.emit_index_by_block[b] = BLOCK_NONE; for (u32 b = 0; b < f->nblocks; ++b) c.forwarded_target_by_block[b] = BLOCK_NONE; @@ -37,6 +69,7 @@ static JumpCleanupCtx jump_cleanup_ctx(Func* f) { u32 b = f->emit_order[i]; if (b < f->nblocks) c.emit_index_by_block[b] = i; } + mark_label_addr_refs(f, c.has_label_addr_ref); return c; } @@ -45,7 +78,7 @@ static int single_jump_block(const JumpCleanupCtx* c, u32 block, Func* f = c->f; if (block >= f->nblocks) return 0; Block* bl = &f->blocks[block]; - if (bl->mc_label != MC_LABEL_NONE) return 0; + if (block_has_label_addr_ref(c, block)) return 0; if (bl->ninsts != 1 || bl->nsucc != 1) return 0; if ((IROp)bl->insts[0].op != IR_BR) return 0; if (target_out) *target_out = bl->succ[0]; @@ -57,7 +90,7 @@ static int empty_fallthrough_block(const JumpCleanupCtx* c, u32 block, Func* f = c->f; if (block >= f->nblocks || block == f->entry) return 0; Block* bl = &f->blocks[block]; - if (bl->mc_label != MC_LABEL_NONE) return 0; + if (block_has_label_addr_ref(c, block)) return 0; if (bl->ninsts != 0 || bl->nsucc != 0) return 0; u32 idx = emit_order_index(c, block); if (idx == BLOCK_NONE || idx + 1u >= f->emit_order_n) return 0; @@ -67,6 +100,30 @@ static int empty_fallthrough_block(const JumpCleanupCtx* c, u32 block, return 1; } +static int passthrough_succ_block(const JumpCleanupCtx* c, u32 block, + u32* target_out) { + Func* f = c->f; + if (block >= f->nblocks || block == f->entry) return 0; + Block* bl = &f->blocks[block]; + if (block_has_label_addr_ref(c, block)) return 0; + if (bl->nsucc != 1) return 0; + if (bl->succ[0] == block) return 0; + if (bl->ninsts != 0) { + if (bl->ninsts != 1) return 0; + switch ((IROp)bl->insts[0].op) { + case IR_NOP: + case IR_SCOPE_BEGIN: + case IR_SCOPE_ELSE: + case IR_SCOPE_END: + break; + default: + return 0; + } + } + if (target_out) *target_out = bl->succ[0]; + return 1; +} + static u32 forward_jump_target_ex(JumpCleanupCtx* c, u32 target, int allow_empty_fallthrough) { u32 cur = target; @@ -84,7 +141,9 @@ static u32 forward_jump_target_ex(JumpCleanupCtx* c, u32 target, break; } if (!single_jump_block(c, cur, &next) && - (!allow_empty_fallthrough || !empty_fallthrough_block(c, cur, &next))) { + (!allow_empty_fallthrough || + (!passthrough_succ_block(c, cur, &next) && + !empty_fallthrough_block(c, cur, &next)))) { result = cur; c->forwarded_target_by_block[cur] = result; break; @@ -169,7 +228,7 @@ static void cleanup_branch_targets(JumpCleanupCtx* c) { } for (u32 s = 0; s < nsucc; ++s) { u32 target = bl->succ[s]; - u32 forwarded = forward_jump_target(c, target); + u32 forwarded = forward_jump_target_ex(c, target, 1); if (forwarded < f->nblocks) bl->succ[s] = forwarded; } } diff --git a/test/opt/cg_ir_lower_test.c b/test/opt/cg_ir_lower_test.c @@ -188,8 +188,71 @@ static void converter_builds_cfg_and_pregs(void) { tc_fini(&tc); } +static void jump_cleanup_threads_empty_fallthrough_target(void) { + TestCtx tc; + tc_init(&tc); + + Func f; + memset(&f, 0, sizeof f); + f.c = tc.c; + f.arena = tc.c->tu; + f.entry = ir_block_new(&f); + u32 then_block = ir_block_new(&f); + u32 scope_block = ir_block_new(&f); + u32 empty_block = ir_block_new(&f); + u32 merge_block = ir_block_new(&f); + ir_note_emit(&f, f.entry); + ir_note_emit(&f, then_block); + ir_note_emit(&f, scope_block); + ir_note_emit(&f, empty_block); + ir_note_emit(&f, merge_block); + + Inst* br = ir_emit(&f, f.entry, IR_CMP_BRANCH); + br->extra.imm = CMP_EQ; + f.blocks[f.entry].succ[0] = scope_block; + f.blocks[f.entry].succ[1] = then_block; + f.blocks[f.entry].nsucc = 2; + + Inst* then_body = ir_emit(&f, then_block, IR_LOAD_IMM); + (void)then_body; + Inst* then_br = ir_emit(&f, then_block, IR_BR); + (void)then_br; + f.blocks[then_block].succ[0] = merge_block; + f.blocks[then_block].nsucc = 1; + + Inst* scope_end = ir_emit(&f, scope_block, IR_SCOPE_END); + (void)scope_end; + f.blocks[scope_block].succ[0] = empty_block; + f.blocks[scope_block].nsucc = 1; + + f.blocks[empty_block].succ[0] = merge_block; + f.blocks[empty_block].nsucc = 1; + + Inst* ret = ir_emit(&f, merge_block, IR_RET); + (void)ret; + + opt_build_cfg(&f); + EXPECT(f.blocks[f.entry].succ[0] == scope_block, + "precondition: branch should target scope block"); + EXPECT(f.blocks[scope_block].npreds == 1, + "precondition: scope block should have one predecessor"); + + opt_jump_cleanup(&f, OPT_JUMP_CLEANUP_CFG); + opt_build_cfg(&f); + + EXPECT(f.blocks[f.entry].succ[0] == merge_block, + "branch target should forward through empty fallthrough block"); + EXPECT(f.blocks[scope_block].npreds == 0 && f.blocks[scope_block].nsucc == 0, + "scope block should become unreachable after forwarding"); + EXPECT(f.blocks[empty_block].npreds == 0 && f.blocks[empty_block].nsucc == 0, + "empty block should become unreachable after forwarding"); + + tc_fini(&tc); +} + int main(void) { converter_builds_cfg_and_pregs(); + jump_cleanup_threads_empty_fallthrough_target(); if (g_fails) { fprintf(stderr, "cg-ir-lower: %d/%d failed\n", g_fails, g_checks); return 1; diff --git a/test/opt/zero_arg.sh b/test/opt/zero_arg.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash +# Behavioral check for the O1 "zero through a temp" call-arg wart (PERCALL.md +# item 3). A pointer-typed null argument (`(void*)0`) introduces an int->ptr +# IR_CONVERT between the constant's `load_imm` and the call arg. At -O1 the +# arg-register hint reaches the convert's def but not its load_imm source, so +# the source lands in a scratch (x8) and the convert emits `mov x0, x8` -- +# routing each zero through a temp: +# +# movz x8, 0x0 ; mov x0, x8 ; movz x8, 0x0 ; mov x1, x8 (4 insns) +# +# Optimal is `movz x0, 0x0 ; movz x1, 0x0` (2 insns, no temp). This test +# asserts the args are materialized directly into the arg registers. +set -euo pipefail + +ROOT="$(cd "$(dirname "$0")/../.." && pwd)" +CFREE="${CFREE:-$ROOT/build/cfree}" +WORK="$ROOT/build/test/opt/zero_arg" +mkdir -p "$WORK" + +SRC="$WORK/leaf.c" +cat > "$SRC" <<'EOF' +#define NULL ((void*)0) +typedef struct tn { struct tn* l; struct tn* r; } treeNode; +treeNode* NewTreeNode(treeNode* left, treeNode* right); +treeNode* leaf(void) { return NewTreeNode(NULL, NULL); } +EOF + +"$CFREE" cc -target aarch64-linux-gnu -O1 -std=c99 -c "$SRC" \ + -o "$WORK/leaf.o" > "$WORK/cc.out" 2>&1 +"$CFREE" objdump -d "$WORK/leaf.o" > "$WORK/dis.out" 2>&1 + +# Slice out the leaf() function body. +awk ' + /^[0-9a-f]+ <leaf>:/ { in_fn = 1; next } + /^[0-9a-f]+ </ { in_fn = 0 } + in_fn { print } +' "$WORK/dis.out" > "$WORK/leaf.dis" + +fail() { + printf 'zero-arg check FAILED: %s\n' "$1" >&2 + sed 's/^/ | /' "$WORK/leaf.dis" >&2 + exit 1 +} + +# The wart: a register-to-register move into an arg register (the zero routed +# through a scratch temp). Optimal codegen materializes the constant straight +# into the arg reg, so there must be no `mov x0, x<n>` / `mov x1, x<n>`. +# (`movz x0, ...` is a separate mnemonic and is not matched by `\bmov x`.) +if grep -Eq '\bmov[[:space:]]+x[01],[[:space:]]*x[0-9]' "$WORK/leaf.dis"; then + fail 'arg zero routed through a scratch temp (mov x0/x1, x<n>)' +fi + +# Both args must be loaded directly with movz into x0 and x1. +if ! grep -Eq '\bmovz[[:space:]]+x0,' "$WORK/leaf.dis"; then + fail 'arg x0 not materialized directly (movz x0, ...)' +fi +if ! grep -Eq '\bmovz[[:space:]]+x1,' "$WORK/leaf.dis"; then + fail 'arg x1 not materialized directly (movz x1, ...)' +fi + +printf 'zero-arg: ok\n' diff --git a/test/test.mk b/test/test.mk @@ -509,7 +509,7 @@ test-macho: lib $(TEST_RT_DEP) $(ROUNDTRIP_BIN_MACHO) $(LINK_EXE_RUNNER) $(JIT_R OPT_TEST_BIN = build/test/cg_ir_lower_test TINY_INLINE_TEST_BIN = build/test/tiny_inline_test -test-opt: bin $(OPT_TEST_BIN) test-opt-tiny-inline test-opt-inline +test-opt: bin $(OPT_TEST_BIN) test-opt-tiny-inline test-opt-inline test-opt-zero-arg $(OPT_TEST_BIN) $(OPT_TEST_BIN): test/opt/cg_ir_lower_test.c $(LIB_OBJS) @@ -527,6 +527,11 @@ $(TINY_INLINE_TEST_BIN): test/opt/tiny_inline_test.c $(LIB_OBJS) test-opt-inline: bin @CFREE=$(abspath $(BIN)) bash test/opt/run.sh +# Behavioral disasm check: a pointer-typed null call arg is not routed through +# a scratch temp at -O1 (PERCALL.md item 3, "zero through a temp"). +test-opt-zero-arg: bin + @CFREE=$(abspath $(BIN)) bash test/opt/zero_arg.sh + test-parse: test-parse-ok test-parse-err test-parse-ok: lib $(TEST_RT_DEP) $(PARSE_RUNNER) $(ROUNDTRIP_BIN) $(LINK_EXE_RUNNER) $(JIT_RUNNER)