opt: fold const convert-of-load_imm and thread branch targets through pass-through blocks" - kit

commit ed078ac80320bb5d7b8e4aa8c4e780d0aec8fdb8
parent a3da4df2317690b3bff662fffd9fd1f3455e580f
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Thu, 28 May 2026 18:20:10 -0700

opt: fold const convert-of-load_imm and thread branch targets through pass-through blocks"

Diffstat:
M doc/PERCALL.md  | 28 ++++++++++++++++++----------
M src/opt/pass_combine.c  | 112 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M src/opt/pass_jump.c  | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M test/opt/cg_ir_lower_test.c  | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A test/opt/zero_arg.sh  | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M test/test.mk  | 7 ++++++-

6 files changed, 323 insertions(+), 15 deletions(-)
diff --git a/doc/PERCALL.md b/doc/PERCALL.md
@@ -63,8 +63,7 @@ cfree puts the frame record above the callee-saves (`add x29, sp, #16`), so the
 sp-decrement and the fp/lr save cannot fold into a single pre-indexed `stp`.
 Moving the frame record to the bottom of the frame (fp-at-bottom, `mov x29, sp`)
 unlocks the pre-indexed entry `stp …, [sp, #-N]!` and post-indexed exit
-`ldp …, [sp], #N`. This is [OPT_O1_PERF_TODO.md](OPT_O1_PERF_TODO.md) item 2,
-the largest open per-call win.
+`ldp …, [sp], #N`.
 
 (An aside: omitting the frame pointer entirely — packing a callee-save with lr,
 e.g. `stp x19, x30, [sp, #-N]!`, no `x29` at all — would save one more insn, but
@@ -75,14 +74,22 @@ planned.)
 
 Two small constant adders beyond the prologue, both call-shaped:
 
-1. **Zero through a temp** (item 3). `BottomUpTree`'s leaf `NewTreeNode(NULL,
-   NULL)` materializes each null arg via a scratch:
+1. **Zero through a temp** (item 3) — **FIXED**. `BottomUpTree`'s leaf
+   `NewTreeNode(NULL, NULL)` used to materialize each null arg via a scratch:
    ```
    movz x8, 0x0 ; mov x0, x8 ; movz x8, 0x0 ; mov x1, x8     ; 4 insns
    ```
-   Optimal is `mov x0, #0 ; mov x1, #0` (2 insns). **−2** on that path.
-   `IR_LOAD_IMM` sources don't participate in the call-arg register-hint
-   propagation in `pass_lower.c`.
+   It now emits the optimal `movz x0, 0x0 ; movz x1, 0x0` (2 insns), **−2** on
+   that path. The trigger was a *pointer-typed* null (`(void*)0` / `NULL` from
+   headers, not an integer `0`): the cast inserts an int→ptr `IR_CONVERT`
+   between the `load_imm` and the call arg. `apply_abi_aliasing_hints` hints the
+   arg PReg to x0/x1 and `propagate_hint_through_copies` carries it backward —
+   but only across `IR_COPY`, so the convert broke the chain and its `load_imm`
+   source landed in the x8 scratch. Fixed by `try_fold_const_convert` in
+   `pass_combine.c`, which constant-folds `load_imm rT,k ; convert rD,rT` into a
+   single `load_imm rD,k'` (the now-dead `load_imm` is removed by DCE); the
+   folded `load_imm` inherits the convert's hinted dst directly. Covered by
+   `test/opt/zero_arg.sh`.
 
 2. **Redundant branch chain** (item 4). `DeleteTree`'s if/else merge emits
    `b A; A: b B` — a conditional branch to a label that just unconditionally
@@ -96,12 +103,13 @@ Two small constant adders beyond the prologue, both call-shaped:
 | --- | ---: | ---: | --- |
 | NewTreeNode | 14 | 16 | +2 prologue |
 | ItemCheck | 20 | 21 | +2 prologue |
-| BottomUpTree | 25 | 24 | +2 prologue, +2 zero-temp |
+| BottomUpTree | 23 | 24 | +2 prologue (zero-temp fixed) |
 | DeleteTree | 20 | 18 | +2 prologue, +1 branch |
 
 cfree already beats gcc -O0 on the malloc-heavy `NewTreeNode` (args stay in
-registers vs gcc's 8 stack shuffles) and ties `ItemCheck`. It loses
-`BottomUpTree`/`DeleteTree` purely on the fixed overhead above.
+registers vs gcc's 8 stack shuffles), ties `ItemCheck`, and — with the
+zero-temp fix — now edges ahead on `BottomUpTree`. `DeleteTree` remains the only
+loss, on the fixed prologue overhead plus the item-4 branch chain.
 
 ## Optimal target
 
diff --git a/src/opt/pass_combine.c b/src/opt/pass_combine.c
@@ -1,6 +1,8 @@
 #include <stdint.h>
 #include <string.h>
 
+#include <cfree/cg.h>
+
 #include "core/arena.h"
 #include "opt/opt_internal.h"
 
@@ -594,6 +596,56 @@ static int ext_params(const Inst* in, u32* src_bytes_out, u32* dst_bytes_out,
   return 1;
 }
 
+/* Width in bytes of a scalar integer or pointer type (1..8), else 0. Mirrors
+ * pass_simplify's simplify_width: builtin ints decode without the compiler,
+ * pointers fall back to the type-size query. */
+static u32 combine_scalar_width_bytes(Func* f, CfreeCgTypeId t) {
+  u32 b = builtin_int_bytes(t);
+  if (b) return b > 8u ? 0u : b;
+  if (f->c && cfree_cg_type_kind((CfreeCompiler*)f->c, t) == CFREE_CG_TYPE_PTR) {
+    u64 sz = cfree_cg_type_size((CfreeCompiler*)f->c, t);
+    if (sz && sz <= 8u) return (u32)sz;
+  }
+  return 0;
+}
+
+static u64 combine_width_mask(u32 bytes) {
+  return bytes >= 8u ? UINT64_MAX : ((1ull << (bytes * 8u)) - 1ull);
+}
+
+/* Compute the constant produced by applying an integer/pointer convert `k`
+ * (src width `sb`, dst width `db`, both bytes) to immediate `imm`. Returns 0
+ * for kinds that aren't a bit-preserving integer/pointer move (the float
+ * conversions reinterpret the value and must not be folded this way). The
+ * materialized-constant model: a load_imm puts (u64)imm into a register, so a
+ * widening move/zext keeps the low `sb` bits and a trunc/narrowing keeps the
+ * low `db` bits. */
+static int const_convert_value(ConvKind k, i64 imm, u32 sb, u32 db, i64* out) {
+  if (!sb || !db) return 0;
+  u64 src_mask = combine_width_mask(sb);
+  u64 dst_mask = combine_width_mask(db);
+  u64 v = (u64)imm;
+  switch (k) {
+    case CV_BITCAST:
+    case CV_ZEXT:
+      *out = (i64)((v & src_mask) & dst_mask);
+      return 1;
+    case CV_TRUNC:
+      *out = (i64)(v & dst_mask);
+      return 1;
+    case CV_SEXT: {
+      u64 low = v & src_mask;
+      u32 sbits = sb * 8u;
+      if (sbits < 64u && (low & (1ull << (sbits - 1u))))
+        low |= ~src_mask; /* replicate the sign bit into the high bits */
+      *out = (i64)(low & dst_mask);
+      return 1;
+    }
+    default:
+      return 0; /* CV_ITOF_*, CV_FTOI_*, CV_FEXT, CV_FTRUNC */
+  }
+}
+
 /* ---- producer-retarget legality (for sink rewrite) ---- */
 
 static int binop_is_commutative(BinOp op) {
@@ -1230,6 +1282,65 @@ static int try_ret_retarget(Func* f, Block* bl, i32 i) {
   return 1;
 }
 
+/* ---- Rewrite 5: constant-fold a convert of a load_imm into a load_imm ----
+ *
+ * `load_imm rT,k ; convert rD,rT` where the convert is a bit-preserving
+ * integer/pointer move collapses to `load_imm rD,k'` (k' = convert applied to
+ * k). The original load_imm, now dead, is removed by post-combine DCE.
+ *
+ * This is the convert-shaped sibling of the load_imm-into-copy fold (see
+ * try_substitute / combine_subst_slot). It matters for pointer-typed constant
+ * call args (e.g. NewTreeNode((void*)0, ...)): the arg-register hint reaches
+ * the convert's def but not its load_imm source, so without this the source
+ * lands in a scratch and the convert emits a `mov rD, scratch`. Folding lets
+ * the load_imm inherit the convert's (hinted) dst directly — `movz x0, 0`. */
+static int try_fold_const_convert(CombineCtx* ctx, Inst* in, i32 i) {
+  if ((IROp)in->op != IR_CONVERT || in->nopnds < 2) return 0;
+  if (in->opnds[0].kind != OPK_REG || in->opnds[1].kind != OPK_REG) return 0;
+  /* Integer/pointer domain only: a load_imm produces an int-class value, and
+   * the bit-preserving convert kinds we fold (BITCAST/ZEXT/SEXT/TRUNC) keep it
+   * int-class. A class change (RC_INT->RC_FP, e.g. an int->float bitcast) is
+   * not bit-preserving and must not collapse to a load_imm. */
+  if (in->opnds[0].cls != RC_INT || in->opnds[1].cls != RC_INT) return 0;
+
+  i32 prod_idx = ctx_producer_of(ctx, in->opnds[1].cls, in->opnds[1].v.reg);
+  if (prod_idx < 0 || prod_idx >= i) return 0;
+  Inst* prod = &ctx->bl->insts[prod_idx];
+  if ((IROp)prod->op != IR_LOAD_IMM || prod->nopnds < 1 ||
+      prod->opnds[0].kind != OPK_REG ||
+      !same_phys_reg(&prod->opnds[0], &in->opnds[1]))
+    return 0;
+
+  u32 sb = combine_scalar_width_bytes(ctx->f, in->opnds[1].type);
+  u32 db = combine_scalar_width_bytes(ctx->f, in->opnds[0].type);
+  i64 value = 0;
+  if (!const_convert_value((ConvKind)in->extra.imm, prod->extra.imm, sb, db,
+                           &value))
+    return 0;
+
+  /* Single-use + not-live-out gate on the load_imm dst, mirroring the SK_IMM
+   * gate in try_substitute_for_reg: only fold when this convert is the sole
+   * use, so we replace rather than duplicate the materialization. */
+  Operand src_def = prod->opnds[0];
+  int killed = 0;
+  if (count_uses_in_live_range(ctx->f, ctx->bl, prod_idx, &src_def, &killed) !=
+      1)
+    return 0;
+  if (!killed && opt_block_live_out_has_phys_reg(ctx->f, ctx->hard_live,
+                                                 ctx->bl->id, &src_def))
+    return 0;
+
+  /* Rewrite the convert in place into a load_imm of the converted constant.
+   * The dst operand (opnds[0]) is the hinted arg/return reg and is kept; the
+   * source operand is dropped. The dead load_imm at prod_idx is left for DCE. */
+  in->op = (u16)IR_LOAD_IMM;
+  in->type = in->opnds[0].type;
+  in->nopnds = 1;
+  in->extra.imm = value;
+  ctx->block_change_p = 1;
+  return 1;
+}
+
 /* ---- per-BB driver ---- */
 
 static int opt_combine_fold_block(Func* f, Block* bl,
@@ -1273,6 +1384,7 @@ static int opt_combine_fold_block(Func* f, Block* bl,
     }
 
     if (enable_o1_combine_rewrites) {
+      try_fold_const_convert(&ctx, in, i);
       try_combine_exts(&ctx, in, i);
       try_substitute(&ctx, in, i);
       try_addr_synth(&ctx, in, i);
diff --git a/src/opt/pass_jump.c b/src/opt/pass_jump.c
@@ -9,6 +9,7 @@ typedef struct JumpCleanupCtx {
   u32* emit_index_by_block;
   u32* forwarded_target_by_block;
   u32* forward_path;
+  u8* has_label_addr_ref;
 } JumpCleanupCtx;
 
 static u32 emit_order_index(const JumpCleanupCtx* c, u32 block) {
@@ -22,6 +23,35 @@ static u32 next_emit_block(const JumpCleanupCtx* c, u32 block) {
   return c->f->emit_order[idx + 1u];
 }
 
+static void mark_label_addr_refs(Func* f, u8* refs) {
+  if (!f || !refs) return;
+  for (u32 b = 0; b < f->nblocks; ++b) {
+    Block* bl = &f->blocks[b];
+    for (u32 i = 0; i < bl->ninsts; ++i) {
+      Inst* in = &bl->insts[i];
+      switch ((IROp)in->op) {
+        case IR_LOAD_LABEL_ADDR:
+          if ((u32)in->extra.imm < f->nblocks) refs[(u32)in->extra.imm] = 1;
+          break;
+        case IR_LOCAL_STATIC_DATA_LABEL_ADDR: {
+          CgIrLocalStaticLabelAux* aux =
+              (CgIrLocalStaticLabelAux*)in->extra.aux;
+          if (aux && (u32)aux->target < f->nblocks)
+            refs[(u32)aux->target] = 1;
+          break;
+        }
+        default:
+          break;
+      }
+    }
+  }
+}
+
+static int block_has_label_addr_ref(const JumpCleanupCtx* c, u32 block) {
+  if (!c || block >= c->f->nblocks) return 0;
+  return c->has_label_addr_ref && c->has_label_addr_ref[block];
+}
+
 static JumpCleanupCtx jump_cleanup_ctx(Func* f) {
   JumpCleanupCtx c;
   c.f = f;
@@ -30,6 +60,8 @@ static JumpCleanupCtx jump_cleanup_ctx(Func* f) {
   c.forwarded_target_by_block =
       arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u);
   c.forward_path = arena_array(f->arena, u32, f->nblocks ? f->nblocks : 1u);
+  c.has_label_addr_ref =
+      arena_zarray(f->arena, u8, f->nblocks ? f->nblocks : 1u);
   for (u32 b = 0; b < f->nblocks; ++b) c.emit_index_by_block[b] = BLOCK_NONE;
   for (u32 b = 0; b < f->nblocks; ++b)
     c.forwarded_target_by_block[b] = BLOCK_NONE;
@@ -37,6 +69,7 @@ static JumpCleanupCtx jump_cleanup_ctx(Func* f) {
     u32 b = f->emit_order[i];
     if (b < f->nblocks) c.emit_index_by_block[b] = i;
   }
+  mark_label_addr_refs(f, c.has_label_addr_ref);
   return c;
 }
 
@@ -45,7 +78,7 @@ static int single_jump_block(const JumpCleanupCtx* c, u32 block,
   Func* f = c->f;
   if (block >= f->nblocks) return 0;
   Block* bl = &f->blocks[block];
-  if (bl->mc_label != MC_LABEL_NONE) return 0;
+  if (block_has_label_addr_ref(c, block)) return 0;
   if (bl->ninsts != 1 || bl->nsucc != 1) return 0;
   if ((IROp)bl->insts[0].op != IR_BR) return 0;
   if (target_out) *target_out = bl->succ[0];
@@ -57,7 +90,7 @@ static int empty_fallthrough_block(const JumpCleanupCtx* c, u32 block,
   Func* f = c->f;
   if (block >= f->nblocks || block == f->entry) return 0;
   Block* bl = &f->blocks[block];
-  if (bl->mc_label != MC_LABEL_NONE) return 0;
+  if (block_has_label_addr_ref(c, block)) return 0;
   if (bl->ninsts != 0 || bl->nsucc != 0) return 0;
   u32 idx = emit_order_index(c, block);
   if (idx == BLOCK_NONE || idx + 1u >= f->emit_order_n) return 0;
@@ -67,6 +100,30 @@ static int empty_fallthrough_block(const JumpCleanupCtx* c, u32 block,
   return 1;
 }
 
+static int passthrough_succ_block(const JumpCleanupCtx* c, u32 block,
+                                  u32* target_out) {
+  Func* f = c->f;
+  if (block >= f->nblocks || block == f->entry) return 0;
+  Block* bl = &f->blocks[block];
+  if (block_has_label_addr_ref(c, block)) return 0;
+  if (bl->nsucc != 1) return 0;
+  if (bl->succ[0] == block) return 0;
+  if (bl->ninsts != 0) {
+    if (bl->ninsts != 1) return 0;
+    switch ((IROp)bl->insts[0].op) {
+      case IR_NOP:
+      case IR_SCOPE_BEGIN:
+      case IR_SCOPE_ELSE:
+      case IR_SCOPE_END:
+        break;
+      default:
+        return 0;
+    }
+  }
+  if (target_out) *target_out = bl->succ[0];
+  return 1;
+}
+
 static u32 forward_jump_target_ex(JumpCleanupCtx* c, u32 target,
                                   int allow_empty_fallthrough) {
   u32 cur = target;
@@ -84,7 +141,9 @@ static u32 forward_jump_target_ex(JumpCleanupCtx* c, u32 target,
       break;
     }
     if (!single_jump_block(c, cur, &next) &&
-        (!allow_empty_fallthrough || !empty_fallthrough_block(c, cur, &next))) {
+        (!allow_empty_fallthrough ||
+         (!passthrough_succ_block(c, cur, &next) &&
+          !empty_fallthrough_block(c, cur, &next)))) {
       result = cur;
       c->forwarded_target_by_block[cur] = result;
       break;
@@ -169,7 +228,7 @@ static void cleanup_branch_targets(JumpCleanupCtx* c) {
     }
     for (u32 s = 0; s < nsucc; ++s) {
       u32 target = bl->succ[s];
-      u32 forwarded = forward_jump_target(c, target);
+      u32 forwarded = forward_jump_target_ex(c, target, 1);
       if (forwarded < f->nblocks) bl->succ[s] = forwarded;
     }
   }
diff --git a/test/opt/cg_ir_lower_test.c b/test/opt/cg_ir_lower_test.c
@@ -188,8 +188,71 @@ static void converter_builds_cfg_and_pregs(void) {
   tc_fini(&tc);
 }
 
+static void jump_cleanup_threads_empty_fallthrough_target(void) {
+  TestCtx tc;
+  tc_init(&tc);
+
+  Func f;
+  memset(&f, 0, sizeof f);
+  f.c = tc.c;
+  f.arena = tc.c->tu;
+  f.entry = ir_block_new(&f);
+  u32 then_block = ir_block_new(&f);
+  u32 scope_block = ir_block_new(&f);
+  u32 empty_block = ir_block_new(&f);
+  u32 merge_block = ir_block_new(&f);
+  ir_note_emit(&f, f.entry);
+  ir_note_emit(&f, then_block);
+  ir_note_emit(&f, scope_block);
+  ir_note_emit(&f, empty_block);
+  ir_note_emit(&f, merge_block);
+
+  Inst* br = ir_emit(&f, f.entry, IR_CMP_BRANCH);
+  br->extra.imm = CMP_EQ;
+  f.blocks[f.entry].succ[0] = scope_block;
+  f.blocks[f.entry].succ[1] = then_block;
+  f.blocks[f.entry].nsucc = 2;
+
+  Inst* then_body = ir_emit(&f, then_block, IR_LOAD_IMM);
+  (void)then_body;
+  Inst* then_br = ir_emit(&f, then_block, IR_BR);
+  (void)then_br;
+  f.blocks[then_block].succ[0] = merge_block;
+  f.blocks[then_block].nsucc = 1;
+
+  Inst* scope_end = ir_emit(&f, scope_block, IR_SCOPE_END);
+  (void)scope_end;
+  f.blocks[scope_block].succ[0] = empty_block;
+  f.blocks[scope_block].nsucc = 1;
+
+  f.blocks[empty_block].succ[0] = merge_block;
+  f.blocks[empty_block].nsucc = 1;
+
+  Inst* ret = ir_emit(&f, merge_block, IR_RET);
+  (void)ret;
+
+  opt_build_cfg(&f);
+  EXPECT(f.blocks[f.entry].succ[0] == scope_block,
+         "precondition: branch should target scope block");
+  EXPECT(f.blocks[scope_block].npreds == 1,
+         "precondition: scope block should have one predecessor");
+
+  opt_jump_cleanup(&f, OPT_JUMP_CLEANUP_CFG);
+  opt_build_cfg(&f);
+
+  EXPECT(f.blocks[f.entry].succ[0] == merge_block,
+         "branch target should forward through empty fallthrough block");
+  EXPECT(f.blocks[scope_block].npreds == 0 && f.blocks[scope_block].nsucc == 0,
+         "scope block should become unreachable after forwarding");
+  EXPECT(f.blocks[empty_block].npreds == 0 && f.blocks[empty_block].nsucc == 0,
+         "empty block should become unreachable after forwarding");
+
+  tc_fini(&tc);
+}
+
 int main(void) {
   converter_builds_cfg_and_pregs();
+  jump_cleanup_threads_empty_fallthrough_target();
   if (g_fails) {
     fprintf(stderr, "cg-ir-lower: %d/%d failed\n", g_fails, g_checks);
     return 1;
diff --git a/test/opt/zero_arg.sh b/test/opt/zero_arg.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+# Behavioral check for the O1 "zero through a temp" call-arg wart (PERCALL.md
+# item 3). A pointer-typed null argument (`(void*)0`) introduces an int->ptr
+# IR_CONVERT between the constant's `load_imm` and the call arg. At -O1 the
+# arg-register hint reaches the convert's def but not its load_imm source, so
+# the source lands in a scratch (x8) and the convert emits `mov x0, x8` --
+# routing each zero through a temp:
+#
+#   movz x8, 0x0 ; mov x0, x8 ; movz x8, 0x0 ; mov x1, x8     (4 insns)
+#
+# Optimal is `movz x0, 0x0 ; movz x1, 0x0` (2 insns, no temp). This test
+# asserts the args are materialized directly into the arg registers.
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+CFREE="${CFREE:-$ROOT/build/cfree}"
+WORK="$ROOT/build/test/opt/zero_arg"
+mkdir -p "$WORK"
+
+SRC="$WORK/leaf.c"
+cat > "$SRC" <<'EOF'
+#define NULL ((void*)0)
+typedef struct tn { struct tn* l; struct tn* r; } treeNode;
+treeNode* NewTreeNode(treeNode* left, treeNode* right);
+treeNode* leaf(void) { return NewTreeNode(NULL, NULL); }
+EOF
+
+"$CFREE" cc -target aarch64-linux-gnu -O1 -std=c99 -c "$SRC" \
+  -o "$WORK/leaf.o" > "$WORK/cc.out" 2>&1
+"$CFREE" objdump -d "$WORK/leaf.o" > "$WORK/dis.out" 2>&1
+
+# Slice out the leaf() function body.
+awk '
+  /^[0-9a-f]+ <leaf>:/ { in_fn = 1; next }
+  /^[0-9a-f]+ </ { in_fn = 0 }
+  in_fn { print }
+' "$WORK/dis.out" > "$WORK/leaf.dis"
+
+fail() {
+  printf 'zero-arg check FAILED: %s\n' "$1" >&2
+  sed 's/^/  | /' "$WORK/leaf.dis" >&2
+  exit 1
+}
+
+# The wart: a register-to-register move into an arg register (the zero routed
+# through a scratch temp). Optimal codegen materializes the constant straight
+# into the arg reg, so there must be no `mov x0, x<n>` / `mov x1, x<n>`.
+# (`movz x0, ...` is a separate mnemonic and is not matched by `\bmov x`.)
+if grep -Eq '\bmov[[:space:]]+x[01],[[:space:]]*x[0-9]' "$WORK/leaf.dis"; then
+  fail 'arg zero routed through a scratch temp (mov x0/x1, x<n>)'
+fi
+
+# Both args must be loaded directly with movz into x0 and x1.
+if ! grep -Eq '\bmovz[[:space:]]+x0,' "$WORK/leaf.dis"; then
+  fail 'arg x0 not materialized directly (movz x0, ...)'
+fi
+if ! grep -Eq '\bmovz[[:space:]]+x1,' "$WORK/leaf.dis"; then
+  fail 'arg x1 not materialized directly (movz x1, ...)'
+fi
+
+printf 'zero-arg: ok\n'
diff --git a/test/test.mk b/test/test.mk
@@ -509,7 +509,7 @@ test-macho: lib $(TEST_RT_DEP) $(ROUNDTRIP_BIN_MACHO) $(LINK_EXE_RUNNER) $(JIT_R
 OPT_TEST_BIN = build/test/cg_ir_lower_test
 TINY_INLINE_TEST_BIN = build/test/tiny_inline_test
 
-test-opt: bin $(OPT_TEST_BIN) test-opt-tiny-inline test-opt-inline
+test-opt: bin $(OPT_TEST_BIN) test-opt-tiny-inline test-opt-inline test-opt-zero-arg
 	$(OPT_TEST_BIN)
 
 $(OPT_TEST_BIN): test/opt/cg_ir_lower_test.c $(LIB_OBJS)
@@ -527,6 +527,11 @@ $(TINY_INLINE_TEST_BIN): test/opt/tiny_inline_test.c $(LIB_OBJS)
 test-opt-inline: bin
 	@CFREE=$(abspath $(BIN)) bash test/opt/run.sh
 
+# Behavioral disasm check: a pointer-typed null call arg is not routed through
+# a scratch temp at -O1 (PERCALL.md item 3, "zero through a temp").
+test-opt-zero-arg: bin
+	@CFREE=$(abspath $(BIN)) bash test/opt/zero_arg.sh
+
 test-parse: test-parse-ok test-parse-err
 
 test-parse-ok: lib $(TEST_RT_DEP) $(PARSE_RUNNER) $(ROUNDTRIP_BIN) $(LINK_EXE_RUNNER) $(JIT_RUNNER)

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	doc/PERCALL.md	\|	28	++++++++++++++++++----------
M	src/opt/pass_combine.c	\|	112	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	src/opt/pass_jump.c	\|	67	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
M	test/opt/cg_ir_lower_test.c	\|	63	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
A	test/opt/zero_arg.sh	\|	61	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
M	test/test.mk	\|	7	++++++-