Add OPT benchmark harness - kit

commit 28f717c79bf761e9f3bcf3b33276733f67a37347
parent f9d2f3384870b09083786f88b7ecdf5209f68794
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 22 May 2026 04:54:01 -0700

Add OPT benchmark harness

Diffstat:
M Makefile  | 5 ++++-
M doc/OPT.md  | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
M driver/run.c  | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
A scripts/opt_bench.sh  | 463 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

4 files changed, 646 insertions(+), 10 deletions(-)
diff --git a/Makefile b/Makefile
@@ -42,7 +42,7 @@ LIB_AR      = build/libcfree.a
 LANG_TOY_AR = build/libcfree_toy.a
 BIN         = build/cfree
 
-.PHONY: all lib bin format clean bootstrap
+.PHONY: all lib bin format clean bootstrap bench-opt
 
 all: lib bin
 
@@ -115,6 +115,9 @@ bootstrap: $(BIN)
 	cmp $(BOOTSTRAP_STAGE2_BIN) $(BOOTSTRAP_STAGE3_BIN)
 	shasum -a 256 $(BOOTSTRAP_STAGE2_BIN) $(BOOTSTRAP_STAGE3_BIN)
 
+bench-opt: bin
+	@bash scripts/opt_bench.sh
+
 format:
 	find src include driver lang test rt \( -path test/lex -o -path test/pp \) -prune -o \( -name '*.c' -o -name '*.h' \) -print | xargs clang-format -i --style=google
 
diff --git a/doc/OPT.md b/doc/OPT.md
@@ -1151,14 +1151,84 @@ Benchmark set:
   benchmark that requires only supported libc/runtime features.
 - cfree-specific stress cases for ABI, TLS, atomics, and inline asm.
 
+Primary harness:
+
+```
+make bench-opt
+```
+
+`scripts/opt_bench.sh` compares MIR `c2m`, `clang`, `gcc-15`, `cfree cc`,
+and `cfree run` at `-O0`, `-O1`, and `-O2` on MIR's `c-benchmarks` sources
+from `~/tmp/mir/c-benchmarks`. It writes per-case logs, `results.csv`, and a
+geomean summary under `build/bench/opt/`.
+
+The hosted `cfree cc` lane passes `--sysroot` and `-lc`; on Darwin the
+sysroot defaults to `xcrun --show-sdk-path`, and it can be overridden with
+`CFREE_OPT_BENCH_SYSROOT`. The `cfree run` lane is an in-process JIT run for
+direct comparison with MIR; it also passes `--sysroot` and `-lc`, expands the
+hosted libc headers/defines in the driver, and relies on host-symbol fallback
+for libc calls.
+
+The benchmark intentionally records unsupported cases instead of stopping at
+the first failure. A cfree row that cannot compile a hosted benchmark is a
+`COMPILE_FAIL` data point, not a harness failure; this lets optimizer work
+track coverage and performance in the same run.
+
+Useful focused runs:
+
+```
+CFREE_OPT_BENCHES="sieve spectral-norm" make bench-opt
+CFREE_OPT_BENCH_LEVELS="1 2" CFREE_OPT_BENCH_RUN_REPEATS=5 make bench-opt
+GCC=/opt/homebrew/bin/gcc-15 MIR_DIR=~/tmp/mir make bench-opt
+```
+
 Measure:
 
 - Compile wall time for `-O0`, `-O1`, `-O2`.
-- Executable run time against clang/gcc `-O2` when available.
+- Executable run time against `gcc-15 -O2`, `clang -O2`, and MIR `c2m -O2`
+  when available.
+- MIR-specific split: `compile_ms` is C-to-binary-MIR time, `codegen_ms` is
+  the JIT link/generation slice reported by `c2m -v`, and `runtime_ms` is the
+  generated function execution slice. Native compiler rows use compile+link
+  wall time for `compile_ms` and executable wall time for `runtime_ms`.
+- `cfree-run` uses `cfree run --bench-time`: `compile_ms` is compile+JIT time,
+  and `runtime_ms` is the in-process entry-call execution slice.
 - Code size for hot text sections.
 - Pass counters: removed GVN expressions, folded branches, removed stores,
   coalesced moves, spills/restores, split ranges, post-RA deleted moves.
 
+Initial representative run, 2026-05-22:
+
+- Scope: `array`, `binary-trees`, `hash`, `hash2`, `matrix`, `nbody`,
+  `sieve`, and `spectral-norm`; levels `0 1 2`; one compile repeat and one
+  run repeat. Output was written to `build/bench/opt/results.csv` and
+  `build/bench/opt/summary.md`.
+- Coverage: 120 data rows; 91 `OK`, 18 `COMPILE_FAIL`, 9 `RUN_FAIL`, and
+  2 `OUTPUT_FAIL`.
+- `gcc-15` and `clang` completed all rows. cfree and MIR are blocked on some
+  hosted/math cases: `binary-trees`, `nbody`, and `spectral-norm` hit Darwin
+  `math.h`/builtin compatibility issues. cfree also has an `-O2` wrong-code
+  failure on `matrix`.
+- Runtime geomean versus `gcc-15 -O2` on completed rows:
+  `cfree-run` improved from `0.372x` at `-O0` to `0.445x` at `-O1` and
+  `0.480x` at `-O2`; MIR improved from `0.554x` to `0.609x` to `0.796x`.
+  `clang -O2` measured `1.076x`.
+- Compile/JIT geomean versus `gcc-15 -O2`: `cfree-run` was about `11.5x` to
+  `12.0x` faster, while MIR was about `2.7x` faster. On cases where both
+  completed, cfree-run compile/JIT time was roughly 4-5x faster than MIR.
+- Direct cfree-run versus MIR runtime on common successful cases:
+  `0.67x` at `-O0`, `0.73x` at `-O1`, and `0.57x` at `-O2`. This confirms
+  the current split: cfree's compile/JIT path is very fast, but generated-code
+  quality still trails MIR, especially at `-O2`.
+
+Immediate benchmark blockers:
+
+- Fix the `matrix -O2` wrong-code regression before trusting O2 timing.
+- Add or model the hosted math builtins needed by Darwin `math.h`, starting
+  with `__builtin_fabsf`.
+- Re-run the full MIR benchmark set after those blockers, then increase repeat
+  counts for stable numbers.
+
 Target:
 
 - `-O1` should be the fast optimized tier and materially faster to compile
diff --git a/driver/run.c b/driver/run.c
@@ -1,15 +1,15 @@
+#include <cfree/compile.h>
+#include <cfree/core.h>
+#include <cfree/jit.h>
+#include <cfree/link.h>
 #include <stdint.h>
 #include <stdlib.h>
 
 #include "cflags.h"
 #include "driver.h"
+#include "hosted.h"
 #include "inputs.h"
 
-#include <cfree/compile.h>
-#include <cfree/core.h>
-#include <cfree/jit.h>
-#include <cfree/link.h>
-
 /* `cfree run` — JIT-compile one or more inputs and invoke the entry symbol
  * (default `main`) in-process. Args after `--` are passed to the JITed
  * program as argv. Mirrors the cc front-end for input shape (.c / - sources,
@@ -30,10 +30,14 @@ typedef struct RunOptions {
   int opt_level;
   int debug_info;
   int metrics;
+  int bench_time;
   int warnings_are_errors; /* -Werror     */
   uint32_t max_errors;     /* -fmax-errors=N */
   const char* entry;       /* -e, default "main" */
+  const char* sysroot;     /* --sysroot   */
+  int wants_hosted_libc;   /* -lc         */
   CfreeTarget target;      /* -target / host */
+  DriverHostedPlan hosted;
 
   DriverCflags cf;
   DriverInputs inputs;
@@ -75,8 +79,8 @@ static void run_metrics_scope_end(void* user, const char* name) {
   f = m->stack[depth];
   end_ns = driver_now_ns();
   elapsed = (end_ns >= f.start_ns) ? (end_ns - f.start_ns) : 0;
-  driver_logf("%*s%s %.3f ms", (int)(depth * 2u), "",
-              f.name ? f.name : name, (double)elapsed / 1000000.0);
+  driver_logf("%*s%s %.3f ms", (int)(depth * 2u), "", f.name ? f.name : name,
+              (double)elapsed / 1000000.0);
 }
 
 static void run_metrics_count(void* user, const char* name, uint64_t value) {
@@ -109,6 +113,10 @@ static void run_metrics_finish(RunMetrics* m) {
   }
 }
 
+static void run_bench_time(const char* name, uint64_t ns) {
+  driver_logf("cfree-run %s -- %.3f msec", name, (double)ns / 1000000.0);
+}
+
 static void run_usage(void) {
   driver_errf(RUN_TOOL, "%s",
               "usage: cfree run [options] inputs... [-- prog-arg...]\n"
@@ -146,8 +154,12 @@ void driver_help_run(void) {
       "  -O0 -O1 -O2       Optimization level (default -O0)\n"
       "  -g                Emit DWARF debug info\n"
       "  --time, --metrics Emit scoped compile/link/JIT timing to stderr\n"
+      "  --bench-time      Emit parseable compile/JIT/execution timings\n"
       "  -e SYMBOL         Entry symbol (default `main`)\n"
       "  -target TRIPLE    Cross-compile target (see `cfree cc --help`)\n"
+      "  --sysroot DIR     Hosted libc sysroot for headers/defines with -lc\n"
+      "  -lc               Enable hosted libc headers/defines; calls resolve "
+      "via host dlsym\n"
       "  -fPIC -fpic       Position-independent code (no-op for the JIT)\n"
       "  -fPIE -fpie       Position-independent executable (no-op for the "
       "JIT)\n"
@@ -192,7 +204,9 @@ static int run_alloc_arrays(RunOptions* o, int argc) {
     return 1;
   }
   if (driver_inputs_init(&o->inputs, o->env, RUN_TOOL, argc) != 0) return 1;
-  if (driver_cflags_init(&o->cf, o->env, argc) != 0) {
+  if (driver_cflags_init(
+          &o->cf, o->env,
+          argc + DRIVER_HOSTED_MAX_INCLUDES + DRIVER_HOSTED_MAX_DEFINES) != 0) {
     driver_errf(RUN_TOOL, "out of memory");
     return 1;
   }
@@ -245,6 +259,31 @@ static int run_classify_positional(RunOptions* o, const char* a) {
   return 0;
 }
 
+static int run_apply_hosted_profile(RunOptions* o) {
+  DriverHostedRequest req;
+  uint32_t i;
+  if (!o->wants_hosted_libc) return 0;
+  {
+    DriverHostedRequest z = {0};
+    req = z;
+  }
+  req.env = o->env;
+  req.tool = RUN_TOOL;
+  req.target = o->target;
+  req.sysroot = o->sysroot;
+  req.static_link = 0;
+  req.link_inputs = 0;
+  if (driver_hosted_resolve(&req, &o->hosted) != 0) return 1;
+  for (i = 0; i < o->hosted.nsystem_includes; ++i) {
+    o->cf.system_include_dirs[o->cf.nsystem_include_dirs++] =
+        o->hosted.system_includes[i];
+  }
+  for (i = 0; i < o->hosted.ndefines; ++i) {
+    o->cf.defines[o->cf.ndefines++] = o->hosted.defines[i];
+  }
+  return 0;
+}
+
 static int run_parse(int argc, char** argv, RunOptions* o) {
   int i;
   int after_dash_dash = 0;
@@ -278,6 +317,10 @@ static int run_parse(int argc, char** argv, RunOptions* o) {
       o->debug_info = 1;
       continue;
     }
+    if (driver_streq(a, "--bench-time")) {
+      o->bench_time = 1;
+      continue;
+    }
     if (driver_streq(a, "--time") || driver_streq(a, "--metrics")) {
       o->metrics = 1;
       continue;
@@ -342,6 +385,43 @@ static int run_parse(int argc, char** argv, RunOptions* o) {
       }
       continue;
     }
+    if (driver_streq(a, "--sysroot") || driver_streq(a, "-isysroot")) {
+      if (++i >= argc) {
+        driver_errf(RUN_TOOL, "%s requires an argument", a);
+        return 1;
+      }
+      o->sysroot = argv[i];
+      continue;
+    }
+    if (driver_strneq(a, "--sysroot=", 10)) {
+      o->sysroot = a + 10;
+      continue;
+    }
+    if (driver_streq(a, "-lc")) {
+      o->wants_hosted_libc = 1;
+      continue;
+    }
+    if (driver_streq(a, "-l")) {
+      if (++i >= argc) {
+        driver_errf(RUN_TOOL, "-l requires an argument");
+        return 1;
+      }
+      if (!driver_streq(argv[i], "c")) {
+        driver_errf(RUN_TOOL, "unsupported hosted library for JIT: -l%s",
+                    argv[i]);
+        return 1;
+      }
+      o->wants_hosted_libc = 1;
+      continue;
+    }
+    if (driver_strneq(a, "-l", 2)) {
+      if (!driver_streq(a + 2, "c")) {
+        driver_errf(RUN_TOOL, "unsupported hosted library for JIT: %s", a);
+        return 1;
+      }
+      o->wants_hosted_libc = 1;
+      continue;
+    }
 
     if (driver_streq(a, "-e")) {
       if (++i >= argc) {
@@ -370,6 +450,7 @@ static int run_parse(int argc, char** argv, RunOptions* o) {
     return 1;
   }
   if (!o->entry) o->entry = "main";
+  if (run_apply_hosted_profile(o) != 0) return 1;
 
   /* Synthetic argv[0]. Hosted programs conventionally read argv[0] as
    * the program name; under `cfree run` there is no executable path, so
@@ -380,6 +461,7 @@ static int run_parse(int argc, char** argv, RunOptions* o) {
 
 static void run_options_release(RunOptions* o) {
   size_t bound = o->argv_bound;
+  driver_hosted_plan_fini(o->env, &o->hosted);
   driver_inputs_release(&o->inputs);
   driver_cflags_fini(&o->cf, o->env);
   driver_free(o->env, o->prog_argv, bound * sizeof(*o->prog_argv));
@@ -467,6 +549,11 @@ int driver_run(int argc, char** argv) {
   void* sym;
   MainFn entry_fn;
   int rc;
+  uint64_t bench_total_start = 0;
+  uint64_t bench_compile_start = 0;
+  uint64_t bench_compile_end = 0;
+  uint64_t bench_exec_start = 0;
+  uint64_t bench_exec_end = 0;
 
   if (argc < 2 || driver_argv_wants_help(argc, argv, 1)) {
     driver_help_run();
@@ -489,6 +576,7 @@ int driver_run(int argc, char** argv) {
     driver_logf("cfree metrics:");
     run_metrics_begin(metrics, "run.total");
   }
+  if (ro.bench_time) bench_total_start = driver_now_ns();
 
   /* Compiler backs the JIT image — keep it alive across cfree_jit_lookup
    * and the entry call, free after cfree_jit_free. */
@@ -502,10 +590,15 @@ int driver_run(int argc, char** argv) {
     return 1;
   }
 
+  if (ro.bench_time) bench_compile_start = driver_now_ns();
   run_metrics_begin(metrics, "run.compile_and_jit");
   rc = run_compile_and_jit(&ro, compiler, &jhost, &jit);
   run_metrics_end(metrics, "run.compile_and_jit");
+  if (ro.bench_time) bench_compile_end = driver_now_ns();
   if (rc != 0) {
+    if (ro.bench_time)
+      run_bench_time("compile_and_jit",
+                     bench_compile_end - bench_compile_start);
     driver_compiler_free(compiler);
     run_metrics_finish(metrics);
     run_options_release(&ro);
@@ -536,9 +629,16 @@ int driver_run(int argc, char** argv) {
   }
 
   run_metrics_begin(metrics, "run.entry_call");
+  if (ro.bench_time) bench_exec_start = driver_now_ns();
   if (!run_call_wasm_entry(&ro, jit, sym, &rc))
     rc = entry_fn((int)ro.prog_argc, ro.prog_argv);
+  if (ro.bench_time) bench_exec_end = driver_now_ns();
   run_metrics_end(metrics, "run.entry_call");
+  if (ro.bench_time) {
+    run_bench_time("compile_and_jit", bench_compile_end - bench_compile_start);
+    run_bench_time("execution", bench_exec_end - bench_exec_start);
+    run_bench_time("total", bench_exec_end - bench_total_start);
+  }
 
   cfree_jit_free(jit);
   driver_compiler_free(compiler);
diff --git a/scripts/opt_bench.sh b/scripts/opt_bench.sh
@@ -0,0 +1,463 @@
+#!/usr/bin/env bash
+set -uo pipefail
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+MIR_DIR="${MIR_DIR:-$HOME/tmp/mir}"
+MIR_C2M="${MIR_C2M:-$MIR_DIR/c2m}"
+CFREE="${CFREE:-$ROOT/build/cfree}"
+CLANG="${CLANG:-clang}"
+GCC="${GCC:-gcc-15}"
+OUT_DIR="${CFREE_OPT_BENCH_OUT:-$ROOT/build/bench/opt}"
+CFREE_SYSROOT="${CFREE_OPT_BENCH_SYSROOT:-}"
+
+LEVELS="${CFREE_OPT_BENCH_LEVELS:-0 1 2}"
+BENCHES="${CFREE_OPT_BENCHES:-array binary-trees except funnkuch-reduce hash hash2 heapsort lists matrix method-call mandelbrot nbody sieve spectral-norm strcat}"
+COMPILE_REPEATS="${CFREE_OPT_BENCH_COMPILE_REPEATS:-1}"
+RUN_REPEATS="${CFREE_OPT_BENCH_RUN_REPEATS:-3}"
+MIR_MAKE="${MIR_MAKE:-}"
+
+case "$(uname -s 2>/dev/null || true)" in
+  Darwin)
+    DEFAULT_MATH_LIBS=""
+    if command -v xcrun >/dev/null 2>&1; then
+      DEFAULT_SYSROOT="$(xcrun --show-sdk-path)"
+      DEFAULT_CFLAGS_EXTRA="-isysroot $DEFAULT_SYSROOT"
+    else
+      DEFAULT_SYSROOT=""
+      DEFAULT_CFLAGS_EXTRA=""
+    fi
+    ;;
+  *)
+    DEFAULT_MATH_LIBS="-lm"
+    DEFAULT_SYSROOT="${SYSROOT:-}"
+    DEFAULT_CFLAGS_EXTRA=""
+    ;;
+esac
+if [ -z "$CFREE_SYSROOT" ]; then
+  CFREE_SYSROOT="$DEFAULT_SYSROOT"
+fi
+MATH_LIBS="${CFREE_OPT_BENCH_MATH_LIBS:-$DEFAULT_MATH_LIBS}"
+CFLAGS_EXTRA="${CFREE_OPT_BENCH_CFLAGS:-$DEFAULT_CFLAGS_EXTRA}"
+CFREE_FLAGS_EXTRA="${CFREE_OPT_BENCH_CFREE_FLAGS:-}"
+CFREE_RUN_FLAGS_EXTRA="${CFREE_OPT_BENCH_CFREE_RUN_FLAGS:-}"
+
+CSV="$OUT_DIR/results.csv"
+SUMMARY="$OUT_DIR/summary.md"
+LOG_DIR="$OUT_DIR/logs"
+BIN_DIR="$OUT_DIR/bin"
+
+mkdir -p "$OUT_DIR" "$LOG_DIR" "$BIN_DIR"
+
+now_ns() {
+  python3 -c 'import time; print(time.monotonic_ns())'
+}
+
+ns_to_ms() {
+  awk -v ns="$1" 'BEGIN { printf "%.3f", ns / 1000000.0 }'
+}
+
+min_ms() {
+  awk -v a="$1" -v b="$2" 'BEGIN {
+    if (a == "" || a == "NA") printf "%s", b;
+    else if (b + 0 < a + 0) printf "%s", b;
+    else printf "%s", a;
+  }'
+}
+
+csv_field() {
+  printf '%s' "$1" | sed 's/"/""/g; s/^/"/; s/$/"/'
+}
+
+record_row() {
+  local bench="$1" tool="$2" opt="$3" status="$4"
+  local compile_ms="$5" codegen_ms="$6" runtime_ms="$7" rc="$8" log="$9"
+  {
+    csv_field "$bench"; printf ','
+    csv_field "$tool"; printf ','
+    csv_field "$opt"; printf ','
+    csv_field "$status"; printf ','
+    csv_field "$compile_ms"; printf ','
+    csv_field "$codegen_ms"; printf ','
+    csv_field "$runtime_ms"; printf ','
+    csv_field "$rc"; printf ','
+    csv_field "$log"; printf '\n'
+  } >>"$CSV"
+}
+
+run_timed() {
+  local out="$1" err="$2"
+  shift 2
+  local t0 t1
+  t0="$(now_ns)"
+  "$@" >"$out" 2>"$err"
+  RUN_RC=$?
+  t1="$(now_ns)"
+  RUN_MS="$(ns_to_ms "$((t1 - t0))")"
+}
+
+read_arg_file() {
+  local f="$1"
+  if [ -f "$f" ]; then
+    sh "$f"
+  fi
+}
+
+check_expected() {
+  local expect="$1" got="$2" diff_out="$3"
+  if [ ! -f "$expect" ]; then
+    return 0
+  fi
+  cmp "$expect" "$got" >/dev/null 2>&1 && return 0
+  diff -u "$expect" "$got" >"$diff_out" 2>&1
+  return 1
+}
+
+parse_mir_ms() {
+  local pattern="$1" file="$2"
+  awk -v pat="$pattern" '
+    $0 ~ pat {
+      v = $(NF - 1)
+      unit = $NF
+      if (unit == "usec") v = v / 1000.0
+      if (unit == "msec") v = v + 0.0
+      printf "%.3f\n", v
+      exit
+    }
+  ' "$file"
+}
+
+ensure_mir() {
+  if [ -x "$MIR_C2M" ]; then
+    return 0
+  fi
+  if [ ! -d "$MIR_DIR" ]; then
+    printf 'opt-bench: MIR_DIR does not exist: %s\n' "$MIR_DIR" >&2
+    return 1
+  fi
+  if [ -z "$MIR_MAKE" ]; then
+    if command -v gmake >/dev/null 2>&1; then
+      MIR_MAKE=gmake
+    else
+      MIR_MAKE=make
+    fi
+  fi
+  printf 'opt-bench: building MIR c2m with %s -C %s c2m\n' "$MIR_MAKE" "$MIR_DIR"
+  "$MIR_MAKE" -C "$MIR_DIR" c2m
+}
+
+tool_label() {
+  basename "$1"
+}
+
+bench_source_dir() {
+  printf '%s/c-benchmarks' "$MIR_DIR"
+}
+
+compile_native() {
+  local bench="$1" tool="$2" cc="$3" opt="$4" src="$5" exe="$6" log_base="$7"
+  local best="NA" rep out err
+  local cmd=() cflags=() libs=() cfree_flags=()
+  read -r -a cmd <<<"$cc"
+  read -r -a cflags <<<"$CFLAGS_EXTRA"
+  read -r -a libs <<<"$MATH_LIBS"
+  read -r -a cfree_flags <<<"$CFREE_FLAGS_EXTRA"
+  if [ "$tool" = "cfree" ]; then
+    cflags=("${cfree_flags[@]}")
+    if [ -n "$CFREE_SYSROOT" ]; then
+      cflags+=(--sysroot "$CFREE_SYSROOT")
+    fi
+    libs+=("-lc")
+  fi
+  for rep in $(seq 1 "$COMPILE_REPEATS"); do
+    out="$log_base.compile.$rep.out"
+    err="$log_base.compile.$rep.err"
+    rm -f "$exe"
+    run_timed "$out" "$err" "${cmd[@]}" -std=c99 "-O$opt" -I"$(bench_source_dir)" \
+      "${cflags[@]}" "$src" "${libs[@]}" -o "$exe"
+    if [ "$RUN_RC" -ne 0 ]; then
+      record_row "$bench" "$tool" "$opt" "COMPILE_FAIL" "$RUN_MS" "NA" \
+        "NA" "$RUN_RC" "$err"
+      return 1
+    fi
+    best="$(min_ms "$best" "$RUN_MS")"
+  done
+  COMPILE_MS="$best"
+  return 0
+}
+
+run_native() {
+  local bench="$1" tool="$2" opt="$3" exe="$4" expect="$5" arg_line="$6" log_base="$7"
+  local best="NA" best_rc=0 rep out err diff_out
+  local args=()
+  read -r -a args <<<"$arg_line"
+  for rep in $(seq 1 "$RUN_REPEATS"); do
+    out="$log_base.run.$rep.out"
+    err="$log_base.run.$rep.err"
+    run_timed "$out" "$err" "$exe" "${args[@]}"
+    if [ "$RUN_RC" -ne 0 ]; then
+      record_row "$bench" "$tool" "$opt" "RUN_FAIL" "$COMPILE_MS" "NA" \
+        "$RUN_MS" "$RUN_RC" "$err"
+      return 1
+    fi
+    diff_out="$log_base.run.$rep.diff"
+    if ! check_expected "$expect" "$out" "$diff_out"; then
+      record_row "$bench" "$tool" "$opt" "OUTPUT_FAIL" "$COMPILE_MS" "NA" \
+        "$RUN_MS" "$RUN_RC" "$diff_out"
+      return 1
+    fi
+    best="$(min_ms "$best" "$RUN_MS")"
+    best_rc="$RUN_RC"
+  done
+  record_row "$bench" "$tool" "$opt" "OK" "$COMPILE_MS" "NA" "$best" \
+    "$best_rc" "$log_base"
+}
+
+bench_native_tool() {
+  local bench="$1" tool="$2" cc="$3" opt="$4" src="$5" expect="$6" arg_line="$7"
+  local exe="$BIN_DIR/$tool.O$opt.$bench.exe"
+  local log_base="$LOG_DIR/$tool.O$opt.$bench"
+  local cmd=()
+  read -r -a cmd <<<"$cc"
+  if ! command -v "${cmd[0]}" >/dev/null 2>&1 && [ ! -x "${cmd[0]}" ]; then
+    record_row "$bench" "$tool" "$opt" "TOOL_MISSING" "NA" "NA" "NA" \
+      "127" "$cc"
+    return 0
+  fi
+  compile_native "$bench" "$tool" "$cc" "$opt" "$src" "$exe" "$log_base" || return 0
+  run_native "$bench" "$tool" "$opt" "$exe" "$expect" "$arg_line" "$log_base"
+}
+
+bench_mir() {
+  local bench="$1" opt="$2" src="$3" expect="$4" arg_line="$5"
+  local tool="mir-c2m"
+  local bmir="$BIN_DIR/$tool.O$opt.$bench.bmir"
+  local log_base="$LOG_DIR/$tool.O$opt.$bench"
+  local best_run="NA" best_codegen="NA" best_rc=0 rep out err diff_out exec_ms cg_ms
+  local args=()
+  read -r -a args <<<"$arg_line"
+
+  run_timed "$log_base.compile.out" "$log_base.compile.err" \
+    "$MIR_C2M" "-O$opt" -c -I"$(bench_source_dir)" -o "$bmir" "$src"
+  if [ "$RUN_RC" -ne 0 ]; then
+    record_row "$bench" "$tool" "$opt" "COMPILE_FAIL" "$RUN_MS" "NA" \
+      "NA" "$RUN_RC" "$log_base.compile.err"
+    return 0
+  fi
+  COMPILE_MS="$RUN_MS"
+
+  for rep in $(seq 1 "$RUN_REPEATS"); do
+    out="$log_base.run.$rep.out"
+    err="$log_base.run.$rep.err"
+    run_timed "$out" "$err" "$MIR_C2M" -v "-O$opt" "$bmir" -eg "${args[@]}"
+    exec_ms="$(parse_mir_ms 'execution' "$err")"
+    cg_ms="$(parse_mir_ms 'MIR link finish' "$err")"
+    [ -z "$exec_ms" ] && exec_ms="$RUN_MS"
+    [ -z "$cg_ms" ] && cg_ms="NA"
+    if [ "$RUN_RC" -ne 0 ]; then
+      record_row "$bench" "$tool" "$opt" "RUN_FAIL" "$COMPILE_MS" "$cg_ms" \
+        "$exec_ms" "$RUN_RC" "$err"
+      return 0
+    fi
+    diff_out="$log_base.run.$rep.diff"
+    if ! check_expected "$expect" "$out" "$diff_out"; then
+      record_row "$bench" "$tool" "$opt" "OUTPUT_FAIL" "$COMPILE_MS" "$cg_ms" \
+        "$exec_ms" "$RUN_RC" "$diff_out"
+      return 0
+    fi
+    best_run="$(min_ms "$best_run" "$exec_ms")"
+    if [ "$cg_ms" != "NA" ]; then
+      best_codegen="$(min_ms "$best_codegen" "$cg_ms")"
+    fi
+    best_rc="$RUN_RC"
+  done
+  record_row "$bench" "$tool" "$opt" "OK" "$COMPILE_MS" "$best_codegen" \
+    "$best_run" "$best_rc" "$log_base"
+}
+
+bench_cfree_run() {
+  local bench="$1" opt="$2" src="$3" expect="$4" arg_line="$5"
+  local tool="cfree-run"
+  local log_base="$LOG_DIR/$tool.O$opt.$bench"
+  local best_run="NA" best_compile="NA" best_rc=0 rep out err diff_out cm_ms exec_ms
+  local args=() cfree_flags=() run_flags=() cmd=()
+  read -r -a args <<<"$arg_line"
+  read -r -a cfree_flags <<<"$CFREE_FLAGS_EXTRA"
+  read -r -a run_flags <<<"$CFREE_RUN_FLAGS_EXTRA"
+
+  if [ ! -x "$CFREE" ]; then
+    record_row "$bench" "$tool" "$opt" "TOOL_MISSING" "NA" "NA" "NA" \
+      "127" "$CFREE"
+    return 0
+  fi
+
+  for rep in $(seq 1 "$RUN_REPEATS"); do
+    out="$log_base.run.$rep.out"
+    err="$log_base.run.$rep.err"
+    cmd=("$CFREE" run --bench-time "-O$opt" -I"$(bench_source_dir)")
+    if [ -n "$CFREE_SYSROOT" ]; then
+      cmd+=(--sysroot "$CFREE_SYSROOT")
+    fi
+    cmd+=("${cfree_flags[@]}" "${run_flags[@]}" "$src" -lc -- "${args[@]}")
+    run_timed "$out" "$err" "${cmd[@]}"
+    cm_ms="$(parse_mir_ms 'cfree-run compile_and_jit' "$err")"
+    exec_ms="$(parse_mir_ms 'cfree-run execution' "$err")"
+    [ -z "$cm_ms" ] && cm_ms="NA"
+    [ -z "$exec_ms" ] && exec_ms="$RUN_MS"
+    if [ "$RUN_RC" -ne 0 ]; then
+      record_row "$bench" "$tool" "$opt" "RUN_FAIL" "$cm_ms" "NA" \
+        "$exec_ms" "$RUN_RC" "$err"
+      return 0
+    fi
+    diff_out="$log_base.run.$rep.diff"
+    if ! check_expected "$expect" "$out" "$diff_out"; then
+      record_row "$bench" "$tool" "$opt" "OUTPUT_FAIL" "$cm_ms" "NA" \
+        "$exec_ms" "$RUN_RC" "$diff_out"
+      return 0
+    fi
+    if [ "$cm_ms" != "NA" ]; then
+      best_compile="$(min_ms "$best_compile" "$cm_ms")"
+    fi
+    best_run="$(min_ms "$best_run" "$exec_ms")"
+    best_rc="$RUN_RC"
+  done
+  record_row "$bench" "$tool" "$opt" "OK" "$best_compile" "NA" "$best_run" \
+    "$best_rc" "$log_base"
+}
+
+write_summary() {
+  python3 - "$CSV" "$SUMMARY" "$(tool_label "$GCC")" <<'PY'
+import csv
+import math
+import sys
+from collections import defaultdict
+
+csv_path, out_path, base_tool = sys.argv[1:4]
+with open(csv_path, newline="") as f:
+    rows = list(csv.DictReader(f))
+
+def fnum(v):
+    if v in ("", "NA", None):
+        return None
+    try:
+        return float(v)
+    except ValueError:
+        return None
+
+ok = [r for r in rows if r["status"] == "OK"]
+groups = defaultdict(list)
+for r in ok:
+    groups[(r["tool"], r["opt"])].append(r)
+
+base_runtime = {}
+base_compile = {}
+for r in ok:
+    if r["tool"] == base_tool and r["opt"] == "2":
+        rt = fnum(r["runtime_ms"])
+        cm = fnum(r["compile_ms"])
+        if rt and rt > 0:
+            base_runtime[r["bench"]] = rt
+        if cm and cm > 0:
+            base_compile[r["bench"]] = cm
+
+def geo(xs):
+    xs = [x for x in xs if x and x > 0]
+    if not xs:
+        return "NA"
+    return f"{math.exp(sum(math.log(x) for x in xs) / len(xs)):.3f}"
+
+def avg(xs):
+    xs = [x for x in xs if x is not None]
+    if not xs:
+        return "NA"
+    return f"{sum(xs) / len(xs):.3f}"
+
+status_counts = defaultdict(int)
+for r in rows:
+    status_counts[r["status"]] += 1
+
+lines = []
+lines.append("# OPT Benchmark Summary")
+lines.append("")
+lines.append(f"Base for speed ratios: `{base_tool} -O2`.")
+lines.append("For MIR, `compile_ms` is C-to-binary-MIR time and `codegen_ms` is the JIT link/generation slice reported by `c2m -v`; compile ratios use their sum. `cfree-run` uses `--bench-time`: `compile_ms` is compile+JIT time, and `runtime_ms` is the in-process entry-call execution slice.")
+lines.append("")
+lines.append("## Status")
+lines.append("")
+lines.append("| status | rows |")
+lines.append("| --- | ---: |")
+for k in sorted(status_counts):
+    lines.append(f"| {k} | {status_counts[k]} |")
+lines.append("")
+lines.append("## Geomean Ratios")
+lines.append("")
+lines.append("| tool | opt | ok cases | compile speed vs base | runtime speed vs base | avg compile+codegen ms | avg runtime ms |")
+lines.append("| --- | ---: | ---: | ---: | ---: | ---: | ---: |")
+for key in sorted(groups):
+    vals = groups[key]
+    comp_totals = []
+    run_times = []
+    comp_ratios = []
+    run_ratios = []
+    for r in vals:
+        cm = fnum(r["compile_ms"])
+        cg = fnum(r["codegen_ms"]) or 0.0
+        rt = fnum(r["runtime_ms"])
+        total = None if cm is None else cm + cg
+        comp_totals.append(total)
+        run_times.append(rt)
+        b = r["bench"]
+        if total and b in base_compile:
+            comp_ratios.append(base_compile[b] / total)
+        if rt and b in base_runtime:
+            run_ratios.append(base_runtime[b] / rt)
+    lines.append(
+        f"| {key[0]} | {key[1]} | {len(vals)} | {geo(comp_ratios)} | "
+        f"{geo(run_ratios)} | {avg(comp_totals)} | {avg(run_times)} |"
+    )
+lines.append("")
+lines.append(f"Raw CSV: `{csv_path}`")
+with open(out_path, "w") as f:
+    f.write("\n".join(lines) + "\n")
+PY
+}
+
+printf 'bench,tool,opt,status,compile_ms,codegen_ms,runtime_ms,exit_code,log\n' >"$CSV"
+
+BENCH_DIR="$(bench_source_dir)"
+if [ ! -d "$BENCH_DIR" ]; then
+  printf 'opt-bench: benchmark directory not found: %s\n' "$BENCH_DIR" >&2
+  exit 2
+fi
+if [ ! -x "$CFREE" ]; then
+  printf 'opt-bench: cfree binary not found: %s\n' "$CFREE" >&2
+  printf 'opt-bench: run `make bin` or set CFREE=/path/to/cfree\n' >&2
+  exit 2
+fi
+ensure_mir || exit 2
+
+printf 'opt-bench: output: %s\n' "$OUT_DIR"
+printf 'opt-bench: benches: %s\n' "$BENCHES"
+printf 'opt-bench: levels: %s\n' "$LEVELS"
+printf 'opt-bench: compile repeats=%s run repeats=%s\n' "$COMPILE_REPEATS" "$RUN_REPEATS"
+
+for bench in $BENCHES; do
+  src="$BENCH_DIR/$bench.c"
+  expect="$BENCH_DIR/$bench.expect"
+  arg_line="$(read_arg_file "$BENCH_DIR/$bench.arg")"
+  if [ ! -f "$src" ]; then
+    printf 'opt-bench: skipping missing benchmark source: %s\n' "$src" >&2
+    continue
+  fi
+  printf '+++++ %s %s +++++\n' "$bench" "$arg_line"
+  for opt in $LEVELS; do
+    bench_native_tool "$bench" "$(tool_label "$GCC")" "$GCC" "$opt" "$src" "$expect" "$arg_line"
+    bench_native_tool "$bench" "$(tool_label "$CLANG")" "$CLANG" "$opt" "$src" "$expect" "$arg_line"
+    bench_native_tool "$bench" "cfree" "$CFREE cc" "$opt" "$src" "$expect" "$arg_line"
+    bench_cfree_run "$bench" "$opt" "$src" "$expect" "$arg_line"
+    bench_mir "$bench" "$opt" "$src" "$expect" "$arg_line"
+  done
+done
+
+write_summary
+printf 'opt-bench: wrote %s\n' "$CSV"
+printf 'opt-bench: wrote %s\n' "$SUMMARY"

	kit kit
	git clone https://git.ryansepassi.com/git/kit.git
	Log \| Files \| Refs \| README

M	Makefile	\|	5	++++-
M	doc/OPT.md	\|	72	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
M	driver/run.c	\|	116	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
A	scripts/opt_bench.sh	\|	463	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++