commit 68c9946c896959ba7af3e72b6cefdedef6bf6caf
parent b732a5d4604781dd27b372bb604ad6210062b2d3
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Thu, 14 May 2026 17:53:06 -0700
Document run JIT perf and compile metrics
Diffstat:
5 files changed, 351 insertions(+), 1 deletion(-)
diff --git a/doc/PERF.md b/doc/PERF.md
@@ -0,0 +1,284 @@
+# Cfree Run/JIT Performance Notes
+
+This file records early measurements for `cfree run --time -O1` and the
+current hypotheses for reducing time from submit to first execution.
+
+The metrics are emitted through `CfreeEnv.metrics`; `cfree run --time` installs
+a hosted stderr logger in the driver. Core/libcfree remain callback-only and do
+not write files or depend on hosted I/O.
+
+## Current Metric Shape
+
+Top-level `cfree run --time` scopes:
+
+- `run.total`: full driver path after option/input loading through entry return.
+- `run.compile_and_jit`: compile all inputs and build the JIT image.
+- `compile.tu`: one translation unit to an `ObjBuilder`.
+- `compile.frontend`: registered frontend body.
+- `compile.c.*`: C frontend setup, PP option application, parse/codegen, cleanup.
+- `opt.o1.total`: per-function O1 pipeline, nested under C parse/codegen.
+- `link_jit.all`: linker resolve plus JIT image materialization.
+- `link.resolve.*`: archive ingestion, symbol resolution, GC, layout, reloc emit.
+- `jit.*`: reservation, segment copy, relocation patching, protection, icache flush,
+ constructors.
+- `run.jit_lookup` and `run.entry_call`: symbol lookup and execution.
+
+Important counters:
+
+- `compile.input_bytes`
+- `compile.obj_sections`, `compile.obj_relocs`
+- `opt.funcs`, `opt.blocks`, `opt.insts`, `opt.vals`
+- `opt.live_words`, `opt.conflict_bytes`
+- `link.inputs`, `link.sections`, `link.segments`, `link.syms`, `link.relocs`
+- `jit.master_size`, `jit.nsegments`, `jit.input_section_bytes`,
+ `jit.segment_bytes`
+
+## Small Programs
+
+Measured with 20 samples each on stdin, `build/cfree run --time -O1 -`.
+Times are `min / p50 / p95 / mean / max` in milliseconds.
+
+`int main(){return 0;}`
+
+```text
+run.total 0.423 / 0.448 / 0.506 / 0.520 / 1.889
+run.compile_and_jit 0.395 / 0.416 / 0.473 / 0.469 / 1.427
+compile.tu 0.254 / 0.267 / 0.304 / 0.306 / 1.015
+link_jit.all 0.124 / 0.132 / 0.149 / 0.140 / 0.287
+opt.o1.total 0.100 / 0.106 / 0.121 / 0.108 / 0.128
+link.resolve.total 0.064 / 0.067 / 0.076 / 0.072 / 0.159
+```
+
+One small loop:
+
+```c
+int main(){int s=0; for(int i=0;i<10;i++) s+=i; return s==45?0:1;}
+```
+
+```text
+run.total 0.500 / 0.522 / 0.539 / 0.525 / 0.618
+run.compile_and_jit 0.471 / 0.492 / 0.510 / 0.495 / 0.582
+compile.tu 0.324 / 0.340 / 0.356 / 0.345 / 0.428
+opt.o1.total 0.148 / 0.154 / 0.160 / 0.154 / 0.166
+link_jit.all 0.127 / 0.133 / 0.137 / 0.133 / 0.137
+link.resolve.total 0.064 / 0.066 / 0.068 / 0.066 / 0.069
+```
+
+For tiny programs, compile dominates, with O1 and JIT/link image setup as the
+main visible sub-buckets. Link/JIT is already sub-millisecond.
+
+## Scaling Programs
+
+Two synthetic ladders were used:
+
+- `straight_main`: N loop-heavy static functions, and `main` directly calls
+ every function. This creates one large straight-line function in addition to
+ N small loop functions.
+- `table_main`: the same N loop-heavy functions, but `main` iterates through a
+ function-pointer table. This keeps `main` compact and isolates the cost of
+ many functions from the cost of one large function.
+
+Samples: 7 or 5 runs through 256 functions, 3 runs at 512 and 1024 functions.
+The table below uses p50 timings. O1 scopes are summed across functions.
+
+### Straight-Line Main
+
+```text
+funcs insts vals conflict_bytes run.total compile.tu opt.o1.total live_pre live_reg regalloc link_jit
+1 74 50 800 0.845 0.641 0.381 0.087 0.084 0.112 0.135
+4 278 188 3008 2.027 1.797 1.279 0.326 0.324 0.421 0.173
+16 1094 740 11840 5.644 5.460 4.088 1.216 1.203 1.528 0.119
+64 4358 2948 56576 22.089 21.817 16.926 5.406 5.365 6.683 0.149
+128 8710 5892 131520 48.249 47.906 38.203 12.782 12.696 15.655 0.170
+256 17414 11780 336704 111.786 111.161 92.228 33.042 32.883 40.054 0.224
+512 34822 23556 968256 289.752 288.507 249.098 95.864 96.069 115.167 0.309
+1024 69638 47108 3116096 877.053 874.755 793.174 325.557 321.390 379.543 0.513
+```
+
+The 512 to 1024 step is the clearest warning:
+
+```text
+compile.tu 288.507 -> 874.755 ms 3.03x
+opt.o1.total 249.098 -> 793.174 ms 3.18x
+opt.live_info.pre_dde 95.864 -> 325.557 ms 3.40x
+opt.live_info.regalloc 96.069 -> 321.390 ms 3.35x
+opt.regalloc 115.167 -> 379.543 ms 3.30x
+```
+
+The associated `opt.conflict_bytes` grows from `968256` to `3116096`, about
+3.2x for a 2x input. That points directly at dense liveness/conflict data as
+the superlinear path.
+
+### Function-Table Main
+
+```text
+funcs insts vals conflict_bytes run.total compile.tu opt.o1.total live_pre live_reg regalloc link_jit
+1 94 60 960 0.895 0.692 0.404 0.095 0.092 0.125 0.141
+4 286 189 3024 1.902 1.692 1.178 0.314 0.309 0.401 0.148
+16 1054 705 11280 5.506 5.315 3.940 1.146 1.126 1.433 0.120
+64 4126 2769 44304 19.621 19.353 14.632 4.411 4.394 5.538 0.152
+128 8222 5521 88336 38.127 37.795 28.668 8.718 8.699 10.934 0.169
+256 16414 11025 176400 75.585 75.103 57.207 17.368 17.411 21.901 0.215
+512 32798 22033 352528 151.642 150.752 114.097 34.786 34.678 43.643 0.289
+1024 65566 44049 704784 303.295 301.632 226.238 68.497 68.306 86.028 0.450
+```
+
+Here the 512 to 1024 step is essentially linear:
+
+```text
+compile.tu 150.752 -> 301.632 ms 2.00x
+opt.o1.total 114.097 -> 226.238 ms 1.98x
+opt.live_info.pre_dde 34.786 -> 68.497 ms 1.97x
+opt.live_info.regalloc 34.678 -> 68.306 ms 1.97x
+opt.regalloc 43.643 -> 86.028 ms 1.97x
+```
+
+This indicates that "many functions" is not the current scaling problem. The
+problem is a single large function whose liveness/conflict structures become
+large enough to bend past linear.
+
+## Compile Perf
+
+`compile.tu` is now split into generic and C-specific scopes:
+
+```text
+compile.tu
+ compile.input_bytes
+ compile.frontend
+ compile.c.setup
+ compile.c.pool_new
+ compile.c.lex_open
+ compile.c.pp_new
+ compile.c.cg_new
+ compile.c.decl_new
+ compile.c.pp_options
+ compile.c.pp_push_input
+ compile.c.parse_codegen
+ opt.o1.total ...
+ compile.c.cleanup
+ compile.obj_finalize
+ compile.obj_sections
+ compile.obj_relocs
+```
+
+Updated 1024-function single-run examples with the detailed compile scopes:
+
+```text
+metric straight_main table_main
+run.total 828.387 ms 314.678 ms
+compile.tu 825.154 ms 309.944 ms
+compile.frontend 825.129 ms 309.825 ms
+compile.c.setup 0.156 ms 0.946 ms
+compile.c.pp_options 0.021 ms 0.021 ms
+compile.c.parse_codegen 824.900 ms 308.676 ms
+compile.c.cleanup 0.026 ms 0.015 ms
+compile.obj_finalize 0.000 ms 0.001 ms
+opt.o1.total summed 746.533 ms 231.538 ms
+```
+
+Subtracting summed O1 from `compile.tu` leaves about `78 ms` in both variants:
+
+```text
+straight_main: 825.154 - 746.533 = 78.621 ms
+table_main: 309.944 - 231.538 = 78.406 ms
+```
+
+That residual is the current frontend/PP/parser/CG API cost for roughly
+330 KiB of generated C. It appears linear with input size and number of small
+functions in these tests. The setup, PP option application, cleanup, and object
+finalization scopes are negligible. The opaque part that remains is inside
+`compile.c.parse_codegen`, which streams PP tokens, parses declarations and
+statements, performs type work, drives the CG API, and triggers per-function O1
+when function bodies end.
+
+### Compile Scope Scaling
+
+After adding the detailed `compile.tu` scopes, the scale ladders were rerun.
+The table below uses p50 milliseconds. `opt.o1.total` is summed across
+functions, and `non_o1_residual` is `compile.tu - opt.o1.total`.
+
+Straight-line main:
+
+```text
+funcs input_bytes compile.tu parse_codegen opt.o1.total non_o1_residual
+1 383 0.743 0.521 0.383 0.360
+4 1338 1.748 1.519 1.148 0.600
+16 5193 5.485 5.264 4.026 1.459
+64 20697 21.657 21.447 16.719 4.938
+128 41599 46.748 46.530 37.164 9.584
+256 83839 109.467 109.241 90.519 18.948
+512 168319 281.974 281.750 244.471 37.503
+1024 337481 824.834 824.586 746.447 78.387
+```
+
+Function-table main:
+
+```text
+funcs input_bytes compile.tu parse_codegen opt.o1.total non_o1_residual
+1 504 0.783 0.563 0.402 0.381
+4 1429 1.781 1.561 1.171 0.610
+16 5166 5.272 5.062 3.839 1.433
+64 20190 19.208 18.990 14.467 4.741
+128 40454 38.143 37.929 28.862 9.281
+256 81414 75.035 74.816 56.935 18.100
+512 163334 150.504 150.275 114.101 36.403
+1024 327378 304.716 304.492 227.790 76.926
+```
+
+The detailed compile scopes show:
+
+- `compile.frontend` is effectively all of `compile.tu`; object finalization is
+ below measurable resolution in these runs.
+- `compile.c.parse_codegen` is effectively all of `compile.frontend`.
+- `compile.c.setup` stays flat around `0.14 ms`; most of that is `pp_new`
+ registering predefined macros.
+- `compile.c.pp_options` stays around `0.02 ms` with no `-I`, `-D`, or `-U`
+ options.
+- `compile.c.cleanup` grows only to about `0.01-0.02 ms`.
+- `non_o1_residual` is near-linear with source size at larger sizes, around
+ `228-241 us/KiB` from 64 through 1024 functions.
+
+So the compile-side scaling story is two-layered: the broad C parse/codegen
+bucket scales linearly with input size after startup overhead, while the nested
+O1 liveness/regalloc work can dominate that bucket and become superlinear for a
+single large function.
+
+Next useful compile instrumentation:
+
+- Count tokens consumed by `pp_next`.
+- Count top-level declarations and function definitions in `parse_c`.
+- Split parse/codegen around function bodies, global declarations, and data
+ emission.
+- Add CG API counters for emitted ops, stack/value operations, local slots, and
+ object writes.
+- Add pool/arena allocation counters or high-water marks for frontend and CG
+ arenas.
+
+## Performance Priorities
+
+1. Make `opt_live_info` cheaper for large functions.
+ The same liveness computation currently runs before DDE and again inside
+ regalloc. The measurements show both passes growing superlinearly on the
+ large straight-line function.
+
+2. Replace or narrow dense conflict structures.
+ `opt.conflict_bytes` tracks the observed curve closely. Investigate sparse
+ sets, segmented bitsets, per-block live sets, or interval-style structures
+ that avoid touching full dense rows for values that cannot overlap.
+
+3. Avoid whole-function contiguous growth where hot passes scan repeatedly.
+ Large `Val`/block/instruction arrays and dense bit matrices are likely to
+ hurt cache locality. Segmented arrays may help by reducing large copies and
+ stabilizing addresses, but pass algorithms need to avoid turning segment
+ traversal into extra indirection in inner loops.
+
+4. Keep link/JIT lower priority for now.
+ Even at 1024 functions, `link_jit.all` is around `0.5 ms`; copy, reloc, and
+ icache flush are small and roughly linear. They are not the bottleneck for
+ submit-to-execution latency in these tests.
+
+5. Split `compile.c.parse_codegen` further before changing parser/CG data
+ structures.
+ The non-O1 compile residual is meaningful at large source sizes, but it is
+ currently broad. More counters are needed before choosing parser, PP, type,
+ or CG API changes.
diff --git a/include/cfree/frontend.h b/include/cfree/frontend.h
@@ -64,6 +64,14 @@ int cfree_source_file(CfreeCompiler*, uint32_t file_id, CfreeSourceFile* out);
typedef int (*CfreeFrontendRunFn)(CfreeCompiler*, void* user);
int cfree_frontend_run(CfreeCompiler*, CfreeFrontendRunFn, void* user);
+/* Optional metrics bridge for frontends. These are no-ops unless the host
+ * supplied CfreeEnv.metrics. Frontends use this public shim instead of
+ * depending on libcfree's internal core headers. */
+void cfree_frontend_metrics_scope_begin(CfreeCompiler*, const char* name);
+void cfree_frontend_metrics_scope_end(CfreeCompiler*, const char* name);
+void cfree_frontend_metrics_count(CfreeCompiler*, const char* name,
+ uint64_t value);
+
_Noreturn void cfree_frontend_fatal(CfreeCompiler*, CfreeSrcLoc,
const char* fmt, ...);
_Noreturn void cfree_frontend_vfatal(CfreeCompiler*, CfreeSrcLoc,
diff --git a/lang/c/c.c b/lang/c/c.c
@@ -182,25 +182,51 @@ int cfree_c_compile(CfreeCompiler* c, const CfreeCompileOptions* opts,
DeclTable* decls;
CfreeCg* cg;
+ cfree_frontend_metrics_scope_begin(c, "compile.c.setup");
+ cfree_frontend_metrics_scope_begin(c, "compile.c.pool_new");
pool = c_pool_new(c);
+ cfree_frontend_metrics_scope_end(c, "compile.c.pool_new");
if (!pool) compiler_panic(c, c_no_loc(), "C compiler out of memory");
+ cfree_frontend_metrics_scope_begin(c, "compile.c.lex_open");
lex = lex_open_mem(c, input->name, (const char*)input->data, input->len);
+ cfree_frontend_metrics_scope_end(c, "compile.c.lex_open");
+ cfree_frontend_metrics_scope_begin(c, "compile.c.pp_new");
pp = pp_new(c);
+ cfree_frontend_metrics_scope_end(c, "compile.c.pp_new");
+ cfree_frontend_metrics_scope_begin(c, "compile.c.cg_new");
cg = cfree_cg_new(c, out, opts);
+ cfree_frontend_metrics_scope_end(c, "compile.c.cg_new");
if (!lex || !pp || !cg)
compiler_panic(c, c_no_loc(), "C compiler out of memory");
(void)out;
+ cfree_frontend_metrics_scope_begin(c, "compile.c.decl_new");
decls = decl_new(c, cg);
-
+ cfree_frontend_metrics_scope_end(c, "compile.c.decl_new");
+ cfree_frontend_metrics_scope_end(c, "compile.c.setup");
+
+ cfree_frontend_metrics_scope_begin(c, "compile.c.pp_options");
+ cfree_frontend_metrics_count(c, "compile.c.pp_include_dirs",
+ opts->pp.ninclude_dirs);
+ cfree_frontend_metrics_count(c, "compile.c.pp_system_include_dirs",
+ opts->pp.nsystem_include_dirs);
+ cfree_frontend_metrics_count(c, "compile.c.pp_defines", opts->pp.ndefines);
+ cfree_frontend_metrics_count(c, "compile.c.pp_undefines", opts->pp.nundefines);
c_apply_pp_options(pp, &opts->pp);
+ cfree_frontend_metrics_scope_end(c, "compile.c.pp_options");
+ cfree_frontend_metrics_scope_begin(c, "compile.c.pp_push_input");
pp_push_input(pp, lex);
+ cfree_frontend_metrics_scope_end(c, "compile.c.pp_push_input");
+ cfree_frontend_metrics_scope_begin(c, "compile.c.parse_codegen");
parse_c(c, pool, pp, decls, cg);
+ cfree_frontend_metrics_scope_end(c, "compile.c.parse_codegen");
+ cfree_frontend_metrics_scope_begin(c, "compile.c.cleanup");
cfree_cg_free(cg);
decl_free(decls);
pp_free(pp);
c_pool_free(pool);
+ cfree_frontend_metrics_scope_end(c, "compile.c.cleanup");
return 0;
}
diff --git a/src/api/frontend.c b/src/api/frontend.c
@@ -5,6 +5,7 @@
#include "core/arena.h"
#include "core/core.h"
+#include "core/metrics.h"
#include "core/pool.h"
struct CfreeArena {
@@ -120,6 +121,19 @@ int cfree_frontend_run(CfreeCompiler* c, CfreeFrontendRunFn fn, void* user) {
return rc;
}
+void cfree_frontend_metrics_scope_begin(CfreeCompiler* c, const char* name) {
+ metrics_scope_begin((Compiler*)c, name);
+}
+
+void cfree_frontend_metrics_scope_end(CfreeCompiler* c, const char* name) {
+ metrics_scope_end((Compiler*)c, name);
+}
+
+void cfree_frontend_metrics_count(CfreeCompiler* c, const char* name,
+ uint64_t value) {
+ metrics_count((Compiler*)c, name, (u64)value);
+}
+
void cfree_frontend_fatal(CfreeCompiler* c, CfreeSrcLoc loc, const char* fmt,
...) {
va_list ap;
diff --git a/src/api/pipeline.c b/src/api/pipeline.c
@@ -60,20 +60,38 @@ static void compile_into(Compiler* c, const CfreeCompileOptions* opts,
frontend = c->frontends[input->lang];
}
if (frontend) {
+ metrics_scope_begin(c, "compile.frontend");
if (frontend(c, opts, input, ob) != 0) {
compiler_panic(c, no_loc(), "frontend failed for input: %s", input->name);
}
+ metrics_scope_end(c, "compile.frontend");
+ metrics_scope_begin(c, "compile.obj_finalize");
obj_finalize(ob);
+ metrics_scope_end(c, "compile.obj_finalize");
+ metrics_count(c, "compile.obj_sections", obj_section_count(ob));
+ metrics_count(c, "compile.obj_relocs", obj_reloc_total(ob));
return;
}
if (input->lang == CFREE_LANG_ASM) {
+ metrics_scope_begin(c, "compile.asm.lex_open");
lex = asm_lex_open_mem(c, input->name, (const char*)input->data, input->len);
+ metrics_scope_end(c, "compile.asm.lex_open");
+ metrics_scope_begin(c, "compile.asm.mc_new");
mc = mc_new(c, ob);
+ metrics_scope_end(c, "compile.asm.mc_new");
/* Asm-irrelevant fields on opts (pp, opt_level) are ignored. */
+ metrics_scope_begin(c, "compile.asm.parse");
asm_parse(c, lex, mc);
+ metrics_scope_end(c, "compile.asm.parse");
+ metrics_scope_begin(c, "compile.obj_finalize");
obj_finalize(ob);
+ metrics_scope_end(c, "compile.obj_finalize");
+ metrics_count(c, "compile.obj_sections", obj_section_count(ob));
+ metrics_count(c, "compile.obj_relocs", obj_reloc_total(ob));
+ metrics_scope_begin(c, "compile.asm.mc_free");
mc_free(mc);
+ metrics_scope_end(c, "compile.asm.mc_free");
/* The assembler owns the lexer it was handed; no pp_free release. */
return;
}