kit

kit
git clone https://git.ryansepassi.com/git/kit.git
Log | Files | Refs | README

commit a28452e0597151ea9bb31eaf7c220e4ee5ee801c
parent 28f717c79bf761e9f3bcf3b33276733f67a37347
Author: Ryan Sepassi <rsepassi@gmail.com>
Date:   Fri, 22 May 2026 08:29:54 -0700

Fix O2 matrix and Darwin math blockers

Diffstat:
Mdoc/OPT.md | 43++++++++++++++++++++++++++++++++++++++++---
Mlang/c/parse/parse.c | 10++++++++++
Mlang/c/parse/parse_expr.c | 88+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mlang/c/parse/parse_priv.h | 10++++++++++
Mlang/c/parse/parse_type.c | 3++-
Mlang/c/pp/pp.c | 26++++++++++++++++++++++++++
Msrc/opt/ir.h | 2++
Msrc/opt/pass_coalesce.c | 1+
Msrc/opt/pass_ssa.c | 1+
Atest/parse/cases/builtin_28_fabs_inf.c | 12++++++++++++
Atest/parse/cases/builtin_28_fabs_inf.expected | 1+
Atest/parse/cases/float16_01_decl.c | 7+++++++
Atest/parse/cases/float16_01_decl.expected | 1+
Atest/parse/cases/o2_loop_postinc_store.c | 16++++++++++++++++
Atest/parse/cases/o2_loop_postinc_store.expected | 1+
15 files changed, 218 insertions(+), 4 deletions(-)

diff --git a/doc/OPT.md b/doc/OPT.md @@ -1223,12 +1223,49 @@ Initial representative run, 2026-05-22: Immediate benchmark blockers: -- Fix the `matrix -O2` wrong-code regression before trusting O2 timing. -- Add or model the hosted math builtins needed by Darwin `math.h`, starting - with `__builtin_fabsf`. +- [x] Fix the `matrix -O2` wrong-code regression before trusting O2 timing. +- [x] Add or model the hosted math builtins needed by Darwin `math.h`, + including `__builtin_fabsf`, `__builtin_inf*`, `__builtin_huge_val*`, + `_Float16` declarations, and the float/double predefined limit macros used + by the system header. - Re-run the full MIR benchmark set after those blockers, then increase repeat counts for stable numbers. +Follow-up representative run after cfree blocker fixes, 2026-05-22: + +- Same 8-benchmark scope, levels `0 1 2`, one compile repeat, and one run + repeat. +- Coverage: 120 data rows; 111 `OK`, 9 `COMPILE_FAIL`, 0 `RUN_FAIL`, and + 0 `OUTPUT_FAIL`. +- All `cfree` and `cfree-run` rows completed successfully, including + `matrix -O2`, `binary-trees`, `nbody`, and `spectral-norm`. The remaining + `COMPILE_FAIL` rows are all `mir-c2m` on the hosted/math benchmarks. +- Updated runtime geomean versus `gcc-15 -O2` on the representative scope: + `cfree-run` measured `0.363x` at `-O0`, `0.519x` at `-O1`, and `0.481x` + at `-O2`; MIR measured `0.550x`, `0.574x`, and `0.786x` on its five + completed cases. `clang -O2` measured `1.072x`. + +Focused rerun of the formerly broken cfree rows, 2026-05-22: + +- Scope: `matrix`, `binary-trees`, `nbody`, and `spectral-norm`; levels + `0 1 2`; one compile repeat and one run repeat. Output was written to + `build/bench/opt/rerun-broken-cfree/results.csv`. +- All `cfree` and `cfree-run` rows were `OK`. +- Geomean speed ratios use each benchmark's `gcc-15 -O2` row as `1.0x` for + both compile time and run time: + +| tool | opt | compile speed | runtime speed | +| --- | ---: | ---: | ---: | +| `cfree` | `O0` | `7.050x` | `0.319x` | +| `cfree` | `O1` | `7.062x` | `0.534x` | +| `cfree` | `O2` | `6.831x` | `0.483x` | +| `cfree-run` | `O0` | `13.709x` | `0.323x` | +| `cfree-run` | `O1` | `14.286x` | `0.550x` | +| `cfree-run` | `O2` | `11.791x` | `0.512x` | + +- On this focused scope, `O1` currently has the best cfree runtime geomean; + `O2` remains correct but is slower than `O1` on the measured mix. + Target: - `-O1` should be the fast optimized tier and materially faster to compile diff --git a/lang/c/parse/parse.c b/lang/c/parse/parse.c @@ -39,6 +39,7 @@ static const char* const kw_names[KW_COUNT] = { "enum", "extern", "float", + "_Float16", "for", "goto", "if", @@ -1495,6 +1496,15 @@ void parse_c(Compiler* c, Pool* pool, Pp* pp, DeclTable* decls, CG* cg) { p.sym_b_memset = pool_intern_cstr(p.pool, "__builtin_memset"); p.sym_b_clear_cache = pool_intern_cstr(p.pool, "__builtin___clear_cache"); p.sym_b_isnan = pool_intern_cstr(p.pool, "__builtin_isnan"); + p.sym_b_fabs = pool_intern_cstr(p.pool, "__builtin_fabs"); + p.sym_b_fabsf = pool_intern_cstr(p.pool, "__builtin_fabsf"); + p.sym_b_fabsl = pool_intern_cstr(p.pool, "__builtin_fabsl"); + p.sym_b_inf = pool_intern_cstr(p.pool, "__builtin_inf"); + p.sym_b_inff = pool_intern_cstr(p.pool, "__builtin_inff"); + p.sym_b_infl = pool_intern_cstr(p.pool, "__builtin_infl"); + p.sym_b_huge_val = pool_intern_cstr(p.pool, "__builtin_huge_val"); + p.sym_b_huge_valf = pool_intern_cstr(p.pool, "__builtin_huge_valf"); + p.sym_b_huge_vall = pool_intern_cstr(p.pool, "__builtin_huge_vall"); p.sym_func = pool_intern_cstr(p.pool, "__func__"); p.sym_func_gcc = pool_intern_cstr(p.pool, "__FUNCTION__"); p.sym_pretty_func_gcc = pool_intern_cstr(p.pool, "__PRETTY_FUNCTION__"); diff --git a/lang/c/parse/parse_expr.c b/lang/c/parse/parse_expr.c @@ -1464,6 +1464,92 @@ static int parse_builtin_isnan_call(Parser* p, Sym name, SrcLoc loc) { return 1; } +static const Type* builtin_math_fp_type(Parser* p, Sym name) { + if (name == p->sym_b_fabsf || name == p->sym_b_inff || + name == p->sym_b_huge_valf) { + return type_prim(p->pool, TY_FLOAT); + } + if (name == p->sym_b_fabsl || name == p->sym_b_infl || + name == p->sym_b_huge_vall) { + return type_prim(p->pool, TY_LDOUBLE); + } + return type_prim(p->pool, TY_DOUBLE); +} + +static int parse_builtin_inf_call(Parser* p, Sym name, SrcLoc loc) { + const Type* ty; + if (name != p->sym_b_inf && name != p->sym_b_inff && + name != p->sym_b_infl && name != p->sym_b_huge_val && + name != p->sym_b_huge_valf && name != p->sym_b_huge_vall) { + return 0; + } + + ty = builtin_math_fp_type(p, name); + advance(p); /* IDENT */ + expect_punct(p, '(', "'(' after floating builtin"); + expect_punct(p, ')', "')' after floating builtin"); + cg_set_loc(p->cg, loc); + cg_push_float(p->cg, __builtin_inf(), ty); + return 1; +} + +static int parse_builtin_fabs_call(Parser* p, Sym name, SrcLoc loc) { + const Type* ty; + FrameSlot slot; + CGLabel L_nonneg; + CGLabel L_nonzero; + if (name != p->sym_b_fabs && name != p->sym_b_fabsf && + name != p->sym_b_fabsl) { + return 0; + } + + ty = builtin_math_fp_type(p, name); + advance(p); /* IDENT */ + expect_punct(p, '(', "'(' after __builtin_fabs"); + parse_assign_expr(p); + to_rvalue(p); + if (!type_is_fp(cg_top_type(p->cg))) { + perr(p, "__builtin_fabs argument must have floating type"); + } + coerce_top_to_type(p, ty); + expect_punct(p, ')', "')' after __builtin_fabs"); + + slot = builtin_tmp_slot(p, ty); + cg_push_local_typed(p->cg, slot, ty); + cg_swap(p->cg); + cg_store(p->cg); + cg_drop(p->cg); + + cg_set_loc(p->cg, loc); + cg_push_local_typed(p->cg, slot, ty); + cg_load(p->cg); + cg_push_float(p->cg, 0.0, ty); + cg_cmp(p->cg, CMP_LT_F); + L_nonneg = cg_label_new(p->cg); + cg_branch_false(p->cg, L_nonneg); + cg_push_local_typed(p->cg, slot, ty); + cg_push_local_typed(p->cg, slot, ty); + cg_load(p->cg); + cg_unop(p->cg, UO_NEG); + cg_store(p->cg); + cg_drop(p->cg); + cg_label_place(p->cg, L_nonneg); + cg_push_local_typed(p->cg, slot, ty); + cg_load(p->cg); + cg_push_float(p->cg, 0.0, ty); + cg_cmp(p->cg, CMP_EQ); + L_nonzero = cg_label_new(p->cg); + cg_branch_false(p->cg, L_nonzero); + cg_push_local_typed(p->cg, slot, ty); + cg_push_float(p->cg, 0.0, ty); + cg_store(p->cg); + cg_drop(p->cg); + cg_label_place(p->cg, L_nonzero); + cg_push_local_typed(p->cg, slot, ty); + cg_load(p->cg); + return 1; +} + static int try_parse_builtin_call(Parser* p) { Sym name = p->cur.v.ident; SrcLoc loc = p->cur.loc; @@ -1475,6 +1561,8 @@ static int try_parse_builtin_call(Parser* p) { if (parse_builtin_overflow_call(p, name, loc)) return 1; if (parse_builtin_isnan_call(p, name, loc)) return 1; + if (parse_builtin_inf_call(p, name, loc)) return 1; + if (parse_builtin_fabs_call(p, name, loc)) return 1; if (parse_builtin_clear_cache_call(p, name, loc)) return 1; if (name != p->sym_b_alloca && name != p->sym_b_ctz && diff --git a/lang/c/parse/parse_priv.h b/lang/c/parse/parse_priv.h @@ -37,6 +37,7 @@ typedef enum CKw { KW_ENUM, KW_EXTERN, KW_FLOAT, + KW_FLOAT16, /* _Float16 */ KW_FOR, KW_GOTO, KW_IF, @@ -223,6 +224,15 @@ typedef struct Parser { Sym sym_b_memset; Sym sym_b_clear_cache; Sym sym_b_isnan; + Sym sym_b_fabs; + Sym sym_b_fabsf; + Sym sym_b_fabsl; + Sym sym_b_inf; + Sym sym_b_inff; + Sym sym_b_infl; + Sym sym_b_huge_val; + Sym sym_b_huge_valf; + Sym sym_b_huge_vall; Sym sym_func; /* __func__ */ Sym sym_func_gcc; /* __FUNCTION__ */ Sym sym_pretty_func_gcc; /* __PRETTY_FUNCTION__ */ diff --git a/lang/c/parse/parse_type.c b/lang/c/parse/parse_type.c @@ -644,7 +644,7 @@ int parse_decl_specs(Parser* p, DeclSpecs* out) { acc.saw_explicit_type = 1; advance(p); seen = 1; - } else if (is_kw(p, &t, KW_FLOAT)) { + } else if (is_kw(p, &t, KW_FLOAT) || is_kw(p, &t, KW_FLOAT16)) { acc.saw_float = 1; acc.saw_explicit_type = 1; advance(p); @@ -1161,6 +1161,7 @@ int starts_type_name(const Parser* p, const Tok* t) { case KW_INT: case KW_LONG: case KW_FLOAT: + case KW_FLOAT16: case KW_DOUBLE: case KW_SIGNED: case KW_UNSIGNED: diff --git a/lang/c/pp/pp.c b/lang/c/pp/pp.c @@ -495,6 +495,32 @@ static void pp_register_target_predefined(Pp* pp) { pp_define(pp, "__ATOMIC_POINTER_LOCK_FREE", "2"); pp_define(pp, "__FLT_EVAL_METHOD__", "0"); + pp_define(pp, "__FLT_HAS_DENORM__", "1"); + pp_define(pp, "__FLT_MANT_DIG__", "24"); + pp_define(pp, "__FLT_DECIMAL_DIG__", "9"); + pp_define(pp, "__FLT_DIG__", "6"); + pp_define(pp, "__FLT_MIN_EXP__", "(-125)"); + pp_define(pp, "__FLT_MIN_10_EXP__", "(-37)"); + pp_define(pp, "__FLT_MAX_EXP__", "128"); + pp_define(pp, "__FLT_MAX_10_EXP__", "38"); + pp_define(pp, "__FLT_MAX__", "0x1.fffffep+127F"); + pp_define(pp, "__FLT_EPSILON__", "0x1p-23F"); + pp_define(pp, "__FLT_MIN__", "0x1p-126F"); + pp_define(pp, "__FLT_DENORM_MIN__", "0x1p-149F"); + + pp_define(pp, "__DBL_HAS_DENORM__", "1"); + pp_define(pp, "__DBL_MANT_DIG__", "53"); + pp_define(pp, "__DBL_DECIMAL_DIG__", "17"); + pp_define(pp, "__DBL_DIG__", "15"); + pp_define(pp, "__DBL_MIN_EXP__", "(-1021)"); + pp_define(pp, "__DBL_MIN_10_EXP__", "(-307)"); + pp_define(pp, "__DBL_MAX_EXP__", "1024"); + pp_define(pp, "__DBL_MAX_10_EXP__", "308"); + pp_define(pp, "__DBL_MAX__", "0x1.fffffffffffffp+1023"); + pp_define(pp, "__DBL_EPSILON__", "0x1p-52"); + pp_define(pp, "__DBL_MIN__", "0x1p-1022"); + pp_define(pp, "__DBL_DENORM_MIN__", "0x1p-1074"); + /* RV64 long double = double per the locked decision (matches RV64 * musl/glibc default). Only aarch64-linux still gets binary128 * long double. */ diff --git a/src/opt/ir.h b/src/opt/ir.h @@ -157,6 +157,8 @@ typedef struct IRPhiAux { u32 reg_id; /* 0 if not from mutable-pseudo SSA; else original Reg id */ } IRPhiAux; +#define IRF_NO_COALESCE (1u << 0) + /* IR_CALL aux. The CGTarget interface is rich enough that we keep the * full descriptor for replay; SSA passes inspect args/results in their * Val form via the CGABIValue.storage.v.reg fields where applicable. */ diff --git a/src/opt/pass_coalesce.c b/src/opt/pass_coalesce.c @@ -184,6 +184,7 @@ static int move_cmp(const void* va, const void* vb) { static int collect_move(Func* f, const OptLiveRangeSet* ranges, Inst* in, u32 block, CoalesceMove* out) { if ((IROp)in->op != IR_COPY || in->nopnds < 2) return 0; + if (in->flags & IRF_NO_COALESCE) return 0; if (in->opnds[0].kind != OPK_REG || in->opnds[1].kind != OPK_REG) return 0; PReg dst = (PReg)in->opnds[0].v.reg; PReg src = (PReg)in->opnds[1].v.reg; diff --git a/src/opt/pass_ssa.c b/src/opt/pass_ssa.c @@ -814,6 +814,7 @@ static Inst make_copy_inst(Func* f, Val dst, Val src, CfreeCgTypeId ty, Inst in; memset(&in, 0, sizeof in); in.op = IR_COPY; + in.flags = IRF_NO_COALESCE; ir_assign_inst_id(f, &in); in.type = ty; in.def = dst; diff --git a/test/parse/cases/builtin_28_fabs_inf.c b/test/parse/cases/builtin_28_fabs_inf.c @@ -0,0 +1,12 @@ +int test_main(void) { + float f = -1.25f; + double d = -2.5; + + if (__builtin_fabsf(f) != 1.25f) return 1; + if (__builtin_fabs(d) != 2.5) return 2; + if (__builtin_inff() <= 0.0f) return 3; + if (__builtin_fabsf(-__builtin_inff()) != __builtin_inff()) return 4; + if (__builtin_huge_val() != __builtin_inf()) return 5; + if (1.0 / __builtin_fabs(-0.0) < 0.0) return 6; + return 42; +} diff --git a/test/parse/cases/builtin_28_fabs_inf.expected b/test/parse/cases/builtin_28_fabs_inf.expected @@ -0,0 +1 @@ +42 diff --git a/test/parse/cases/float16_01_decl.c b/test/parse/cases/float16_01_decl.c @@ -0,0 +1,7 @@ +extern _Float16 __fabsf16(_Float16); + +int test_main(void) { + _Float16 x = 1.0f; + _Float16* px = &x; + return sizeof(_Float16) == sizeof(float) && px ? 42 : 1; +} diff --git a/test/parse/cases/float16_01_decl.expected b/test/parse/cases/float16_01_decl.expected @@ -0,0 +1 @@ +42 diff --git a/test/parse/cases/o2_loop_postinc_store.c b/test/parse/cases/o2_loop_postinc_store.c @@ -0,0 +1,16 @@ +int test_main(void) { + int m[4]; + int i; + int count = 1; + + for (i = 0; i < 4; i++) { + m[i] = count++; + } + + if (m[0] != 1) return 1; + if (m[1] != 2) return 2; + if (m[2] != 3) return 3; + if (m[3] != 4) return 4; + if (count != 5) return 5; + return 42; +} diff --git a/test/parse/cases/o2_loop_postinc_store.expected b/test/parse/cases/o2_loop_postinc_store.expected @@ -0,0 +1 @@ +42