commit a28452e0597151ea9bb31eaf7c220e4ee5ee801c
parent 28f717c79bf761e9f3bcf3b33276733f67a37347
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 22 May 2026 08:29:54 -0700
Fix O2 matrix and Darwin math blockers
Diffstat:
15 files changed, 218 insertions(+), 4 deletions(-)
diff --git a/doc/OPT.md b/doc/OPT.md
@@ -1223,12 +1223,49 @@ Initial representative run, 2026-05-22:
Immediate benchmark blockers:
-- Fix the `matrix -O2` wrong-code regression before trusting O2 timing.
-- Add or model the hosted math builtins needed by Darwin `math.h`, starting
- with `__builtin_fabsf`.
+- [x] Fix the `matrix -O2` wrong-code regression before trusting O2 timing.
+- [x] Add or model the hosted math builtins needed by Darwin `math.h`,
+ including `__builtin_fabsf`, `__builtin_inf*`, `__builtin_huge_val*`,
+ `_Float16` declarations, and the float/double predefined limit macros used
+ by the system header.
- Re-run the full MIR benchmark set after those blockers, then increase repeat
counts for stable numbers.
+Follow-up representative run after cfree blocker fixes, 2026-05-22:
+
+- Same 8-benchmark scope, levels `0 1 2`, one compile repeat, and one run
+ repeat.
+- Coverage: 120 data rows; 111 `OK`, 9 `COMPILE_FAIL`, 0 `RUN_FAIL`, and
+ 0 `OUTPUT_FAIL`.
+- All `cfree` and `cfree-run` rows completed successfully, including
+ `matrix -O2`, `binary-trees`, `nbody`, and `spectral-norm`. The remaining
+ `COMPILE_FAIL` rows are all `mir-c2m` on the hosted/math benchmarks.
+- Updated runtime geomean versus `gcc-15 -O2` on the representative scope:
+ `cfree-run` measured `0.363x` at `-O0`, `0.519x` at `-O1`, and `0.481x`
+ at `-O2`; MIR measured `0.550x`, `0.574x`, and `0.786x` on its five
+ completed cases. `clang -O2` measured `1.072x`.
+
+Focused rerun of the formerly broken cfree rows, 2026-05-22:
+
+- Scope: `matrix`, `binary-trees`, `nbody`, and `spectral-norm`; levels
+ `0 1 2`; one compile repeat and one run repeat. Output was written to
+ `build/bench/opt/rerun-broken-cfree/results.csv`.
+- All `cfree` and `cfree-run` rows were `OK`.
+- Geomean speed ratios use each benchmark's `gcc-15 -O2` row as `1.0x` for
+ both compile time and run time:
+
+| tool | opt | compile speed | runtime speed |
+| --- | ---: | ---: | ---: |
+| `cfree` | `O0` | `7.050x` | `0.319x` |
+| `cfree` | `O1` | `7.062x` | `0.534x` |
+| `cfree` | `O2` | `6.831x` | `0.483x` |
+| `cfree-run` | `O0` | `13.709x` | `0.323x` |
+| `cfree-run` | `O1` | `14.286x` | `0.550x` |
+| `cfree-run` | `O2` | `11.791x` | `0.512x` |
+
+- On this focused scope, `O1` currently has the best cfree runtime geomean;
+ `O2` remains correct but is slower than `O1` on the measured mix.
+
Target:
- `-O1` should be the fast optimized tier and materially faster to compile
diff --git a/lang/c/parse/parse.c b/lang/c/parse/parse.c
@@ -39,6 +39,7 @@ static const char* const kw_names[KW_COUNT] = {
"enum",
"extern",
"float",
+ "_Float16",
"for",
"goto",
"if",
@@ -1495,6 +1496,15 @@ void parse_c(Compiler* c, Pool* pool, Pp* pp, DeclTable* decls, CG* cg) {
p.sym_b_memset = pool_intern_cstr(p.pool, "__builtin_memset");
p.sym_b_clear_cache = pool_intern_cstr(p.pool, "__builtin___clear_cache");
p.sym_b_isnan = pool_intern_cstr(p.pool, "__builtin_isnan");
+ p.sym_b_fabs = pool_intern_cstr(p.pool, "__builtin_fabs");
+ p.sym_b_fabsf = pool_intern_cstr(p.pool, "__builtin_fabsf");
+ p.sym_b_fabsl = pool_intern_cstr(p.pool, "__builtin_fabsl");
+ p.sym_b_inf = pool_intern_cstr(p.pool, "__builtin_inf");
+ p.sym_b_inff = pool_intern_cstr(p.pool, "__builtin_inff");
+ p.sym_b_infl = pool_intern_cstr(p.pool, "__builtin_infl");
+ p.sym_b_huge_val = pool_intern_cstr(p.pool, "__builtin_huge_val");
+ p.sym_b_huge_valf = pool_intern_cstr(p.pool, "__builtin_huge_valf");
+ p.sym_b_huge_vall = pool_intern_cstr(p.pool, "__builtin_huge_vall");
p.sym_func = pool_intern_cstr(p.pool, "__func__");
p.sym_func_gcc = pool_intern_cstr(p.pool, "__FUNCTION__");
p.sym_pretty_func_gcc = pool_intern_cstr(p.pool, "__PRETTY_FUNCTION__");
diff --git a/lang/c/parse/parse_expr.c b/lang/c/parse/parse_expr.c
@@ -1464,6 +1464,92 @@ static int parse_builtin_isnan_call(Parser* p, Sym name, SrcLoc loc) {
return 1;
}
+static const Type* builtin_math_fp_type(Parser* p, Sym name) {
+ if (name == p->sym_b_fabsf || name == p->sym_b_inff ||
+ name == p->sym_b_huge_valf) {
+ return type_prim(p->pool, TY_FLOAT);
+ }
+ if (name == p->sym_b_fabsl || name == p->sym_b_infl ||
+ name == p->sym_b_huge_vall) {
+ return type_prim(p->pool, TY_LDOUBLE);
+ }
+ return type_prim(p->pool, TY_DOUBLE);
+}
+
+static int parse_builtin_inf_call(Parser* p, Sym name, SrcLoc loc) {
+ const Type* ty;
+ if (name != p->sym_b_inf && name != p->sym_b_inff &&
+ name != p->sym_b_infl && name != p->sym_b_huge_val &&
+ name != p->sym_b_huge_valf && name != p->sym_b_huge_vall) {
+ return 0;
+ }
+
+ ty = builtin_math_fp_type(p, name);
+ advance(p); /* IDENT */
+ expect_punct(p, '(', "'(' after floating builtin");
+ expect_punct(p, ')', "')' after floating builtin");
+ cg_set_loc(p->cg, loc);
+ cg_push_float(p->cg, __builtin_inf(), ty);
+ return 1;
+}
+
+static int parse_builtin_fabs_call(Parser* p, Sym name, SrcLoc loc) {
+ const Type* ty;
+ FrameSlot slot;
+ CGLabel L_nonneg;
+ CGLabel L_nonzero;
+ if (name != p->sym_b_fabs && name != p->sym_b_fabsf &&
+ name != p->sym_b_fabsl) {
+ return 0;
+ }
+
+ ty = builtin_math_fp_type(p, name);
+ advance(p); /* IDENT */
+ expect_punct(p, '(', "'(' after __builtin_fabs");
+ parse_assign_expr(p);
+ to_rvalue(p);
+ if (!type_is_fp(cg_top_type(p->cg))) {
+ perr(p, "__builtin_fabs argument must have floating type");
+ }
+ coerce_top_to_type(p, ty);
+ expect_punct(p, ')', "')' after __builtin_fabs");
+
+ slot = builtin_tmp_slot(p, ty);
+ cg_push_local_typed(p->cg, slot, ty);
+ cg_swap(p->cg);
+ cg_store(p->cg);
+ cg_drop(p->cg);
+
+ cg_set_loc(p->cg, loc);
+ cg_push_local_typed(p->cg, slot, ty);
+ cg_load(p->cg);
+ cg_push_float(p->cg, 0.0, ty);
+ cg_cmp(p->cg, CMP_LT_F);
+ L_nonneg = cg_label_new(p->cg);
+ cg_branch_false(p->cg, L_nonneg);
+ cg_push_local_typed(p->cg, slot, ty);
+ cg_push_local_typed(p->cg, slot, ty);
+ cg_load(p->cg);
+ cg_unop(p->cg, UO_NEG);
+ cg_store(p->cg);
+ cg_drop(p->cg);
+ cg_label_place(p->cg, L_nonneg);
+ cg_push_local_typed(p->cg, slot, ty);
+ cg_load(p->cg);
+ cg_push_float(p->cg, 0.0, ty);
+ cg_cmp(p->cg, CMP_EQ);
+ L_nonzero = cg_label_new(p->cg);
+ cg_branch_false(p->cg, L_nonzero);
+ cg_push_local_typed(p->cg, slot, ty);
+ cg_push_float(p->cg, 0.0, ty);
+ cg_store(p->cg);
+ cg_drop(p->cg);
+ cg_label_place(p->cg, L_nonzero);
+ cg_push_local_typed(p->cg, slot, ty);
+ cg_load(p->cg);
+ return 1;
+}
+
static int try_parse_builtin_call(Parser* p) {
Sym name = p->cur.v.ident;
SrcLoc loc = p->cur.loc;
@@ -1475,6 +1561,8 @@ static int try_parse_builtin_call(Parser* p) {
if (parse_builtin_overflow_call(p, name, loc)) return 1;
if (parse_builtin_isnan_call(p, name, loc)) return 1;
+ if (parse_builtin_inf_call(p, name, loc)) return 1;
+ if (parse_builtin_fabs_call(p, name, loc)) return 1;
if (parse_builtin_clear_cache_call(p, name, loc)) return 1;
if (name != p->sym_b_alloca && name != p->sym_b_ctz &&
diff --git a/lang/c/parse/parse_priv.h b/lang/c/parse/parse_priv.h
@@ -37,6 +37,7 @@ typedef enum CKw {
KW_ENUM,
KW_EXTERN,
KW_FLOAT,
+ KW_FLOAT16, /* _Float16 */
KW_FOR,
KW_GOTO,
KW_IF,
@@ -223,6 +224,15 @@ typedef struct Parser {
Sym sym_b_memset;
Sym sym_b_clear_cache;
Sym sym_b_isnan;
+ Sym sym_b_fabs;
+ Sym sym_b_fabsf;
+ Sym sym_b_fabsl;
+ Sym sym_b_inf;
+ Sym sym_b_inff;
+ Sym sym_b_infl;
+ Sym sym_b_huge_val;
+ Sym sym_b_huge_valf;
+ Sym sym_b_huge_vall;
Sym sym_func; /* __func__ */
Sym sym_func_gcc; /* __FUNCTION__ */
Sym sym_pretty_func_gcc; /* __PRETTY_FUNCTION__ */
diff --git a/lang/c/parse/parse_type.c b/lang/c/parse/parse_type.c
@@ -644,7 +644,7 @@ int parse_decl_specs(Parser* p, DeclSpecs* out) {
acc.saw_explicit_type = 1;
advance(p);
seen = 1;
- } else if (is_kw(p, &t, KW_FLOAT)) {
+ } else if (is_kw(p, &t, KW_FLOAT) || is_kw(p, &t, KW_FLOAT16)) {
acc.saw_float = 1;
acc.saw_explicit_type = 1;
advance(p);
@@ -1161,6 +1161,7 @@ int starts_type_name(const Parser* p, const Tok* t) {
case KW_INT:
case KW_LONG:
case KW_FLOAT:
+ case KW_FLOAT16:
case KW_DOUBLE:
case KW_SIGNED:
case KW_UNSIGNED:
diff --git a/lang/c/pp/pp.c b/lang/c/pp/pp.c
@@ -495,6 +495,32 @@ static void pp_register_target_predefined(Pp* pp) {
pp_define(pp, "__ATOMIC_POINTER_LOCK_FREE", "2");
pp_define(pp, "__FLT_EVAL_METHOD__", "0");
+ pp_define(pp, "__FLT_HAS_DENORM__", "1");
+ pp_define(pp, "__FLT_MANT_DIG__", "24");
+ pp_define(pp, "__FLT_DECIMAL_DIG__", "9");
+ pp_define(pp, "__FLT_DIG__", "6");
+ pp_define(pp, "__FLT_MIN_EXP__", "(-125)");
+ pp_define(pp, "__FLT_MIN_10_EXP__", "(-37)");
+ pp_define(pp, "__FLT_MAX_EXP__", "128");
+ pp_define(pp, "__FLT_MAX_10_EXP__", "38");
+ pp_define(pp, "__FLT_MAX__", "0x1.fffffep+127F");
+ pp_define(pp, "__FLT_EPSILON__", "0x1p-23F");
+ pp_define(pp, "__FLT_MIN__", "0x1p-126F");
+ pp_define(pp, "__FLT_DENORM_MIN__", "0x1p-149F");
+
+ pp_define(pp, "__DBL_HAS_DENORM__", "1");
+ pp_define(pp, "__DBL_MANT_DIG__", "53");
+ pp_define(pp, "__DBL_DECIMAL_DIG__", "17");
+ pp_define(pp, "__DBL_DIG__", "15");
+ pp_define(pp, "__DBL_MIN_EXP__", "(-1021)");
+ pp_define(pp, "__DBL_MIN_10_EXP__", "(-307)");
+ pp_define(pp, "__DBL_MAX_EXP__", "1024");
+ pp_define(pp, "__DBL_MAX_10_EXP__", "308");
+ pp_define(pp, "__DBL_MAX__", "0x1.fffffffffffffp+1023");
+ pp_define(pp, "__DBL_EPSILON__", "0x1p-52");
+ pp_define(pp, "__DBL_MIN__", "0x1p-1022");
+ pp_define(pp, "__DBL_DENORM_MIN__", "0x1p-1074");
+
/* RV64 long double = double per the locked decision (matches RV64
* musl/glibc default). Only aarch64-linux still gets binary128
* long double. */
diff --git a/src/opt/ir.h b/src/opt/ir.h
@@ -157,6 +157,8 @@ typedef struct IRPhiAux {
u32 reg_id; /* 0 if not from mutable-pseudo SSA; else original Reg id */
} IRPhiAux;
+#define IRF_NO_COALESCE (1u << 0)
+
/* IR_CALL aux. The CGTarget interface is rich enough that we keep the
* full descriptor for replay; SSA passes inspect args/results in their
* Val form via the CGABIValue.storage.v.reg fields where applicable. */
diff --git a/src/opt/pass_coalesce.c b/src/opt/pass_coalesce.c
@@ -184,6 +184,7 @@ static int move_cmp(const void* va, const void* vb) {
static int collect_move(Func* f, const OptLiveRangeSet* ranges, Inst* in,
u32 block, CoalesceMove* out) {
if ((IROp)in->op != IR_COPY || in->nopnds < 2) return 0;
+ if (in->flags & IRF_NO_COALESCE) return 0;
if (in->opnds[0].kind != OPK_REG || in->opnds[1].kind != OPK_REG) return 0;
PReg dst = (PReg)in->opnds[0].v.reg;
PReg src = (PReg)in->opnds[1].v.reg;
diff --git a/src/opt/pass_ssa.c b/src/opt/pass_ssa.c
@@ -814,6 +814,7 @@ static Inst make_copy_inst(Func* f, Val dst, Val src, CfreeCgTypeId ty,
Inst in;
memset(&in, 0, sizeof in);
in.op = IR_COPY;
+ in.flags = IRF_NO_COALESCE;
ir_assign_inst_id(f, &in);
in.type = ty;
in.def = dst;
diff --git a/test/parse/cases/builtin_28_fabs_inf.c b/test/parse/cases/builtin_28_fabs_inf.c
@@ -0,0 +1,12 @@
+int test_main(void) {
+ float f = -1.25f;
+ double d = -2.5;
+
+ if (__builtin_fabsf(f) != 1.25f) return 1;
+ if (__builtin_fabs(d) != 2.5) return 2;
+ if (__builtin_inff() <= 0.0f) return 3;
+ if (__builtin_fabsf(-__builtin_inff()) != __builtin_inff()) return 4;
+ if (__builtin_huge_val() != __builtin_inf()) return 5;
+ if (1.0 / __builtin_fabs(-0.0) < 0.0) return 6;
+ return 42;
+}
diff --git a/test/parse/cases/builtin_28_fabs_inf.expected b/test/parse/cases/builtin_28_fabs_inf.expected
@@ -0,0 +1 @@
+42
diff --git a/test/parse/cases/float16_01_decl.c b/test/parse/cases/float16_01_decl.c
@@ -0,0 +1,7 @@
+extern _Float16 __fabsf16(_Float16);
+
+int test_main(void) {
+ _Float16 x = 1.0f;
+ _Float16* px = &x;
+ return sizeof(_Float16) == sizeof(float) && px ? 42 : 1;
+}
diff --git a/test/parse/cases/float16_01_decl.expected b/test/parse/cases/float16_01_decl.expected
@@ -0,0 +1 @@
+42
diff --git a/test/parse/cases/o2_loop_postinc_store.c b/test/parse/cases/o2_loop_postinc_store.c
@@ -0,0 +1,16 @@
+int test_main(void) {
+ int m[4];
+ int i;
+ int count = 1;
+
+ for (i = 0; i < 4; i++) {
+ m[i] = count++;
+ }
+
+ if (m[0] != 1) return 1;
+ if (m[1] != 2) return 2;
+ if (m[2] != 3) return 3;
+ if (m[3] != 4) return 4;
+ if (count != 5) return 5;
+ return 42;
+}
diff --git a/test/parse/cases/o2_loop_postinc_store.expected b/test/parse/cases/o2_loop_postinc_store.expected
@@ -0,0 +1 @@
+42