commit e3cc40502d041ed5d803683ac0afc4edf7b641ef
parent c1a6bf61d4e24de878d42dd717f69cc7589de591
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 15 May 2026 14:38:51 -0700
Complete O1 constfold cleanup
Diffstat:
| M | doc/CONSTFOLD.md | | | 54 | ++++++++++++++++++++++++++++++------------------------ |
| M | doc/OPT1.md | | | 169 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------- |
| M | doc/PERF.md | | | 37 | +++++++++++++++++++++++++++++++++++++ |
| M | src/api/cg.c | | | 384 | ++++++++++++++++++++++++++++++++++++++++++++----------------------------------- |
| M | test/api/cg_type_test.c | | | 85 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
5 files changed, 485 insertions(+), 244 deletions(-)
diff --git a/doc/CONSTFOLD.md b/doc/CONSTFOLD.md
@@ -25,12 +25,18 @@ Implemented:
- delayed `SV_ARITH` for unary and binary arithmetic;
- expression-local arithmetic chain folding;
- straight-line local constant shadowing with conservative boundary
- invalidation.
+ invalidation;
+- disassembly and metrics refresh for the phase 5 probes;
+- `ApiSValue` delayed compare/arithmetic payloads are now unioned;
+- local-shadow invalidation is routed through named memory, control, and
+ address-taking boundary helpers;
+- delayed arithmetic has a direct register-pressure materialization regression
+ test.
Remaining:
-- disassembly and metrics updates after the remaining phases land;
-- cleanup/refactor work listed in "Future Refactors".
+- consider a shadow generation counter if repeated `clear_all` scans show up in
+ future O1 metrics.
## Current Shape
@@ -416,8 +422,7 @@ Tests cover:
- delayed arithmetic consumed by another binop with immediate folding;
- delayed unary arithmetic consumed by another unop with chain folding;
- delayed arithmetic forced by store path;
-- register pressure path is still best covered indirectly by `test-opt`; add a
- direct CG API pressure case if a future regression appears.
+- register-pressure materialization when no ordinary spill victim exists.
Run:
@@ -433,6 +438,9 @@ Done. Added local shadow state to `ApiSourceLocal` and invalidation helpers:
```c
static void api_local_const_clear(ApiSourceLocal*);
static void api_local_const_clear_all(CfreeCg*);
+static void api_local_const_memory_boundary(CfreeCg*);
+static void api_local_const_control_boundary(CfreeCg*);
+static void api_local_const_address_taken(CfreeCg*, CfreeCgLocal);
static int api_local_const_can_track(CfreeCg*, const ApiSourceLocal*,
CfreeCgMemAccess);
static void api_local_const_store(CfreeCg*, CfreeCgLocal, CfreeCgTypeId, i64);
@@ -485,19 +493,18 @@ and an RV64 targeted case if available locally.
### Future Refactors
-These are not required for the phase 3/4 patch, but should be considered before
-growing the vstack simplifier further:
-
-- Move `SV_CMP` and `SV_ARITH` payload fields into a small union once the shape
- stabilizes. The current flat struct keeps the first implementation simple but
- increases every vstack entry.
-- Centralize local-shadow invalidation behind named boundary helpers such as
- `api_memory_boundary`, `api_control_boundary`, and
- `api_local_address_taken`. The first implementation clears at call sites so
- correctness is visible, but the repeated calls are easy to miss when adding
- new CG API operations.
-- Add an explicit CG API register-pressure test that forces delayed arithmetic
- materialization when no ordinary spill victim exists.
+Completed cleanup from the original phase 3/4 implementation:
+
+- `SV_CMP` and `SV_ARITH` payload fields now live in an `ApiSValue` union.
+- Local-shadow invalidation now goes through named boundary helpers:
+ `api_local_const_memory_boundary`, `api_local_const_control_boundary`, and
+ `api_local_const_address_taken`.
+- `test/api/cg_type_test.c` includes an explicit delayed-arithmetic
+ register-pressure case that forces materialization when no ordinary spill
+ victim exists.
+
+Still worth considering before growing the vstack simplifier further:
+
- Consider a shadow generation counter if whole-function local counts grow
enough for repeated `clear_all` scans to show up in O1 metrics.
- Consider one small builder helper in `test/api/cg_type_test.c` for creating
@@ -506,26 +513,25 @@ growing the vstack simplifier further:
### Phase 5: Disassembly And Metrics
-Re-run the probe cases from `doc/OPT1.md`:
+Done. Re-ran the probe cases from `doc/OPT1.md`:
- `6_5_17_compound_assign`;
- a direct literal return case;
- a compare/branch literal case;
- one local-address-taken case.
-For each of AArch64, x64, and RV64, compare:
+For each of AArch64, x64, and RV64, recorded:
- `.text` size;
- instruction count;
- `mov`/`load_imm` count;
- arithmetic instruction count;
-- O1 wall time;
-- live-range time;
-- regalloc time;
- spills/reloads;
- rewrite inserted instruction count.
-Update `doc/PERF.md` only after the implementation lands and numbers are real.
+Host JIT `run --time` samples recorded O1 wall time, live-range time,
+regalloc time, spills/reloads, and rewrite inserted instruction count. The
+current numbers are in `doc/PERF.md` under "Vstack Constfold Probe".
## Expected Results
diff --git a/doc/OPT1.md b/doc/OPT1.md
@@ -179,66 +179,126 @@ make test-smoke-x64
- rewrite reloads/stores/inserted instructions/live traffic;
- link/JIT and compile frontend scopes.
-## Current Disassembly Observations
+## Current State
-A May 2026 probe compiled existing parse corpus cases at `-O0` and `-O1`,
-then disassembled with `llvm-objdump -dr`. The cases covered straight-line
-scalar arithmetic, a direct call, a while loop, and a backward-goto loop:
+Fresh May 2026 scaling probes used the same direct-call and function-table
+families tracked in `doc/PERF.md`, with three samples per point and p50 times
+below. Both normal-path ladders are now essentially linear through 1024
+generated functions. The old dense conflict matrix is absent from the normal
+path (`opt.conflict_bytes=0`), and the live-range and allocator buckets double
+with the input.
-| case | AArch64 O0 -> O1 | x64 O0 -> O1 | RV64 O0 -> O1 |
-| ---- | ---------------- | ------------ | ------------- |
-| `6_5_17_compound_assign` | 40 -> 18 insns, 160 -> 72 bytes | 93 -> 22 insns, 158 -> 96 bytes | 53 -> 28 insns, 212 -> 112 bytes |
-| `6_5_24_func_call` | 70 -> 26 insns, 276 -> 100 bytes | 194 -> 34 insns, 281 -> 131 bytes | 97 -> 47 insns, 384 -> 184 bytes |
-| `6_8_02_while_sum` | 52 -> 26 insns, 208 -> 104 bytes | 100 -> 29 insns, 210 -> 127 bytes | 67 -> 38 insns, 268 -> 152 bytes |
-| `6_8_11_goto_backward` | 46 -> 20 insns, 184 -> 80 bytes | 93 -> 24 insns, 186 -> 106 bytes | 61 -> 32 insns, 244 -> 128 bytes |
+```text
+straight direct-call ladder
+N run.total opt.o1.total live_reg regalloc
+64 18.155 14.826 3.899 8.056
+128 35.244 29.371 7.727 15.970
+256 70.254 58.873 15.765 32.168
+512 144.874 120.613 30.894 64.842
+1024 283.198 233.653 61.392 126.779
+
+function-table ladder
+N run.total opt.o1.total live_reg regalloc
+64 17.536 14.571 3.822 7.915
+128 34.270 28.855 7.594 15.685
+256 68.305 57.831 15.183 31.413
+512 135.433 114.115 30.025 62.018
+1024 277.842 230.563 60.167 124.895
+```
+
+A nonconstant wide-local spill probe also scales linearly in spill/reload and
+rewrite counts, though regalloc bends slightly above 2x under heavy spill
+pressure:
+
+```text
+N opt.o1 live_reg regalloc spills reloads inserted
+128 1.541 0.398 0.918 120 120 240
+256 3.230 0.778 2.048 248 248 496
+512 6.991 1.474 4.924 504 504 1008
+```
+
+The same spill family exposed a correctness risk in the JIT/run path: some
+large spill-heavy generated functions returned correctly while nearby sizes
+segfaulted during execution (`N=256` and `N=512` in the focused probe). Treat
+that as a codegen/runtime correctness bug before using the spill ladder as a
+pure performance benchmark.
+
+Current code-shape probes compiled `identity_param`, `scalar_add`,
+`while_sum`, `simple_branch`, `direct_call`, `const_local`, and
+`local_addr_taken` across x64, AArch64, and RV64 with `-O1`, then disassembled
+with `llvm-objdump -dr`.
Observed progress:
-- O1 is materially smaller across all three probed targets.
+- O1 is materially smaller than O0 across all three supported native backends
+ in the older corpus probes.
- The old prologue NOP sleds are gone on O1 for AArch64, x64, and RV64.
-- Simple loop locals are now kept in registers in the AArch64, x64, and RV64
- loop probes instead of being reloaded from stack slots on each iteration.
-- Branch-consuming compares now lower to direct compare branches in the probed
- AArch64 cases. For example, the while-loop condition is:
+- Simple loop locals are kept in registers in the loop probes instead of being
+ reloaded from stack slots on each iteration.
+- Branch-consuming compares lower to direct compare branches. For example,
+ AArch64 while-loop conditions use `cmp w19, #0xa` plus `b.ge ...` rather
+ than a `cmp; cset; cmp #0; b.cond` bridge.
+- Very local constant simplification handles the direct O1 probes covered by
+ `doc/CONSTFOLD.md`: immediate arithmetic, compare literals, delayed
+ arithmetic chains, and straight-line scalar-local shadows. The
+ `int x = 40; x += 2; return x;` probe now reduces to immediate return
+ materialization on AArch64:
```asm
-cmp w19, #0xa
-b.ge ...
+mov w0, #0x2a
+b ...
```
- The larger short-circuit control-flow probe likewise shows `cmp` plus direct
- conditional branches and no `cset` bridge for branch consumers.
+Remaining O1 shape issues visible in the current dumps:
-Remaining O1 shape issues visible in the same dumps:
+- Cheap branch layout cleanup is still missing. Every quality probe had at
+ least one unconditional branch to the immediately following block, including
+ trivial returns and shared return epilogues.
+- Parameter and entry-slot promotion is incomplete. The trivial AArch64
+ identity function still stores the incoming argument to a frame slot, reloads
+ it into `w19`, then copies it back to `w0`:
+
+```asm
+stur w0, [x29, #-0x4]
+ldur w19, [x29, #-0x4]
+mov w0, w19
+```
-- Cheap branch layout cleanup is still missing. O1 frequently emits an
- unconditional branch to the immediately following block, including before
- loop headers and before shared return epilogues.
- O1 still saves/restores more callee-saved registers than the body appears to
- use in small functions. For example, the AArch64 while-loop probe saves
- `x22` even though the body uses `x19-x21`; x64 and RV64 show similar
- over-preservation.
-- Post-RA copy cleanup still leaves avoidable moves such as `add tmp, ...`
- followed by `mov live, tmp`. More two-address or destination-selection
- folding would help on all targets.
-- Parameter and entry-slot promotion is incomplete. The trivial identity
- function at O1 still stores `w0` to a frame slot and reloads it before
- returning on AArch64.
-- Very local constant simplification is still absent in O1. The
- `int x = 40; x += 2; return x;` probe is much smaller than O0, but still
- emits immediate materialization plus an add instead of returning `42`.
-
-## Todo: Code Quality
+ need in small functions. The AArch64 while-loop probe saves `x19-x22`, and
+ the x64 direct-call probe saves `rbx/r12/r13/r14` in tiny functions.
+- Post-RA copy cleanup still leaves avoidable moves such as:
+
+```asm
+add w21, w20, w19
+mov w20, w21
+add w21, w19, #1
+mov w19, w21
+```
+
+- Direct-call tiny functions are still heavy at O1. The x64 `callee(x) + 2`
+ probe emitted 167 bytes and 47 instructions across two small functions,
+ mostly frame setup, callee-save traffic, copies, and branch-to-epilogue
+ artifacts. O1 does not need general inlining, but call-side frame/save
+ discipline remains expensive.
+
+## Todo
MIR's O1 path suggests these high-value local cleanups that still fit cfree's
fast tier:
-1. Keep compare-branch fusion covered by tests.
- The current probes show direct `cmp` plus branch shapes for branch-consuming
- compares on AArch64. Add focused regression coverage so the old
- `cmp; cset; cmp #0; b.cond` bridge does not return.
+1. Reduce and fix the spill-heavy JIT/runtime crash.
+ The nonconstant wide-local spill probe returned correctly for many sizes but
+ segfaulted at nearby large sizes. Isolate this to a small parse or CG API
+ testcase before doing more spill-pressure perf work.
-2. Promote remaining scalar entry slots before backend allocation.
+2. Clean up local branch layout artifacts.
+ MIR's full jump optimizer is O2-only, but its cheap pieces are appropriate
+ for O1: delete branches to immediate fallthrough blocks, forward
+ branch-to-branch targets, and invert a branch when it removes an
+ unconditional jump. Avoid full CFG layout work.
+
+3. Promote remaining scalar entry slots before backend allocation.
MIR's C frontend represents normal scalar block locals as MIR registers and
leaves stack slots for aggregates, forced-stack cases, and address-taken
values. O1 now keeps simple loop locals in registers in the probe, but still
@@ -246,12 +306,6 @@ fast tier:
pass should promote remaining integer/pointer scalars whose address does not
escape, starting with parameters and single-entry structured control flow.
-3. Clean up local branch layout artifacts.
- MIR's full jump optimizer is O2-only, but its cheap pieces are appropriate
- for O1: delete branches to immediate fallthrough blocks, forward
- branch-to-branch targets, and invert a branch when it removes an
- unconditional jump. Avoid full CFG layout work.
-
4. Avoid unnecessary callee-save traffic.
Reserve and preserve only hard registers that survive final post-rewrite
cleanup, and consider caller-saved registers for values that are not live
@@ -264,10 +318,21 @@ fast tier:
traffic. Also fold single-use arithmetic temporaries into their destination
when target constraints allow it.
-6. Add tiny local constant simplification where it is cheap.
- O1 should not grow full SSA value optimization, but folding immediate-only
- straight-line arithmetic before allocation would remove obvious code in
- small functions without pulling O2 machinery into the fast tier.
+6. Keep compare-branch fusion covered by tests.
+ The current probes show direct `cmp` plus branch shapes for branch-consuming
+ compares on AArch64. Add focused regression coverage so the old
+ `cmp; cset; cmp #0; b.cond` bridge does not return.
+
+7. Keep tiny local constant simplification bounded.
+ The vstack constfold path now removes obvious immediate and straight-line
+ scalar-local code before allocation. Keep it basic-block-local; broader
+ propagation belongs in the O2 SSA/value optimizer.
+
+8. Watch spill-pressure regalloc slope.
+ Normal-path scaling is linear, but heavy spill pressure still bends slightly
+ in regalloc time. After the correctness crash is fixed, rerun the
+ nonconstant wide-local ladder and decide whether interval probing or stack
+ slot assignment needs another narrow cleanup.
- Keep `opt_combine` legality target-aware.
Existing one-use copy/immediate/convert folds should stay conservative. New
diff --git a/doc/PERF.md b/doc/PERF.md
@@ -803,6 +803,43 @@ O1 allocator cleanup item identified by the MIR-shape report; further allocator
work should now be justified by generated-code quality, O2 coalescing, or live
range splitting rather than by O1 compile-time scaling.
+### Vstack Constfold Probe
+
+After `doc/CONSTFOLD.md` phase 5, the focused O1 probes were re-run with
+`build/cfree cc -O1 -target ... -c` and disassembled with `llvm-objdump -dr`.
+Instruction counts below include function prologue/epilogue traffic; the useful
+shape is that the direct literal, compound-assign, and literal compare/branch
+probes all reduce to immediate return materialization, while the address-taken
+probe still performs the load/add/store/reload sequence.
+
+```text
+probe arch text insn mov arithmetic
+compound_assign x64 45 11 6 1
+compound_assign aa64 32 8 2 2
+compound_assign rv64 72 18 1 3
+literal_return x64 45 11 6 1
+literal_return aa64 32 8 2 2
+literal_return rv64 72 18 1 3
+compare_branch_literal x64 45 11 6 1
+compare_branch_literal aa64 32 8 2 2
+compare_branch_literal rv64 72 18 1 3
+local_addr_taken x64 107 25 18 1
+local_addr_taken aa64 88 22 3 5
+local_addr_taken rv64 128 32 1 4
+```
+
+Host JIT timings were measured with seven `build/cfree run --time -O1 -e
+test_main` samples per probe; the table uses p50 milliseconds for time buckets
+and stable first-sample counters for rewrite activity.
+
+```text
+probe opt.o1 live_ranges regalloc spills reloads inserted
+compound_assign 0.285 0.071 0.153 0 0 0
+literal_return 0.289 0.072 0.155 0 0 0
+compare_branch_literal 0.351 0.088 0.188 0 0 0
+local_addr_taken 0.351 0.088 0.187 0 0 0
+```
+
## Performance Priorities
1. Keep O1 on interval occupancy.
diff --git a/src/api/cg.c b/src/api/cg.c
@@ -1004,25 +1004,38 @@ typedef enum ApiDelayedArithKind {
API_DELAYED_BINOP,
} ApiDelayedArithKind;
+typedef struct ApiDelayedCmp {
+ Operand a;
+ Operand b;
+ CmpOp op;
+ u8 a_owned;
+ u8 b_owned;
+ u8 pad[2];
+} ApiDelayedCmp;
+
+typedef struct ApiDelayedArith {
+ Operand a;
+ Operand b;
+ BinOp bin_op;
+ UnOp un_op;
+ u8 kind;
+ u8 a_owned;
+ u8 b_owned;
+ u8 pad;
+} ApiDelayedArith;
+
typedef struct ApiSValue {
Operand op;
- Operand cmp_a;
- Operand cmp_b;
- Operand arith_a;
- Operand arith_b;
+ union {
+ ApiDelayedCmp cmp;
+ ApiDelayedArith arith;
+ } delayed;
CfreeCgTypeId type;
- CmpOp cmp_op;
- BinOp arith_bin_op;
- UnOp arith_un_op;
u8 kind;
- u8 arith_kind;
u8 res;
u8 pinned;
u8 lvalue;
- u8 cmp_a_owned;
- u8 cmp_b_owned;
- u8 arith_a_owned;
- u8 arith_b_owned;
+ u8 pad;
FrameSlot spill_slot;
CfreeCgLocal source_local;
} ApiSValue;
@@ -1292,11 +1305,11 @@ static ApiSValue api_make_cmp(CmpOp op, Operand a, Operand b,
memset(&sv, 0, sizeof sv);
sv.kind = SV_CMP;
sv.type = result_ty;
- sv.cmp_op = op;
- sv.cmp_a = a;
- sv.cmp_b = b;
- sv.cmp_a_owned = a_owned ? 1u : 0u;
- sv.cmp_b_owned = b_owned ? 1u : 0u;
+ sv.delayed.cmp.op = op;
+ sv.delayed.cmp.a = a;
+ sv.delayed.cmp.b = b;
+ sv.delayed.cmp.a_owned = a_owned ? 1u : 0u;
+ sv.delayed.cmp.b_owned = b_owned ? 1u : 0u;
sv.res = RES_INHERENT;
sv.spill_slot = FRAME_SLOT_NONE;
sv.source_local = CFREE_CG_LOCAL_NONE;
@@ -1308,11 +1321,11 @@ static ApiSValue api_make_arith_unop(UnOp op, Operand a, CfreeCgTypeId ty,
ApiSValue sv;
memset(&sv, 0, sizeof sv);
sv.kind = SV_ARITH;
- sv.arith_kind = API_DELAYED_UNOP;
+ sv.delayed.arith.kind = API_DELAYED_UNOP;
sv.type = ty;
- sv.arith_un_op = op;
- sv.arith_a = a;
- sv.arith_a_owned = a_owned ? 1u : 0u;
+ sv.delayed.arith.un_op = op;
+ sv.delayed.arith.a = a;
+ sv.delayed.arith.a_owned = a_owned ? 1u : 0u;
sv.res = RES_INHERENT;
sv.spill_slot = FRAME_SLOT_NONE;
sv.source_local = CFREE_CG_LOCAL_NONE;
@@ -1325,13 +1338,13 @@ static ApiSValue api_make_arith_binop(BinOp op, Operand a, Operand b,
ApiSValue sv;
memset(&sv, 0, sizeof sv);
sv.kind = SV_ARITH;
- sv.arith_kind = API_DELAYED_BINOP;
+ sv.delayed.arith.kind = API_DELAYED_BINOP;
sv.type = ty;
- sv.arith_bin_op = op;
- sv.arith_a = a;
- sv.arith_b = b;
- sv.arith_a_owned = a_owned ? 1u : 0u;
- sv.arith_b_owned = b_owned ? 1u : 0u;
+ sv.delayed.arith.bin_op = op;
+ sv.delayed.arith.a = a;
+ sv.delayed.arith.b = b;
+ sv.delayed.arith.a_owned = a_owned ? 1u : 0u;
+ sv.delayed.arith.b_owned = b_owned ? 1u : 0u;
sv.res = RES_INHERENT;
sv.spill_slot = FRAME_SLOT_NONE;
sv.source_local = CFREE_CG_LOCAL_NONE;
@@ -1632,51 +1645,57 @@ static int api_sv_owns_operand_reg(const ApiSValue *sv, const Operand *op) {
}
static void api_release_cmp(CfreeCg *g, ApiSValue *sv) {
- if (sv->cmp_a_owned)
- api_release_operand_reg(g, sv->cmp_a);
- if (sv->cmp_b_owned &&
- (sv->cmp_b.kind != OPK_REG || sv->cmp_a.kind != OPK_REG ||
- sv->cmp_b.v.reg != sv->cmp_a.v.reg || sv->cmp_b.cls != sv->cmp_a.cls ||
- !sv->cmp_a_owned)) {
- api_release_operand_reg(g, sv->cmp_b);
- }
- memset(&sv->cmp_a, 0, sizeof sv->cmp_a);
- memset(&sv->cmp_b, 0, sizeof sv->cmp_b);
- sv->cmp_a_owned = 0;
- sv->cmp_b_owned = 0;
+ if (sv->delayed.cmp.a_owned)
+ api_release_operand_reg(g, sv->delayed.cmp.a);
+ if (sv->delayed.cmp.b_owned &&
+ (sv->delayed.cmp.b.kind != OPK_REG || sv->delayed.cmp.a.kind != OPK_REG ||
+ sv->delayed.cmp.b.v.reg != sv->delayed.cmp.a.v.reg ||
+ sv->delayed.cmp.b.cls != sv->delayed.cmp.a.cls ||
+ !sv->delayed.cmp.a_owned)) {
+ api_release_operand_reg(g, sv->delayed.cmp.b);
+ }
+ memset(&sv->delayed.cmp.a, 0, sizeof sv->delayed.cmp.a);
+ memset(&sv->delayed.cmp.b, 0, sizeof sv->delayed.cmp.b);
+ sv->delayed.cmp.a_owned = 0;
+ sv->delayed.cmp.b_owned = 0;
sv->kind = SV_OPERAND;
}
static void api_release_arith(CfreeCg *g, ApiSValue *sv) {
- if (sv->arith_a_owned)
- api_release_operand_reg(g, sv->arith_a);
- if (sv->arith_b_owned &&
- (sv->arith_b.kind != OPK_REG || sv->arith_a.kind != OPK_REG ||
- sv->arith_b.v.reg != sv->arith_a.v.reg ||
- sv->arith_b.cls != sv->arith_a.cls || !sv->arith_a_owned)) {
- api_release_operand_reg(g, sv->arith_b);
- }
- memset(&sv->arith_a, 0, sizeof sv->arith_a);
- memset(&sv->arith_b, 0, sizeof sv->arith_b);
- sv->arith_a_owned = 0;
- sv->arith_b_owned = 0;
+ if (sv->delayed.arith.a_owned)
+ api_release_operand_reg(g, sv->delayed.arith.a);
+ if (sv->delayed.arith.b_owned &&
+ (sv->delayed.arith.b.kind != OPK_REG ||
+ sv->delayed.arith.a.kind != OPK_REG ||
+ sv->delayed.arith.b.v.reg != sv->delayed.arith.a.v.reg ||
+ sv->delayed.arith.b.cls != sv->delayed.arith.a.cls ||
+ !sv->delayed.arith.a_owned)) {
+ api_release_operand_reg(g, sv->delayed.arith.b);
+ }
+ memset(&sv->delayed.arith.a, 0, sizeof sv->delayed.arith.a);
+ memset(&sv->delayed.arith.b, 0, sizeof sv->delayed.arith.b);
+ sv->delayed.arith.a_owned = 0;
+ sv->delayed.arith.b_owned = 0;
sv->kind = SV_OPERAND;
}
static void api_materialize_cmp_to(CfreeCg *g, ApiSValue *sv, Operand dst) {
- g->target->cmp(g->target, sv->cmp_op, dst, sv->cmp_a, sv->cmp_b);
- if (sv->cmp_a_owned && sv->cmp_a.kind == OPK_REG &&
- (sv->cmp_a.v.reg != dst.v.reg || sv->cmp_a.cls != dst.cls)) {
- api_release_operand_reg(g, sv->cmp_a);
- }
- if (sv->cmp_b_owned && sv->cmp_b.kind == OPK_REG &&
- (sv->cmp_b.v.reg != dst.v.reg || sv->cmp_b.cls != dst.cls)) {
- api_release_operand_reg(g, sv->cmp_b);
- }
- memset(&sv->cmp_a, 0, sizeof sv->cmp_a);
- memset(&sv->cmp_b, 0, sizeof sv->cmp_b);
- sv->cmp_a_owned = 0;
- sv->cmp_b_owned = 0;
+ g->target->cmp(g->target, sv->delayed.cmp.op, dst, sv->delayed.cmp.a,
+ sv->delayed.cmp.b);
+ if (sv->delayed.cmp.a_owned && sv->delayed.cmp.a.kind == OPK_REG &&
+ (sv->delayed.cmp.a.v.reg != dst.v.reg ||
+ sv->delayed.cmp.a.cls != dst.cls)) {
+ api_release_operand_reg(g, sv->delayed.cmp.a);
+ }
+ if (sv->delayed.cmp.b_owned && sv->delayed.cmp.b.kind == OPK_REG &&
+ (sv->delayed.cmp.b.v.reg != dst.v.reg ||
+ sv->delayed.cmp.b.cls != dst.cls)) {
+ api_release_operand_reg(g, sv->delayed.cmp.b);
+ }
+ memset(&sv->delayed.cmp.a, 0, sizeof sv->delayed.cmp.a);
+ memset(&sv->delayed.cmp.b, 0, sizeof sv->delayed.cmp.b);
+ sv->delayed.cmp.a_owned = 0;
+ sv->delayed.cmp.b_owned = 0;
sv->kind = SV_OPERAND;
sv->op = dst;
sv->type = dst.type;
@@ -1685,24 +1704,28 @@ static void api_materialize_cmp_to(CfreeCg *g, ApiSValue *sv, Operand dst) {
}
static void api_materialize_arith_to(CfreeCg *g, ApiSValue *sv, Operand dst) {
- if (sv->arith_kind == API_DELAYED_UNOP) {
- g->target->unop(g->target, sv->arith_un_op, dst, sv->arith_a);
+ if (sv->delayed.arith.kind == API_DELAYED_UNOP) {
+ g->target->unop(g->target, sv->delayed.arith.un_op, dst,
+ sv->delayed.arith.a);
} else {
- g->target->binop(g->target, sv->arith_bin_op, dst, sv->arith_a,
- sv->arith_b);
- }
- if (sv->arith_a_owned && sv->arith_a.kind == OPK_REG &&
- (sv->arith_a.v.reg != dst.v.reg || sv->arith_a.cls != dst.cls)) {
- api_release_operand_reg(g, sv->arith_a);
- }
- if (sv->arith_b_owned && sv->arith_b.kind == OPK_REG &&
- (sv->arith_b.v.reg != dst.v.reg || sv->arith_b.cls != dst.cls)) {
- api_release_operand_reg(g, sv->arith_b);
- }
- memset(&sv->arith_a, 0, sizeof sv->arith_a);
- memset(&sv->arith_b, 0, sizeof sv->arith_b);
- sv->arith_a_owned = 0;
- sv->arith_b_owned = 0;
+ g->target->binop(g->target, sv->delayed.arith.bin_op, dst,
+ sv->delayed.arith.a,
+ sv->delayed.arith.b);
+ }
+ if (sv->delayed.arith.a_owned && sv->delayed.arith.a.kind == OPK_REG &&
+ (sv->delayed.arith.a.v.reg != dst.v.reg ||
+ sv->delayed.arith.a.cls != dst.cls)) {
+ api_release_operand_reg(g, sv->delayed.arith.a);
+ }
+ if (sv->delayed.arith.b_owned && sv->delayed.arith.b.kind == OPK_REG &&
+ (sv->delayed.arith.b.v.reg != dst.v.reg ||
+ sv->delayed.arith.b.cls != dst.cls)) {
+ api_release_operand_reg(g, sv->delayed.arith.b);
+ }
+ memset(&sv->delayed.arith.a, 0, sizeof sv->delayed.arith.a);
+ memset(&sv->delayed.arith.b, 0, sizeof sv->delayed.arith.b);
+ sv->delayed.arith.a_owned = 0;
+ sv->delayed.arith.b_owned = 0;
sv->kind = SV_OPERAND;
sv->op = dst;
sv->type = dst.type;
@@ -1711,9 +1734,9 @@ static void api_materialize_arith_to(CfreeCg *g, ApiSValue *sv, Operand dst) {
}
static int api_arith_rhs_reusable(const ApiSValue *sv) {
- if (sv->arith_kind == API_DELAYED_UNOP)
+ if (sv->delayed.arith.kind == API_DELAYED_UNOP)
return 0;
- switch (sv->arith_bin_op) {
+ switch (sv->delayed.arith.bin_op) {
case BO_IADD:
case BO_IMUL:
case BO_AND:
@@ -1733,12 +1756,12 @@ static int api_materialize_cmp_victim(CfreeCg *g, u8 cls) {
Operand dst;
if (sv->kind != SV_CMP || sv->pinned)
continue;
- if (sv->cmp_a_owned && sv->cmp_a.kind == OPK_REG &&
- sv->cmp_a.cls == RC_INT) {
- dst = api_op_reg(sv->cmp_a.v.reg, api_sv_type(sv));
- } else if (sv->cmp_b_owned && sv->cmp_b.kind == OPK_REG &&
- sv->cmp_b.cls == RC_INT) {
- dst = api_op_reg(sv->cmp_b.v.reg, api_sv_type(sv));
+ if (sv->delayed.cmp.a_owned && sv->delayed.cmp.a.kind == OPK_REG &&
+ sv->delayed.cmp.a.cls == RC_INT) {
+ dst = api_op_reg(sv->delayed.cmp.a.v.reg, api_sv_type(sv));
+ } else if (sv->delayed.cmp.b_owned && sv->delayed.cmp.b.kind == OPK_REG &&
+ sv->delayed.cmp.b.cls == RC_INT) {
+ dst = api_op_reg(sv->delayed.cmp.b.v.reg, api_sv_type(sv));
} else {
continue;
}
@@ -1756,12 +1779,13 @@ static int api_materialize_arith_victim(CfreeCg *g, u8 cls) {
Operand dst;
if (sv->kind != SV_ARITH || sv->pinned)
continue;
- if (sv->arith_a_owned && sv->arith_a.kind == OPK_REG &&
- sv->arith_a.cls == RC_INT) {
- dst = api_op_reg(sv->arith_a.v.reg, api_sv_type(sv));
- } else if (api_arith_rhs_reusable(sv) && sv->arith_b_owned &&
- sv->arith_b.kind == OPK_REG && sv->arith_b.cls == RC_INT) {
- dst = api_op_reg(sv->arith_b.v.reg, api_sv_type(sv));
+ if (sv->delayed.arith.a_owned && sv->delayed.arith.a.kind == OPK_REG &&
+ sv->delayed.arith.a.cls == RC_INT) {
+ dst = api_op_reg(sv->delayed.arith.a.v.reg, api_sv_type(sv));
+ } else if (api_arith_rhs_reusable(sv) && sv->delayed.arith.b_owned &&
+ sv->delayed.arith.b.kind == OPK_REG &&
+ sv->delayed.arith.b.cls == RC_INT) {
+ dst = api_op_reg(sv->delayed.arith.b.v.reg, api_sv_type(sv));
} else {
continue;
}
@@ -1820,12 +1844,12 @@ static void api_ensure_reg(CfreeCg *g, ApiSValue *sv) {
if (sv->kind == SV_CMP) {
CfreeCgTypeId ty = api_sv_type(sv);
Operand dst;
- if (sv->cmp_a_owned && sv->cmp_a.kind == OPK_REG &&
- sv->cmp_a.cls == RC_INT) {
- dst = api_op_reg(sv->cmp_a.v.reg, ty);
- } else if (sv->cmp_b_owned && sv->cmp_b.kind == OPK_REG &&
- sv->cmp_b.cls == RC_INT) {
- dst = api_op_reg(sv->cmp_b.v.reg, ty);
+ if (sv->delayed.cmp.a_owned && sv->delayed.cmp.a.kind == OPK_REG &&
+ sv->delayed.cmp.a.cls == RC_INT) {
+ dst = api_op_reg(sv->delayed.cmp.a.v.reg, ty);
+ } else if (sv->delayed.cmp.b_owned && sv->delayed.cmp.b.kind == OPK_REG &&
+ sv->delayed.cmp.b.cls == RC_INT) {
+ dst = api_op_reg(sv->delayed.cmp.b.v.reg, ty);
} else {
Reg r =
api_alloc_reg_or_spill(g, RC_INT,
@@ -1838,12 +1862,13 @@ static void api_ensure_reg(CfreeCg *g, ApiSValue *sv) {
if (sv->kind == SV_ARITH) {
CfreeCgTypeId ty = api_sv_type(sv);
Operand dst;
- if (sv->arith_a_owned && sv->arith_a.kind == OPK_REG &&
- sv->arith_a.cls == RC_INT) {
- dst = api_op_reg(sv->arith_a.v.reg, ty);
- } else if (api_arith_rhs_reusable(sv) && sv->arith_b_owned &&
- sv->arith_b.kind == OPK_REG && sv->arith_b.cls == RC_INT) {
- dst = api_op_reg(sv->arith_b.v.reg, ty);
+ if (sv->delayed.arith.a_owned && sv->delayed.arith.a.kind == OPK_REG &&
+ sv->delayed.arith.a.cls == RC_INT) {
+ dst = api_op_reg(sv->delayed.arith.a.v.reg, ty);
+ } else if (api_arith_rhs_reusable(sv) && sv->delayed.arith.b_owned &&
+ sv->delayed.arith.b.kind == OPK_REG &&
+ sv->delayed.arith.b.cls == RC_INT) {
+ dst = api_op_reg(sv->delayed.arith.b.v.reg, ty);
} else {
Reg r =
api_alloc_reg_or_spill(g, RC_INT,
@@ -2341,6 +2366,19 @@ static void api_local_const_clear_all(CfreeCg *g) {
api_local_const_clear(&g->locals[i]);
}
+static void api_local_const_memory_boundary(CfreeCg *g) {
+ api_local_const_clear_all(g);
+}
+
+static void api_local_const_control_boundary(CfreeCg *g) {
+ api_local_const_clear_all(g);
+}
+
+static void api_local_const_address_taken(CfreeCg *g, CfreeCgLocal local) {
+ api_local_const_clear_all(g);
+ api_local_const_clear(api_local_from_handle(g, local));
+}
+
static int api_local_const_can_track(CfreeCg *g, const ApiSourceLocal *rec,
CfreeCgMemAccess access) {
u32 width;
@@ -2458,21 +2496,24 @@ static int api_try_fold_arith_chain(CfreeCg *g, BinOp op, CfreeCgTypeId ty,
ApiSValue *out) {
i64 folded;
BinOp result_op;
- if (a->kind != SV_ARITH || a->arith_kind != API_DELAYED_BINOP ||
- a->arith_a.kind != OPK_REG || a->arith_b.kind != OPK_IMM ||
+ if (a->kind != SV_ARITH || a->delayed.arith.kind != API_DELAYED_BINOP ||
+ a->delayed.arith.a.kind != OPK_REG ||
+ a->delayed.arith.b.kind != OPK_IMM ||
b->kind != SV_OPERAND || b->op.kind != OPK_IMM) {
return 0;
}
- result_op = a->arith_bin_op;
- switch (a->arith_bin_op) {
+ result_op = a->delayed.arith.bin_op;
+ switch (a->delayed.arith.bin_op) {
case BO_IADD:
if (op == BO_IADD) {
- if (!api_try_fold_int_binop(g, BO_IADD, ty, a->arith_b.v.imm, b->op.v.imm,
+ if (!api_try_fold_int_binop(g, BO_IADD, ty,
+ a->delayed.arith.b.v.imm, b->op.v.imm,
&folded))
return 0;
result_op = BO_IADD;
} else if (op == BO_ISUB) {
- if (!api_try_fold_int_binop(g, BO_ISUB, ty, a->arith_b.v.imm, b->op.v.imm,
+ if (!api_try_fold_int_binop(g, BO_ISUB, ty,
+ a->delayed.arith.b.v.imm, b->op.v.imm,
&folded))
return 0;
result_op = BO_IADD;
@@ -2482,12 +2523,14 @@ static int api_try_fold_arith_chain(CfreeCg *g, BinOp op, CfreeCgTypeId ty,
break;
case BO_ISUB:
if (op == BO_IADD) {
- if (!api_try_fold_int_binop(g, BO_ISUB, ty, b->op.v.imm, a->arith_b.v.imm,
+ if (!api_try_fold_int_binop(g, BO_ISUB, ty, b->op.v.imm,
+ a->delayed.arith.b.v.imm,
&folded))
return 0;
result_op = BO_IADD;
} else if (op == BO_ISUB) {
- if (!api_try_fold_int_binop(g, BO_IADD, ty, a->arith_b.v.imm, b->op.v.imm,
+ if (!api_try_fold_int_binop(g, BO_IADD, ty,
+ a->delayed.arith.b.v.imm, b->op.v.imm,
&folded))
return 0;
result_op = BO_ISUB;
@@ -2496,19 +2539,22 @@ static int api_try_fold_arith_chain(CfreeCg *g, BinOp op, CfreeCgTypeId ty,
}
break;
case BO_XOR:
- if (op != BO_XOR || !api_try_fold_int_binop(g, BO_XOR, ty, a->arith_b.v.imm,
+ if (op != BO_XOR || !api_try_fold_int_binop(g, BO_XOR, ty,
+ a->delayed.arith.b.v.imm,
b->op.v.imm, &folded))
return 0;
result_op = BO_XOR;
break;
case BO_AND:
- if (op != BO_AND || !api_try_fold_int_binop(g, BO_AND, ty, a->arith_b.v.imm,
+ if (op != BO_AND || !api_try_fold_int_binop(g, BO_AND, ty,
+ a->delayed.arith.b.v.imm,
b->op.v.imm, &folded))
return 0;
result_op = BO_AND;
break;
case BO_OR:
- if (op != BO_OR || !api_try_fold_int_binop(g, BO_OR, ty, a->arith_b.v.imm,
+ if (op != BO_OR || !api_try_fold_int_binop(g, BO_OR, ty,
+ a->delayed.arith.b.v.imm,
b->op.v.imm, &folded))
return 0;
result_op = BO_OR;
@@ -2517,31 +2563,34 @@ static int api_try_fold_arith_chain(CfreeCg *g, BinOp op, CfreeCgTypeId ty,
return 0;
}
if (api_op_is_int_identity(g, result_op, ty, folded)) {
- *out = api_make_sv_with_reg_ownership(a->arith_a, ty, a->arith_a_owned);
- a->arith_a_owned = 0;
- memset(&a->arith_a, 0, sizeof a->arith_a);
+ *out = api_make_sv_with_reg_ownership(a->delayed.arith.a, ty,
+ a->delayed.arith.a_owned);
+ a->delayed.arith.a_owned = 0;
+ memset(&a->delayed.arith.a, 0, sizeof a->delayed.arith.a);
return 1;
}
- a->arith_bin_op = result_op;
- a->arith_b.v.imm = folded;
+ a->delayed.arith.bin_op = result_op;
+ a->delayed.arith.b.v.imm = folded;
*out = *a;
- a->arith_a_owned = 0;
- a->arith_b_owned = 0;
- memset(&a->arith_a, 0, sizeof a->arith_a);
- memset(&a->arith_b, 0, sizeof a->arith_b);
+ a->delayed.arith.a_owned = 0;
+ a->delayed.arith.b_owned = 0;
+ memset(&a->delayed.arith.a, 0, sizeof a->delayed.arith.a);
+ memset(&a->delayed.arith.b, 0, sizeof a->delayed.arith.b);
return 1;
}
static int api_try_fold_unary_chain(ApiSValue *a, UnOp op, CfreeCgTypeId ty,
ApiSValue *out) {
if (op != UO_BNOT || a->kind != SV_ARITH ||
- a->arith_kind != API_DELAYED_UNOP || a->arith_un_op != UO_BNOT ||
- a->arith_a.kind != OPK_REG) {
+ a->delayed.arith.kind != API_DELAYED_UNOP ||
+ a->delayed.arith.un_op != UO_BNOT ||
+ a->delayed.arith.a.kind != OPK_REG) {
return 0;
}
- *out = api_make_sv_with_reg_ownership(a->arith_a, ty, a->arith_a_owned);
- a->arith_a_owned = 0;
- memset(&a->arith_a, 0, sizeof a->arith_a);
+ *out = api_make_sv_with_reg_ownership(a->delayed.arith.a, ty,
+ a->delayed.arith.a_owned);
+ a->delayed.arith.a_owned = 0;
+ memset(&a->delayed.arith.a, 0, sizeof a->delayed.arith.a);
return 1;
}
@@ -3333,7 +3382,7 @@ void cfree_cg_load(CfreeCg *g, CfreeCgMemAccess access) {
if (!g)
return;
if (access.flags & CFREE_CG_MEM_VOLATILE)
- api_local_const_clear_all(g);
+ api_local_const_memory_boundary(g);
v = api_pop(g);
if (!api_is_lvalue_sv(&v)) {
api_push(g, v);
@@ -3392,11 +3441,9 @@ void cfree_cg_addr(CfreeCg *g) {
ApiSourceLocal *rec;
if (!g)
return;
- api_local_const_clear_all(g);
T = g->target;
v = api_pop(g);
- if (v.source_local != CFREE_CG_LOCAL_NONE)
- api_local_const_clear(api_local_from_handle(g, v.source_local));
+ api_local_const_address_taken(g, v.source_local);
api_ensure_reg(g, &v);
if (!api_is_lvalue_sv(&v)) {
compiler_panic(g->c, g->cur_loc, "CfreeCg: addr operand is not an lvalue");
@@ -3424,7 +3471,7 @@ void cfree_cg_store(CfreeCg *g, CfreeCgMemAccess access) {
if (!g)
return;
if (access.flags & CFREE_CG_MEM_VOLATILE)
- api_local_const_clear_all(g);
+ api_local_const_memory_boundary(g);
T = g->target;
rv = api_pop(g);
lv = api_pop(g);
@@ -3451,7 +3498,7 @@ void cfree_cg_store(CfreeCg *g, CfreeCgMemAccess access) {
}
} else if (lv.op.kind == OPK_INDIRECT || lv.op.kind == OPK_GLOBAL ||
(access.flags & CFREE_CG_MEM_VOLATILE)) {
- api_local_const_clear_all(g);
+ api_local_const_memory_boundary(g);
}
if (lv.source_local != CFREE_CG_LOCAL_NONE && lv.op.kind == OPK_REG) {
Operand dst = lv.op;
@@ -4020,7 +4067,7 @@ void cfree_cg_atomic_load(CfreeCg *g, CfreeCgMemAccess access,
Reg rr;
if (!g)
return;
- api_local_const_clear_all(g);
+ api_local_const_memory_boundary(g);
ptr = api_pop(g);
pty = api_sv_type(&ptr);
val_ty = resolve_type(g->c, access.type);
@@ -4042,7 +4089,7 @@ void cfree_cg_atomic_store(CfreeCg *g, CfreeCgMemAccess access,
Operand addr, src;
if (!g)
return;
- api_local_const_clear_all(g);
+ api_local_const_memory_boundary(g);
val = api_pop(g);
ptr = api_pop(g);
pty = api_sv_type(&ptr);
@@ -4067,7 +4114,7 @@ void cfree_cg_atomic_rmw(CfreeCg *g, CfreeCgMemAccess access,
Reg rr;
if (!g)
return;
- api_local_const_clear_all(g);
+ api_local_const_memory_boundary(g);
val = api_pop(g);
ptr = api_pop(g);
pty = api_sv_type(&ptr);
@@ -4095,7 +4142,7 @@ void cfree_cg_atomic_cmpxchg(CfreeCg *g, CfreeCgMemAccess access,
Reg pr, kr;
if (!g)
return;
- api_local_const_clear_all(g);
+ api_local_const_memory_boundary(g);
(void)weak;
desired = api_pop(g);
expected = api_pop(g);
@@ -4129,7 +4176,7 @@ void cfree_cg_atomic_cmpxchg(CfreeCg *g, CfreeCgMemAccess access,
void cfree_cg_atomic_fence(CfreeCg *g, CfreeCgMemOrder order) {
if (!g)
return;
- api_local_const_clear_all(g);
+ api_local_const_memory_boundary(g);
g->target->fence(g->target, api_map_mem_order(order));
}
@@ -4214,7 +4261,7 @@ void cfree_cg_inline_asm(CfreeCg *g, CfreeCgInlineAsm asm_block) {
(void)asm_block.clobber_abi_sets;
if (!g)
return;
- api_local_const_clear_all(g);
+ api_local_const_memory_boundary(g);
T = g->target;
h = g->c->env->heap;
fallback_ty = builtin_id(CFREE_CG_BUILTIN_I64);
@@ -4483,14 +4530,14 @@ CfreeCgLabel cfree_cg_label_new(CfreeCg *g) {
void cfree_cg_label_place(CfreeCg *g, CfreeCgLabel label) {
if (!g)
return;
- api_local_const_clear_all(g);
+ api_local_const_control_boundary(g);
g->target->label_place(g->target, (Label)label);
}
void cfree_cg_jump(CfreeCg *g, CfreeCgLabel label) {
if (!g)
return;
- api_local_const_clear_all(g);
+ api_local_const_control_boundary(g);
g->target->jump(g->target, (Label)label);
}
@@ -4500,7 +4547,7 @@ static void api_branch_if(CfreeCg *g, ApiSValue *v, int branch_when_true,
CfreeCgTypeId ty;
if (!g)
return;
- api_local_const_clear_all(g);
+ api_local_const_control_boundary(g);
T = g->target;
ty = v->type ? v->type : builtin_id(CFREE_CG_BUILTIN_I32);
if (v->op.kind == OPK_IMM && v->kind == SV_OPERAND) {
@@ -4510,8 +4557,9 @@ static void api_branch_if(CfreeCg *g, ApiSValue *v, int branch_when_true,
return;
}
if (v->kind == SV_CMP) {
- CmpOp op = branch_when_true ? v->cmp_op : api_invert_cmp(v->cmp_op);
- T->cmp_branch(T, op, v->cmp_a, v->cmp_b, label);
+ CmpOp op =
+ branch_when_true ? v->delayed.cmp.op : api_invert_cmp(v->delayed.cmp.op);
+ T->cmp_branch(T, op, v->delayed.cmp.a, v->delayed.cmp.b, label);
api_release(g, v);
return;
}
@@ -4547,7 +4595,7 @@ void cfree_cg_switch(CfreeCg *g, CfreeCgSwitch sw) {
return;
if (g->sp == 0)
return;
- api_local_const_clear_all(g);
+ api_local_const_control_boundary(g);
selector = api_pop(g);
ty = resolve_type(g->c, sw.selector_type);
if (!ty)
@@ -4581,7 +4629,7 @@ void cfree_cg_computed_goto(CfreeCg *g, const CfreeCgLabel *valid_targets,
(void)ntargets;
if (!g)
return;
- api_local_const_clear_all(g);
+ api_local_const_control_boundary(g);
target = api_pop(g);
api_release(g, &target);
compiler_panic(g->c, g->cur_loc,
@@ -4591,7 +4639,7 @@ void cfree_cg_computed_goto(CfreeCg *g, const CfreeCgLabel *valid_targets,
void cfree_cg_unreachable(CfreeCg *g) {
if (!g)
return;
- api_local_const_clear_all(g);
+ api_local_const_control_boundary(g);
g->target->intrinsic(g->target, INTRIN_UNREACHABLE, NULL, 0, NULL, 0);
}
@@ -4676,7 +4724,7 @@ CfreeCgScope cfree_cg_scope_begin(CfreeCg *g, CfreeCgTypeId result_type) {
return 0;
break_lbl = g->target->label_new(g->target);
cont_lbl = g->target->label_new(g->target);
- api_local_const_clear_all(g);
+ api_local_const_control_boundary(g);
g->target->label_place(g->target, cont_lbl);
if (g->nscopes >= API_CG_MAX_SCOPES) {
@@ -4723,7 +4771,7 @@ void cfree_cg_scope_end(CfreeCg *g, CfreeCgScope scope) {
ApiSValue result = api_pop(g);
api_scope_store_result(g, s, &result);
}
- api_local_const_clear_all(g);
+ api_local_const_control_boundary(g);
g->target->label_place(g->target, s->break_lbl);
g->target->scope_end(g->target, s->target_scope);
api_scope_push_result(g, s);
@@ -4739,7 +4787,7 @@ void cfree_cg_break(CfreeCg *g, CfreeCgScope scope) {
ApiSValue result = api_pop(g);
api_scope_store_result(g, s, &result);
}
- api_local_const_clear_all(g);
+ api_local_const_control_boundary(g);
g->target->jump(g->target, s->break_lbl);
}
@@ -4758,7 +4806,7 @@ void cfree_cg_break_true(CfreeCg *g, CfreeCgScope scope) {
if (cond.kind == SV_OPERAND && cond.op.kind == OPK_IMM) {
if (cond.op.v.imm != 0) {
api_scope_store_result(g, s, &result);
- api_local_const_clear_all(g);
+ api_local_const_control_boundary(g);
g->target->jump(g->target, s->break_lbl);
} else {
api_release(g, &result);
@@ -4768,9 +4816,9 @@ void cfree_cg_break_true(CfreeCg *g, CfreeCgScope scope) {
Label skip = g->target->label_new(g->target);
api_branch_if(g, &cond, 0, skip);
api_scope_store_result(g, s, &result);
- api_local_const_clear_all(g);
+ api_local_const_control_boundary(g);
g->target->jump(g->target, s->break_lbl);
- api_local_const_clear_all(g);
+ api_local_const_control_boundary(g);
g->target->label_place(g->target, skip);
}
} else {
@@ -4793,7 +4841,7 @@ void cfree_cg_break_false(CfreeCg *g, CfreeCgScope scope) {
if (cond.kind == SV_OPERAND && cond.op.kind == OPK_IMM) {
if (cond.op.v.imm == 0) {
api_scope_store_result(g, s, &result);
- api_local_const_clear_all(g);
+ api_local_const_control_boundary(g);
g->target->jump(g->target, s->break_lbl);
} else {
api_release(g, &result);
@@ -4803,9 +4851,9 @@ void cfree_cg_break_false(CfreeCg *g, CfreeCgScope scope) {
Label skip = g->target->label_new(g->target);
api_branch_if(g, &cond, 1, skip);
api_scope_store_result(g, s, &result);
- api_local_const_clear_all(g);
+ api_local_const_control_boundary(g);
g->target->jump(g->target, s->break_lbl);
- api_local_const_clear_all(g);
+ api_local_const_control_boundary(g);
g->target->label_place(g->target, skip);
}
} else {
@@ -4817,7 +4865,7 @@ void cfree_cg_continue(CfreeCg *g, CfreeCgScope scope) {
ApiCgScope *s = api_scope_from_handle(g, scope, 0, "CfreeCg: continue");
if (!s)
return;
- api_local_const_clear_all(g);
+ api_local_const_control_boundary(g);
g->target->jump(g->target, s->continue_lbl);
}
@@ -4949,7 +4997,7 @@ void cfree_cg_memcpy(CfreeCg *g, uint64_t size, CfreeCgMemAccess dst_access,
Operand dst_op, src_op;
if (!g)
return;
- api_local_const_clear_all(g);
+ api_local_const_memory_boundary(g);
(void)src_access;
if (size > UINT32_MAX) {
compiler_panic(g->c, g->cur_loc, "CfreeCg: memcpy size exceeds CGTarget");
@@ -4974,7 +5022,7 @@ void cfree_cg_memmove(CfreeCg *g, uint64_t size, CfreeCgMemAccess dst_access,
Operand args[3];
if (!g)
return;
- api_local_const_clear_all(g);
+ api_local_const_memory_boundary(g);
(void)dst_access;
(void)src_access;
if (size > INT64_MAX) {
@@ -4999,7 +5047,7 @@ void cfree_cg_memset(CfreeCg *g, uint8_t val, uint64_t size,
Operand dst_op, byte_val;
if (!g)
return;
- api_local_const_clear_all(g);
+ api_local_const_memory_boundary(g);
if (size > UINT32_MAX) {
compiler_panic(g->c, g->cur_loc, "CfreeCg: memset size exceeds CGTarget");
return;
@@ -5168,7 +5216,7 @@ void cfree_cg_call(CfreeCg *g, uint32_t nargs, CfreeCgTypeId fn_type,
int tail;
if (!g)
return;
- api_local_const_clear_all(g);
+ api_local_const_memory_boundary(g);
tail =
attrs.tail == CFREE_CG_TAIL_ALLOWED || attrs.tail == CFREE_CG_TAIL_MUST;
T = g->target;
@@ -5289,7 +5337,7 @@ static void api_cg_tail_call(CfreeCg *g, uint32_t nargs,
ApiSValue callee;
if (!g)
return;
- api_local_const_clear_all(g);
+ api_local_const_memory_boundary(g);
T = g->target;
fty = resolve_type(g->c, fn_type);
if (!fty)
@@ -5348,7 +5396,7 @@ static void api_call_symbol_common(CfreeCg *g, CfreeCgSym sym, uint32_t nargs,
Operand callee_op;
if (!g)
return;
- api_local_const_clear_all(g);
+ api_local_const_memory_boundary(g);
int tail =
attrs.tail == CFREE_CG_TAIL_ALLOWED || attrs.tail == CFREE_CG_TAIL_MUST;
T = g->target;
diff --git a/test/api/cg_type_test.c b/test/api/cg_type_test.c
@@ -692,6 +692,87 @@ static uint32_t cg_emit_delayed_store(CfreeCompiler* c, CfreeCgTypeId i32_ty,
return size;
}
+static uint32_t cg_emit_delayed_pressure(CfreeCompiler* c,
+ CfreeCgTypeId i32_ty,
+ const char* name) {
+ enum { NPARAMS = 13 };
+ CfreeCompileOptions opts;
+ CfreeObjBuilder* ob;
+ CfreeCg* cg;
+ CfreeCgFuncParam param_desc[NPARAMS];
+ CfreeCgFuncSig sig;
+ CfreeCgDecl decl;
+ CfreeCgSym sym;
+ CfreeCgLocalAttrs attrs;
+ CfreeCgLocal params[NPARAMS];
+ CfreeCgMemAccess mem;
+ uint32_t size;
+
+ memset(&opts, 0, sizeof opts);
+ opts.opt_level = 1;
+ ob = (CfreeObjBuilder*)obj_new((Compiler*)c);
+ EXPECT(ob != NULL, "delayed pressure obj builder allocation failed");
+ if (!ob) return 0;
+ cg = cfree_cg_new(c, ob, &opts);
+ EXPECT(cg != NULL, "delayed pressure cg allocation failed");
+ if (!cg) {
+ obj_free((ObjBuilder*)ob);
+ return 0;
+ }
+
+ memset(param_desc, 0, sizeof param_desc);
+ for (uint32_t i = 0; i < NPARAMS; ++i)
+ param_desc[i].type = i32_ty;
+ memset(&sig, 0, sizeof sig);
+ sig.ret = i32_ty;
+ sig.params = param_desc;
+ sig.nparams = NPARAMS;
+ sig.call_conv = CFREE_CG_CC_TARGET_C;
+
+ memset(&decl, 0, sizeof decl);
+ decl.kind = CFREE_CG_DECL_FUNC;
+ decl.linkage_name = cfree_sym_intern(c, name);
+ decl.display_name = decl.linkage_name;
+ decl.type = cfree_cg_type_func(c, sig);
+ decl.sym.bind = CFREE_SB_GLOBAL;
+ decl.sym.visibility = CFREE_CG_VIS_DEFAULT;
+ sym = cfree_cg_decl(cg, decl);
+ EXPECT(sym != CFREE_CG_SYM_NONE, "delayed pressure decl failed");
+
+ cfree_cg_func_begin(cg, sym);
+ memset(&attrs, 0, sizeof attrs);
+ memset(&mem, 0, sizeof mem);
+ mem.type = i32_ty;
+ mem.align = cfree_cg_type_align(c, i32_ty);
+ for (uint32_t i = 0; i < NPARAMS; ++i) {
+ char pname[8];
+ snprintf(pname, sizeof pname, "p%u", (unsigned)i);
+ attrs.name = cfree_sym_intern(c, pname);
+ params[i] = cfree_cg_param(cg, i, i32_ty, attrs);
+ EXPECT(params[i] != CFREE_CG_LOCAL_NONE, "delayed pressure param failed");
+ }
+
+ for (uint32_t i = 0; i + 1 < NPARAMS; ++i) {
+ cfree_cg_push_local(cg, params[i]);
+ cfree_cg_load(cg, mem);
+ cfree_cg_push_int(cg, 1, i32_ty);
+ cfree_cg_int_binop(cg, CFREE_CG_INT_ADD, 0);
+ }
+ cfree_cg_push_local(cg, params[NPARAMS - 1]);
+ cfree_cg_load(cg, mem);
+ cfree_cg_drop(cg);
+ for (uint32_t i = 0; i + 1 < NPARAMS; ++i)
+ cfree_cg_drop(cg);
+ cfree_cg_push_int(cg, 0, i32_ty);
+ cfree_cg_ret(cg);
+ cfree_cg_func_end(cg);
+
+ cfree_cg_free(cg);
+ size = text_size((ObjBuilder*)ob);
+ obj_free((ObjBuilder*)ob);
+ return size;
+}
+
typedef enum CgShadowBoundary {
CG_SHADOW_LABEL,
CG_SHADOW_BRANCH,
@@ -883,6 +964,8 @@ static void exercise_cg_constfold_phases(CfreeCompiler* c,
cg_emit_delayed_cmp(c, i32_ty, "cg_delayed_cmp_o1");
uint32_t delayed_store_size =
cg_emit_delayed_store(c, i32_ty, "cg_delayed_store_o1");
+ uint32_t pressure_size =
+ cg_emit_delayed_pressure(c, i32_ty, "cg_delayed_pressure_o1");
uint32_t label_size = cg_emit_local_shadow_boundary(
c, i32_ty, "cg_shadow_label_o1", CG_SHADOW_LABEL);
uint32_t branch_size = cg_emit_local_shadow_boundary(
@@ -913,6 +996,8 @@ static void exercise_cg_constfold_phases(CfreeCompiler* c,
EXPECT(delayed_store_size <= 64,
"delayed arithmetic forced by store should stay compact, text size=%u",
delayed_store_size);
+ EXPECT(pressure_size > 0,
+ "delayed arithmetic pressure materialization should emit code");
EXPECT(label_size > local_size,
"label should clear local shadow, label=%u folded=%u", label_size,
local_size);