commit 4a221b1ee6db53d17a49c0af494e10f234502f53
parent 5e4d8e66ad3facbda764b97e7df3cd9cef32ede6
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 15 May 2026 13:42:37 -0700
Add vstack immediate constant folding
Diffstat:
| A | doc/CONSTFOLD.md | | | 572 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| M | doc/OPT1.md | | | 25 | ------------------------- |
| M | src/api/cg.c | | | 258 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- |
| M | test/api/cg_type_test.c | | | 95 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
4 files changed, 913 insertions(+), 37 deletions(-)
diff --git a/doc/CONSTFOLD.md b/doc/CONSTFOLD.md
@@ -0,0 +1,572 @@
+# CONSTFOLD - Vstack Constant Simplification Plan
+
+`-O1` needs a tiny constant simplification path that improves obvious scalar
+code without pulling the O2 SSA/value-optimization machinery into the fast
+backend tier. The right first home is the public CG API value stack in
+`src/api/cg.c`, beside the existing delayed compare representation used by
+compare-consuming branches.
+
+This plan covers integer constant folding and local expression delay in the
+vstack. It intentionally avoids global propagation, CFG-wide reasoning, and
+target-specific peepholes.
+
+## Status
+
+Implemented:
+
+- immediate-only integer binop folding for flag-free integer operations up to
+ 64 bits;
+- immediate-only integer unop folding for flag-free integer operations up to
+ 64 bits;
+- immediate-only integer and pointer-like compare folding;
+- scalar immediate return preservation through `cfree_cg_ret`;
+- CG API shape coverage for O1 literal folds;
+- delayed-compare materialization now reuses only owned operand registers.
+
+Remaining:
+
+- delayed `SV_ARITH` for unary and binary arithmetic;
+- expression-local arithmetic chain folding;
+- straight-line local constant shadowing;
+- disassembly and metrics updates after the remaining phases land.
+
+## Current Shape
+
+The API value stack currently carries:
+
+- `SV_OPERAND`: an immediate, register, local, global, or indirect operand;
+- `SV_CMP`: a delayed integer compare with source operands and register
+ ownership bits.
+
+Integer constants enter the stack as `SV_OPERAND` plus `OPK_IMM`.
+
+Integer compare already has the desired delayed shape:
+
+```text
+push a
+push b
+int_cmp -> pushes SV_CMP, emits no boolean materialization
+branch_true -> consumes SV_CMP as target->cmp_branch
+```
+
+This avoids the old branch-consumer shape:
+
+```text
+cmp
+cset
+cmp #0
+b.cond
+```
+
+Arithmetic now has a first immediate-only fold. For flag-free integer
+operations where both operands are `OPK_IMM`, `api_cg_binop` and
+`api_cg_unop` fold before allocating a result register. `api_cg_cmp` likewise
+folds immediate-only integer and pointer-like compares. Non-immediate
+arithmetic still materializes immediately; delayed arithmetic is the next
+planned step.
+
+The visible O1 quality gap is:
+
+```c
+int x = 40;
+x += 2;
+return x;
+```
+
+The expression part can be fixed by immediate folding, but the full example
+also crosses a local store/load. That requires a narrow local constant shadow,
+not just expression-stack folding.
+
+## Goals
+
+- Fold immediate-only integer arithmetic before register allocation.
+- Delay simple integer arithmetic on the vstack when doing so can avoid
+ materialization or combine a following immediate.
+- Preserve the existing `SV_CMP` compare-branch behavior.
+- Improve straight-line scalar-local code such as `x = 40; x = x + 2`.
+- Keep the implementation target-neutral and bounded.
+- Do not introduce VLAs or global state.
+- Keep all new state attached to `CfreeCg`, `ApiSValue`, or `ApiSourceLocal`.
+
+## Non-Goals
+
+- No SSA construction.
+- No global constant propagation.
+- No CFG-wide dataflow.
+- No folding across labels, jumps, branches, calls, volatile accesses, inline
+ asm, or unknown memory effects.
+- No FP constant folding in this pass.
+- No i128 folding until there is a deliberate wide-integer helper.
+- No target-specific immediate legality checks in the vstack. The target
+ backend already decides whether an `OPK_IMM` source fits an instruction
+ encoding or must be materialized.
+
+## Design
+
+### 1. Immediate-Only Integer Folding
+
+Implemented in `src/api/cg.c` with helpers shaped as:
+
+```c
+static int api_try_fold_int_binop(CfreeCg*, BinOp, CfreeCgTypeId,
+ i64 a, i64 b, i64* out);
+static int api_try_fold_int_unop(CfreeCg*, UnOp, CfreeCgTypeId,
+ i64 a, i64* out);
+static int api_try_fold_int_cmp(CfreeCg*, CmpOp, CfreeCgTypeId,
+ i64 a, i64 b, i64* out);
+```
+
+The binop/unop paths run before allocating a destination register. If both
+source operands are `OPK_IMM` and the operation is supported, they push a new
+`OPK_IMM` result and release the inputs.
+
+Implemented operations:
+
+- `BO_IADD`, `BO_ISUB`, `BO_IMUL`;
+- `BO_AND`, `BO_OR`, `BO_XOR`;
+- `BO_SHL`, `BO_SHR_U`, `BO_SHR_S`;
+- `UO_NEG`, `UO_BNOT`, `UO_NOT`;
+- integer compares.
+
+Still skipped:
+
+- signed and unsigned division/remainder, to avoid changing divide-by-zero
+ behavior and signed-overflow corner cases in the first patch;
+- any operation with trap, saturating, or exact/nowrap flags until flags are
+ actually modeled by the vstack fold;
+- FP operations;
+- i128 types.
+
+Evaluation rules:
+
+- Use unsigned arithmetic internally where possible so host signed overflow is
+ not invoked.
+- Mask the result to the destination integer width for widths below 64.
+- For arithmetic right shift, sign-extend the masked input before shifting.
+- Normalize `_Bool` results to `0` or `1`.
+- Treat pointer-typed `OPK_IMM` only as integer-sized values for compares.
+ Do not fold pointer arithmetic here.
+
+### 2. Immediate Return Preservation
+
+Implemented. `cfree_cg_ret` preserves non-aggregate scalar `OPK_IMM` values by
+passing them directly to `target->ret`.
+
+Required behavior:
+
+```text
+return 40 + 2
+ -> vstack folds to OPK_IMM 42
+ -> ret receives OPK_IMM 42
+ -> backend emits only return-value materialization
+```
+
+If the return value is delayed arithmetic or any non-immediate expression, it
+can still materialize through the normal register path.
+
+### 3. Delayed Arithmetic SValues
+
+Add a generic delayed arithmetic stack value after the immediate-only fold is
+stable. It should cover both unary and binary integer arithmetic, with an
+operation-kind discriminator and an optional second operand.
+
+Suggested shape:
+
+```c
+typedef enum ApiDelayedArithKind {
+ API_DELAYED_UNOP,
+ API_DELAYED_BINOP,
+} ApiDelayedArithKind;
+
+typedef enum ApiSValueKind {
+ SV_OPERAND,
+ SV_CMP,
+ SV_ARITH,
+} ApiSValueKind;
+
+typedef struct ApiSValue {
+ Operand op;
+ Operand cmp_a;
+ Operand cmp_b;
+ Operand arith_a;
+ Operand arith_b;
+ CfreeCgTypeId type;
+ CmpOp cmp_op;
+ BinOp arith_bin_op;
+ UnOp arith_un_op;
+ u8 kind;
+ u8 arith_kind;
+ u8 arith_has_b;
+ u8 res;
+ u8 pinned;
+ u8 lvalue;
+ u8 cmp_a_owned;
+ u8 cmp_b_owned;
+ u8 arith_a_owned;
+ u8 arith_b_owned;
+ FrameSlot spill_slot;
+ CfreeCgLocal source_local;
+} ApiSValue;
+```
+
+If struct size becomes a concern, the compare and arithmetic payloads can be
+moved into a small union. Keep the first patch simple unless stack footprint
+shows up in measurements.
+
+Materialization mirrors `SV_CMP`:
+
+- `api_ensure_reg` dispatches `SV_ARITH` to `api_materialize_arith_to`.
+- `api_materialize_arith_to` calls `target->unop` when
+ `arith_kind == API_DELAYED_UNOP` and `target->binop` when
+ `arith_kind == API_DELAYED_BINOP`.
+- `api_release` releases owned operand registers for `SV_ARITH`.
+- `api_pick_victim` ignores delayed non-register values.
+- `api_alloc_reg_or_spill` may materialize a delayed arithmetic value if no
+ normal spill victim exists, just as it does for delayed compares.
+
+Creation rules:
+
+- Only create `SV_ARITH` for integer-class operations.
+- For binary operations, require at least one non-immediate source.
+ Immediate-only cases fold to `OPK_IMM` instead.
+- For unary operations, delay only non-immediate sources. Immediate-only cases
+ fold to `OPK_IMM` instead.
+- Store unary operations with `arith_has_b == 0`; store binary operations with
+ `arith_has_b == 1`.
+- Own registers only when the source `ApiSValue` owns that register, following
+ the existing compare ownership pattern.
+- Never create delayed arithmetic for lvalues, aggregates, FP values, calls, or
+ memory operands that need a load.
+- Do not encode unary operations as synthetic binary operations. For example,
+ keep `-x` as `UO_NEG`, `~x` as `UO_BNOT`, and `!x` as `UO_NOT` or a delayed
+ compare only when the consumer naturally wants compare semantics.
+
+### 4. Local Arithmetic Chain Folding
+
+Once `SV_ARITH` exists, fold cheap expression-local chains before
+materializing:
+
+```text
+(x + C1) + C2 -> x + (C1 + C2)
+(x + C1) - C2 -> x + (C1 - C2)
+(x - C1) + C2 -> x + (C2 - C1)
+(x - C1) - C2 -> x - (C1 + C2)
+(x ^ C1) ^ C2 -> x ^ (C1 ^ C2)
+(x & C1) & C2 -> x & (C1 & C2)
+(x | C1) | C2 -> x | (C1 | C2)
+~~x -> x
+!!x -> x != 0, if the result is consumed as a branch or compare-like boolean
+```
+
+Keep this conservative:
+
+- only one delayed level;
+- only one real non-immediate operand;
+- no reassociation across non-immediate operands;
+- no multiply reassociation initially;
+- no transforms that depend on signed-overflow assumptions.
+- no unary chain transform that changes C truth-value semantics for non-boolean
+ integer results.
+
+If the folded constant becomes an identity, collapse the result:
+
+- `x + 0`, `x - 0`, `x ^ 0`, `x | 0` -> `x`;
+- `x & -1` -> `x` after width masking;
+- `x * 1` -> `x` if multiply folding is later added;
+- `x * 0` -> `0` only when flags and side effects are known safe.
+
+### 5. Straight-Line Local Constant Shadow
+
+To handle `int x = 40; x += 2; return x;`, track constants for source locals
+that are eligible for register-local treatment.
+
+Extend `ApiSourceLocal`:
+
+```c
+typedef struct ApiSourceLocal {
+ ...
+ i64 const_value;
+ u8 const_valid;
+ u8 const_type_class;
+ u8 pad[2];
+} ApiSourceLocal;
+```
+
+Rules:
+
+- A local can be shadowed only if it is scalar integer or pointer typed.
+- Address-taken locals are not shadowed.
+- Volatile accesses are not shadowed.
+- Parameters start unknown.
+- On store of an immediate to an eligible source local, set the shadow value.
+- On store of a non-immediate to that local, clear the shadow value.
+- On load of an eligible source local with a valid shadow, push `OPK_IMM`
+ instead of loading/materializing the local.
+- On taking a local address, clear the local's shadow.
+- On unknown memory or control-flow boundary, clear all shadows.
+
+Clear all local shadows at:
+
+- label placement;
+- unconditional jump;
+- conditional branch;
+- switch;
+- scope break/continue control transfer if it can leave the current block;
+- call, unless later alias analysis proves the call cannot observe the local;
+- inline asm;
+- atomic operations and fences;
+- direct stores through `OPK_INDIRECT`;
+- address-taking of any shadowed local.
+
+This is intentionally a basic-block-local cache. It is not a dataflow fact.
+
+### 6. Interaction With Existing Register Locals
+
+The CG API already treats some scalar autos as source-register lvalues when the
+target returns `CG_LOCAL_STORAGE_REG`. Loads from those locals can avoid memory.
+The constant shadow sits above that:
+
+```text
+local has valid constant
+ -> push OPK_IMM
+else local stored in register
+ -> push register-backed rvalue
+else
+ -> normal frame load path
+```
+
+Stores to register locals still update the real storage when necessary. The
+shadow only affects subsequent vstack reads in the same straight-line region.
+
+### 7. Boundary And Correctness Rules
+
+Every helper that consumes stack values must understand the new delayed value:
+
+- `api_release`;
+- `api_ensure_reg`;
+- `api_force_reg`;
+- `api_class_of_sv`;
+- `api_reg_of_sv`;
+- `api_owned_reg_type`;
+- `api_pick_victim`;
+- `cfree_cg_dup`;
+- `cfree_cg_store`;
+- call argument preparation;
+- return handling;
+- branch handling.
+
+If an operation does not explicitly support delayed arithmetic, it should force
+materialization first. That keeps behavior local and avoids subtle ownership
+bugs.
+
+Do not fold across volatile memory. `CfreeCgMemAccess.flags` already carries
+`CFREE_CG_MEM_VOLATILE`; the local shadow and vstack folds must respect it.
+
+Do not depend on C signed-overflow undefined behavior. The CG API integer types
+are width-only, and the existing frontend currently passes `CFREE_CG_INTOP_NONE`
+for ordinary integer arithmetic. The fold should preserve the backend's
+current wrap-like behavior unless flags explicitly request something else.
+
+## Implementation Phases
+
+### Phase 1: Red Tests For Literal Folding
+
+Done. The CG API test now covers O1 literal fold shapes for:
+
+- direct API function returning `40 + 2`;
+- direct API function returning `(5 << 3) | 2`;
+- direct API compare through `40 + 2 == 42`;
+- direct API unary folding through `~~41 + 1`.
+
+Expected shape should check for fewer emitted arithmetic operations where the
+existing test harness can observe it. Runtime correctness alone is not enough.
+
+### Phase 2: Immediate-Only Fold
+
+Done. The fold helpers are wired into:
+
+- `api_cg_binop`;
+- `api_cg_unop`;
+- `api_cg_cmp`;
+- `cfree_cg_ret`.
+
+Validation run:
+
+```sh
+make test-cg-api test-opt
+CFREE_TEST_ALLOW_SKIP=1 make test-parse
+```
+
+Final observed parse result:
+
+```text
+2496 pass, 0 fail, 4 skip
+```
+
+### Phase 3: Delayed Arithmetic
+
+Add `SV_ARITH`, operation-kind metadata, optional second-operand handling,
+ownership helpers, materialization, and release logic.
+
+Add tests for:
+
+- one delayed unary arithmetic value consumed by return;
+- one delayed binary arithmetic value consumed by return;
+- delayed arithmetic consumed by compare;
+- delayed arithmetic consumed by another binop with immediate folding;
+- delayed unary arithmetic consumed by another unop with chain folding;
+- delayed arithmetic forced by store/call path;
+- register pressure path materializing delayed arithmetic when no ordinary
+ victim is available.
+
+Run:
+
+```sh
+make test-cg-api
+make test-opt
+```
+
+### Phase 4: Local Constant Shadow
+
+Add local shadow state to `ApiSourceLocal` and invalidation helpers:
+
+```c
+static void api_local_const_clear(ApiSourceLocal*);
+static void api_local_const_clear_all(CfreeCg*);
+static int api_local_const_can_track(CfreeCg*, const ApiSourceLocal*,
+ CfreeCgMemAccess);
+static void api_local_const_store(CfreeCg*, CfreeCgLocal, CfreeCgTypeId, i64);
+static int api_local_const_load(CfreeCg*, CfreeCgLocal, CfreeCgTypeId,
+ Operand* out);
+```
+
+Wire store/load behavior through `cfree_cg_store` and `cfree_cg_load`.
+
+Wire invalidation through:
+
+- `cfree_cg_label_place`;
+- `cfree_cg_jump`;
+- `api_branch_if`;
+- `cfree_cg_switch`;
+- call paths;
+- inline asm path;
+- atomics/fences;
+- indirect stores;
+- local address-taking.
+
+Add tests for:
+
+- `int x = 40; x = x + 2; return x;`;
+- shadow cleared by label/jump;
+- shadow cleared by branch;
+- shadow cleared by address-taking;
+- shadow cleared by volatile access;
+- shadow cleared by indirect store;
+- parameter locals are initially unknown.
+
+Run:
+
+```sh
+make test-cg-api
+make test-opt
+make test-parse
+```
+
+If the change affects emitted target code shape, also run one targeted smoke
+case per supported O1 target:
+
+```sh
+make test-smoke-x64
+make test-aa64-inline
+```
+
+and an RV64 targeted case if available locally.
+
+### Phase 5: Disassembly And Metrics
+
+Re-run the probe cases from `doc/OPT1.md`:
+
+- `6_5_17_compound_assign`;
+- a direct literal return case;
+- a compare/branch literal case;
+- one local-address-taken case.
+
+For each of AArch64, x64, and RV64, compare:
+
+- `.text` size;
+- instruction count;
+- `mov`/`load_imm` count;
+- arithmetic instruction count;
+- O1 wall time;
+- live-range time;
+- regalloc time;
+- spills/reloads;
+- rewrite inserted instruction count.
+
+Update `doc/PERF.md` only after the implementation lands and numbers are real.
+
+## Expected Results
+
+For direct literal arithmetic:
+
+```c
+int test_main(void) { return 40 + 2; }
+```
+
+O1 should emit only return-value materialization for `42`, with no separate add.
+
+For scalar locals:
+
+```c
+int test_main(void) {
+ int x = 40;
+ x = x + 2;
+ return x;
+}
+```
+
+O1 should avoid the load/add/store/reload chain in the straight-line case. The
+best expected shape is return-value materialization for `42`, unless debug or
+local-storage requirements intentionally force a store.
+
+For address-taken locals:
+
+```c
+int test_main(void) {
+ int x = 40;
+ int* p = &x;
+ *p = *p + 2;
+ return x;
+}
+```
+
+The local shadow must not fold through the pointer access. Correctness is more
+important than preserving the constant.
+
+## Risks
+
+- Register ownership bugs in delayed arithmetic are the main implementation
+ risk. Reuse the compare ownership pattern and add tests where delayed
+ arithmetic values are forced, duplicated, dropped, and spilled.
+- Local shadow invalidation can become unsound if it is treated as a dataflow
+ fact. Keep it basic-block-local and clear aggressively.
+- Signed arithmetic folding can diverge from backend behavior if it uses host
+ signed overflow. Use unsigned operations plus width masking.
+- Return-path changes can accidentally bypass ABI requirements for aggregate
+ returns. Only preserve immediates for non-aggregate scalar returns.
+- Debug-info expectations for locals may eventually require materialized
+ storage even when constant-shadowed. Do not remove local declarations or
+ frame-slot allocation in this pass.
+
+## Acceptance Criteria
+
+- Immediate-only integer expressions fold before backend allocation.
+- Delayed compare-branch behavior remains covered and unchanged.
+- Delayed arithmetic materializes correctly in all consumers.
+- Straight-line scalar-local constants fold in the common `x = 40; x += 2`
+ shape.
+- Shadowed constants are cleared across control-flow, volatile, indirect
+ memory, call, asm, atomic, and address-taking boundaries.
+- Targeted tests pass for CG API, O1 opt, and parser coverage.
+- Disassembly shows fewer arithmetic/materialization instructions in the probe
+ cases without introducing target-specific vstack logic.
diff --git a/doc/OPT1.md b/doc/OPT1.md
@@ -273,28 +273,3 @@ fast tier:
Existing one-use copy/immediate/convert folds should stay conservative. New
folds must be gated by target operand legality rather than assuming every
backend accepts the same IR operand forms.
-
-## Todo: Compile-Time And Validation
-
-- Decide whether O1 can avoid the second block-liveness pass after DDE.
- The current duplicated liveness run is simple and no longer the dominant
- bucket, but DDE could eventually return enough local update information to
- reuse or cheaply repair liveness for regalloc.
-
-- Keep interval occupancy as the O1 allocator baseline.
- Do not reintroduce point-index bitmap row OR/mark loops in O1. O2 can build
- richer interval/event state for splitting if needed.
-
-- Add full CG corpus O1 validation for x64 and RV64.
- The focused opt suite covers all three main targets, but full corpus O1 runs
- should be part of release confidence for x64 and RV64.
-
-- Add stable targeted dumps for allocator and rewrite regressions.
- Metrics catch scaling regressions, but compact dumps for live ranges,
- assignments, and rewrite insertions would make code-quality changes easier to
- review.
-
-- Keep `doc/PERF.md` current after each O1 code-quality pass.
- The most useful before/after numbers are `.text` size, instruction counts by
- mnemonic, O1 wall time, live-range time, regalloc time, spills, reloads, and
- rewrite inserted instruction counts.
diff --git a/src/api/cg.c b/src/api/cg.c
@@ -1617,9 +1617,11 @@ static int api_materialize_cmp_victim(CfreeCg *g, u8 cls) {
Operand dst;
if (sv->kind != SV_CMP || sv->pinned)
continue;
- if (sv->cmp_a.kind == OPK_REG && sv->cmp_a.cls == RC_INT) {
+ if (sv->cmp_a_owned && sv->cmp_a.kind == OPK_REG &&
+ sv->cmp_a.cls == RC_INT) {
dst = api_op_reg(sv->cmp_a.v.reg, api_sv_type(sv));
- } else if (sv->cmp_b.kind == OPK_REG && sv->cmp_b.cls == RC_INT) {
+ } else if (sv->cmp_b_owned && sv->cmp_b.kind == OPK_REG &&
+ sv->cmp_b.cls == RC_INT) {
dst = api_op_reg(sv->cmp_b.v.reg, api_sv_type(sv));
} else {
continue;
@@ -1673,9 +1675,11 @@ static void api_ensure_reg(CfreeCg *g, ApiSValue *sv) {
if (sv->kind == SV_CMP) {
CfreeCgTypeId ty = api_sv_type(sv);
Operand dst;
- if (sv->cmp_a.kind == OPK_REG && sv->cmp_a.cls == RC_INT) {
+ if (sv->cmp_a_owned && sv->cmp_a.kind == OPK_REG &&
+ sv->cmp_a.cls == RC_INT) {
dst = api_op_reg(sv->cmp_a.v.reg, ty);
- } else if (sv->cmp_b.kind == OPK_REG && sv->cmp_b.cls == RC_INT) {
+ } else if (sv->cmp_b_owned && sv->cmp_b.kind == OPK_REG &&
+ sv->cmp_b.cls == RC_INT) {
dst = api_op_reg(sv->cmp_b.v.reg, ty);
} else {
Reg r =
@@ -1957,6 +1961,204 @@ static AsmDir api_map_asm_dir(uint8_t dir) {
return ASM_IN;
}
+/* ---- immediate integer folding ---- */
+
+static u32 api_int_like_width(Compiler *c, CfreeCgTypeId id) {
+ const CgType *ty = cg_type_get(c, id);
+ if (!ty)
+ return 0;
+ if (ty->kind == CFREE_CG_TYPE_ALIAS)
+ return api_int_like_width(c, ty->alias.base);
+ if (ty->kind == CFREE_CG_TYPE_INT || ty->kind == CFREE_CG_TYPE_BOOL)
+ return ty->integer.width;
+ if (ty->kind == CFREE_CG_TYPE_ENUM)
+ return (u32)(ty->size * 8u);
+ if (ty->kind == CFREE_CG_TYPE_PTR)
+ return (u32)(ty->size * 8u);
+ return 0;
+}
+
+static int api_type_is_bool(Compiler *c, CfreeCgTypeId id) {
+ const CgType *ty = cg_type_get(c, id);
+ if (!ty)
+ return 0;
+ if (ty->kind == CFREE_CG_TYPE_ALIAS)
+ return api_type_is_bool(c, ty->alias.base);
+ return ty->kind == CFREE_CG_TYPE_BOOL;
+}
+
+static u64 api_width_mask(u32 width) {
+ if (width >= 64)
+ return UINT64_MAX;
+ return (1ull << width) - 1ull;
+}
+
+static u64 api_mask_width(u64 v, u32 width) {
+ return v & api_width_mask(width);
+}
+
+static i64 api_sign_extend_width(u64 v, u32 width) {
+ v = api_mask_width(v, width);
+ if (width >= 64)
+ return (i64)v;
+ u64 sign = 1ull << (width - 1u);
+ return (i64)((v ^ sign) - sign);
+}
+
+static int api_foldable_int_like_type(Compiler *c, CfreeCgTypeId ty,
+ u32 *width_out) {
+ u32 width = api_int_like_width(c, ty);
+ if (!width || width > 64)
+ return 0;
+ *width_out = width;
+ return 1;
+}
+
+static int api_foldable_int_type(Compiler *c, CfreeCgTypeId ty,
+ u32 *width_out) {
+ if (!cg_type_is_int(c, ty))
+ return 0;
+ return api_foldable_int_like_type(c, ty, width_out);
+}
+
+static i64 api_fold_result(Compiler *c, CfreeCgTypeId ty, u64 v, u32 width) {
+ v = api_mask_width(v, width);
+ if (api_type_is_bool(c, ty))
+ v = v != 0;
+ return (i64)v;
+}
+
+static int api_try_fold_int_binop(CfreeCg *g, BinOp op, CfreeCgTypeId ty,
+ i64 a, i64 b, i64 *out) {
+ u32 width;
+ u64 ua, ub, r;
+ if (!g || !out || !api_foldable_int_type(g->c, ty, &width))
+ return 0;
+ ua = api_mask_width((u64)a, width);
+ ub = api_mask_width((u64)b, width);
+ r = 0;
+ switch (op) {
+ case BO_IADD:
+ r = ua + ub;
+ break;
+ case BO_ISUB:
+ r = ua - ub;
+ break;
+ case BO_IMUL:
+ r = ua * ub;
+ break;
+ case BO_AND:
+ r = ua & ub;
+ break;
+ case BO_OR:
+ r = ua | ub;
+ break;
+ case BO_XOR:
+ r = ua ^ ub;
+ break;
+ case BO_SHL: {
+ u32 sh = (u32)(ub & (u64)(width - 1u));
+ r = ua << sh;
+ break;
+ }
+ case BO_SHR_U: {
+ u32 sh = (u32)(ub & (u64)(width - 1u));
+ r = ua >> sh;
+ break;
+ }
+ case BO_SHR_S: {
+ u32 sh = (u32)(ub & (u64)(width - 1u));
+ if (!sh) {
+ r = ua;
+ } else {
+ u64 sign = 1ull << (width - 1u);
+ r = ua >> sh;
+ if (ua & sign)
+ r |= api_width_mask(width) << (width - sh);
+ }
+ break;
+ }
+ default:
+ return 0;
+ }
+ *out = api_fold_result(g->c, ty, r, width);
+ return 1;
+}
+
+static int api_try_fold_int_unop(CfreeCg *g, UnOp op, CfreeCgTypeId ty, i64 a,
+ i64 *out) {
+ u32 width;
+ u64 ua, r;
+ if (!g || !out || !api_foldable_int_type(g->c, ty, &width))
+ return 0;
+ ua = api_mask_width((u64)a, width);
+ switch (op) {
+ case UO_NEG:
+ r = 0u - ua;
+ break;
+ case UO_NOT:
+ r = ua == 0;
+ break;
+ case UO_BNOT:
+ r = ~ua;
+ break;
+ default:
+ return 0;
+ }
+ *out = api_fold_result(g->c, ty, r, width);
+ return 1;
+}
+
+static int api_try_fold_int_cmp(CfreeCg *g, CmpOp op, CfreeCgTypeId ty, i64 a,
+ i64 b, i64 *out) {
+ u32 width;
+ u64 ua, ub;
+ i64 sa, sb;
+ int r;
+ if (!g || !out || !api_foldable_int_like_type(g->c, ty, &width))
+ return 0;
+ ua = api_mask_width((u64)a, width);
+ ub = api_mask_width((u64)b, width);
+ sa = api_sign_extend_width(ua, width);
+ sb = api_sign_extend_width(ub, width);
+ switch (op) {
+ case CMP_EQ:
+ r = ua == ub;
+ break;
+ case CMP_NE:
+ r = ua != ub;
+ break;
+ case CMP_LT_S:
+ r = sa < sb;
+ break;
+ case CMP_LE_S:
+ r = sa <= sb;
+ break;
+ case CMP_GT_S:
+ r = sa > sb;
+ break;
+ case CMP_GE_S:
+ r = sa >= sb;
+ break;
+ case CMP_LT_U:
+ r = ua < ub;
+ break;
+ case CMP_LE_U:
+ r = ua <= ub;
+ break;
+ case CMP_GT_U:
+ r = ua > ub;
+ break;
+ case CMP_GE_U:
+ r = ua >= ub;
+ break;
+ default:
+ return 0;
+ }
+ *out = r ? 1 : 0;
+ return 1;
+}
+
/* ---- C-symbol mangling ---- */
static SymBind api_map_bind(CfreeSymBind b) {
@@ -2924,13 +3126,14 @@ void cfree_cg_rot3(CfreeCg *g) {
* Arithmetic / compare / convert
* ============================================================ */
-static void api_cg_binop(CfreeCg *g, BinOp iop) {
+static void api_cg_binop(CfreeCg *g, BinOp iop, u32 flags) {
ApiSValue b, a;
CGTarget *T;
CfreeCgTypeId ty;
Operand ra, rb;
Reg rr;
Operand dst;
+ i64 folded;
if (!g)
return;
T = g->target;
@@ -2938,6 +3141,14 @@ static void api_cg_binop(CfreeCg *g, BinOp iop) {
a = api_pop(g);
ty = a.type ? a.type : b.type;
+ if (!flags && api_sv_op_is(&a, OPK_IMM) && api_sv_op_is(&b, OPK_IMM) &&
+ api_try_fold_int_binop(g, iop, ty, a.op.v.imm, b.op.v.imm, &folded)) {
+ api_release(g, &a);
+ api_release(g, &b);
+ api_push(g, api_make_sv(api_op_imm(folded, ty), ty));
+ return;
+ }
+
ra = api_force_reg_unless_imm(g, &a, ty);
rb = api_force_reg_unless_imm(g, &b, ty);
rr = api_alloc_reg_or_spill(g, api_type_class(ty), ty);
@@ -2948,19 +3159,27 @@ static void api_cg_binop(CfreeCg *g, BinOp iop) {
api_push(g, api_make_sv(dst, ty));
}
-static void api_cg_unop(CfreeCg *g, UnOp iop) {
+static void api_cg_unop(CfreeCg *g, UnOp iop, u32 flags) {
ApiSValue a;
CGTarget *T;
CfreeCgTypeId ty;
Operand ra;
Reg rr;
Operand dst;
+ i64 folded;
if (!g)
return;
T = g->target;
a = api_pop(g);
ty = a.type ? a.type : a.op.type;
+ if (!flags && api_sv_op_is(&a, OPK_IMM) &&
+ api_try_fold_int_unop(g, iop, ty, a.op.v.imm, &folded)) {
+ api_release(g, &a);
+ api_push(g, api_make_sv(api_op_imm(folded, ty), ty));
+ return;
+ }
+
ra = api_force_reg_unless_imm(g, &a, ty);
rr = api_alloc_reg_or_spill(g, api_type_class(ty), ty);
dst = api_op_reg(rr, ty);
@@ -2977,6 +3196,7 @@ static void api_cg_cmp(CfreeCg *g, CmpOp cop) {
Operand ra, rb;
Reg rr;
Operand dst;
+ i64 folded;
if (!g)
return;
T = g->target;
@@ -2985,6 +3205,14 @@ static void api_cg_cmp(CfreeCg *g, CmpOp cop) {
opty = a.type ? a.type : b.type;
i32 = builtin_id(CFREE_CG_BUILTIN_I32);
+ if (api_sv_op_is(&a, OPK_IMM) && api_sv_op_is(&b, OPK_IMM) &&
+ api_try_fold_int_cmp(g, cop, opty, a.op.v.imm, b.op.v.imm, &folded)) {
+ api_release(g, &a);
+ api_release(g, &b);
+ api_push(g, api_make_sv(api_op_imm(folded, i32), i32));
+ return;
+ }
+
ra = api_force_reg_unless_imm(g, &a, opty);
rb = api_force_reg_unless_imm(g, &b, opty);
if (api_type_class(opty) != RC_FP) {
@@ -3040,13 +3268,11 @@ static void api_cg_convert_kind(CfreeCg *g, CfreeCgTypeId dst_type,
}
void cfree_cg_int_binop(CfreeCg *g, CfreeCgIntBinOp op, uint32_t flags) {
- (void)flags;
- api_cg_binop(g, api_map_int_binop(op));
+ api_cg_binop(g, api_map_int_binop(op), flags);
}
void cfree_cg_int_unop(CfreeCg *g, CfreeCgIntUnOp op, uint32_t flags) {
- (void)flags;
- api_cg_unop(g, api_map_int_unop(op));
+ api_cg_unop(g, api_map_int_unop(op), flags);
}
void cfree_cg_int_cmp(CfreeCg *g, CfreeCgIntCmpOp op) {
@@ -3059,13 +3285,13 @@ void cfree_cg_fp_binop(CfreeCg *g, CfreeCgFpBinOp op, uint32_t flags) {
compiler_panic(g->c, g->cur_loc, "CfreeCg: FP remainder is unsupported");
return;
}
- api_cg_binop(g, api_map_fp_binop(op));
+ api_cg_binop(g, api_map_fp_binop(op), 0);
}
void cfree_cg_fp_unop(CfreeCg *g, CfreeCgFpUnOp op, uint32_t flags) {
(void)flags;
(void)op;
- api_cg_unop(g, UO_NEG);
+ api_cg_unop(g, UO_NEG, 0);
}
void cfree_cg_fp_cmp(CfreeCg *g, CfreeCgFpCmpOp op) {
@@ -4755,6 +4981,14 @@ void cfree_cg_ret(CfreeCg *g) {
T->ret(T, &av);
return;
}
+ if (api_sv_op_is(&v, OPK_IMM)) {
+ ret_op = v.op;
+ ret_op.type = rty;
+ av.storage = ret_op;
+ T->ret(T, &av);
+ api_release(g, &v);
+ return;
+ }
ret_op = api_force_reg(g, &v, rty);
av.storage = ret_op;
T->ret(T, &av);
diff --git a/test/api/cg_type_test.c b/test/api/cg_type_test.c
@@ -259,6 +259,100 @@ static void exercise_cg_late_local_addr(CfreeCompiler* c, CfreeCgTypeId i32_ty,
obj_free((ObjBuilder*)ob);
}
+static uint32_t text_size(const ObjBuilder* ob) {
+ uint32_t total = 0;
+ uint32_t n = obj_section_count(ob);
+ for (uint32_t i = 0; i < n; ++i) {
+ const Section* sec = obj_section_get(ob, i);
+ if (sec && sec->kind == SEC_TEXT) total += sec->bytes.total;
+ }
+ return total;
+}
+
+static CfreeCgSym begin_i32_func(CfreeCompiler* c, CfreeCg* cg,
+ CfreeCgTypeId i32_ty, const char* name) {
+ CfreeCgFuncSig sig;
+ CfreeCgDecl decl;
+ memset(&sig, 0, sizeof sig);
+ sig.ret = i32_ty;
+ sig.call_conv = CFREE_CG_CC_TARGET_C;
+
+ memset(&decl, 0, sizeof decl);
+ decl.kind = CFREE_CG_DECL_FUNC;
+ decl.linkage_name = cfree_sym_intern(c, name);
+ decl.display_name = decl.linkage_name;
+ decl.type = cfree_cg_type_func(c, sig);
+ decl.sym.bind = CFREE_SB_GLOBAL;
+ decl.sym.visibility = CFREE_CG_VIS_DEFAULT;
+ return cfree_cg_decl(cg, decl);
+}
+
+static void exercise_cg_literal_folds(CfreeCompiler* c, CfreeCgTypeId i32_ty) {
+ static const char* names[] = {
+ "cg_fold_add_o1",
+ "cg_fold_shift_or_o1",
+ "cg_fold_cmp_o1",
+ "cg_fold_unop_o1",
+ };
+ CfreeCompileOptions opts;
+ CfreeObjBuilder* ob;
+ CfreeCg* cg;
+
+ memset(&opts, 0, sizeof opts);
+ opts.opt_level = 1;
+ ob = (CfreeObjBuilder*)obj_new((Compiler*)c);
+ EXPECT(ob != NULL, "literal fold obj builder allocation failed");
+ if (!ob) return;
+ cg = cfree_cg_new(c, ob, &opts);
+ EXPECT(cg != NULL, "literal fold cg allocation failed");
+ if (!cg) {
+ obj_free((ObjBuilder*)ob);
+ return;
+ }
+
+ for (uint32_t i = 0; i < 4; ++i) {
+ CfreeCgSym sym = begin_i32_func(c, cg, i32_ty, names[i]);
+ EXPECT(sym != CFREE_CG_SYM_NONE, "literal fold decl failed");
+ cfree_cg_func_begin(cg, sym);
+ switch (i) {
+ case 0:
+ cfree_cg_push_int(cg, 40, i32_ty);
+ cfree_cg_push_int(cg, 2, i32_ty);
+ cfree_cg_int_binop(cg, CFREE_CG_INT_ADD, 0);
+ break;
+ case 1:
+ cfree_cg_push_int(cg, 5, i32_ty);
+ cfree_cg_push_int(cg, 3, i32_ty);
+ cfree_cg_int_binop(cg, CFREE_CG_INT_SHL, 0);
+ cfree_cg_push_int(cg, 2, i32_ty);
+ cfree_cg_int_binop(cg, CFREE_CG_INT_OR, 0);
+ break;
+ case 2:
+ cfree_cg_push_int(cg, 40, i32_ty);
+ cfree_cg_push_int(cg, 2, i32_ty);
+ cfree_cg_int_binop(cg, CFREE_CG_INT_ADD, 0);
+ cfree_cg_push_int(cg, 42, i32_ty);
+ cfree_cg_int_cmp(cg, CFREE_CG_INT_EQ);
+ break;
+ case 3:
+ cfree_cg_push_int(cg, 41, i32_ty);
+ cfree_cg_int_unop(cg, CFREE_CG_INT_BNOT, 0);
+ cfree_cg_int_unop(cg, CFREE_CG_INT_BNOT, 0);
+ cfree_cg_push_int(cg, 1, i32_ty);
+ cfree_cg_int_binop(cg, CFREE_CG_INT_ADD, 0);
+ break;
+ }
+ cfree_cg_ret(cg);
+ cfree_cg_func_end(cg);
+ }
+
+ cfree_cg_free(cg);
+ EXPECT(text_size((ObjBuilder*)ob) <= 128,
+ "literal folds should avoid arithmetic materialization, text size=%u",
+ text_size((ObjBuilder*)ob));
+ obj_free((ObjBuilder*)ob);
+}
+
int main(void) {
CfreeTarget target;
CfreeEnv env;
@@ -417,6 +511,7 @@ int main(void) {
exercise_cg_scalar_local(c, i32_ty, 1);
exercise_cg_late_local_addr(c, i32_ty, 0);
exercise_cg_late_local_addr(c, i32_ty, 1);
+ exercise_cg_literal_folds(c, i32_ty);
cfree_compiler_free(c);
return g_fail ? 1 : 0;