commit 0b6a8f1ea1f601f5ab95420d780170f57d6f7ee7
parent f639d9174f8a364814dc16e21a1bd1ffa3634902
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sun, 17 May 2026 17:05:19 -0700
Complete O1 codegen cleanup
Diffstat:
16 files changed, 502 insertions(+), 331 deletions(-)
diff --git a/doc/OPT1.md b/doc/OPT1.md
@@ -8,7 +8,8 @@ cleanup, and emits through the target backend.
This document has two jobs:
1. document the implemented O1 pipeline; and
-2. track the remaining work needed to clean up O1 generated code.
+2. record the completed O1 code-shape cleanup and remaining performance
+ watchpoints.
Compile-time measurements and scaling notes live in `doc/PERF.md`. Broader
optimizer direction lives in `doc/OPT.md`.
@@ -202,311 +203,86 @@ Current progress:
- Address-taken locals still go through memory, as expected for current O1.
- The old prologue NOP patch path is gone; known-frame entry is used where the
target supports it.
+- Empty leaf functions omit avoidable frame setup and return directly.
+- Scalar return producers can be retargeted into ABI return registers, removing
+ target-level return self-copies.
+- Unused scalar parameters consume ABI locations without materializing moves,
+ and used stack-passed scalar parameters load directly from the incoming stack
+ area.
Representative AArch64 `const_local` output:
```asm
-sub sp, sp, #0x10
-stp x29, x30, [sp]
-mov x29, sp
mov w0, #0x2a
-b .Lepilogue
-ldp x29, x30, [sp]
-add sp, sp, #0x10
ret
```
-Representative AArch64 `scalar_add` output still contains redundant copies:
+Representative AArch64 `scalar_add` output:
```asm
-mov w2, w0
-mov w0, w1
-add w2, w2, w0
-mov w0, w2
-mov w0, w0
-b .Lepilogue
+add w0, w0, w1
+ret
+```
+
+A ninth-argument AArch64 scalar probe now loads directly from the incoming
+stack area in a frameless leaf:
+
+```asm
+ldur w0, [sp]
+add w0, w0, #1
+ret
+```
+
+## Completed O1 Codegen Work
+
+The O1 cleanup list is now implemented for the current backend path:
+
+- Leaf functions can omit the frame on AArch64, x64, and RV64 when the known
+ frame is empty, the function has no calls or dynamic stack allocation, and the
+ backend has no callee-saved state to preserve. Returns in such functions emit
+ the target return instruction directly instead of branching to an adjacent
+ epilogue.
+- Post-RA cleanup removes physical self-copies, collapses short copy chains,
+ retargets safe single-use producers, and retargets adjacent scalar return
+ producers into ABI return registers when target metadata allows it.
+- Target physical-register metadata exposes caller-saved, callee-saved,
+ argument, return, reserved, and scratch policy. O1 prefers caller-saved
+ registers for non-call-crossing values, uses callee-saved registers for
+ call-crossing values when profitable, and ties non-call-crossing scalar
+ parameters to their incoming ABI registers when legal.
+- Non-address-taken scalar parameters remain register-backed. Unused
+ register-backed parameters consume their ABI locations without materializing
+ pointless moves, and stack-passed scalar parameters are loaded directly from
+ the incoming stack area. Frameless functions use `sp`/`rsp` for those loads;
+ framed functions use the target frame pointer as before.
+- Planned call replay is the normal O1 call path for supported target plans.
+ Replay uses a local parallel-move resolver for register moves, stack argument
+ stores, return extraction, cycles, and indirect callees that overlap argument
+ destinations.
+- Branch cleanup remains intentionally local: branch-target forwarding,
+ compare-branch inversion for local jump-block removal, unreachable pruning via
+ CFG, and physical fallthrough branch deletion.
+
+Representative AArch64 output after these slices:
+
+```asm
+f:
+ mov w0, #0x2a
+ ret
+
+g:
+ add w0, w0, w1
+ ret
+
+h: // ninth integer argument
+ ldur w0, [sp]
+ add w0, w0, #1
+ ret
```
-A parameter-heavy 32-argument AArch64 probe shows the main remaining shape
-problem: stack arguments are copied into frame slots, many callee-saved
-registers are used and therefore saved/restored, and returns still branch to an
-immediately following epilogue block.
-
-## Remaining O1 Codegen Work
-
-These are code-shape issues, not known correctness issues.
-
-1. Leaf-frame and epilogue cleanup.
- Leaf functions such as `const_local` still get a frame and save/restore
- `x29/x30`. O1 should let targets omit frames when the function has no calls,
- no dynamic stack, no required frame-address use, and no spills/locals that
- require a frame. Return-to-immediate-epilogue branches should also be removed
- or emitted as direct fallthrough where legal.
-
-2. Redundant physical-copy cleanup.
- Copies like `mov w0, w0`, return-register self copies, and copy chains around
- simple arithmetic are still visible. Extend post-RA combine/DCE to remove
- more physical self-copies and to coalesce simple return/argument copies
- without requiring a full O2 coalescer.
-
-3. AArch64 caller/callee register choice for leaf and small functions.
- Current AArch64 O1 allocable integer registers are callee-saved
- `x19-x28`. This is conservative and correct, but a leaf function that uses
- many of them pays save/restore traffic. A targeted O1 improvement is to make
- caller-saved temporaries available when doing so does not create call
- preservation work, especially in leaf functions.
-
-4. Stack-parameter handling.
- Parameter-heavy functions copy stack arguments into local frame slots before
- use. For non-address-taken scalar parameters, O1 should prefer using the
- incoming stack location directly or loading it once into an assigned hard
- register when profitable, instead of manufacturing extra frame traffic.
-
-5. Planned/parallel call argument setup.
- O1 records call plans before allocation, but final backend call emission is
- still largely through existing sequential call emitters. Moving O1 call
- emission closer to planned/parallel argument setup should reduce copies,
- stack shuffling, and conservative register choices around calls.
-
-6. Keep branch cleanup local.
- O1 should keep only cheap CFG/layout cleanup. Global block ordering and
- heavier branch layout belong in O2.
-
-## Implementation Plan
-
-The remaining O1 work should land as small vertical slices. Each slice should
-start with a focused `test/opt` unit or small smoke case that exposes the bad
-shape, then make the narrowest backend/opt change needed for that shape. Avoid
-SSA, global block layout, dense conflict data, and whole-function coalescing in
-these steps.
-
-### 1. Leaf Frames And Return Fallthrough
-
-Goal: make trivial leaf functions emit without an avoidable frame, callee-save
-traffic, or a branch to the immediately following epilogue.
-
-Implementation:
-
-- Add an O1 function summary in `src/opt/opt.c` before `opt_emit`: has calls,
- has `IR_ALLOCA`, has frame-address-required storage, has non-empty frame
- slots after known-frame collection, has outgoing stack args, and has used
- callee-saved registers after rewrite.
-- Extend `CGKnownFrameDesc` or add a sibling target hook bit so
- `func_begin_known_frame` can receive "frame may be omitted" without each
- backend rediscovering O1 state. The backend remains authoritative because
- CFI, platform ABI rules, and frame-pointer policy are target-owned.
-- Teach AArch64 first, then x64/RV64, to omit the frame only when the known
- frame is empty, the function is leaf, no dynamic stack exists, no outgoing
- call area exists, and no callee-saved register needs preservation.
-- Keep `reserve_hard_regs` authoritative. If post-RA code still mentions a
- callee-saved register, the target must preserve it even for a leaf.
-- Remove return-to-next-epilogue branches in the layout stage. Either let
- `opt_jump_cleanup(...LAYOUT)` delete an `IR_BR` to the next emitted epilogue
- block, or add a target-local direct-return fast path only when the backend's
- epilogue is empty. Prefer the IR/layout cleanup first because it is target
- neutral.
-
-Tests:
-
-- Add `test/opt` mock coverage for the summary: empty leaf, leaf with spill,
- leaf with alloca, non-leaf, and leaf using a callee-saved hard reg.
-- Add AArch64 smoke/disassembly cases for `return 42;` and one-reg arithmetic:
- no `stp x29, x30`, no stack adjustment, no branch to an adjacent epilogue.
-- Run `make test-opt test-smoke-x64 test-aa64-inline`; add RV64/x64 targeted
- smoke once their frame omission hooks are enabled.
-
-### 2. Post-RA Physical Copy Cleanup
-
-Goal: remove obvious copy noise such as `mov w0, w0`, return-register self
-copies, and short copy chains around a single arithmetic result without adding
-an O2 coalescer.
-
-Implementation:
-
-- Extend `src/opt/pass_lower.c:opt_combine` with a post-RA copy-propagation
- sweep over hard registers. Reuse the existing `HardBlockLive` and local
- single-use machinery; keep it block-local except for already-computed
- live-out safety checks.
-- Add ABI-value helpers that can rewrite a return or planned-call operand from
- a copied hard register to its original source when the source has not been
- clobbered and the copied destination is not live out.
-- Handle these cases first:
- - physical self-copy deletion for all copy-like ops already modeled as
- `IR_COPY`;
- - `copy retreg, src; ret retreg` to `ret src` when the target return emitter
- accepts that operand shape;
- - `producer tmp; copy retreg, tmp; ret retreg` by retargeting the producer to
- `retreg` when `retarget_producer_legal` already says that is safe;
- - adjacent `copy a, b; copy c, a` to `copy c, b` when `a` has no other use.
-- Keep target legality explicit. If an instruction requires same-dst operands
- or cannot write a return register directly, leave it for O2.
-
-Tests:
-
-- Extend `test/opt/opt_test.c` with post-rewrite blocks for each pattern,
- including a negative case where the source is clobbered by a call.
-- Add a small AArch64 scalar-add probe that checks the final disassembly has no
- `mov w0, w0` and fewer register-to-register moves.
-- Run `make test-opt test-cg-api test-toy`.
-
-### 3. Caller-Saved Registers In Leaf And Small Functions
-
-Goal: avoid save/restore traffic caused by preferring AArch64 callee-saved
-registers in leaf and low-pressure functions.
-
-Implementation:
-
-- Keep the backend physical register metadata in `src/arch/*/opt_coord.c` as
- the source of truth. Do not hard-code register classes in the allocator.
-- Split O1 register ordering into a per-function policy in
- `src/opt/pass_lower.c`: leaf/no-call values prefer caller-saved temp regs;
- values live across calls prefer callee-saved regs only when that avoids
- per-call save/restore; tied ABI regs keep their fixed priority.
-- For AArch64, make a caller-saved temp subset visible in the O1 allocation
- order for leaf functions. Avoid `x16/x17` because they are backend scratch
- registers, and continue to respect `opt_reserved_regs`, arg/ret hazards, and
- inline-asm fixed/clobber masks.
-- For non-leaf functions, keep current correctness first: caller-saved
- assignment is allowed only if rewrite will preserve the value across the
- specific call clobber mask.
-- Add metrics for hard-reg class choice if useful for regression tracking:
- caller-saved assignments, callee-saved assignments, and call-preserve inserts.
-
-Tests:
-
-- Extend existing allocator preference tests with AArch64-like metadata:
- leaf temporaries choose caller-saved temps, call-crossing values avoid them
- when a callee-saved reg is available, inline asm fixed regs still win.
-- Add AArch64 smoke for a leaf function with many scalar temps: no
- callee-save pair stores solely due to O1 temporaries.
-- Run `make test-opt test-aa64-inline test-smoke-x64`.
-
-### 4. Stack-Parameter Handling
-
-Goal: stop copying non-address-taken scalar stack parameters into new local
-frame slots before their first real use.
-
-Implementation:
-
-- Add an IR representation for incoming stack parameter storage that is
- distinct from ordinary callee frame slots. The target should be able to
- describe it as an immutable incoming argument address, e.g. base register plus
- ABI stack offset, without allocating `FS_PARAM`.
-- Extend `CGLocalStorage` or add O1-only parameter metadata so
- `w_param` can record direct incoming-stack storage for scalar,
- non-address-taken, non-variadic, direct ABI parameters.
-- Teach AArch64 first because it is the current quality target. `aa_param`
- should be able to bind a register-backed param by loading from the incoming
- stack location, and bind a memory-backed non-address-taken param without
- copying into a local frame slot. Preserve the existing copy behavior for
- address-taken params, aggregate/byval params, variadic save-area needs, and
- debug locations that require a stable local home.
-- Update alias metadata so loads from incoming stack params are not mistaken
- for writable local frame slots. Treat them as parameter memory that may alias
- conservatively with unknown memory unless a target/language fact says more.
-- After AArch64 is green, port the same storage description to x64 and RV64.
-
-Tests:
-
-- Add O1 unit coverage for `w_param`: scalar stack param with no address taken
- does not allocate `FS_PARAM`; address-taken and aggregate params still do.
-- Add AArch64 smoke for a 9+ integer-argument function: stack args are loaded
- directly or once into assigned regs, with no extra callee local slot traffic.
-- Run `make test-opt test-cg-api test-smoke-x64`; add RV64 smoke after the
- RV64 port.
-
-### 5. Planned/Parallel Call Argument Setup
-
-Goal: make O1 replay use target call plans as the normal call-emission route so
-argument setup is parallelized and copy-minimized.
-
-Implementation:
-
-- Promote `IRCallAux.plan_valid/use_plan_replay` from a partial path to the
- default O1 path after `opt_machinize`. Keep the old descriptor replay as a
- fallback for targets without `plan_call` or `emit_call_plan`.
-- Add a small parallel-move resolver in `src/opt/opt.c` replay that lowers the
- planned register moves, stack stores, return moves, and indirect callee
- preservation. It must handle cycles using target scratch registers or a
- temporary spill slot; no VLA and no global resolver state.
-- Keep call-stack sizing tied to `call_stack_size` and the known-frame path so
- outgoing areas are still allocated once per function.
-- Use target `call_clobber_mask` and plan return masks to avoid preserving
- registers that the concrete call cannot clobber.
-- Start with integer direct calls, then FP args/returns, then mixed stack args,
- then indirect calls. Leave varargs and aggregate edge cases on descriptor
- replay until their planned lowering has explicit tests.
-
-Tests:
-
-- Extend mock `plan_call` tests to assert O1 emits planned calls by default and
- preserves an indirect callee only when it conflicts with argument moves.
-- Add cycle tests for argument permutation, including `a -> b, b -> a`.
-- Add cross-target call smoke for direct, indirect, stack-heavy, and mixed
- int/FP calls.
-- Run `make test-opt test-cg-api test-toy test-smoke-x64`.
-
-### 6. Local Branch Cleanup Only
-
-Goal: improve obvious return/branch shape while preserving O1's cheap,
-block-local cleanup policy.
-
-Implementation:
-
-- Keep `src/opt/pass_jump.c` restricted to branch-target forwarding,
- compare-branch inversion, unreachable pruning already owned by CFG, and
- physical fallthrough branch deletion.
-- Add only local return/epilogue cleanup needed by step 1. Do not introduce
- trace layout, global block ordering, or profile-guided branch placement in
- O1.
-- Document any tempting global cleanup in `doc/OPT.md` as O2 work instead of
- expanding O1.
-
-Tests:
-
-- Add branch-cleanup tests for adjacent epilogue fallthrough and negative cases
- where the epilogue is not adjacent or has target-required body.
-- Run `make test-opt`.
-
-### 7. Stack-Heavy Compile-Time Watch
-
-Goal: keep stack-heavy `params` and `spill_argc` shapes close to linear before
-larger inputs become normal benchmarks.
-
-Implementation:
-
-- Instrument the current `used_words` growth source in
- `src/opt/pass_live.c` and `src/opt/pass_lower.c`: distinguish hard-register
- occupancy words, stack-slot occupancy words, live-range point count, and
- parameter/spill slot count.
-- Investigate whether stack slots are being modeled with more interval words
- than needed, especially after stack-parameter improvements remove some
- `FS_PARAM` traffic.
-- Keep the invariant that `opt.conflict_bytes=0` on normal O1. Any fix that
- reintroduces dense pseudo conflicts is a regression, even if it improves one
- microbenchmark.
-
-Tests:
-
-- Add or update a phase0 guardrail that records `opt.alloc.used_loc_words` for
- representative stack-heavy cases without asserting fragile exact values.
-- Refresh `doc/PERF.md` after the stack-parameter and caller-saved-register
- slices land.
-
-### Suggested Order
-
-1. Physical-copy cleanup. It is isolated, improves current AArch64 samples, and
- gives better post-RA liveness coverage for later steps.
-2. Leaf return/fallthrough cleanup. This removes visible branch noise and
- creates the summary data needed for frame omission.
-3. AArch64 caller-saved leaf policy. This directly reduces save/restore traffic
- once leaf frames are eligible for omission.
-4. Stack-parameter handling. This is higher risk because it changes parameter
- storage contracts, so land it per target.
-5. Planned call replay. This has the largest call/ABI surface and should build
- on the copy and register-choice cleanup.
-6. Compile-time stack-heavy investigation and `doc/PERF.md` refresh after the
- generated-code changes have reduced avoidable stack traffic.
+Further improvements such as global block ordering, general coalescing, SSA
+value optimization, and deeper stack-heavy compile-time tuning belong to O2 or
+`doc/PERF.md`, not the O1 cleanup baseline.
## Compile-Time Watchlist
diff --git a/src/arch/aa64/emit.c b/src/arch/aa64/emit.c
@@ -106,6 +106,7 @@ static void aa_func_begin_init(CGTarget *t, const CGFuncDesc *fd) {
a->nscopes = 0;
a->has_alloca = 0;
a->known_frame = 0;
+ a->omit_frame = 0;
a->nadd_patches = 0;
a->sret_ptr_slot = FRAME_SLOT_NONE;
a->is_variadic = (fd->abi && fd->abi->variadic) ? 1 : 0;
@@ -373,6 +374,13 @@ void aa_func_begin_known_frame(CGTarget *t, const CGFuncDesc *fd,
u32 n_int_saves = collect_mask_regs(a->used_cs_int_mask, 19u, 28u, int_regs);
u32 n_fp_saves = collect_mask_regs(a->used_cs_fp_mask, 8u, 15u, fp_regs);
+ if (frame && frame->may_omit_frame && frame->nslots == 0 &&
+ frame->max_outgoing == 0 && !frame->has_alloca && !frame->has_call &&
+ !a->has_sret && !a->is_variadic && n_int_saves == 0 &&
+ n_fp_saves == 0) {
+ a->omit_frame = 1;
+ return;
+ }
aa_compute_frame(a, n_int_saves, n_fp_saves, &int_save_off, &fp_save_off,
&fp_lr_off, &frame_size);
@@ -400,6 +408,8 @@ void aa_func_end(CGTarget *t) {
aa_compute_frame(a, n_int_saves, n_fp_saves, &int_save_off, &fp_save_off,
&fp_lr_off, &frame_size);
+ if (a->omit_frame) goto finish;
+
mc->label_place(mc, a->epilogue_label);
if (a->has_alloca) {
@@ -455,6 +465,8 @@ void aa_func_end(CGTarget *t) {
aa64_patch32(obj, sec, a->add_patches[i].pos, word);
}
+finish:
+ ;
u32 end = mc->pos(mc);
obj_symbol_define(obj, a->fd->sym, sec, (u64)a->func_start,
(u64)(end - a->func_start));
@@ -500,6 +512,32 @@ FrameSlot aa_frame_slot(CGTarget *t, const FrameSlotDesc *d) {
* Parameters
* ============================================================ */
+static void aa_consume_param_location(AAImpl *a, const ABIArgInfo *ai) {
+ if (!ai || ai->kind == ABI_ARG_IGNORE) return;
+ if (ai->kind == ABI_ARG_INDIRECT) {
+ if (a->next_param_int < 8)
+ ++a->next_param_int;
+ else {
+ a->next_param_stack += 8;
+ }
+ return;
+ }
+ for (u16 i = 0; i < ai->nparts; ++i) {
+ const ABIArgPart *pt = &ai->parts[i];
+ if (pt->cls == ABI_CLASS_INT) {
+ if (a->next_param_int < 8)
+ ++a->next_param_int;
+ else
+ a->next_param_stack += 8;
+ } else if (pt->cls == ABI_CLASS_FP) {
+ if (a->next_param_fp < 8)
+ ++a->next_param_fp;
+ else
+ a->next_param_stack += 8;
+ }
+ }
+}
+
CGLocalStorage aa_param(CGTarget *t, const CGParamDesc *p) {
AAImpl *a = impl_of(t);
CGLocalStorage st = p->storage;
@@ -521,9 +559,15 @@ CGLocalStorage aa_param(CGTarget *t, const CGParamDesc *p) {
compiler_panic(t->c, a->loc, "aarch64 param: bad slot");
}
const ABIArgInfo *ai = p->abi;
+ u32 incoming_stack_base = a->omit_frame ? 31u : 29u;
+ i32 incoming_stack_bias = a->omit_frame ? 0 : 16;
if (ai->kind == ABI_ARG_IGNORE)
return st;
+ if (st.kind == CG_LOCAL_STORAGE_REG && st.v.reg == (Reg)REG_NONE) {
+ aa_consume_param_location(a, ai);
+ return st;
+ }
if (st.kind == CG_LOCAL_STORAGE_REG) {
if (ai->kind != ABI_ARG_DIRECT || ai->nparts != 1) {
compiler_panic(t->c, a->loc,
@@ -541,8 +585,8 @@ CGLocalStorage aa_param(CGTarget *t, const CGParamDesc *p) {
} else {
u32 caller_off = a->next_param_stack;
a->next_param_stack += 8;
- aa64_emit_ldur_off(t->mc, sidx, dst, 29, (i32)(16 + caller_off),
- AA_TMP0);
+ aa64_emit_ldur_off(t->mc, sidx, dst, incoming_stack_base,
+ incoming_stack_bias + (i32)caller_off, AA_TMP0);
}
} else if (pt->cls == ABI_CLASS_FP) {
u32 dst = reg_num((Operand){.kind = OPK_REG, .v.reg = st.v.reg});
@@ -553,8 +597,8 @@ CGLocalStorage aa_param(CGTarget *t, const CGParamDesc *p) {
} else {
u32 caller_off = a->next_param_stack;
a->next_param_stack += 8;
- aa64_emit_ldur_fp_off(t->mc, sidx, dst, 29,
- (i32)(16 + caller_off), AA_TMP0);
+ aa64_emit_ldur_fp_off(t->mc, sidx, dst, incoming_stack_base,
+ incoming_stack_bias + (i32)caller_off, AA_TMP0);
}
} else {
compiler_panic(t->c, a->loc, "aarch64 param: ABI class %d unimpl",
@@ -569,8 +613,8 @@ CGLocalStorage aa_param(CGTarget *t, const CGParamDesc *p) {
} else {
u32 caller_off = a->next_param_stack;
a->next_param_stack += 8;
- aa64_emit_ldur_off(t->mc, 3, AA_TMP0, 29, (i32)(16 + caller_off),
- AA_TMP0);
+ aa64_emit_ldur_off(t->mc, 3, AA_TMP0, incoming_stack_base,
+ incoming_stack_bias + (i32)caller_off, AA_TMP0);
ptr_reg = AA_TMP0;
}
u32 nbytes = s->size;
@@ -615,8 +659,8 @@ CGLocalStorage aa_param(CGTarget *t, const CGParamDesc *p) {
} else {
u32 caller_off = a->next_param_stack;
a->next_param_stack += 8;
- aa64_emit_ldur_off(t->mc, sidx, AA_TMP0, 29,
- (i32)(16 + caller_off), AA_TMP0);
+ aa64_emit_ldur_off(t->mc, sidx, AA_TMP0, incoming_stack_base,
+ incoming_stack_bias + (i32)caller_off, AA_TMP0);
aa64_emit_stur_off(t->mc, sidx, AA_TMP0, 29,
-(i32)s->off + (i32)part_off, AA_TMP1);
}
@@ -628,8 +672,8 @@ CGLocalStorage aa_param(CGTarget *t, const CGParamDesc *p) {
} else {
u32 caller_off = a->next_param_stack;
a->next_param_stack += 8;
- aa64_emit_ldur_fp_off(t->mc, sidx, AA_FP_TMP0, 29,
- (i32)(16 + caller_off), AA_TMP0);
+ aa64_emit_ldur_fp_off(t->mc, sidx, AA_FP_TMP0, incoming_stack_base,
+ incoming_stack_bias + (i32)caller_off, AA_TMP0);
aa64_emit_stur_fp_off(t->mc, sidx, AA_FP_TMP0, 29,
-(i32)s->off + (i32)part_off, AA_TMP0);
}
diff --git a/src/arch/aa64/internal.h b/src/arch/aa64/internal.h
@@ -234,7 +234,8 @@ typedef struct AAImpl {
u32 prologue_words;
MCLabel epilogue_label;
u8 known_frame;
- u8 pad0[3];
+ u8 omit_frame;
+ u8 pad0[2];
AASlot* slots;
u32 nslots;
diff --git a/src/arch/aa64/ops.c b/src/arch/aa64/ops.c
@@ -1257,10 +1257,12 @@ static void aa_ret(CGTarget* t, const CGABIValue* val) {
} else if (val->storage.kind == OPK_REG) {
if (val->storage.cls == RC_FP) {
u32 type = type_is_fp_double(val->storage.type) ? 1u : 0u;
- aa64_emit32(mc, aa64_fmov_reg(type, /*Rd=*/0, reg_num(val->storage)));
+ if (reg_num(val->storage) != 0)
+ aa64_emit32(mc, aa64_fmov_reg(type, /*Rd=*/0, reg_num(val->storage)));
} else {
u32 sf = type_is_64(val->storage.type) ? 1u : 0u;
- aa64_emit32(mc, aa64_mov_reg(sf, /*Rd=*/0, reg_num(val->storage)));
+ if (reg_num(val->storage) != 0)
+ aa64_emit32(mc, aa64_mov_reg(sf, /*Rd=*/0, reg_num(val->storage)));
}
} else if (val->storage.kind == OPK_IMM) {
u32 sf = type_is_64(val->storage.type) ? 1u : 0u;
@@ -1294,6 +1296,10 @@ static void aa_ret(CGTarget* t, const CGABIValue* val) {
}
}
}
+ if (a->omit_frame) {
+ aa64_emit32(mc, aa64_ret(AA64_LR));
+ return;
+ }
u32 bpos = mc->pos(mc);
aa64_emit32(mc, aa64_b_base());
mc->emit_label_ref(mc, a->epilogue_label, R_AARCH64_JUMP26, 4, 0);
diff --git a/src/arch/arch.h b/src/arch/arch.h
@@ -373,7 +373,9 @@ typedef struct CGKnownFrameDesc {
u32 nslots;
u32 max_outgoing;
u8 has_alloca;
- u8 pad[3];
+ u8 has_call;
+ u8 may_omit_frame;
+ u8 pad;
} CGKnownFrameDesc;
typedef enum CGCallFlag {
diff --git a/src/arch/rv64/alloc.c b/src/arch/rv64/alloc.c
@@ -37,6 +37,31 @@ RvSlot* rv64_slot_get(RImpl* a, FrameSlot fs) {
/* ---- param ---- */
+static void rv_consume_param_location(RImpl* a, const ABIArgInfo* ai) {
+ if (!ai || ai->kind == ABI_ARG_IGNORE) return;
+ if (ai->kind == ABI_ARG_INDIRECT) {
+ if (a->next_param_int < 8)
+ ++a->next_param_int;
+ else
+ a->next_param_stack += 8;
+ return;
+ }
+ for (u16 i = 0; i < ai->nparts; ++i) {
+ const ABIArgPart* pt = &ai->parts[i];
+ if (pt->cls == ABI_CLASS_INT) {
+ if (a->next_param_int < 8)
+ ++a->next_param_int;
+ else
+ a->next_param_stack += 8;
+ } else if (pt->cls == ABI_CLASS_FP) {
+ if (a->next_param_fp < 8)
+ ++a->next_param_fp;
+ else
+ a->next_param_stack += 8;
+ }
+ }
+}
+
CGLocalStorage rv_param(CGTarget* t, const CGParamDesc* p) {
RImpl* a = impl_of(t);
MCEmitter* mc = t->mc;
@@ -60,9 +85,14 @@ CGLocalStorage rv_param(CGTarget* t, const CGParamDesc* p) {
const ABIArgInfo* ai = p->abi;
/* Caller's stack args start above the saved-s0/ra pair, plus the
* 64-byte variadic save area when this function is variadic. */
- i32 caller_stack_base = 16 + (a->is_variadic ? 64 : 0);
+ u32 incoming_stack_base = a->omit_frame ? RV_SP : RV_S0;
+ i32 caller_stack_base = a->omit_frame ? 0 : 16 + (a->is_variadic ? 64 : 0);
if (ai->kind == ABI_ARG_IGNORE) return st;
+ if (st.kind == CG_LOCAL_STORAGE_REG && st.v.reg == (Reg)REG_NONE) {
+ rv_consume_param_location(a, ai);
+ return st;
+ }
if (st.kind == CG_LOCAL_STORAGE_REG) {
if (ai->kind != ABI_ARG_DIRECT || ai->nparts != 1) {
compiler_panic(t->c, a->loc,
@@ -79,7 +109,7 @@ CGLocalStorage rv_param(CGTarget* t, const CGParamDesc* p) {
} else {
u32 caller_off = a->next_param_stack;
a->next_param_stack += 8;
- rv64_emit32(mc, enc_int_load(sz, 0, dst, RV_S0,
+ rv64_emit32(mc, enc_int_load(sz, 0, dst, incoming_stack_base,
caller_stack_base + (i32)caller_off));
}
} else if (pt->cls == ABI_CLASS_FP) {
@@ -94,10 +124,10 @@ CGLocalStorage rv_param(CGTarget* t, const CGParamDesc* p) {
u32 caller_off = a->next_param_stack;
a->next_param_stack += 8;
if (sz == 8)
- rv64_emit32(mc, rv_fld(dst, RV_S0,
+ rv64_emit32(mc, rv_fld(dst, incoming_stack_base,
caller_stack_base + (i32)caller_off));
else
- rv64_emit32(mc, rv_flw(dst, RV_S0,
+ rv64_emit32(mc, rv_flw(dst, incoming_stack_base,
caller_stack_base + (i32)caller_off));
}
} else {
@@ -119,7 +149,8 @@ CGLocalStorage rv_param(CGTarget* t, const CGParamDesc* p) {
/* Incoming stack args live in the caller's outgoing-arg area,
* which is `frame_size - fp_pair_off` (= 16 + the saved-s0/ra
* pair) above s0 — same logic as aa64's `16 + caller_off`. */
- rv64_emit32(mc, rv_ld(RV_T1, RV_S0, caller_stack_base + (i32)caller_off));
+ rv64_emit32(mc, rv_ld(RV_T1, incoming_stack_base,
+ caller_stack_base + (i32)caller_off));
ptr_reg = RV_T1;
}
u32 nbytes = s->size;
@@ -161,7 +192,7 @@ CGLocalStorage rv_param(CGTarget* t, const CGParamDesc* p) {
} else {
u32 caller_off = a->next_param_stack;
a->next_param_stack += 8;
- rv64_emit32(mc, enc_int_load(sz, 0, RV_T2, RV_S0,
+ rv64_emit32(mc, enc_int_load(sz, 0, RV_T2, incoming_stack_base,
caller_stack_base + (i32)caller_off));
rv64_emit32(mc, enc_int_store(sz, RV_T2, RV_S0,
-(i32)s->off + (i32)part_off));
@@ -180,10 +211,12 @@ CGLocalStorage rv_param(CGTarget* t, const CGParamDesc* p) {
u32 caller_off = a->next_param_stack;
a->next_param_stack += 8;
if (sz == 8) {
- rv64_emit32(mc, rv_fld(0, RV_S0, caller_stack_base + (i32)caller_off));
+ rv64_emit32(mc, rv_fld(0, incoming_stack_base,
+ caller_stack_base + (i32)caller_off));
rv64_emit32(mc, rv_fsd(0, RV_S0, -(i32)s->off + (i32)part_off));
} else {
- rv64_emit32(mc, rv_flw(0, RV_S0, caller_stack_base + (i32)caller_off));
+ rv64_emit32(mc, rv_flw(0, incoming_stack_base,
+ caller_stack_base + (i32)caller_off));
rv64_emit32(mc, rv_fsw(0, RV_S0, -(i32)s->off + (i32)part_off));
}
}
diff --git a/src/arch/rv64/emit.c b/src/arch/rv64/emit.c
@@ -149,6 +149,7 @@ static void rv_func_begin_init(CGTarget *t, const CGFuncDesc *fd) {
a->has_sret = (fd->abi && fd->abi->has_sret) ? 1 : 0;
a->is_variadic = (fd->abi && fd->abi->variadic) ? 1 : 0;
a->known_frame = 0;
+ a->omit_frame = 0;
a->cum_off = 0;
a->max_outgoing = 0;
a->fp_pair_off = 0;
@@ -354,6 +355,13 @@ void rv_func_begin_known_frame(CGTarget *t, const CGFuncDesc *fd,
u32 n_int_saves = collect_mask_regs(a->used_cs_int_mask, 18u, 27u, int_regs);
u32 n_fp_saves = collect_mask_regs(a->used_cs_fp_mask, 18u, 27u, fp_regs);
+ if (frame && frame->may_omit_frame && frame->nslots == 0 &&
+ frame->max_outgoing == 0 && !frame->has_alloca && !frame->has_call &&
+ !a->has_sret && !a->is_variadic && n_int_saves == 0 &&
+ n_fp_saves == 0) {
+ a->omit_frame = 1;
+ return;
+ }
rv_compute_frame(a, n_int_saves, n_fp_saves, &fl);
a->fp_pair_off = fl.fp_pair_off;
a->prologue_pos = t->mc->pos(t->mc);
@@ -378,6 +386,8 @@ void rv_func_end(CGTarget *t) {
rv_compute_frame(a, n_int_saves, n_fp_saves, &fl);
a->fp_pair_off = fl.fp_pair_off;
+ if (a->omit_frame) goto finish;
+
/* Place the epilogue label at current pos. */
mc->label_place(mc, a->epilogue_label);
@@ -457,6 +467,8 @@ void rv_func_end(CGTarget *t) {
rv64_patch32(obj, sec, a->add_patches[i].pos, word);
}
+finish:
+ ;
/* Define the function symbol. */
u32 end = mc->pos(mc);
obj_symbol_define(obj, a->fd->sym, sec, (u64)a->func_start,
diff --git a/src/arch/rv64/internal.h b/src/arch/rv64/internal.h
@@ -56,7 +56,8 @@ typedef struct RImpl {
u32 next_param_stack;
u8 has_sret;
u8 known_frame;
- u8 pad0[2];
+ u8 omit_frame;
+ u8 pad0;
FrameSlot sret_ptr_slot;
u32 used_cs_int_mask; /* bit reg set for s2-s11 */
diff --git a/src/arch/rv64/ops.c b/src/arch/rv64/ops.c
@@ -1239,9 +1239,11 @@ static void rv_ret(CGTarget* t, const CGABIValue* val) {
if (val->storage.cls == RC_FP) {
u32 fmt = type_is_fp_double(val->storage.type) ? RV_FMT_D : RV_FMT_S;
u32 r = reg_num(val->storage);
- rv64_emit32(mc, rv_fsgnj(fmt, 10u, r, r)); /* fa0 = freg 10 */
+ if (r != 10u)
+ rv64_emit32(mc, rv_fsgnj(fmt, 10u, r, r)); /* fa0 = freg 10 */
} else {
- rv64_emit32(mc, rv_addi(RV_A0, reg_num(val->storage), 0));
+ if (reg_num(val->storage) != RV_A0)
+ rv64_emit32(mc, rv_addi(RV_A0, reg_num(val->storage), 0));
}
} else if (val->storage.kind == OPK_IMM) {
u32 sf = type_is_64(val->storage.type) ? 1u : 0u;
@@ -1277,6 +1279,10 @@ static void rv_ret(CGTarget* t, const CGABIValue* val) {
}
}
}
+ if (a->omit_frame) {
+ rv64_emit32(mc, rv_ret_());
+ return;
+ }
/* Jump to epilogue. */
rv64_emit32(mc, rv_jal(RV_ZERO, 0));
mc->emit_label_ref(mc, a->epilogue_label, R_RV_JAL, 4, 0);
diff --git a/src/arch/x64/alloc.c b/src/arch/x64/alloc.c
@@ -49,6 +49,31 @@ XSlot* x64_slot_get(XImpl* a, FrameSlot fs) {
}
/* ---- param: bind incoming arg(s) to the requested storage ---- */
+static void x_consume_param_location(XImpl* a, const ABIArgInfo* ai) {
+ if (!ai || ai->kind == ABI_ARG_IGNORE) return;
+ if (ai->kind == ABI_ARG_INDIRECT) {
+ if (a->next_param_int < 6)
+ ++a->next_param_int;
+ else
+ a->next_param_stack += 8;
+ return;
+ }
+ for (u16 i = 0; i < ai->nparts; ++i) {
+ const ABIArgPart* pt = &ai->parts[i];
+ if (pt->cls == ABI_CLASS_INT) {
+ if (a->next_param_int < 6)
+ ++a->next_param_int;
+ else
+ a->next_param_stack += 8;
+ } else if (pt->cls == ABI_CLASS_FP) {
+ if (a->next_param_fp < 8)
+ ++a->next_param_fp;
+ else
+ a->next_param_stack += 8;
+ }
+ }
+}
+
CGLocalStorage x_param(CGTarget* t, const CGParamDesc* p) {
XImpl* a = impl_of(t);
CGLocalStorage st = p->storage;
@@ -69,8 +94,14 @@ CGLocalStorage x_param(CGTarget* t, const CGParamDesc* p) {
if (st.kind == CG_LOCAL_STORAGE_FRAME && !s)
compiler_panic(t->c, a->loc, "x64 param: bad slot");
const ABIArgInfo* ai = p->abi;
+ u32 incoming_stack_base = a->omit_frame ? X64_RSP : X64_RBP;
+ i32 incoming_stack_bias = a->omit_frame ? 8 : 16;
if (ai->kind == ABI_ARG_IGNORE) return st;
+ if (st.kind == CG_LOCAL_STORAGE_REG && st.v.reg == (Reg)REG_NONE) {
+ x_consume_param_location(a, ai);
+ return st;
+ }
if (st.kind == CG_LOCAL_STORAGE_REG) {
if (ai->kind != ABI_ARG_DIRECT || ai->nparts != 1) {
compiler_panic(t->c, a->loc,
@@ -87,8 +118,8 @@ CGLocalStorage x_param(CGTarget* t, const CGParamDesc* p) {
} else {
u32 caller_off = a->next_param_stack;
a->next_param_stack += 8;
- emit_mov_load(t->mc, sz, 0, st.v.reg & 0xFu, X64_RBP,
- (i32)(16 + caller_off));
+ emit_mov_load(t->mc, sz, 0, st.v.reg & 0xFu, incoming_stack_base,
+ incoming_stack_bias + (i32)caller_off);
}
} else if (pt->cls == ABI_CLASS_FP) {
u8 prefix = (sz == 8) ? 0xF2 : 0xF3;
@@ -99,8 +130,8 @@ CGLocalStorage x_param(CGTarget* t, const CGParamDesc* p) {
} else {
u32 caller_off = a->next_param_stack;
a->next_param_stack += 8;
- emit_sse_load(t->mc, prefix, 0x10, dst, X64_RBP,
- (i32)(16 + caller_off));
+ emit_sse_load(t->mc, prefix, 0x10, dst, incoming_stack_base,
+ incoming_stack_bias + (i32)caller_off);
}
} else {
compiler_panic(t->c, a->loc, "x64 param: ABI class %d unimpl",
@@ -116,7 +147,8 @@ CGLocalStorage x_param(CGTarget* t, const CGParamDesc* p) {
} else {
u32 caller_off = a->next_param_stack;
a->next_param_stack += 8;
- emit_mov_load(t->mc, 8, 0, X64_R11, X64_RBP, (i32)(16 + caller_off));
+ emit_mov_load(t->mc, 8, 0, X64_R11, incoming_stack_base,
+ incoming_stack_bias + (i32)caller_off);
ptr_reg = X64_R11;
}
u32 nbytes = s->size;
@@ -156,8 +188,8 @@ CGLocalStorage x_param(CGTarget* t, const CGParamDesc* p) {
} else {
u32 caller_off = a->next_param_stack;
a->next_param_stack += 8;
- emit_mov_load(t->mc, sz, 0, X64_RAX, X64_RBP,
- (i32)(16 + caller_off));
+ emit_mov_load(t->mc, sz, 0, X64_RAX, incoming_stack_base,
+ incoming_stack_bias + (i32)caller_off);
emit_mov_store(t->mc, sz, X64_RAX, X64_RBP,
-(i32)s->off + (i32)part_off);
}
@@ -171,8 +203,8 @@ CGLocalStorage x_param(CGTarget* t, const CGParamDesc* p) {
u32 caller_off = a->next_param_stack;
a->next_param_stack += 8;
u8 prefix = (sz == 8) ? 0xF2 : 0xF3;
- emit_sse_load(t->mc, prefix, 0x10, X64_XMM0, X64_RBP,
- (i32)(16 + caller_off));
+ emit_sse_load(t->mc, prefix, 0x10, X64_XMM0, incoming_stack_base,
+ incoming_stack_bias + (i32)caller_off);
emit_sse_store(t->mc, prefix, 0x11, X64_XMM0, X64_RBP,
-(i32)s->off + (i32)part_off);
}
diff --git a/src/arch/x64/emit.c b/src/arch/x64/emit.c
@@ -435,7 +435,7 @@ void emit_extend_rr(MCEmitter *mc, int w, int signed_ext, u32 src_size, u32 dst,
debug_emit_row(mc->debug, mc->section_id, ofs, mc->loc);
}
-static void emit_ret(MCEmitter *mc) {
+void emit_ret(MCEmitter *mc) {
u8 op = 0xC3;
mc->emit_bytes(mc, &op, 1);
}
@@ -528,6 +528,7 @@ static void x_func_begin_init(CGTarget *t, const CGFuncDesc *fd) {
a->has_alloca = 0;
a->is_variadic = (fd->abi && fd->abi->variadic) ? 1 : 0;
a->known_frame = 0;
+ a->omit_frame = 0;
a->cum_off = 0;
a->max_outgoing = 0;
a->used_cs_int_mask = a->has_planned_regs ? a->planned_cs_int_mask : 0;
@@ -716,6 +717,12 @@ void x_func_begin_known_frame(CGTarget *t, const CGFuncDesc *fd,
}
u32 cs_used = x_collect_cs_regs(a, cs_regs);
+ if (frame && frame->may_omit_frame && frame->nslots == 0 &&
+ frame->max_outgoing == 0 && !frame->has_alloca && !frame->has_call &&
+ !a->has_sret && !a->is_variadic && cs_used == 0) {
+ a->omit_frame = 1;
+ return;
+ }
u32 frame_size = x_compute_frame_size(a, cs_used);
a->prologue_pos = t->mc->pos(t->mc);
u32 nbytes = x_build_prologue(t, buf, X64_PROLOGUE_BYTES, frame_size,
@@ -738,6 +745,8 @@ void x_func_end(CGTarget *t) {
* mod 16, so frame_size must be a multiple of 16. */
u32 frame_size = x_compute_frame_size(a, cs_used);
+ if (a->omit_frame) goto finish;
+
mc->label_place(mc, a->epilogue_label);
/* Restore callee-saves. Each at rbp - (cum_off + (i+1)*8). */
@@ -778,6 +787,8 @@ void x_func_end(CGTarget *t) {
dbuf, 4);
}
+finish:
+ ;
/* Define the function symbol. */
u32 end = mc->pos(mc);
obj_symbol_define(t->obj, a->fd->sym, a->fd->text_section_id,
diff --git a/src/arch/x64/internal.h b/src/arch/x64/internal.h
@@ -75,6 +75,8 @@ typedef struct XImpl {
u8 has_alloca;
u8 is_variadic;
u8 known_frame;
+ u8 omit_frame;
+ u8 pad0[3];
FrameSlot sret_ptr_slot;
FrameSlot reg_save_slot; /* variadic: 176-byte __va_list_tag reg save area */
@@ -169,6 +171,7 @@ void emit_mov_load(MCEmitter *mc, u32 size, int signed_ext, u32 dst, u32 base,
i32 disp);
void emit_mov_store(MCEmitter *mc, u32 size, u32 src, u32 base, i32 disp);
void emit_lea(MCEmitter *mc, u32 dst, u32 base, i32 disp);
+void emit_ret(MCEmitter *mc);
void x64_emit_load_imm(MCEmitter *mc, int is64, u32 dst, i64 imm);
void emit_alu_rr(MCEmitter *mc, int w, u8 op, u32 dst, u32 src);
void emit_imul_rr(MCEmitter *mc, int w, u32 dst, u32 src);
diff --git a/src/arch/x64/ops.c b/src/arch/x64/ops.c
@@ -1234,6 +1234,10 @@ static void x_ret(CGTarget* t, const CGABIValue* val) {
}
}
}
+ if (a->omit_frame) {
+ emit_ret(mc);
+ return;
+ }
emit_jmp_label(mc, a->epilogue_label);
}
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -1246,6 +1246,15 @@ static CGLocalStorage xlat_storage(ReplayCtx* r, CGLocalStorage st,
return st;
}
+static int replay_reg_storage_unused(ReplayCtx* r, CGLocalStorage st) {
+ if (!r || st.kind != CG_LOCAL_STORAGE_REG) return 0;
+ if (!(r->identity_regs && r->f->opt_rewritten && r->f->val_info)) return 0;
+ Val v = (Val)st.v.reg;
+ if (v == VAL_NONE || v >= r->f->nvals) return 0;
+ return r->f->val_info[v].alloc_kind == OPT_ALLOC_NONE ||
+ r->f->val_info[v].use_freq == 0;
+}
+
static Operand xlat_op(ReplayCtx* r, Operand op) {
switch ((OpKind)op.kind) {
case OPK_IMM:
@@ -2047,7 +2056,9 @@ static void collect_known_frame(Func* f, CGTarget* w, CGKnownFrameDesc* out) {
Inst* in = &bl->insts[i];
if ((IROp)in->op == IR_ALLOCA) {
out->has_alloca = 1;
- } else if ((IROp)in->op == IR_CALL && w->call_stack_size) {
+ } else if ((IROp)in->op == IR_CALL) {
+ out->has_call = 1;
+ if (!w->call_stack_size) continue;
IRCallAux* aux = (IRCallAux*)in->extra.aux;
if (!aux) continue;
u32 need = w->call_stack_size(w, &aux->desc);
@@ -2055,6 +2066,11 @@ static void collect_known_frame(Func* f, CGTarget* w, CGKnownFrameDesc* out) {
}
}
}
+ out->may_omit_frame =
+ (!out->has_call && !out->has_alloca && out->nslots == 0 &&
+ out->max_outgoing == 0)
+ ? 1u
+ : 0u;
}
static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) {
@@ -2139,7 +2155,12 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) {
d.size = p->size;
d.align = p->align;
d.flags = p->flags;
- d.storage = xlat_storage(&r, p->storage, p->type);
+ if (replay_reg_storage_unused(&r, p->storage)) {
+ d.storage = p->storage;
+ d.storage.v.reg = REG_NONE;
+ } else {
+ d.storage = xlat_storage(&r, p->storage, p->type);
+ }
d.abi = p->abi;
d.loc = p->loc;
(void)w->param(w, &d);
diff --git a/src/opt/pass_lower.c b/src/opt/pass_lower.c
@@ -305,6 +305,8 @@ static int phys_arg_reg_for_index(Func* f, u8 cls, u32 abi_index, Reg* out) {
return 0;
}
+static int hard_available(Func* f, u8 cls, Reg r);
+
static void apply_param_incoming_register_hazards(Func* f) {
if (!f || !f->val_info || !f->desc.abi || !f->nparams) return;
Reg incoming_regs[64];
@@ -361,6 +363,14 @@ static void apply_param_incoming_register_hazards(Func* f) {
Val v = (Val)p->storage.v.reg;
if (v == VAL_NONE || v >= f->nvals) continue;
u8 cls = f->val_info[v].cls;
+ if (has_incoming[i] && incoming_cls[i] == cls &&
+ f->val_info[v].tied_hard_reg < 0 &&
+ f->val_info[v].live_across_call_freq == 0 &&
+ hard_available(f, cls, incoming_regs[i]) &&
+ incoming_regs[i] < 32 &&
+ (f->val_info[v].forbidden_hard_regs & (1u << incoming_regs[i])) == 0) {
+ f->val_info[v].tied_hard_reg = (i32)incoming_regs[i];
+ }
for (u32 j = i + 1u; j < nparams; ++j) {
if (!has_incoming[j] || incoming_cls[j] != cls) continue;
forbid_val_reg(f, v, cls, incoming_regs[j]);
@@ -2056,6 +2066,25 @@ static int retarget_producer_legal(Inst* producer, const Operand* copy_dst,
}
}
+static int first_return_reg(Func* f, u8 cls, Reg* out) {
+ if (!f || cls >= OPT_REG_CLASSES) return 0;
+ u32 mask = f->opt_ret_regs[cls];
+ for (Reg r = 0; r < 32; ++r) {
+ if (mask & (1u << r)) {
+ *out = r;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int ret_scalar_storage(CGABIValue* v, Operand** out) {
+ if (!v || v->storage.kind != OPK_REG) return 0;
+ if (v->nparts > 1) return 0;
+ *out = &v->storage;
+ return 1;
+}
+
static int find_single_direct_use(Func* f, Block* bl,
const HardBlockLive* hard_live, u32 def_i,
const Operand* def, const Operand* src,
@@ -2117,6 +2146,32 @@ static void opt_combine_fold_block(Func* f, Block* bl,
u32 use_i = 0;
u32 op_i = 0;
+ if (f->opt_rewritten && (IROp)in->op == IR_RET && i > 0) {
+ IRRetAux* aux = (IRRetAux*)in->extra.aux;
+ Operand* ret_op = NULL;
+ Reg ret_reg = REG_NONE;
+ if (aux && aux->present && ret_scalar_storage(&aux->val, &ret_op) &&
+ first_return_reg(f, ret_op->cls, &ret_reg) &&
+ ret_reg != (Reg)REG_NONE && ret_reg != ret_op->v.reg) {
+ Inst* producer = &bl->insts[i - 1u];
+ Operand ret_dst = *ret_op;
+ ret_dst.v.reg = ret_reg;
+ int swap_binop = 0;
+ if (producer->nopnds >= 1 &&
+ same_phys_reg(&producer->opnds[0], ret_op) &&
+ retarget_producer_legal(producer, &ret_dst, &swap_binop)) {
+ if (swap_binop) {
+ Operand tmp = producer->opnds[1];
+ producer->opnds[1] = producer->opnds[2];
+ producer->opnds[2] = tmp;
+ }
+ producer->opnds[0] = ret_dst;
+ *ret_op = ret_dst;
+ continue;
+ }
+ }
+ }
+
if (f->opt_rewritten &&
((IROp)in->op == IR_BINOP || (IROp)in->op == IR_UNOP) &&
in->nopnds >= 1 && in->opnds[0].kind == OPK_REG &&
diff --git a/test/opt/opt_test.c b/test/opt/opt_test.c
@@ -149,6 +149,17 @@ static Operand op_indirect_(Reg base, CfreeCgTypeId ty) {
return o;
}
+static Operand op_global_(ObjSymId sym, i64 addend, CfreeCgTypeId ty) {
+ Operand o;
+ memset(&o, 0, sizeof o);
+ o.kind = OPK_GLOBAL;
+ o.cls = RC_INT;
+ o.type = ty;
+ o.v.global.sym = sym;
+ o.v.global.addend = addend;
+ return o;
+}
+
static MemAccess mem_local_(FrameSlot fs, CfreeCgTypeId ty, u32 size,
u16 flags) {
MemAccess m;
@@ -423,6 +434,8 @@ typedef struct MockCGTarget {
int plan_calls[OPT_REG_CLASSES];
int plan_regs[OPT_REG_CLASSES];
int func_begin_plan_calls;
+ int known_frame_calls;
+ CGKnownFrameDesc last_known_frame;
int reserve_calls[OPT_REG_CLASSES];
int load_imm_calls;
Reg last_load_imm_dst;
@@ -454,6 +467,7 @@ typedef struct MockCGTarget {
int planned_stack_arg;
int param_calls;
CGLocalStorage last_param_storage;
+ int alloca_calls;
} MockCGTarget;
static void mock_func_begin(CGTarget* t, const CGFuncDesc* d) {
@@ -465,6 +479,24 @@ static void mock_func_begin(CGTarget* t, const CGFuncDesc* d) {
}
static void mock_func_end(CGTarget* t) { (void)t; }
+static void mock_func_begin_known_frame(CGTarget* t, const CGFuncDesc* d,
+ const CGKnownFrameDesc* frame,
+ FrameSlot* out_slots) {
+ MockCGTarget* m = (MockCGTarget*)t;
+ mock_func_begin(t, d);
+ ++m->known_frame_calls;
+ if (frame) m->last_known_frame = *frame;
+ for (u32 i = 0; frame && i < frame->nslots; ++i) {
+ if (out_slots) out_slots[i] = (FrameSlot)(i + 1u);
+ }
+}
+
+static u32 mock_call_stack_size(CGTarget* t, const CGCallDesc* d) {
+ MockCGTarget* m = (MockCGTarget*)t;
+ (void)d;
+ return m->planned_stack_arg ? 8u : 0u;
+}
+
static void mock_get_allocable_regs(CGTarget* t, RegClass cls, const Reg** out,
u32* nregs) {
MockCGTarget* m = (MockCGTarget*)t;
@@ -735,10 +767,19 @@ static void mock_set_loc(CGTarget* t, SrcLoc loc) {
(void)loc;
}
+static void mock_alloca(CGTarget* t, Operand dst, Operand size, u32 align) {
+ MockCGTarget* m = (MockCGTarget*)t;
+ (void)dst;
+ (void)size;
+ (void)align;
+ ++m->alloca_calls;
+}
+
static void mock_init(MockCGTarget* m, Compiler* c) {
memset(m, 0, sizeof *m);
m->base.c = c;
m->base.func_begin = mock_func_begin;
+ m->base.func_begin_known_frame = mock_func_begin_known_frame;
m->base.func_end = mock_func_end;
m->base.frame_slot = mock_frame_slot;
m->base.label_new = mock_label_new;
@@ -756,12 +797,14 @@ static void mock_init(MockCGTarget* m, Compiler* c) {
m->base.param = mock_param;
m->base.addr_of = mock_addr_of;
m->base.ret = mock_ret;
+ m->base.alloca_ = mock_alloca;
m->base.set_loc = mock_set_loc;
m->base.get_allocable_regs = mock_get_allocable_regs;
m->base.get_phys_regs = mock_get_phys_regs;
m->base.get_scratch_regs = mock_get_scratch_regs;
m->base.is_caller_saved = mock_is_caller_saved;
m->base.call_clobber_mask = mock_call_clobber_mask;
+ m->base.call_stack_size = mock_call_stack_size;
m->base.return_reg_mask = mock_return_reg_mask;
m->base.callee_save_mask = mock_callee_save_mask;
m->base.call = mock_call;
@@ -2714,6 +2757,19 @@ static void opt_combine_retargets_single_use_producer_copy(void) {
add->opnds[1].v.reg == 20 && add->opnds[2].v.reg == 19,
"commutative rhs overlap should swap operands before retargeting");
+ Func* retreg = new_func(&tc);
+ retreg->opt_rewritten = 1;
+ retreg->opt_ret_regs[RC_INT] = 1u << 20;
+ emit_phys_binop(retreg, retreg->entry, 21, 19, 20, tc.i32, BO_IADD);
+ emit_ret_val(retreg, retreg->entry, 21, tc.i32);
+
+ opt_combine(retreg);
+ add = &retreg->blocks[retreg->entry].insts[0];
+ IRRetAux* raux = (IRRetAux*)retreg->blocks[retreg->entry].insts[1].extra.aux;
+ EXPECT(add->opnds[0].v.reg == 20 && add->opnds[1].v.reg == 20 &&
+ add->opnds[2].v.reg == 19 && raux->val.storage.v.reg == 20,
+ "adjacent scalar return producer should retarget to ABI return reg");
+
Func* sub = new_func(&tc);
sub->opt_rewritten = 1;
emit_phys_binop(sub, sub->entry, 21, 19, 20, tc.i32, BO_ISUB);
@@ -3450,6 +3506,112 @@ static void opt_cmp_branch_keeps_fallthrough_after_block_growth(void) {
}
static void begin_mock_opt_func(TestCtx* tc, CGTarget* opt,
+ CfreeCgTypeId ret_ty);
+static CGLocalDesc local_desc_(CfreeCgTypeId ty, u32 size, u32 align,
+ u32 flags);
+
+static void opt_known_frame_marks_empty_leaf_omittable(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ MockCGTarget mock;
+ mock_init(&mock, tc.c);
+ static const Reg pool[] = {2};
+ static const Reg scratch[] = {9, 10};
+ mock_set_pool(&mock, RC_INT, pool, 1, scratch, 2, 0x4007FFFFu);
+
+ CGTarget* opt = opt_cgtarget_new(tc.c, &mock.base, 1);
+ begin_mock_opt_func(&tc, opt, tc.i32);
+
+ opt->load_imm(opt, op_reg_(1, tc.i32), 42);
+ CGABIValue retv = {0};
+ retv.type = tc.i32;
+ retv.storage = op_reg_(1, tc.i32);
+ opt->ret(opt, &retv);
+ opt->func_end(opt);
+
+ EXPECT(mock.known_frame_calls == 1, "O1 should use known-frame replay");
+ EXPECT(mock.last_known_frame.may_omit_frame,
+ "empty leaf known frame should be marked omittable");
+ EXPECT(mock.last_known_frame.nslots == 0, "empty leaf should have no slots");
+ EXPECT(!mock.last_known_frame.has_call, "empty leaf should not report calls");
+
+ opt->destroy(opt);
+ tc_fini(&tc);
+}
+
+static void opt_known_frame_keeps_frame_for_slot_call_and_alloca(void) {
+ TestCtx tc;
+ tc_init(&tc);
+ static const Reg pool[] = {2, 3, 4};
+ static const Reg scratch[] = {9, 10};
+
+ MockCGTarget slot_mock;
+ mock_init(&slot_mock, tc.c);
+ mock_set_pool(&slot_mock, RC_INT, pool, 3, scratch, 2, 0x4007FFFFu);
+ CGTarget* slot_opt = opt_cgtarget_new(tc.c, &slot_mock.base, 1);
+ begin_mock_opt_func(&tc, slot_opt, tc.i32);
+ CGLocalDesc ld =
+ local_desc_(tc.i32, 4, 4, CG_LOCAL_ADDR_TAKEN | CG_LOCAL_MEMORY_REQUIRED);
+ CGLocalStorage lst = slot_opt->local(slot_opt, &ld);
+ Operand addr = op_reg_(1, cfree_cg_type_ptr(tc.c, tc.i32, 0));
+ slot_opt->local_addr(slot_opt, addr, &ld, lst);
+ slot_opt->load_imm(slot_opt, op_reg_(2, tc.i32), 7);
+ CGABIValue sret = {0};
+ sret.type = tc.i32;
+ sret.storage = op_reg_(2, tc.i32);
+ slot_opt->ret(slot_opt, &sret);
+ slot_opt->func_end(slot_opt);
+ EXPECT(!slot_mock.last_known_frame.may_omit_frame,
+ "frame slot should block frame omission");
+ EXPECT(slot_mock.last_known_frame.nslots != 0,
+ "frame-backed local should be in known-frame slots");
+ slot_opt->destroy(slot_opt);
+
+ MockCGTarget call_mock;
+ mock_init(&call_mock, tc.c);
+ mock_set_pool(&call_mock, RC_INT, pool, 3, scratch, 2, 0x4007FFFFu);
+ CGTarget* call_opt = opt_cgtarget_new(tc.c, &call_mock.base, 1);
+ begin_mock_opt_func(&tc, call_opt, tc.i32);
+ CGCallDesc cd;
+ memset(&cd, 0, sizeof cd);
+ cd.fn_type = cfree_cg_type_func(tc.c, (CfreeCgFuncSig){.ret = tc.i32});
+ cd.callee = op_global_(OBJ_SYM_NONE, 0, cd.fn_type);
+ call_opt->call(call_opt, &cd);
+ call_opt->load_imm(call_opt, op_reg_(1, tc.i32), 3);
+ CGABIValue cret = {0};
+ cret.type = tc.i32;
+ cret.storage = op_reg_(1, tc.i32);
+ call_opt->ret(call_opt, &cret);
+ call_opt->func_end(call_opt);
+ EXPECT(call_mock.last_known_frame.has_call,
+ "known-frame summary should report calls");
+ EXPECT(!call_mock.last_known_frame.may_omit_frame,
+ "non-leaf function should block frame omission");
+ call_opt->destroy(call_opt);
+
+ MockCGTarget alloca_mock;
+ mock_init(&alloca_mock, tc.c);
+ mock_set_pool(&alloca_mock, RC_INT, pool, 3, scratch, 2, 0x4007FFFFu);
+ CGTarget* alloca_opt = opt_cgtarget_new(tc.c, &alloca_mock.base, 1);
+ begin_mock_opt_func(&tc, alloca_opt, tc.i32);
+ alloca_opt->alloca_(alloca_opt, op_reg_(1, cfree_cg_type_ptr(tc.c, tc.i32, 0)),
+ op_imm_(16, tc.i32), 16);
+ alloca_opt->load_imm(alloca_opt, op_reg_(2, tc.i32), 5);
+ CGABIValue aret = {0};
+ aret.type = tc.i32;
+ aret.storage = op_reg_(2, tc.i32);
+ alloca_opt->ret(alloca_opt, &aret);
+ alloca_opt->func_end(alloca_opt);
+ EXPECT(alloca_mock.last_known_frame.has_alloca,
+ "known-frame summary should report alloca");
+ EXPECT(!alloca_mock.last_known_frame.may_omit_frame,
+ "alloca should block frame omission");
+ alloca_opt->destroy(alloca_opt);
+
+ tc_fini(&tc);
+}
+
+static void begin_mock_opt_func(TestCtx* tc, CGTarget* opt,
CfreeCgTypeId ret_ty) {
CGFuncDesc fd;
CfreeCgFuncSig sig;
@@ -3711,6 +3873,8 @@ int main(void) {
opt_emit_no_virtual_alloc();
opt_records_const_bytes_by_value();
opt_cmp_branch_keeps_fallthrough_after_block_growth();
+ opt_known_frame_marks_empty_leaf_omittable();
+ opt_known_frame_keeps_frame_for_slot_call_and_alloca();
opt_local_hook_chooses_register_for_scalar();
opt_param_hook_chooses_register_for_scalar();
opt_param_memory_required_uses_frame();