commit 3594417c5b6b345d722d1e63dd096cf59f6ec5ab
parent 2054863eaad4a1cb13b851e90f6e9630a114c398
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 15 May 2026 07:40:16 -0700
Emit exact O1 prologues for known frames
Diffstat:
12 files changed, 1072 insertions(+), 441 deletions(-)
diff --git a/doc/OPT1.md b/doc/OPT1.md
@@ -1,114 +1,270 @@
-# OPT1 Checklist
-
-This is the focused implementation checklist for the first production `-O1`
-slice. The fuller design in `doc/OPT.md` is the source of truth for pass
-semantics, phase order, and exit criteria, especially:
-
-- `doc/OPT.md` §3.3, "Lowering and Allocation"
-- `doc/OPT.md` §4.1, "`-O1` Minimal Schedule"
-- `doc/OPT.md` Phase A, "Production `-O1` Lowering"
-
-If this checklist is ambiguous, follow `doc/OPT.md`; do not reduce scope or
-substitute a behaviorally similar shortcut without updating both documents.
-
-- [x] Set up `test/opt/` as the first task. Add IR-level and mock-target
- red-green tests before implementation:
- - `opt_liveness_branch`
- - `opt_regalloc_priority`
- - `opt_rewrite_spill_use_def`
- - `opt_emit_no_virtual_alloc`
- These tests must assert pass/rewrite shape, not just executable output.
-- [x] Add the liveness data model: block `live_in`/`live_out`, value live
- ranges, loop depth/frequency, and complete use/def walkers.
-- [x] Implement simple `opt_regalloc(..., false)`: priority allocation by
- tied hard-reg needs, frequency, live length, then stable id.
-- [x] Add the rewrite pass: map virtual regs to hard regs or `FS_SPILL`
- slots, inserting reloads/stores for spilled uses and defs.
-- [x] Make `opt_emit` stop relying on wrapped-target register allocation for virtual
- values after rewrite.
-- [x] Fill in target-aware `opt_machinize`/`opt_combine`, starting with
- AArch64 ABI/call constraints, noop move deletion, and safe single-use
- folds. **Done:** expanded AArch64/x64/RV64 register pools to match
- backend pools, added caller-saved masks, and narrowed call save/restore
- to caller-saved only. Noop physical move deletion implemented.
- General safe single-use folds remain future work.
-- [x] Add focused `-O1` tests for branch liveness, call-clobber preservation,
- spill pressure, inline asm tied/fixed registers, post-rewrite DCE,
- dead-def elimination, and cross-target validation (x64/RV64).
-- [ ] Review AArch64 backend internal scratch-register usage and decide how
- those backend scratch conventions should interact with `opt` register
- allocation.
-
-## Completion Notes
-
-- Implemented in `src/opt/pass_lower.c`, `src/opt/opt.c`, `src/opt/ir.h`,
- `src/arch/aa64/opt_coord.c`, `src/arch/x64/opt_coord.c`,
- `src/arch/rv64/opt_coord.c`, and `test/opt/opt_test.c`.
-- Added `opt_dead_def_elim` pass (pre-RA backward walk with dynamic liveness,
- removes cascading dead defs before rewrite).
-- Added `Func->opt_caller_saved[OPT_REG_CLASSES]` bitmask and
- `is_caller_saved()` helper used by rewrite to narrow call save/restore.
-- Expanded `opt_machinize` pools:
- - AArch64: INT x19-x28 (10), FP v8-v23 (16)
- - x64: INT RBX/R12/R13/R14/R15/R10 (6), FP XMM6-XMM15 (10)
- - RV64: INT s2-s11 (10), FP fs2-fs11 (10)
-- Fixed backend prologue interaction at `-O1`: `replay_func_to` now calls
- `CGTarget.reserve_hard_regs` with the exact set of assigned hard regs before
- `func_end`, so the backend saves/restores only the callee-saved subset that
- opt actually used. Replaces the old alloc/free hwm-bump hack.
-- `-O1` pipeline: `build_cfg`, `machinize`, `live_info`, `dead_def_elim`,
- `regalloc`, `combine`, `dce`, `emit`.
-- Focused opt tests now cover:
- `opt_liveness_branch`, `opt_regalloc_priority`,
- `opt_rewrite_spill_use_def`, `opt_emit_no_virtual_alloc`,
- `opt_call_clobber_preservation`, `opt_call_clobber_caller_saved`,
- `opt_spill_pressure`, `opt_inline_asm_tied_fixed_regs`,
- `opt_post_rewrite_dce`, `opt_dead_def_elim`, all running on aa64/x64/rv64.
-- Validation run after this round:
- - `make test-opt` passed with 69 checks (9 tests × 3 archs + 1 aa64-only).
- - `CFREE_OPT_LEVELS=1 CFREE_TEST_PATHS=D make test-cg` passed
- 196 cases with 0 failures.
- - `make test-cg` (full suite, -O0) passed 1573 cases with 0 failures.
- - `make test-link` passed 122 cases.
- - `make test-elf` passed 37 cases.
- - `make test-ar`, `make test-debug` passed.
-
-## Deviations
-
-- `opt_combine` is intentionally narrow: it removes noop physical copies but
- does not yet implement a broader safe single-use fold framework.
-- `opt_dce` is conservative post-rewrite cleanup, currently covering `IR_NOP`
- and empty non-side-effecting instructions. Dead-definition elimination now
- happens earlier via `opt_dead_def_elim` (pre-rewrite), which uses backward
- liveness to remove cascading dead defs. Full post-rewrite dead-def DCE
- remains future work once precise side-effect and use/def coverage is in place.
-- AArch64 allocation uses an expanded hard-register pool matching the backend
- (x19-x28, v8-v23). Call-clobber preservation is narrowed to caller-saved
- hard regs only; callee-saved regs rely on the backend prologue/epilogue
- (enabled by `reserve_hard_regs`).
-- x64 and RV64 register pools now match their respective backend pools and
- receive cross-target opt test coverage. Full CG corpus validation at `-O1`
- remains ongoing for those targets.
-- Backend scratch registers are declared via `CGTarget.get_scratch_regs` and
- kept disjoint from the allocable pool via a build-time assertion in
- `opt_machinize`. `is_caller_saved` is also backend-provided, so opt no
- longer hard-codes ABI masks.
-
-## Remaining Todos
-
-- Finish the general safe single-use fold portion of `opt_combine`:
- - [x] Substitute one-use physical copies into users when the rewritten IR
- form is guaranteed target-legal.
- - [x] Fold one-use integer `load_imm` defs into `binop`, `cmp`, and
- `cmp_branch` operands accepted by `CGTarget`.
- - [x] Collapse physical copy chains before post-RA DCE.
- - [x] Fold redundant `IR_CONVERT` chains, starting with identical
- conversion pairs.
- - [x] Keep branch-shape cleanup out of `-O1` unless it is purely local and
- does not require CFG/jump optimization.
-- Continue tightening post-rewrite DCE:
- - [ ] Model call clobbers and hard-reg call arguments explicitly.
- - [ ] Add tests for dead caller-saved defs before calls, live call returns,
- and hard-reg call arguments.
-- Add full CG corpus `-O1` validation runs for x64 and RV64 (currently
- validated only on aa64).
+# OPT1 - Current O1 Backend Path
+
+`-O1` is cfree's fast optimized backend tier. It records frontend CG operations
+as IR, runs a small MIR-shaped lowering/allocation pipeline, rewrites virtual
+values to hard registers or spill slots, performs narrow post-RA cleanup, and
+then emits through the normal target backend.
+
+This document describes the implemented O1 path and tracks remaining O1 code
+quality and compile-time work. Broader optimizer direction lives in
+`doc/OPT.md`; allocator performance history and measurements live in
+`doc/PERF.md` and `doc/MIR_RA_REPORT.md`.
+
+## Goals
+
+O1 is the bisection floor for optimized codegen. It should remain:
+
+- correct across supported targets and ABI lowering paths;
+- materially better than O0 generated code;
+- close enough to O0 compile time for JIT use;
+- free of unbounded dense pseudo-register conflict matrices;
+- simple enough that O2 can add coalescing, splitting, SSA, and value
+ optimization without destabilizing the O1 baseline.
+
+O1 is not intended to perform full SSA optimization, global value numbering,
+LICM, general inlining, or full branch layout. Those belong in O2 unless a
+local cleanup is required to keep O1 code shape reasonable.
+
+## Pipeline
+
+The normal O1 function-end path is:
+
+```text
+build_cfg
+machinize
+build_loop_tree
+live_blocks.pre_dde
+dead_def_elim
+regalloc:
+ live_blocks
+ live_ranges_build
+ assign_ranges
+ rewrite
+combine
+dce
+emit
+```
+
+Important pass boundaries:
+
+- `machinize` applies target-specific ABI/call/register constraints before
+ liveness and allocation.
+- `dead_def_elim` runs before RA using block liveness and operand-reference
+ events.
+- `regalloc` rebuilds liveness after DDE, constructs compressed live ranges,
+ assigns locations, and rewrites operands.
+- `combine` is deliberately narrow post-RA cleanup: noop physical copy
+ deletion, safe single-use folds, redundant spill load/store cleanup, and
+ similar local rewrites.
+- `dce` is conservative after rewrite and removes only trivial dead IR shapes.
+
+## Liveness And Ranges
+
+O1 liveness is pass-local. Persistent `Block.live_after` and dense value
+conflict storage are not part of the normal path.
+
+`opt_live_blocks` computes block `live_use`, `live_def`, `live_in`, and
+`live_out` sets using `OptBitset`. `OptBitset` tracks active words, trims
+trailing zero words, and bounds copy/union work by active length where possible.
+
+`opt_live_ranges_build` turns block liveness plus backward instruction walks
+into compressed live ranges:
+
+- per-instruction references are collected as use/def value lists;
+- open and touched values are tracked with generation marks;
+- raw instruction points are compressed to live-range boundary points;
+- live-across-call frequency is tracked from a rolling live-value list;
+- per-value summaries record live length, use/def frequency, block frequency,
+ call-crossing frequency, and spill cost.
+
+## Allocation
+
+The normal O1 allocator is range based. It does not build or consume a
+pseudo-register interference matrix.
+
+Candidates are ordered by:
+
+1. tied hard-register requirement;
+2. spill/frequency cost;
+3. shorter live length;
+4. stable value id.
+
+Each value is assigned either:
+
+- a target-provided hard register in the value's register class; or
+- a compatible `FS_SPILL` frame slot.
+
+Non-tied values known to be live across calls avoid caller-saved registers.
+Tied hard-register values are checked for availability, clobber constraints,
+and live-range conflicts.
+
+Occupancy is stored as sorted per-location interval sets:
+
+- one interval vector per physical hard-register location;
+- one interval vector per allocated stack spill slot.
+
+Conflict checks test whether any candidate live range overlaps the location's
+intervals. Marking inserts and merges intervals. The old point-index bitmap
+row OR/mark path is not part of normal O1.
+
+Metrics still report `opt.alloc.hard_loc_words`, `opt.alloc.stack_loc_words`,
+and point/mark counters for trend compatibility, but `opt.alloc.hard_word_ors`
+and `opt.alloc.stack_word_ors` should remain zero on the interval path.
+
+## Rewrite
+
+Rewrite consumes the assignment map and walks each block backward with rolling
+liveness:
+
+- hard-assigned pseudos become physical registers;
+- spilled uses receive reloads through target scratch registers;
+- spilled defs receive stores;
+- call save/restore insertion checks only hard-assigned caller-saved values
+ known to be live across some call;
+- inline-asm register constraints are applied only for functions containing
+ `IR_ASM_BLOCK`.
+
+Rewrite does not materialize per-instruction full live-after sets.
+
+## Target Contract
+
+O1 relies on each target backend to provide:
+
+- allocable hard-register pools per register class;
+- scratch-register pools disjoint from allocable pools;
+- caller-saved register classification;
+- optional hard-register reservation before backend `func_end`;
+- target legality for local folds performed by `opt_combine`.
+
+The current target pools are:
+
+- AArch64: integer `x19-x28`, FP `v8-v23`;
+- x64: integer `RBX/R12/R13/R14/R15/R10`, FP `XMM6-XMM15`;
+- RV64: integer `s2-s11`, FP `fs2-fs11`.
+
+Backends still own final prologue/epilogue emission and callee-saved register
+preservation. O1 calls `reserve_hard_regs` with the hard registers it assigned
+so backend save/restore decisions match the rewritten IR.
+
+Targets may also provide a known-frame entry path for O1. When
+`func_begin_known_frame` and `call_stack_size` are both available, O1 computes
+all frame slots, outgoing call area size, and dynamic-allocation presence before
+body replay and passes that to the backend. AArch64, x64, and RV64 use this
+path to emit the exact prologue up front instead of reserving target NOP patch
+slots.
+
+## Tests And Metrics
+
+Focused O1 coverage lives in `test/opt/` and the phase0 guardrails. It covers
+branch liveness, call-clobber preservation, spill pressure, inline asm
+tied/fixed registers, pre-RA DDE, post-rewrite cleanup, and cross-target mock
+validation.
+
+Useful targeted runs:
+
+```sh
+make test-opt
+make test-cg-api
+make test-toy
+make test-aa64-inline
+make test-smoke-x64
+```
+
+`cfree run --time -O1` emits counters for:
+
+- function/block/inst/value counts;
+- block liveness active words and dataflow visits;
+- live range counts, compressed points, live-word touches, and value scans;
+- allocator hard/stack occupancy, interval probes/marks, spills;
+- rewrite reloads/stores/inserted instructions/live traffic;
+- link/JIT and compile frontend scopes.
+
+## Current Disassembly Observations
+
+A small AArch64 Linux probe with arithmetic, a conditional, a loop, and a call
+shows the current shape:
+
+- O1 is smaller than O0 and removes many post-RA physical moves.
+- The old prologue NOP sleds are gone on O1 for AArch64, x64, and RV64; the
+ known-frame entry path emits exact prologues for those targets.
+- Loads/stores for simple C locals and parameters are still unchanged between
+ O0 and O1 in the probe.
+- Boolean branch lowering still materializes compare results:
+
+```asm
+cmp w19, #0x64
+cset w20, gt
+cmp w20, #0x0
+b.eq ...
+```
+
+The desired local shape is a direct conditional branch:
+
+```asm
+cmp w19, #0x64
+b.le ...
+```
+
+The same pattern appears in loop conditions.
+
+## Todo: Code Quality
+
+MIR's O1 path suggests these high-value local cleanups that still fit cfree's
+fast tier:
+
+1. Fuse compare-result conditionals into direct compare branches.
+ MIR's C frontend emits direct compare branches for conditional compares, and
+ its generic compare-value-plus-branch fusion lives in the heavier SSA
+ combine path. For cfree O1, recover or preserve `IR_CMP_BRANCH` instead of
+ lowering through `IR_CMP` plus `IR_CONDBR` when the compare result has a
+ single branch use. This should use target `cmp_branch` support and should
+ remove the `cmp; cset; cmp #0; b.cond` pattern on AArch64.
+
+2. Promote simple scalar locals before backend allocation.
+ MIR's C frontend represents normal scalar block locals as MIR registers and
+ leaves stack slots for aggregates, forced-stack cases, and address-taken
+ values. O1 still stores parameters and scalar locals to frame slots and
+ reloads them in straightforward code. A conservative mem2reg-lite pass
+ should promote locals whose address does not escape, starting with
+ integer/pointer scalars in single-entry structured control flow.
+
+3. Clean up local branch layout artifacts.
+ MIR's full jump optimizer is O2-only, but its cheap pieces are appropriate
+ for O1: delete branches to immediate fallthrough blocks, forward
+ branch-to-branch targets, and invert a branch when it removes an
+ unconditional jump. Avoid full CFG layout work.
+
+- Continue tightening post-rewrite DCE.
+ Model hard-register call arguments and call clobbers precisely enough to
+ delete dead caller-saved defs before calls without removing required ABI
+ traffic.
+
+- Keep `opt_combine` legality target-aware.
+ Existing one-use copy/immediate/convert folds should stay conservative. New
+ folds must be gated by target operand legality rather than assuming every
+ backend accepts the same IR operand forms.
+
+## Todo: Compile-Time And Validation
+
+- Decide whether O1 can avoid the second block-liveness pass after DDE.
+ The current duplicated liveness run is simple and no longer the dominant
+ bucket, but DDE could eventually return enough local update information to
+ reuse or cheaply repair liveness for regalloc.
+
+- Keep interval occupancy as the O1 allocator baseline.
+ Do not reintroduce point-index bitmap row OR/mark loops in O1. O2 can build
+ richer interval/event state for splitting if needed.
+
+- Add full CG corpus O1 validation for x64 and RV64.
+ The focused opt suite covers all three main targets, but full corpus O1 runs
+ should be part of release confidence for x64 and RV64.
+
+- Add stable targeted dumps for allocator and rewrite regressions.
+ Metrics catch scaling regressions, but compact dumps for live ranges,
+ assignments, and rewrite insertions would make code-quality changes easier to
+ review.
+
+- Keep `doc/PERF.md` current after each O1 code-quality pass.
+ The most useful before/after numbers are `.text` size, instruction counts by
+ mnemonic, O1 wall time, live-range time, regalloc time, spills, reloads, and
+ rewrite inserted instruction counts.
diff --git a/src/arch/aa64/emit.c b/src/arch/aa64/emit.c
@@ -80,6 +80,95 @@ static u32 aa_planned_prologue_words(const AAImpl *a) {
return n ? n : 1u;
}
+static void aa_func_begin_init(CGTarget *t, const CGFuncDesc *fd) {
+ AAImpl *a = impl_of(t);
+ MCEmitter *mc = t->mc;
+
+ mc->set_section(mc, fd->text_section_id);
+ mc->emit_align(mc, 4, 0);
+
+ a->fd = fd;
+ a->func_start = mc->pos(mc);
+ a->next_param_int = 0;
+ a->next_param_fp = 0;
+ a->next_param_stack = 0;
+ a->has_sret = (fd->abi && fd->abi->has_sret) ? 1 : 0;
+ a->cum_off = 0;
+ a->max_outgoing = 0;
+ a->used_cs_int_mask = a->has_planned_regs ? a->planned_cs_int_mask : 0;
+ a->used_cs_fp_mask = a->has_planned_regs ? a->planned_cs_fp_mask : 0;
+ a->prologue_words = a->has_planned_regs ? aa_planned_prologue_words(a)
+ : AA_PROLOGUE_WORDS;
+ a->planned_cs_int_mask = 0;
+ a->planned_cs_fp_mask = 0;
+ a->has_planned_regs = 0;
+ a->nslots = 0;
+ a->nscopes = 0;
+ a->has_alloca = 0;
+ a->known_frame = 0;
+ a->nadd_patches = 0;
+ a->sret_ptr_slot = FRAME_SLOT_NONE;
+ a->is_variadic = (fd->abi && fd->abi->variadic) ? 1 : 0;
+ a->gp_save_slot = FRAME_SLOT_NONE;
+ a->fp_save_slot = FRAME_SLOT_NONE;
+ a->epilogue_label = mc->label_new(mc);
+
+ mc->cfi_startproc(mc);
+}
+
+static void aa_add_entry_frame_slots(CGTarget *t) {
+ AAImpl *a = impl_of(t);
+
+ if (a->has_sret) {
+ FrameSlotDesc fsd = {
+ .type = CFREE_CG_TYPE_NONE,
+ .name = 0,
+ .loc = (SrcLoc){0, 0, 0},
+ .size = 8,
+ .align = 8,
+ .kind = FS_SPILL,
+ .flags = 0,
+ };
+ a->sret_ptr_slot = aa_frame_slot(t, &fsd);
+ }
+
+ if (a->is_variadic) {
+ FrameSlotDesc gpd = {
+ .type = CFREE_CG_TYPE_NONE,
+ .name = 0,
+ .loc = (SrcLoc){0, 0, 0},
+ .size = 64,
+ .align = 8,
+ .kind = FS_SPILL,
+ .flags = 0,
+ };
+ a->gp_save_slot = aa_frame_slot(t, &gpd);
+ FrameSlotDesc fpd = {
+ .type = CFREE_CG_TYPE_NONE,
+ .name = 0,
+ .loc = (SrcLoc){0, 0, 0},
+ .size = 128,
+ .align = 16,
+ .kind = FS_SPILL,
+ .flags = 0,
+ };
+ a->fp_save_slot = aa_frame_slot(t, &fpd);
+ }
+}
+
+static void aa_emit_variadic_reg_saves(CGTarget *t) {
+ AAImpl *a = impl_of(t);
+ MCEmitter *mc = t->mc;
+
+ if (!a->is_variadic) return;
+ AASlot *gs = aa64_slot_get(a, a->gp_save_slot);
+ AASlot *fs = aa64_slot_get(a, a->fp_save_slot);
+ for (u32 i = 0; i < 8; ++i)
+ aa64_emit32(mc, aa64_stur(3, i, 29, -(i32)gs->off + (i32)i * 8));
+ for (u32 i = 0; i < 8; ++i)
+ aa64_emit32(mc, aa64_stur_fp(3, i, 29, -(i32)fs->off + (i32)i * 16));
+}
+
/* ============================================================
* Low-level emission
* ============================================================ */
@@ -170,104 +259,121 @@ void aa_func_begin(CGTarget *t, const CGFuncDesc *fd) {
AAImpl *a = impl_of(t);
MCEmitter *mc = t->mc;
- mc->set_section(mc, fd->text_section_id);
- mc->emit_align(mc, 4, 0);
-
- a->fd = fd;
- a->func_start = mc->pos(mc);
- a->next_param_int = 0;
- a->next_param_fp = 0;
- a->next_param_stack = 0;
- a->has_sret = (fd->abi && fd->abi->has_sret) ? 1 : 0;
- a->cum_off = 0;
- a->max_outgoing = 0;
- a->used_cs_int_mask = a->has_planned_regs ? a->planned_cs_int_mask : 0;
- a->used_cs_fp_mask = a->has_planned_regs ? a->planned_cs_fp_mask : 0;
- a->prologue_words = a->has_planned_regs ? aa_planned_prologue_words(a)
- : AA_PROLOGUE_WORDS;
- a->planned_cs_int_mask = 0;
- a->planned_cs_fp_mask = 0;
- a->has_planned_regs = 0;
- a->nslots = 0;
- a->nscopes = 0;
- a->has_alloca = 0;
- a->nadd_patches = 0;
- a->sret_ptr_slot = FRAME_SLOT_NONE;
- a->is_variadic = (fd->abi && fd->abi->variadic) ? 1 : 0;
- a->gp_save_slot = FRAME_SLOT_NONE;
- a->fp_save_slot = FRAME_SLOT_NONE;
- a->epilogue_label = mc->label_new(mc);
-
- mc->cfi_startproc(mc);
+ aa_func_begin_init(t, fd);
a->prologue_pos = mc->pos(mc);
for (u32 i = 0; i < a->prologue_words; ++i)
aa64_emit32(mc, AA64_NOP);
- if (a->has_sret) {
- FrameSlotDesc fsd = {
- .type = CFREE_CG_TYPE_NONE,
- .name = 0,
- .loc = (SrcLoc){0, 0, 0},
- .size = 8,
- .align = 8,
- .kind = FS_SPILL,
- .flags = 0,
- };
- a->sret_ptr_slot = aa_frame_slot(t, &fsd);
- }
+ aa_add_entry_frame_slots(t);
+ aa_emit_variadic_reg_saves(t);
+}
- if (a->is_variadic) {
- FrameSlotDesc gpd = {
- .type = CFREE_CG_TYPE_NONE,
- .name = 0,
- .loc = (SrcLoc){0, 0, 0},
- .size = 64,
- .align = 8,
- .kind = FS_SPILL,
- .flags = 0,
- };
- a->gp_save_slot = aa_frame_slot(t, &gpd);
- FrameSlotDesc fpd = {
- .type = CFREE_CG_TYPE_NONE,
- .name = 0,
- .loc = (SrcLoc){0, 0, 0},
- .size = 128,
- .align = 16,
- .kind = FS_SPILL,
- .flags = 0,
- };
- a->fp_save_slot = aa_frame_slot(t, &fpd);
- AASlot *gs = aa64_slot_get(a, a->gp_save_slot);
- AASlot *fs = aa64_slot_get(a, a->fp_save_slot);
- for (u32 i = 0; i < 8; ++i) {
- aa64_emit32(mc, aa64_stur(3, i, 29, -(i32)gs->off + (i32)i * 8));
- }
- for (u32 i = 0; i < 8; ++i) {
- aa64_emit32(mc, aa64_stur_fp(3, i, 29, -(i32)fs->off + (i32)i * 16));
+static u32 aa_build_prologue(CGTarget *t, u32 *words, u32 cap, u32 frame_size,
+ u32 fp_lr_off, u32 int_save_off, u32 fp_save_off,
+ const u32 *int_regs, u32 n_int_saves,
+ const u32 *fp_regs, u32 n_fp_saves) {
+ AAImpl *a = impl_of(t);
+ u32 wi = 0;
+
+ if (frame_size <= 0xfff) {
+ if (wi >= cap) goto overflow;
+ words[wi++] = aa64_sub_imm(1, 31, 31, frame_size, 0);
+ } else if ((frame_size & 0xfff) == 0 && (frame_size >> 12) <= 0xfff) {
+ if (wi >= cap) goto overflow;
+ words[wi++] = aa64_sub_imm(1, 31, 31, frame_size >> 12, 1);
+ } else {
+ if (wi + 2 > cap) goto overflow;
+ words[wi++] = aa64_sub_imm(1, 31, 31, (frame_size >> 12) & 0xfff, 1);
+ words[wi++] = aa64_sub_imm(1, 31, 31, frame_size & 0xfff, 0);
+ }
+ if (wi + 2 > cap) goto overflow;
+ words[wi++] = aa64_stp_x(29, 30, 31, (i32)fp_lr_off);
+ words[wi++] = aa64_add_imm(1, 29, 31, fp_lr_off, 0);
+ if (a->has_sret && a->sret_ptr_slot != FRAME_SLOT_NONE) {
+ AASlot *s = aa64_slot_get(a, a->sret_ptr_slot);
+ if (s) {
+ if (wi >= cap) goto overflow;
+ words[wi++] = aa64_stur(3, 8, 29, -(i32)s->off);
}
}
+ for (u32 i = 0; i < n_int_saves; ++i) {
+ if (wi >= cap) goto overflow;
+ words[wi++] = aa64_str_uimm(3, int_regs[i], 31, int_save_off + i * 8u);
+ }
+ for (u32 i = 0; i < n_fp_saves; ++i) {
+ if (wi >= cap) goto overflow;
+ words[wi++] = aa64_str_fp_uimm(3, fp_regs[i], 31, fp_save_off + i * 8u);
+ }
+ return wi;
+
+overflow:
+ compiler_panic(t->c, a->loc,
+ "aarch64: prologue too small (used more than %u words)", cap);
+ return 0;
+}
+
+static void aa_compute_frame(const AAImpl *a, u32 n_int_saves, u32 n_fp_saves,
+ u32 *int_save_off, u32 *fp_save_off,
+ u32 *fp_lr_off, u32 *frame_size) {
+ *int_save_off = a->max_outgoing;
+ *fp_save_off = *int_save_off + n_int_saves * 8u;
+ u32 locals_off = *fp_save_off + n_fp_saves * 8u;
+ *fp_lr_off = locals_off + a->cum_off;
+ *frame_size = *fp_lr_off + 16;
+ *frame_size = (*frame_size + 15u) & ~15u;
+ *fp_lr_off = *frame_size - 16;
+}
+
+void aa_func_begin_known_frame(CGTarget *t, const CGFuncDesc *fd,
+ const CGKnownFrameDesc *frame,
+ FrameSlot *out_slots) {
+ AAImpl *a = impl_of(t);
+ u32 int_regs[10];
+ u32 fp_regs[8];
+ u32 int_save_off, fp_save_off, fp_lr_off, frame_size;
+ u32 words[AA_PROLOGUE_WORDS];
+
+ aa_func_begin_init(t, fd);
+ a->known_frame = 1;
+ aa_add_entry_frame_slots(t);
+ for (u32 i = 0; frame && i < frame->nslots; ++i) {
+ FrameSlot fs = aa_frame_slot(t, &frame->slots[i]);
+ if (out_slots) out_slots[i] = fs;
+ }
+ if (frame) {
+ a->max_outgoing = frame->max_outgoing;
+ a->has_alloca = frame->has_alloca ? 1u : 0u;
+ }
+
+ u32 n_int_saves = collect_mask_regs(a->used_cs_int_mask, 19u, 28u, int_regs);
+ u32 n_fp_saves = collect_mask_regs(a->used_cs_fp_mask, 8u, 15u, fp_regs);
+ aa_compute_frame(a, n_int_saves, n_fp_saves, &int_save_off, &fp_save_off,
+ &fp_lr_off, &frame_size);
+
+ a->prologue_pos = t->mc->pos(t->mc);
+ u32 nwords = aa_build_prologue(t, words, AA_PROLOGUE_WORDS, frame_size,
+ fp_lr_off, int_save_off, fp_save_off,
+ int_regs, n_int_saves, fp_regs, n_fp_saves);
+ for (u32 i = 0; i < nwords; ++i)
+ aa64_emit32(t->mc, words[i]);
+ aa_emit_variadic_reg_saves(t);
}
void aa_func_end(CGTarget *t) {
AAImpl *a = impl_of(t);
MCEmitter *mc = t->mc;
+ ObjBuilder *obj = t->obj;
+ u32 sec = a->fd->text_section_id;
u32 int_regs[10];
u32 fp_regs[8];
u32 n_int_saves = collect_mask_regs(a->used_cs_int_mask, 19u, 28u, int_regs);
u32 n_fp_saves = collect_mask_regs(a->used_cs_fp_mask, 8u, 15u, fp_regs);
- u32 outgoing_off = 0;
- u32 int_save_off = a->max_outgoing;
- u32 fp_save_off = int_save_off + n_int_saves * 8u;
- u32 locals_off = fp_save_off + n_fp_saves * 8u;
- u32 fp_lr_off = locals_off + a->cum_off;
- u32 frame_size = fp_lr_off + 16;
- frame_size = (frame_size + 15u) & ~15u;
- fp_lr_off = frame_size - 16;
-
- (void)outgoing_off;
+ u32 int_save_off, fp_save_off, fp_lr_off, frame_size;
+ aa_compute_frame(a, n_int_saves, n_fp_saves, &int_save_off, &fp_save_off,
+ &fp_lr_off, &frame_size);
mc->label_place(mc, a->epilogue_label);
@@ -293,62 +399,18 @@ void aa_func_end(CGTarget *t) {
emit_sp_add(mc, frame_size);
aa64_emit32(mc, aa64_ret(AA64_LR));
- u32 pos = a->prologue_pos;
- ObjBuilder *obj = t->obj;
- u32 sec = a->fd->text_section_id;
-
- u32 words[AA_PROLOGUE_WORDS];
- u32 prologue_words = a->prologue_words ? a->prologue_words
- : AA_PROLOGUE_WORDS;
- for (u32 i = 0; i < prologue_words; ++i)
- words[i] = AA64_NOP;
- u32 wi = 0;
-
- if (frame_size <= 0xfff) {
- words[wi++] = aa64_sub_imm(1, 31, 31, frame_size, 0);
- } else if ((frame_size & 0xfff) == 0 && (frame_size >> 12) <= 0xfff) {
- words[wi++] = aa64_sub_imm(1, 31, 31, frame_size >> 12, 1);
- } else {
- if (wi + 2 > prologue_words) {
- compiler_panic(t->c, a->loc,
- "aarch64: prologue overflow for frame_size %u",
- frame_size);
- }
- words[wi++] = aa64_sub_imm(1, 31, 31, (frame_size >> 12) & 0xfff, 1);
- words[wi++] = aa64_sub_imm(1, 31, 31, frame_size & 0xfff, 0);
- }
- words[wi++] = aa64_stp_x(29, 30, 31, (i32)fp_lr_off);
- words[wi++] = aa64_add_imm(1, 29, 31, fp_lr_off, 0);
- if (a->has_sret && a->sret_ptr_slot != FRAME_SLOT_NONE) {
- AASlot *s = aa64_slot_get(a, a->sret_ptr_slot);
- if (s) {
- if (wi >= prologue_words)
- goto overflow;
- words[wi++] = aa64_stur(3, 8, 29, -(i32)s->off);
- }
- }
- for (u32 i = 0; i < n_int_saves; ++i) {
- u32 r0 = int_regs[i];
- if (wi >= prologue_words)
- goto overflow;
- words[wi++] = aa64_str_uimm(3, r0, 31, int_save_off + i * 8u);
- }
- for (u32 i = 0; i < n_fp_saves; ++i) {
- u32 r0 = fp_regs[i];
- if (wi >= prologue_words)
- goto overflow;
- words[wi++] = aa64_str_fp_uimm(3, r0, 31, fp_save_off + i * 8u);
- }
- if (0) {
- overflow:
- compiler_panic(
- t->c, a->loc,
- "aarch64: prologue placeholder too small (used %u of %u words)", wi,
- prologue_words);
- }
-
- for (u32 i = 0; i < prologue_words; ++i) {
- aa64_patch32(obj, sec, pos + i * 4u, words[i]);
+ if (!a->known_frame) {
+ u32 pos = a->prologue_pos;
+ u32 words[AA_PROLOGUE_WORDS];
+ u32 prologue_words = a->prologue_words ? a->prologue_words
+ : AA_PROLOGUE_WORDS;
+ for (u32 i = 0; i < prologue_words; ++i)
+ words[i] = AA64_NOP;
+ (void)aa_build_prologue(t, words, prologue_words, frame_size, fp_lr_off,
+ int_save_off, fp_save_off, int_regs, n_int_saves,
+ fp_regs, n_fp_saves);
+ for (u32 i = 0; i < prologue_words; ++i)
+ aa64_patch32(obj, sec, pos + i * 4u, words[i]);
}
if (a->max_outgoing > 0xfff) {
diff --git a/src/arch/aa64/internal.h b/src/arch/aa64/internal.h
@@ -232,6 +232,8 @@ typedef struct AAImpl {
u32 prologue_pos;
u32 prologue_words;
MCLabel epilogue_label;
+ u8 known_frame;
+ u8 pad0[3];
AASlot* slots;
u32 nslots;
@@ -285,6 +287,9 @@ void emit_global_addr(CGTarget* t, u32 dst_reg, ObjSymId sym, i64 addend);
/* emit.c public surface */
FrameSlot aa_frame_slot(CGTarget* t, const FrameSlotDesc* d);
void aa_func_begin(CGTarget* t, const CGFuncDesc* fd);
+void aa_func_begin_known_frame(CGTarget* t, const CGFuncDesc* fd,
+ const CGKnownFrameDesc* frame,
+ FrameSlot* out_slots);
void aa_func_end(CGTarget* t);
void aa_param(CGTarget* t, const CGParamDesc* p);
diff --git a/src/arch/aa64/ops.c b/src/arch/aa64/ops.c
@@ -921,6 +921,60 @@ static void emit_arg_value(CGTarget* t, const ABIFuncInfo* fi,
}
}
+static void count_arg_stack(const ABIFuncInfo* fi, const CGABIValue* av,
+ u32* next_int, u32* next_fp, u32* stack_off) {
+ ABIArgInfo va_ai;
+ ABIArgPart va_pt;
+ const ABIArgInfo* ai = av->abi;
+ if (!ai) {
+ u32 sz = type_byte_size(av->type);
+ memset(&va_ai, 0, sizeof va_ai);
+ memset(&va_pt, 0, sizeof va_pt);
+ va_ai.kind = ABI_ARG_DIRECT;
+ va_ai.parts = &va_pt;
+ va_ai.nparts = 1;
+ va_pt.cls = (av->storage.cls == RC_FP) ? ABI_CLASS_FP : ABI_CLASS_INT;
+ va_pt.size = sz;
+ va_pt.align = sz;
+ va_pt.src_offset = 0;
+ ai = &va_ai;
+ if (fi && fi->vararg_on_stack) {
+ *next_int = 8;
+ *next_fp = 8;
+ }
+ }
+ if (ai->kind == ABI_ARG_IGNORE) return;
+ if (ai->kind == ABI_ARG_INDIRECT) {
+ if (*next_int < 8)
+ ++*next_int;
+ else
+ *stack_off += 8;
+ return;
+ }
+ for (u16 i = 0; i < ai->nparts; ++i) {
+ const ABIArgPart* pt = &ai->parts[i];
+ if (pt->cls == ABI_CLASS_INT) {
+ if (*next_int < 8)
+ ++*next_int;
+ else
+ *stack_off += 8;
+ } else if (pt->cls == ABI_CLASS_FP) {
+ if (*next_fp < 8)
+ ++*next_fp;
+ else
+ *stack_off += 8;
+ }
+ }
+}
+
+static u32 aa_call_stack_size(CGTarget* t, const CGCallDesc* d) {
+ (void)t;
+ u32 next_int = 0, next_fp = 0, stack_off = 0;
+ for (u32 i = 0; i < d->nargs; ++i)
+ count_arg_stack(d->abi, &d->args[i], &next_int, &next_fp, &stack_off);
+ return (stack_off + 15u) & ~15u;
+}
+
static void aa_call(CGTarget* t, const CGCallDesc* d) {
AAImpl* a = impl_of(t);
MCEmitter* mc = t->mc;
@@ -942,7 +996,13 @@ static void aa_call(CGTarget* t, const CGCallDesc* d) {
}
u32 needed = (stack_off + 15u) & ~15u;
- if (needed > a->max_outgoing) a->max_outgoing = needed;
+ if (needed > a->max_outgoing) {
+ if (a->known_frame) {
+ compiler_panic(t->c, a->loc,
+ "aarch64 call: known frame outgoing area too small");
+ }
+ a->max_outgoing = needed;
+ }
if (d->callee.kind == OPK_GLOBAL) {
u32 bl_pos = mc->pos(mc);
@@ -1871,6 +1931,7 @@ CGTarget* aa64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) {
t->mc = m;
t->func_begin = aa_func_begin;
+ t->func_begin_known_frame = aa_func_begin_known_frame;
t->func_end = aa_func_end;
t->frame_slot = aa_frame_slot;
t->param = aa_param;
@@ -1892,6 +1953,7 @@ CGTarget* aa64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) {
t->convert = aa_convert;
t->call = aa_call;
+ t->call_stack_size = aa_call_stack_size;
t->ret = aa_ret;
t->alloca_ = aa_alloca_;
diff --git a/src/arch/arch.h b/src/arch/arch.h
@@ -336,6 +336,14 @@ typedef struct CGFuncDesc {
u32 flags; /* CGFuncDescFlag */
} CGFuncDesc;
+typedef struct CGKnownFrameDesc {
+ const FrameSlotDesc* slots;
+ u32 nslots;
+ u32 max_outgoing;
+ u8 has_alloca;
+ u8 pad[3];
+} CGKnownFrameDesc;
+
typedef enum CGCallFlag {
CG_CALL_NONE = 0,
/* Sibling call. The target emits the caller's epilogue, transfers
@@ -487,6 +495,12 @@ struct CGTarget {
void (*func_begin)(CGTarget*, const CGFuncDesc*);
void (*func_end)(CGTarget*);
+ /* Optional fast path for optimized emitters that know all frame slots and
+ * outgoing call area needs before body emission. `out_slots`, when non-NULL,
+ * has `frame->nslots` entries and receives target FrameSlot ids in order. */
+ void (*func_begin_known_frame)(CGTarget*, const CGFuncDesc*,
+ const CGKnownFrameDesc*, FrameSlot* out_slots);
+
/* ---- frame slots and spill/reload ----
* CG and opt allocate caller-visible registers and pass concrete Operand
* regs to the target. Plain machine targets consume hard regs; opt_cgtarget
@@ -530,6 +544,10 @@ struct CGTarget {
* Direct CG and opt both call this after emitting hard-register operands. */
void (*reserve_hard_regs)(CGTarget*, RegClass, const Reg* regs, u32 n);
+ /* Return the outgoing stack argument area needed by this call after target
+ * ABI routing/alignment. Optional; only needed by known-frame emitters. */
+ u32 (*call_stack_size)(CGTarget*, const CGCallDesc*);
+
/* ---- labels and control flow ---- */
Label (*label_new)(CGTarget*);
void (*label_place)(CGTarget*, Label);
diff --git a/src/arch/rv64/emit.c b/src/arch/rv64/emit.c
@@ -125,7 +125,16 @@ void emit_sp_addi(MCEmitter *mc, i64 imm) {
/* ---- function lifecycle ---- */
-void rv_func_begin(CGTarget *t, const CGFuncDesc *fd) {
+typedef struct RvFrameLayout {
+ u32 max_out;
+ u32 fp_saves_sz;
+ u32 fp_pair_off;
+ u32 frame_size;
+ i32 fp_save_base;
+ i32 int_save_base;
+} RvFrameLayout;
+
+static void rv_func_begin_init(CGTarget *t, const CGFuncDesc *fd) {
RImpl *a = impl_of(t);
MCEmitter *mc = t->mc;
@@ -139,6 +148,7 @@ void rv_func_begin(CGTarget *t, const CGFuncDesc *fd) {
a->next_param_stack = 0;
a->has_sret = (fd->abi && fd->abi->has_sret) ? 1 : 0;
a->is_variadic = (fd->abi && fd->abi->variadic) ? 1 : 0;
+ a->known_frame = 0;
a->cum_off = 0;
a->max_outgoing = 0;
a->fp_pair_off = 0;
@@ -158,11 +168,10 @@ void rv_func_begin(CGTarget *t, const CGFuncDesc *fd) {
a->epilogue_label = mc->label_new(mc);
mc->cfi_startproc(mc);
+}
- /* Reserve a NOP-filled prologue placeholder; func_end patches it. */
- a->prologue_pos = mc->pos(mc);
- for (u32 i = 0; i < a->prologue_words; ++i)
- rv64_emit32(mc, RV_NOP);
+static void rv_add_entry_frame_slots(CGTarget *t) {
+ RImpl *a = impl_of(t);
/* For an sret return, the caller passed the destination pointer in
* a0; reserve a hidden slot to spill it into so the body can use a0
@@ -191,6 +200,170 @@ void rv_func_begin(CGTarget *t, const CGFuncDesc *fd) {
* is_variadic is set. */
}
+static void rv_compute_frame(const RImpl *a, u32 n_int_saves, u32 n_fp_saves,
+ RvFrameLayout *fl) {
+ fl->max_out = (a->max_outgoing + 15u) & ~15u;
+ u32 int_saves_sz = n_int_saves * 8u;
+ fl->fp_saves_sz = n_fp_saves * 8u;
+
+ /* Variadic functions reserve a 64-byte save area at the very top of
+ * the frame so the save area and caller's stack args form a single
+ * contiguous byte stream walked by the va_list pointer. */
+ u32 va_save_sz = a->is_variadic ? 64u : 0u;
+ u32 locals_off = fl->max_out + int_saves_sz + fl->fp_saves_sz;
+ fl->fp_pair_off = locals_off + a->cum_off;
+ fl->frame_size = fl->fp_pair_off + 16u + va_save_sz;
+ fl->frame_size = (fl->frame_size + 15u) & ~15u;
+ fl->fp_pair_off = fl->frame_size - 16u - va_save_sz;
+
+ /* Save slots sit at the start of an 8-byte cell below the locals
+ * area. fp_save_base = offset of the first fp save (=-(L+8)); each
+ * subsequent save is 8 bytes lower. int saves start below the fp
+ * block. */
+ fl->fp_save_base = -(i32)a->cum_off - 8;
+ fl->int_save_base = fl->fp_save_base - (i32)fl->fp_saves_sz;
+}
+
+static u32 rv_variadic_first_saved_int(const CGFuncDesc *fd) {
+ u32 next_int = (fd->abi && fd->abi->has_sret) ? 1u : 0u;
+ u32 next_fp = 0;
+ for (u32 pidx = 0; pidx < fd->nparams; ++pidx) {
+ const ABIArgInfo *ai = fd->params[pidx].abi;
+ if (!ai || ai->kind == ABI_ARG_IGNORE) continue;
+ if (ai->kind == ABI_ARG_INDIRECT) {
+ if (next_int < 8) ++next_int;
+ continue;
+ }
+ for (u16 i = 0; i < ai->nparts; ++i) {
+ const ABIArgPart *pt = &ai->parts[i];
+ if (pt->cls == ABI_CLASS_INT) {
+ if (next_int < 8) ++next_int;
+ } else if (pt->cls == ABI_CLASS_FP) {
+ if (next_fp < 8) ++next_fp;
+ }
+ }
+ }
+ return next_int;
+}
+
+static u32 rv_build_prologue(CGTarget *t, u32 *words, u32 cap,
+ const RvFrameLayout *fl, const u32 *int_regs,
+ u32 n_int_saves, const u32 *fp_regs,
+ u32 n_fp_saves, u32 variadic_first_int) {
+ RImpl *a = impl_of(t);
+ u32 wi = 0;
+
+ /* addi sp, sp, -frame_size (or multi-insn if too large) */
+ if ((i64)fl->frame_size <= 2048) {
+ if (wi >= cap) goto overflow;
+ words[wi++] = rv_addi(RV_SP, RV_SP, -(i32)fl->frame_size);
+ } else {
+ i64 neg = -(i64)fl->frame_size;
+ if (!fits_signed32(neg))
+ compiler_panic(t->c, a->loc, "rv64: frame_size too large to patch");
+ i32 hi = (i32)((u32)((i32)neg + 0x800) >> 12);
+ i32 lo = (i32)neg - (hi << 12);
+ if (wi >= cap) goto overflow;
+ words[wi++] = rv_lui(RV_T0, (u32)hi & 0xfffffu);
+ if (lo) {
+ if (wi >= cap) goto overflow;
+ words[wi++] = rv_addiw(RV_T0, RV_T0, lo);
+ }
+ if (wi >= cap) goto overflow;
+ words[wi++] = rv_add(RV_SP, RV_SP, RV_T0);
+ }
+
+ if ((i32)fl->fp_pair_off > 2047 ||
+ (i32)(fl->fp_pair_off + 8) > 2047) {
+ compiler_panic(t->c, a->loc, "rv64: fp_pair_off out of imm12 range");
+ }
+ if (wi + 3 > cap) goto overflow;
+ words[wi++] = rv_sd(RV_S0, RV_SP, (i32)fl->fp_pair_off);
+ words[wi++] = rv_sd(RV_RA, RV_SP, (i32)fl->fp_pair_off + 8);
+ words[wi++] = rv_addi(RV_S0, RV_SP, (i32)fl->fp_pair_off);
+
+ /* If sret, spill incoming a0 into the hidden slot. */
+ if (a->has_sret && a->sret_ptr_slot != FRAME_SLOT_NONE) {
+ RvSlot *s = rv64_slot_get(a, a->sret_ptr_slot);
+ if (s) {
+ if (wi >= cap) goto overflow;
+ words[wi++] = rv_sd(RV_A0, RV_S0, -(i32)s->off);
+ }
+ }
+ /* Variadic: spill the still-unconsumed a-regs into the save area. */
+ if (a->is_variadic) {
+ for (u32 i = variadic_first_int; i < 8; ++i) {
+ if (wi >= cap) goto overflow;
+ words[wi++] = rv_sd(RV_A0 + i, RV_S0, 16 + (i32)i * 8);
+ }
+ }
+ for (u32 i = 0; i < n_int_saves; ++i) {
+ u32 r = int_regs[i];
+ i32 off = fl->int_save_base - 8 * (i32)i;
+ if (wi >= cap) goto overflow;
+ words[wi++] = rv_sd(r, RV_S0, off);
+ }
+ for (u32 i = 0; i < n_fp_saves; ++i) {
+ u32 r = fp_regs[i];
+ i32 off = fl->fp_save_base - 8 * (i32)i;
+ if (wi >= cap) goto overflow;
+ words[wi++] = rv_fsd(r, RV_S0, off);
+ }
+ return wi;
+
+overflow:
+ compiler_panic(t->c, a->loc,
+ "rv64: prologue placeholder too small (cap %u)", cap);
+ return 0;
+}
+
+void rv_func_begin(CGTarget *t, const CGFuncDesc *fd) {
+ RImpl *a = impl_of(t);
+ MCEmitter *mc = t->mc;
+
+ rv_func_begin_init(t, fd);
+
+ /* Reserve a NOP-filled prologue placeholder; func_end patches it. */
+ a->prologue_pos = mc->pos(mc);
+ for (u32 i = 0; i < a->prologue_words; ++i)
+ rv64_emit32(mc, RV_NOP);
+
+ rv_add_entry_frame_slots(t);
+}
+
+void rv_func_begin_known_frame(CGTarget *t, const CGFuncDesc *fd,
+ const CGKnownFrameDesc *frame,
+ FrameSlot *out_slots) {
+ RImpl *a = impl_of(t);
+ u32 int_regs[10];
+ u32 fp_regs[10];
+ u32 words[RV_PROLOGUE_WORDS];
+ RvFrameLayout fl;
+
+ rv_func_begin_init(t, fd);
+ a->known_frame = 1;
+ rv_add_entry_frame_slots(t);
+ for (u32 i = 0; frame && i < frame->nslots; ++i) {
+ FrameSlot fs = rv_frame_slot(t, &frame->slots[i]);
+ if (out_slots) out_slots[i] = fs;
+ }
+ if (frame) {
+ a->max_outgoing = frame->max_outgoing;
+ a->has_alloca = frame->has_alloca ? 1u : 0u;
+ }
+
+ u32 n_int_saves = collect_mask_regs(a->used_cs_int_mask, 18u, 27u, int_regs);
+ u32 n_fp_saves = collect_mask_regs(a->used_cs_fp_mask, 18u, 27u, fp_regs);
+ rv_compute_frame(a, n_int_saves, n_fp_saves, &fl);
+ a->fp_pair_off = fl.fp_pair_off;
+ a->prologue_pos = t->mc->pos(t->mc);
+ u32 nwords = rv_build_prologue(t, words, RV_PROLOGUE_WORDS, &fl, int_regs,
+ n_int_saves, fp_regs, n_fp_saves,
+ rv_variadic_first_saved_int(fd));
+ for (u32 i = 0; i < nwords; ++i)
+ rv64_emit32(t->mc, words[i]);
+}
+
void rv_func_end(CGTarget *t) {
RImpl *a = impl_of(t);
MCEmitter *mc = t->mc;
@@ -201,20 +374,9 @@ void rv_func_end(CGTarget *t) {
u32 fp_regs[10];
u32 n_int_saves = collect_mask_regs(a->used_cs_int_mask, 18u, 27u, int_regs);
u32 n_fp_saves = collect_mask_regs(a->used_cs_fp_mask, 18u, 27u, fp_regs);
- u32 max_out = (a->max_outgoing + 15u) & ~15u;
- u32 int_saves_sz = n_int_saves * 8u;
- u32 fp_saves_sz = n_fp_saves * 8u;
-
- /* Variadic functions reserve a 64-byte save area at the very top of
- * the frame so the save area and caller's stack args form a single
- * contiguous byte stream walked by the va_list pointer. */
- u32 va_save_sz = a->is_variadic ? 64u : 0u;
- u32 locals_off = max_out + int_saves_sz + fp_saves_sz; /* from sp */
- u32 fp_pair_off = locals_off + a->cum_off;
- u32 frame_size = fp_pair_off + 16u + va_save_sz;
- frame_size = (frame_size + 15u) & ~15u;
- fp_pair_off = frame_size - 16u - va_save_sz;
- a->fp_pair_off = fp_pair_off;
+ RvFrameLayout fl;
+ rv_compute_frame(a, n_int_saves, n_fp_saves, &fl);
+ a->fp_pair_off = fl.fp_pair_off;
/* Place the epilogue label at current pos. */
mc->label_place(mc, a->epilogue_label);
@@ -244,127 +406,54 @@ void rv_func_end(CGTarget *t) {
* area. fp_save_base = offset of the first fp save (=-(L+8)); each
* subsequent save is 8 bytes lower. int saves start below the fp
* block. */
- i32 fp_save_base = -(i32)a->cum_off - 8;
- i32 int_save_base = fp_save_base - (i32)fp_saves_sz;
-
/* Reverse order: ints first (lowest address) on restore, but we emit
* the restore loop in reverse to keep the prologue/epilogue symmetric. */
for (i32 i = (i32)n_int_saves - 1; i >= 0; --i) {
u32 r = int_regs[i];
- i32 off = int_save_base - 8 * (i32)i;
+ i32 off = fl.int_save_base - 8 * (i32)i;
rv64_emit32(mc, rv_ld(r, RV_S0, off));
}
for (i32 i = (i32)n_fp_saves - 1; i >= 0; --i) {
u32 r = fp_regs[i];
- i32 off = fp_save_base - 8 * (i32)i;
+ i32 off = fl.fp_save_base - 8 * (i32)i;
rv64_emit32(mc, rv_fld(r, RV_S0, off));
}
/* Restore sp from s0 first so alloca-induced offsets don't matter.
* After this, sp == its post-prologue value. */
if (a->has_alloca) {
- if ((i32)fp_pair_off > 2047) {
+ if ((i32)fl.fp_pair_off > 2047) {
compiler_panic(t->c, a->loc, "rv64: fp_pair_off too large for alloca");
}
- rv64_emit32(mc, rv_addi(RV_SP, RV_S0, -(i32)fp_pair_off));
+ rv64_emit32(mc, rv_addi(RV_SP, RV_S0, -(i32)fl.fp_pair_off));
}
- rv64_emit32(mc, rv_ld(RV_S0, RV_SP, (i32)fp_pair_off));
- rv64_emit32(mc, rv_ld(RV_RA, RV_SP, (i32)fp_pair_off + 8));
- emit_sp_addi(mc, (i64)frame_size);
+ rv64_emit32(mc, rv_ld(RV_S0, RV_SP, (i32)fl.fp_pair_off));
+ rv64_emit32(mc, rv_ld(RV_RA, RV_SP, (i32)fl.fp_pair_off + 8));
+ emit_sp_addi(mc, (i64)fl.frame_size);
rv64_emit32(mc, rv_ret_());
- /* Now patch the prologue placeholder. */
- u32 pos = a->prologue_pos;
- u32 words[RV_PROLOGUE_WORDS];
- u32 prologue_words = a->prologue_words ? a->prologue_words
- : RV_PROLOGUE_WORDS;
- for (u32 i = 0; i < prologue_words; ++i)
- words[i] = RV_NOP;
- u32 wi = 0;
-
- /* addi sp, sp, -frame_size (or 2-insn if too large) */
- if ((i64)frame_size <= 2048) {
- words[wi++] = rv_addi(RV_SP, RV_SP, -(i32)frame_size);
- } else {
- /* li t0, -frame_size; add sp, sp, t0 */
- /* Use a small two-instruction expansion via LUI+ADDI if it fits 32-bit;
- * otherwise we'd need a full load_imm but that's overkill for tests. */
- i64 neg = -(i64)frame_size;
- if (fits_signed32(neg)) {
- i32 hi = (i32)((u32)((i32)neg + 0x800) >> 12);
- i32 lo = (i32)neg - (hi << 12);
- words[wi++] = rv_lui(RV_T0, (u32)hi & 0xfffffu);
- if (lo)
- words[wi++] = rv_addiw(RV_T0, RV_T0, lo);
- words[wi++] = rv_add(RV_SP, RV_SP, RV_T0);
- } else {
- compiler_panic(t->c, a->loc, "rv64: frame_size too large to patch");
- }
- }
- /* sd s0, fp_pair_off(sp); sd ra, fp_pair_off+8(sp); addi s0, sp, fp_pair_off
- */
- if ((i32)fp_pair_off > 2047 || (i32)(fp_pair_off + 8) > 2047) {
- compiler_panic(t->c, a->loc, "rv64: fp_pair_off out of imm12 range");
- }
- words[wi++] = rv_sd(RV_S0, RV_SP, (i32)fp_pair_off);
- words[wi++] = rv_sd(RV_RA, RV_SP, (i32)fp_pair_off + 8);
- words[wi++] = rv_addi(RV_S0, RV_SP, (i32)fp_pair_off);
-
- /* If sret, spill incoming a0 into the hidden slot. */
- if (a->has_sret && a->sret_ptr_slot != FRAME_SLOT_NONE) {
- RvSlot *s = rv64_slot_get(a, a->sret_ptr_slot);
- if (s) {
- if (wi >= prologue_words)
- goto overflow;
- words[wi++] = rv_sd(RV_A0, RV_S0, -(i32)s->off);
- }
- }
- /* Variadic: spill the still-unconsumed a-regs (a_{nparams_int}..a7)
- * into the save area at [s0 + 16 + i*8]. The save area sits between
- * the saved-s0/ra pair and the caller's stack args, so save_area[8]
- * == caller's first stack arg. */
- if (a->is_variadic) {
- for (u32 i = a->next_param_int; i < 8; ++i) {
- if (wi >= prologue_words)
- goto overflow;
- words[wi++] = rv_sd(RV_A0 + i, RV_S0, 16 + (i32)i * 8);
- }
- }
- /* int saves */
- for (u32 i = 0; i < n_int_saves; ++i) {
- u32 r = int_regs[i];
- i32 off = int_save_base - 8 * (i32)i;
- if (wi >= prologue_words)
- goto overflow;
- words[wi++] = rv_sd(r, RV_S0, off);
- }
- /* fp saves */
- for (u32 i = 0; i < n_fp_saves; ++i) {
- u32 r = fp_regs[i];
- i32 off = fp_save_base - 8 * (i32)i;
- if (wi >= prologue_words)
- goto overflow;
- words[wi++] = rv_fsd(r, RV_S0, off);
- }
- if (0) {
- overflow:
- compiler_panic(t->c, a->loc,
- "rv64: prologue placeholder too small (used %u of %u)", wi,
- prologue_words);
- }
-
- for (u32 i = 0; i < prologue_words; ++i) {
- rv64_patch32(obj, sec, pos + i * 4u, words[i]);
+ if (!a->known_frame) {
+ u32 pos = a->prologue_pos;
+ u32 words[RV_PROLOGUE_WORDS];
+ u32 prologue_words = a->prologue_words ? a->prologue_words
+ : RV_PROLOGUE_WORDS;
+ for (u32 i = 0; i < prologue_words; ++i)
+ words[i] = RV_NOP;
+ (void)rv_build_prologue(t, words, prologue_words, &fl, int_regs,
+ n_int_saves, fp_regs, n_fp_saves,
+ a->next_param_int);
+ for (u32 i = 0; i < prologue_words; ++i)
+ rv64_patch32(obj, sec, pos + i * 4u, words[i]);
}
/* Patch alloca placeholders with max_outgoing. */
- if (max_out > 2047u) {
+ if (fl.max_out > 2047u) {
compiler_panic(t->c, a->loc,
"rv64: max_outgoing %u out of imm12 for alloca patch",
- max_out);
+ fl.max_out);
}
for (u32 i = 0; i < a->nadd_patches; ++i) {
u32 dr = a->add_patches[i].dst_reg;
- u32 word = rv_addi(dr, RV_SP, (i32)max_out);
+ u32 word = rv_addi(dr, RV_SP, (i32)fl.max_out);
rv64_patch32(obj, sec, a->add_patches[i].pos, word);
}
diff --git a/src/arch/rv64/internal.h b/src/arch/rv64/internal.h
@@ -55,6 +55,8 @@ typedef struct RImpl {
u32 next_param_fp;
u32 next_param_stack;
u8 has_sret;
+ u8 known_frame;
+ u8 pad0[2];
FrameSlot sret_ptr_slot;
u32 used_cs_int_mask; /* bit reg set for s2-s11 */
@@ -113,6 +115,9 @@ static inline u32 reg_num(Operand op) { return op.v.reg & 0x1fu; }
/* ---- emit.c: function lifecycle (referenced by ops.c vtable) ---- */
void rv_func_begin(CGTarget *t, const CGFuncDesc *fd);
+void rv_func_begin_known_frame(CGTarget *t, const CGFuncDesc *fd,
+ const CGKnownFrameDesc *frame,
+ FrameSlot *out_slots);
void rv_func_end(CGTarget *t);
void rv_coord_vtable_init(CGTarget *t);
diff --git a/src/arch/rv64/ops.c b/src/arch/rv64/ops.c
@@ -933,6 +933,57 @@ static void emit_arg_value(CGTarget* t, const CGABIValue* av, u32* next_int,
}
}
+static void count_arg_stack(const CGABIValue* av, u32* next_int,
+ u32* next_fp, u32* stack_off) {
+ ABIArgInfo va_ai;
+ ABIArgPart va_pt;
+ const ABIArgInfo* ai = av->abi;
+ if (!ai) {
+ u32 sz = type_byte_size(av->type);
+ memset(&va_ai, 0, sizeof va_ai);
+ memset(&va_pt, 0, sizeof va_pt);
+ va_ai.kind = ABI_ARG_DIRECT;
+ va_ai.parts = &va_pt;
+ va_ai.nparts = 1;
+ va_pt.cls = ABI_CLASS_INT;
+ va_pt.size = sz;
+ va_pt.align = sz;
+ va_pt.src_offset = 0;
+ ai = &va_ai;
+ }
+ if (ai->kind == ABI_ARG_IGNORE) return;
+ if (ai->kind == ABI_ARG_INDIRECT) {
+ if (*next_int < 8)
+ ++*next_int;
+ else
+ *stack_off += 8;
+ return;
+ }
+ for (u16 i = 0; i < ai->nparts; ++i) {
+ const ABIArgPart* pt = &ai->parts[i];
+ if (pt->cls == ABI_CLASS_INT) {
+ if (*next_int < 8)
+ ++*next_int;
+ else
+ *stack_off += 8;
+ } else if (pt->cls == ABI_CLASS_FP) {
+ if (*next_fp < 8)
+ ++*next_fp;
+ else
+ *stack_off += 8;
+ }
+ }
+}
+
+static u32 rv_call_stack_size(CGTarget* t, const CGCallDesc* d) {
+ (void)t;
+ u32 next_int = (d->abi && d->abi->has_sret) ? 1u : 0u;
+ u32 next_fp = 0, stack_off = 0;
+ for (u32 i = 0; i < d->nargs; ++i)
+ count_arg_stack(&d->args[i], &next_int, &next_fp, &stack_off);
+ return (stack_off + 15u) & ~15u;
+}
+
static void rv_call(CGTarget* t, const CGCallDesc* d) {
RImpl* a = impl_of(t);
MCEmitter* mc = t->mc;
@@ -960,7 +1011,12 @@ static void rv_call(CGTarget* t, const CGCallDesc* d) {
emit_arg_value(t, &d->args[i], &next_int, &next_fp, &stack_off);
}
u32 needed = (stack_off + 15u) & ~15u;
- if (needed > a->max_outgoing) a->max_outgoing = needed;
+ if (needed > a->max_outgoing) {
+ if (a->known_frame)
+ compiler_panic(t->c, a->loc,
+ "rv64 call: known frame outgoing area too small");
+ a->max_outgoing = needed;
+ }
if (d->callee.kind == OPK_GLOBAL) {
/* AUIPC ra, 0 ; JALR ra, ra, 0 with R_RV_CALL on AUIPC */
@@ -1792,6 +1848,7 @@ CGTarget* rv64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) {
t->mc = m;
t->func_begin = rv_func_begin;
+ t->func_begin_known_frame = rv_func_begin_known_frame;
t->func_end = rv_func_end;
t->frame_slot = rv_frame_slot;
@@ -1828,6 +1885,7 @@ CGTarget* rv64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) {
t->convert = rv_convert;
t->call = rv_call;
+ t->call_stack_size = rv_call_stack_size;
t->ret = rv_ret;
t->alloca_ = rv_alloca_;
diff --git a/src/arch/x64/emit.c b/src/arch/x64/emit.c
@@ -512,7 +512,7 @@ static u32 x64_planned_prologue_bytes(const XImpl *a) {
return n ? n : 1u;
}
-void x_func_begin(CGTarget *t, const CGFuncDesc *fd) {
+static void x_func_begin_init(CGTarget *t, const CGFuncDesc *fd) {
XImpl *a = impl_of(t);
MCEmitter *mc = t->mc;
@@ -527,6 +527,7 @@ void x_func_begin(CGTarget *t, const CGFuncDesc *fd) {
a->has_sret = (fd->abi && fd->abi->has_sret) ? 1 : 0;
a->has_alloca = 0;
a->is_variadic = (fd->abi && fd->abi->variadic) ? 1 : 0;
+ a->known_frame = 0;
a->cum_off = 0;
a->max_outgoing = 0;
a->used_cs_int_mask = a->has_planned_regs ? a->planned_cs_int_mask : 0;
@@ -544,11 +545,10 @@ void x_func_begin(CGTarget *t, const CGFuncDesc *fd) {
a->epilogue_label = mc->label_new(mc);
mc->cfi_startproc(mc);
+}
- /* Reserve a fixed-size prologue placeholder filled with NOPs. */
- a->prologue_pos = mc->pos(mc);
- for (u32 i = 0; i < a->prologue_nbytes; ++i)
- emit1(mc, 0x90);
+static void x_add_entry_frame_slots(CGTarget *t) {
+ XImpl *a = impl_of(t);
/* sret: rdi at entry holds the destination pointer. Spill it to a
* hidden slot so the body can use rdi freely. */
@@ -582,67 +582,53 @@ void x_func_begin(CGTarget *t, const CGFuncDesc *fd) {
.flags = 0,
};
a->reg_save_slot = x_frame_slot(t, &rsd);
- XSlot *rs = x64_slot_get(a, a->reg_save_slot);
- static const u32 gprs[6] = {X64_RDI, X64_RSI, X64_RDX,
- X64_RCX, X64_R8, X64_R9};
- for (u32 i = 0; i < 6; ++i) {
- emit_mov_store(mc, 8, gprs[i], X64_RBP, -(i32)rs->off + (i32)(i * 8u));
- }
- /* movsd writes the low 8 bytes of each xmm; va_arg reads 8 bytes per
- * FP slot, so the upper half of the 16-byte stride stays unused. */
- for (u32 i = 0; i < 8; ++i) {
- emit_sse_store(mc, 0xF2, 0x11, (u32)(X64_XMM0 + i), X64_RBP,
- -(i32)rs->off + (i32)(48u + i * 16u));
- }
}
}
-static u32 align_up_u32(u32 v, u32 a) { return (v + (a - 1u)) & ~(a - 1u); }
-
-void x_func_end(CGTarget *t) {
+static void x_emit_variadic_reg_saves(CGTarget *t) {
XImpl *a = impl_of(t);
MCEmitter *mc = t->mc;
- Reg cs_regs[5];
+ if (!a->is_variadic) return;
+ XSlot *rs = x64_slot_get(a, a->reg_save_slot);
+ static const u32 gprs[6] = {X64_RDI, X64_RSI, X64_RDX,
+ X64_RCX, X64_R8, X64_R9};
+ for (u32 i = 0; i < 6; ++i) {
+ emit_mov_store(mc, 8, gprs[i], X64_RBP, -(i32)rs->off + (i32)(i * 8u));
+ }
+ /* movsd writes the low 8 bytes of each xmm; va_arg reads 8 bytes per
+ * FP slot, so the upper half of the 16-byte stride stays unused. */
+ for (u32 i = 0; i < 8; ++i) {
+ emit_sse_store(mc, 0xF2, 0x11, (u32)(X64_XMM0 + i), X64_RBP,
+ -(i32)rs->off + (i32)(48u + i * 16u));
+ }
+}
+
+static u32 align_up_u32(u32 v, u32 a) { return (v + (a - 1u)) & ~(a - 1u); }
+
+static u32 x_collect_cs_regs(const XImpl *a, Reg *cs_regs) {
u32 cs_used = 0;
for (u32 i = 0; i < 5u; ++i) {
Reg r = g_int_order[i];
if (a->used_cs_int_mask & (1u << r))
cs_regs[cs_used++] = r;
}
- u32 cs_size = cs_used * 8u;
+ return cs_used;
+}
- /* Stack alignment: SysV requires rsp ≡ 0 mod 16 just before a call,
- * which means rsp ≡ 8 mod 16 inside the function (after the return
- * address is pushed). On entry, rsp ≡ 8 mod 16; after `push rbp` it
- * is 0 mod 16; after `sub rsp, frame_size` we need it back to 0
- * mod 16, so frame_size must be a multiple of 16. */
+static u32 x_compute_frame_size(const XImpl *a, u32 cs_used) {
+ u32 cs_size = cs_used * 8u;
u32 raw = a->max_outgoing + cs_size + a->cum_off;
u32 frame_size = align_up_u32(raw, 16u);
- if (frame_size == 0)
- frame_size = 16;
-
- mc->label_place(mc, a->epilogue_label);
-
- /* Restore callee-saves. Each at rbp - (cum_off + (i+1)*8). */
- for (i32 i = (i32)cs_used - 1; i >= 0; --i) {
- u32 reg = cs_regs[i];
- i32 off = -(i32)a->cum_off - (i32)(i + 1) * 8;
- emit_mov_load(mc, /*size=*/8, /*signed=*/0, reg, X64_RBP, off);
- }
-
- /* leave; ret. */
- emit_leave(mc);
- emit_ret(mc);
+ return frame_size ? frame_size : 16u;
+}
- /* Patch prologue placeholder. */
- u8 buf[X64_PROLOGUE_BYTES];
- u32 prologue_nbytes = a->prologue_nbytes ? a->prologue_nbytes
- : X64_PROLOGUE_BYTES;
- for (u32 i = 0; i < prologue_nbytes; ++i)
- buf[i] = 0x90;
+static u32 x_build_prologue(CGTarget *t, u8 *buf, u32 cap, u32 frame_size,
+ const Reg *cs_regs, u32 cs_used) {
+ XImpl *a = impl_of(t);
u32 wi = 0;
+ if (wi + 11 > cap) goto overflow;
/* push rbp (1 byte). */
buf[wi++] = 0x55;
/* mov rbp, rsp: REX.W 89 E5. */
@@ -663,8 +649,7 @@ void x_func_end(CGTarget *t) {
XSlot *s = x64_slot_get(a, a->sret_ptr_slot);
if (s) {
i32 off = -(i32)s->off;
- if (wi + 7 > prologue_nbytes)
- goto overflow;
+ if (wi + 7 > cap) goto overflow;
buf[wi++] = X64_REX_BASE | X64_REX_W;
buf[wi++] = 0x89;
buf[wi++] = modrm(2u, X64_RDI, X64_RBP);
@@ -679,8 +664,7 @@ void x_func_end(CGTarget *t) {
for (u32 i = 0; i < cs_used; ++i) {
u32 reg = cs_regs[i];
i32 off = -(i32)a->cum_off - (i32)(i + 1) * 8;
- if (wi + 7 > prologue_nbytes)
- goto overflow;
+ if (wi + 7 > cap) goto overflow;
buf[wi++] = (u8)(X64_REX_BASE | X64_REX_W | ((reg & 8) ? X64_REX_R : 0));
buf[wi++] = 0x89;
buf[wi++] = modrm(2u, (reg & 7u), X64_RBP);
@@ -689,15 +673,96 @@ void x_func_end(CGTarget *t) {
buf[wi++] = (u8)(off >> 16);
buf[wi++] = (u8)(off >> 24);
}
+ return wi;
+
+overflow:
+ compiler_panic(t->c, a->loc,
+ "x64: prologue placeholder overflow (cap %u bytes)", cap);
+ return 0;
+}
+
+void x_func_begin(CGTarget *t, const CGFuncDesc *fd) {
+ XImpl *a = impl_of(t);
+ MCEmitter *mc = t->mc;
+
+ x_func_begin_init(t, fd);
+
+ /* Reserve a fixed-size prologue placeholder filled with NOPs. */
+ a->prologue_pos = mc->pos(mc);
+ for (u32 i = 0; i < a->prologue_nbytes; ++i)
+ emit1(mc, 0x90);
+
+ x_add_entry_frame_slots(t);
+ x_emit_variadic_reg_saves(t);
+}
+
+void x_func_begin_known_frame(CGTarget *t, const CGFuncDesc *fd,
+ const CGKnownFrameDesc *frame,
+ FrameSlot *out_slots) {
+ XImpl *a = impl_of(t);
+ Reg cs_regs[5];
+ u8 buf[X64_PROLOGUE_BYTES];
+
+ x_func_begin_init(t, fd);
+ a->known_frame = 1;
+ x_add_entry_frame_slots(t);
+ for (u32 i = 0; frame && i < frame->nslots; ++i) {
+ FrameSlot fs = x_frame_slot(t, &frame->slots[i]);
+ if (out_slots) out_slots[i] = fs;
+ }
+ if (frame) {
+ a->max_outgoing = frame->max_outgoing;
+ a->has_alloca = frame->has_alloca ? 1u : 0u;
+ }
+
+ u32 cs_used = x_collect_cs_regs(a, cs_regs);
+ u32 frame_size = x_compute_frame_size(a, cs_used);
+ a->prologue_pos = t->mc->pos(t->mc);
+ u32 nbytes = x_build_prologue(t, buf, X64_PROLOGUE_BYTES, frame_size,
+ cs_regs, cs_used);
+ t->mc->emit_bytes(t->mc, buf, nbytes);
+ x_emit_variadic_reg_saves(t);
+}
+
+void x_func_end(CGTarget *t) {
+ XImpl *a = impl_of(t);
+ MCEmitter *mc = t->mc;
+
+ Reg cs_regs[5];
+ u32 cs_used = x_collect_cs_regs(a, cs_regs);
+
+ /* Stack alignment: SysV requires rsp ≡ 0 mod 16 just before a call,
+ * which means rsp ≡ 8 mod 16 inside the function (after the return
+ * address is pushed). On entry, rsp ≡ 8 mod 16; after `push rbp` it
+ * is 0 mod 16; after `sub rsp, frame_size` we need it back to 0
+ * mod 16, so frame_size must be a multiple of 16. */
+ u32 frame_size = x_compute_frame_size(a, cs_used);
+
+ mc->label_place(mc, a->epilogue_label);
+
+ /* Restore callee-saves. Each at rbp - (cum_off + (i+1)*8). */
+ for (i32 i = (i32)cs_used - 1; i >= 0; --i) {
+ u32 reg = cs_regs[i];
+ i32 off = -(i32)a->cum_off - (i32)(i + 1) * 8;
+ emit_mov_load(mc, /*size=*/8, /*signed=*/0, reg, X64_RBP, off);
+ }
+
+ /* leave; ret. */
+ emit_leave(mc);
+ emit_ret(mc);
- if (0) {
- overflow:
- compiler_panic(t->c, a->loc,
- "x64: prologue placeholder overflow (%u of %u bytes)", wi,
- prologue_nbytes);
+ if (!a->known_frame) {
+ /* Patch prologue placeholder. */
+ u8 buf[X64_PROLOGUE_BYTES];
+ u32 prologue_nbytes = a->prologue_nbytes ? a->prologue_nbytes
+ : X64_PROLOGUE_BYTES;
+ for (u32 i = 0; i < prologue_nbytes; ++i)
+ buf[i] = 0x90;
+ (void)x_build_prologue(t, buf, prologue_nbytes, frame_size, cs_regs,
+ cs_used);
+ obj_patch(t->obj, a->fd->text_section_id, a->prologue_pos, buf,
+ prologue_nbytes);
}
- obj_patch(t->obj, a->fd->text_section_id, a->prologue_pos, buf,
- prologue_nbytes);
/* Patch each alloca's `lea dst, [rsp + 0]` disp32 with the final
* max_outgoing (already 16-aligned via the `(stack_off+15)&~15` round
diff --git a/src/arch/x64/internal.h b/src/arch/x64/internal.h
@@ -74,7 +74,7 @@ typedef struct XImpl {
u8 has_sret;
u8 has_alloca;
u8 is_variadic;
- u8 pad0;
+ u8 known_frame;
FrameSlot sret_ptr_slot;
FrameSlot reg_save_slot; /* variadic: 176-byte __va_list_tag reg save area */
@@ -149,6 +149,9 @@ extern const u32 g_int_arg_regs[6];
/* --- emit.c exports (lifecycle used by ops.c vtable constructor,
* encoding helpers used by alloc.c and ops.c) --- */
void x_func_begin(CGTarget *t, const CGFuncDesc *fd);
+void x_func_begin_known_frame(CGTarget *t, const CGFuncDesc *fd,
+ const CGKnownFrameDesc *frame,
+ FrameSlot *out_slots);
void x_func_end(CGTarget *t);
void x_coord_vtable_init(CGTarget *t);
diff --git a/src/arch/x64/ops.c b/src/arch/x64/ops.c
@@ -846,6 +846,57 @@ static void emit_arg_value(CGTarget* t, const CGABIValue* av, u32* next_int,
}
}
+static void count_arg_stack(const CGABIValue* av, u32* next_int,
+ u32* next_fp, u32* stack_off) {
+ ABIArgInfo va_ai;
+ ABIArgPart va_pt;
+ const ABIArgInfo* ai = av->abi;
+ if (!ai) {
+ u32 sz = type_byte_size(av->type);
+ memset(&va_ai, 0, sizeof va_ai);
+ memset(&va_pt, 0, sizeof va_pt);
+ va_ai.kind = ABI_ARG_DIRECT;
+ va_ai.parts = &va_pt;
+ va_ai.nparts = 1;
+ va_pt.cls = (av->storage.cls == RC_FP) ? ABI_CLASS_FP : ABI_CLASS_INT;
+ va_pt.size = sz;
+ va_pt.align = sz;
+ va_pt.src_offset = 0;
+ ai = &va_ai;
+ }
+ if (ai->kind == ABI_ARG_IGNORE) return;
+ if (ai->kind == ABI_ARG_INDIRECT) {
+ if (*next_int < 6)
+ ++*next_int;
+ else
+ *stack_off += 8;
+ return;
+ }
+ for (u16 i = 0; i < ai->nparts; ++i) {
+ const ABIArgPart* pt = &ai->parts[i];
+ if (pt->cls == ABI_CLASS_INT) {
+ if (*next_int < 6)
+ ++*next_int;
+ else
+ *stack_off += 8;
+ } else if (pt->cls == ABI_CLASS_FP) {
+ if (*next_fp < 8)
+ ++*next_fp;
+ else
+ *stack_off += 8;
+ }
+ }
+}
+
+static u32 x_call_stack_size(CGTarget* t, const CGCallDesc* d) {
+ (void)t;
+ u32 next_int = (d->abi && d->abi->has_sret) ? 1u : 0u;
+ u32 next_fp = 0, stack_off = 0;
+ for (u32 i = 0; i < d->nargs; ++i)
+ count_arg_stack(&d->args[i], &next_int, &next_fp, &stack_off);
+ return (stack_off + 15u) & ~15u;
+}
+
static void x_call(CGTarget* t, const CGCallDesc* d) {
XImpl* a = impl_of(t);
MCEmitter* mc = t->mc;
@@ -866,7 +917,12 @@ static void x_call(CGTarget* t, const CGCallDesc* d) {
emit_arg_value(t, &d->args[i], &next_int, &next_fp, &stack_off);
}
u32 needed = (stack_off + 15u) & ~15u;
- if (needed > a->max_outgoing) a->max_outgoing = needed;
+ if (needed > a->max_outgoing) {
+ if (a->known_frame)
+ compiler_panic(t->c, a->loc,
+ "x64 call: known frame outgoing area too small");
+ a->max_outgoing = needed;
+ }
/* Variadic calls: AL = number of XMM regs used. */
if (d->abi && d->abi->variadic) {
@@ -1859,6 +1915,7 @@ CGTarget* x64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) {
t->mc = m;
t->func_begin = x_func_begin;
+ t->func_begin_known_frame = x_func_begin_known_frame;
t->func_end = x_func_end;
t->frame_slot = x_frame_slot;
@@ -1895,6 +1952,7 @@ CGTarget* x64_cgtarget_new(Compiler* c, ObjBuilder* o, MCEmitter* m) {
t->convert = x_convert;
t->call = x_call;
+ t->call_stack_size = x_call_stack_size;
t->ret = x_ret;
t->alloca_ = x_alloca_;
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -1249,6 +1249,41 @@ static u32 collect_opt_hard_regs(Func* f, CGTarget* w, RegClass cls,
return nused;
}
+static void collect_known_frame(Func* f, CGTarget* w, CGKnownFrameDesc* out) {
+ memset(out, 0, sizeof(*out));
+ FrameSlotDesc* slots = NULL;
+ if (f->nframe_slots) {
+ slots = arena_zarray(f->arena, FrameSlotDesc, f->nframe_slots);
+ for (u32 i = 0; i < f->nframe_slots; ++i) {
+ IRFrameSlot* s = &f->frame_slots[i];
+ slots[i].type = s->type;
+ slots[i].name = s->name;
+ slots[i].loc = s->loc;
+ slots[i].size = s->size;
+ slots[i].align = s->align;
+ slots[i].kind = s->kind;
+ slots[i].flags = s->flags;
+ }
+ }
+ out->slots = slots;
+ out->nslots = f->nframe_slots;
+
+ for (u32 b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ for (u32 i = 0; i < bl->ninsts; ++i) {
+ Inst* in = &bl->insts[i];
+ if ((IROp)in->op == IR_ALLOCA) {
+ out->has_alloca = 1;
+ } else if ((IROp)in->op == IR_CALL && w->call_stack_size) {
+ IRCallAux* aux = (IRCallAux*)in->extra.aux;
+ if (!aux) continue;
+ u32 need = w->call_stack_size(w, &aux->desc);
+ if (need > out->max_outgoing) out->max_outgoing = need;
+ }
+ }
+ }
+}
+
static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) {
ReplayCtx r;
r.c = c;
@@ -1278,10 +1313,23 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) {
}
}
- /* func_begin with the recorded descriptor. The desc.params[].slot
- * fields are wrapper IR slot ids; aarch64's func_begin doesn't
- * dereference them so we don't translate. */
- w->func_begin(w, &f->desc);
+ int known_frame = identity && w->func_begin_known_frame && w->call_stack_size;
+ if (known_frame) {
+ CGKnownFrameDesc frame;
+ FrameSlot* target_slots = f->nframe_slots
+ ? arena_zarray(f->arena, FrameSlot,
+ f->nframe_slots)
+ : NULL;
+ collect_known_frame(f, w, &frame);
+ w->func_begin_known_frame(w, &f->desc, &frame, target_slots);
+ for (u32 i = 0; i < f->nframe_slots; ++i)
+ r.slot_map[f->frame_slots[i].id] = target_slots[i];
+ } else {
+ /* func_begin with the recorded descriptor. The desc.params[].slot
+ * fields are wrapper IR slot ids; aarch64's func_begin doesn't
+ * dereference them so we don't translate. */
+ w->func_begin(w, &f->desc);
+ }
if (!r.identity_regs) {
for (u32 cidx = 0; cidx < OPT_REG_CLASSES; ++cidx) {
@@ -1295,17 +1343,19 @@ static void replay_func_to(Compiler* c, Func* f, CGTarget* w, int identity) {
}
}
- for (u32 i = 0; i < f->nframe_slots; ++i) {
- IRFrameSlot* s = &f->frame_slots[i];
- FrameSlotDesc d = {0};
- d.type = s->type;
- d.name = s->name;
- d.loc = s->loc;
- d.size = s->size;
- d.align = s->align;
- d.kind = s->kind;
- d.flags = s->flags;
- r.slot_map[s->id] = w->frame_slot(w, &d);
+ if (!known_frame) {
+ for (u32 i = 0; i < f->nframe_slots; ++i) {
+ IRFrameSlot* s = &f->frame_slots[i];
+ FrameSlotDesc d = {0};
+ d.type = s->type;
+ d.name = s->name;
+ d.loc = s->loc;
+ d.size = s->size;
+ d.align = s->align;
+ d.kind = s->kind;
+ d.flags = s->flags;
+ r.slot_map[s->id] = w->frame_slot(w, &d);
+ }
}
for (u32 i = 0; i < f->nparams; ++i) {