commit 1bfb41a5fc5c5da9f233bcf2d6846192694639e3
parent a7d80ffe34ba3daa813e372cc10b3d8ece9bf97e
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 29 May 2026 09:12:52 -0700
rv64: adopt NativeFrame, implement -O1 known-frame path
rv64 now embeds the shared NativeFrame (slot arena, callee-save set, outgoing
tracking, per-function reset) and implements func_begin_known_frame, mirroring
aa64: derive the callee-saved set from the optimizer's per-class masks (recorded
without slots — rv64 homes saves below the locals at rv_save_off), allocate the
static slots, reserve the sret entry-save, then emit the exact prologue eagerly
(rv_build_prologue, which already spills sret + variadic inline). reserve_callee_saves
is now non-NULL so plan_frame computes the mask set.
Frame-size fix: rv_frame_size now reserves the callee-save bytes (ncallee_saves*8)
that rv_save_off places below the locals — previously omitted, so at -O1 the saves
stomped outside the frame (SIGSEGV in recursion). rv_emit_tail_site restores
callee-saves before teardown and stages an indirect callee into t1 first (the
restore would otherwise clobber the pointer parked in a callee-saved reg).
va-save area now sized via native_frame_va_save_bytes (ABI va_list layout, now
populated for rv64 LP64D: 8 GP regs x 8 bytes) instead of a hardcoded 64.
Also fix scripts/toy_cross_batch.sh: brittle nested sh -c quoting (write the
in-container loop to a file), per-arch image tags (localhost/alpine-*), and WORK
under build/ (the macOS podman VM shares the repo tree, not /tmp).
rv64 toy X-O0 156/0, X-O1 156/0; parse E-O0+O1 900 pass, only deferred ldbl128
fail (28 = 14 x 2 opts), zero non-ldbl regressions.
Diffstat:
4 files changed, 161 insertions(+), 100 deletions(-)
diff --git a/doc/INTERFACES.md b/doc/INTERFACES.md
@@ -89,7 +89,7 @@ fills in. This is the most actively-changing area (x64/rv64 are being ported ont
| Semantic CG | `src/cg/cgtarget.h` | `CgTarget` | `native_direct_target` (-O0) or `opt_cgtarget` (-O≥1) | frontend-facing lowering, pre-regalloc |
| -O0 adapter | `src/cg/native_direct_target.h` | `NativeDirectTarget` + `NativeOps` | shared, parameterized by arch `NativeOps` | adapts `NativeTarget` to `CgTarget` for -O0 |
| Physical emit | `src/arch/native_target.h` | `NativeTarget` | `aa64`/`x64`/`rv64` `*_native_target_new()` | hard-register, machine-code emission + frame/CFI |
-| Frame model (shared) | `src/cg/native_frame.h` | `NativeFrame` | shared impl (`native_frame.c`); each backend embeds one | arch-neutral frame-slot bookkeeping the `NativeTarget` impls delegate to |
+| Frame model (shared) | `src/cg/native_frame.h` | `NativeFrame` | shared impl (`native_frame.c`); embedded by aa64 + rv64 (x64 shortly) | arch-neutral frame-slot bookkeeping the `NativeTarget` impls delegate to |
| Machine code | `src/arch/mc.h` | `MCEmitter` | one generic impl, `mc_new(Compiler, ObjBuilder)` | section/label/reloc/CFI byte emission for all MC archs |
**Per-arch entry points** (the surface each backend exposes to the rest of the
@@ -115,7 +115,7 @@ compiler):
`ArchImpl.apply_label_fixup` + CFI constants. Don't leak arch knowledge into
the generic emitter.
-### Native frame model (`src/cg/native_frame.h`) — NEW, adoption in progress
+### Native frame model (`src/cg/native_frame.h`) — in use by aa64 + rv64
A shared frame-bookkeeping module extracted because aa64/rv64/x64 all lay out a
stack frame the same way at the bookkeeping level. `NativeFrame` owns the
@@ -132,10 +132,10 @@ arch-neutral parts; each backend embeds one and keeps the ISA/ABI-specific parts
| Vararg save-area size from ABI va_list layout (`native_frame_va_save_bytes`) | — |
**Review notes:**
-- ⚠️ **Status: untracked / not yet consumed.** `native_frame.{h,c}` exist but no
- backend includes them yet (`grep` shows only self-reference). The contract is
- defined ahead of adoption; treat the *migration of aa64/rv64/x64 onto it* as
- the open work, mirroring the `NativeTarget` port itself.
+- **Status: in use by aa64 and rv64** (`src/arch/{aa64,rv64}/native.c` embed a
+ `NativeFrame`); x64 adoption is next, landing with its `NativeTarget` port.
+ Until x64 is on it, the contract is proven against two of three backends — the
+ x64 port is the remaining validation that the split generalizes.
- It consolidates the per-arch vararg-save magic numbers (rv64 64, x64 176,
aa64 64+128) into the single ABI-driven `native_frame_va_save_bytes` query —
aligned with the no-magic-numbers rule. When adopting per arch, verify the old
@@ -328,7 +328,7 @@ Track interface-review passes here. Status: ⬜ not reviewed · 🔶 in progress
| `compile.h` / `frontend.h` / `source.h` | 1 | ⬜ | frontend-facing |
| other Tier-1 (`archive`, `asm_emit`, `emu`, `preprocess`, `wasm`, `config`, support) | 1 | ⬜ | smaller surfaces |
| `NativeTarget` (`native_target.h`) | 2 | 🔶 | aa64 ✅ reference; x64/rv64 porting |
-| `NativeFrame` (`native_frame.h`) | 2 | 🔶 | ⚠️ NEW, untracked, no backend consumes it yet — adoption pending |
+| `NativeFrame` (`native_frame.h`) | 2 | 🔶 | in use by aa64 + rv64; x64 adoption pending |
| `CgTarget` (`cgtarget.h`) | 2 | ⬜ | — |
| `NativeDirectTarget`/`NativeOps` | 2 | ⬜ | -O0 adapter; semantic/physical split |
| `MCEmitter` (`mc.h`) | 2 | ⬜ | arch-neutral; keep it that way |
diff --git a/scripts/toy_cross_batch.sh b/scripts/toy_cross_batch.sh
@@ -18,16 +18,21 @@ ARCH="${1:?arch}"
OPT="${2:?opt}"
FILTER="${3:-}"
CASES="$ROOT/test/toy/cases"
-WORK="/tmp/toy_cross_batch/$ARCH-O$OPT"
+# Under the repo build dir, not /tmp: the podman VM (macOS) shares the repo tree
+# but not /tmp, so a /tmp bind-mount fails with "statfs ...: no such file".
+WORK="$ROOT/build/toy_cross_batch/$ARCH-O$OPT"
rm -rf "$WORK"; mkdir -p "$WORK"
+# Per-arch image: the three platforms otherwise collide on a shared
+# alpine:latest tag (a multi-platform pull overwrites it). Distinct local tags
+# (localhost/alpine-{rv64,amd64,arm64}) avoid that; override via RUN_<ARCH>_IMAGE.
case "$ARCH" in
- rv64) TRIPLE=riscv64-linux-gnu; PLAT=linux/riscv64 ;;
- x64) TRIPLE=x86_64-linux-gnu; PLAT=linux/amd64 ;;
- aa64) TRIPLE=aarch64-linux-gnu; PLAT=linux/arm64 ;;
+ rv64) TRIPLE=riscv64-linux-gnu; PLAT=linux/riscv64; IMAGE="${RUN_RV64_IMAGE:-localhost/alpine-rv64}" ;;
+ x64) TRIPLE=x86_64-linux-gnu; PLAT=linux/amd64; IMAGE="${RUN_X64_IMAGE:-localhost/alpine-amd64}" ;;
+ aa64) TRIPLE=aarch64-linux-gnu; PLAT=linux/arm64; IMAGE="${RUN_AARCH64_IMAGE:-localhost/alpine-arm64}" ;;
*) echo "unknown arch $ARCH"; exit 2 ;;
esac
-IMAGE="${IMAGE:-alpine:latest}"
+IMAGE="${IMAGE_OVERRIDE:-$IMAGE}"
# Build a freestanding _start.o for the target via clang (mirrors run.sh).
START_C="$WORK/start.c"
@@ -85,13 +90,20 @@ for src in "$CASES"/*.toy; do
done
# Single batched container run: execute every linked exe, print "name rc".
+# The in-container loop is written to a file (bind-mounted via $WORK) rather than
+# inlined through nested sh/bash quoting, which is brittle and shell-dependent.
RESULTS="$WORK/results.txt"
+INRUN="$WORK/run_in_container.sh"
+cat > "$INRUN" <<EOF
+cd "$WORK" || exit 2
+while read n; do
+ "./\$n.exe" >/dev/null 2>"$WORK/\$n.run.err"; echo "\$n \$?"
+done < "$RUNLIST"
+EOF
if [ -s "$RUNLIST" ]; then
podman run --rm --pull=never --platform "$PLAT" --net=none \
- -v "$WORK:$WORK" -w "$WORK" "$IMAGE" sh -c '
- while read n; do
- "./'$'"$n"'.exe" >/dev/null 2>"'"$WORK"'/$n.run.err"; echo "$n $?";
- done < "'"$RUNLIST"'"' > "$RESULTS" 2>"$WORK/podman.err" || {
+ -v "$WORK:$WORK" -w "$WORK" "$IMAGE" sh "$INRUN" \
+ > "$RESULTS" 2>"$WORK/podman.err" || {
echo "FATAL: podman batch run failed"; cat "$WORK/podman.err"; exit 2; }
fi
diff --git a/src/abi/abi_rv64.c b/src/abi/abi_rv64.c
@@ -246,6 +246,15 @@ static ABIFuncInfo* rv64_compute_func_info(TargetABI* a, CfreeCgTypeId fn) {
const ABIVtable rv64_vtable = {
.compute_func_info = rv64_compute_func_info,
.va_list_info = {8, 8, ABI_SC_PTR, 0, 0, 0},
- .va_list_layout =
- {.type = {8, 8, ABI_SC_PTR, 0, 0, 0}, .kind = ABI_VA_LIST_POINTER},
+ /* LP64D va_list is a plain pointer, but the variadic register-save area is
+ * the 8 integer arg registers (a0..a7) spilled contiguously = 64 bytes; FP
+ * varargs are passed in GPRs, so there is no separate FP save area. The
+ * gp_reg_count/gp_slot_size fields let native_frame_va_save_bytes size that
+ * area from the ABI rather than a backend constant. */
+ .va_list_layout = {.type = {8, 8, ABI_SC_PTR, 0, 0, 0},
+ .kind = ABI_VA_LIST_POINTER,
+ .gp_reg_count = 8,
+ .fp_reg_count = 0,
+ .gp_slot_size = 8,
+ .fp_slot_size = 0},
};
diff --git a/src/arch/rv64/native.c b/src/arch/rv64/native.c
@@ -24,6 +24,7 @@
#include "arch/rv64/regs.h"
#include "arch/rv64/rv64.h"
#include "cg/native_direct_target.h"
+#include "cg/native_frame.h"
#include "cg/type.h"
#include "core/arena.h"
#include "core/bytes.h"
@@ -43,10 +44,16 @@ enum {
/* Single-pass (-O0) worst-case prologue: sp adjust (3) + far save pair (7)
* + sret spill (1) + variadic GP spills (8). No callee-saves at -O0. */
RV_PROLOGUE_WORDS = 32u,
+ /* Known-frame (-O1) prologues are emitted directly, not into the fixed -O0
+ * NOP region, and additionally save callee-saved registers (up to 11 int + 12
+ * fp, each up to 4 words for a far s0-relative offset) on top of the header,
+ * sret, and variadic spills. Size the build buffer for the worst case. */
+ RV_KNOWN_PROLOGUE_WORDS = 192u,
RV_FRAME_SAVE_SIZE = 16u,
};
-#define RV_MAX_CALLEE_SAVES 22u /* s1..s11 (11) + fs0..fs11 (12)... capped */
+/* s1..s11 (11) + fs0..fs11 (12); separate int/fp collect arrays use this cap. */
+#define RV_MAX_CALLEE_SAVES 16u
#define RV_MAX_REG_ARG_MOVES 16u
extern void debug_emit_row(Debug*, ObjSecId text_section, u32 offset, SrcLoc);
@@ -172,20 +179,10 @@ static u32 enc_int_load(u32 nbytes, int sign_ext, u32 rd, u32 base, i32 off) {
/* ============================ target state ============================ */
-typedef struct RvNativeSlot {
- u32 off; /* bytes below s0 (positive); address = s0 - off */
- u32 size;
- u32 align;
- u8 kind; /* NativeFrameSlotKind */
- u8 pad[3];
-} RvNativeSlot;
-
-typedef struct RvCalleeSave {
- NativeFrameSlot slot;
- CfreeCgTypeId type;
- u8 cls; /* NativeAllocClass */
- Reg reg;
-} RvCalleeSave;
+/* Frame slots and callee-save records live in the shared NativeFrame
+ * bookkeeping (cg/native_frame.h); these aliases keep the rv64-local spellings. */
+typedef NativeFrameSlotEntry RvNativeSlot;
+typedef NativeFrameCalleeSave RvCalleeSave;
typedef enum RvPatchKind { RV_PATCH_ALLOCA } RvPatchKind;
@@ -200,13 +197,12 @@ typedef struct RvNativeTarget {
SrcLoc loc;
const CGFuncDesc* func;
- RvNativeSlot* slots;
- u32 nslots;
- u32 slots_cap;
- u32 cum_off; /* sum of frame-slot reservations below s0 */
- u32 max_outgoing; /* max outgoing-arg bytes across all calls */
+ /* Shared frame bookkeeping: slot table, cum_off, max_outgoing, callee-save
+ * set, and the known_frame / has_alloca / frame_final flags. */
+ NativeFrame frame;
u32 frame_size_final;
u32 fp_pair_off;
+ u32 minimal_prologue_words; /* known-frame path: exact prologue length, else 0 */
u32 incoming_stack_size; /* fixed-param stack bytes (tail-call check) */
u32 next_param_int;
@@ -224,13 +220,6 @@ typedef struct RvNativeTarget {
u32 func_start;
u32 prologue_pos;
MCLabel epilogue_label;
-
- RvCalleeSave callee_saves[RV_MAX_CALLEE_SAVES];
- u32 ncallee_saves;
-
- u8 known_frame;
- u8 has_alloca;
- u8 frame_final;
} RvNativeTarget;
static RvNativeTarget* rv_of(NativeTarget* t) { return (RvNativeTarget*)t; }
@@ -240,9 +229,7 @@ static _Noreturn void rv_panic(RvNativeTarget* a, const char* msg) {
}
static RvNativeSlot* rv_slot_get(RvNativeTarget* a, NativeFrameSlot fs) {
- if (fs == NATIVE_FRAME_SLOT_NONE || fs > a->nslots)
- rv_panic(a, "bad frame slot");
- return &a->slots[fs - 1u];
+ return native_frame_slot_at(&a->frame, fs);
}
/* s0-relative byte offset of a frame slot's base (address = s0 + ret). */
@@ -257,11 +244,21 @@ static i32 rv_s0_off_in_arg(const RvNativeTarget* a, u32 byte_off) {
}
static u32 rv_va_save_sz(const RvNativeTarget* a) {
- return a->is_variadic ? 64u : 0u;
+ /* ABI-derived: the variadic register-save area is gp_reg_count*gp_slot_size
+ * (a0..a7 = 64 bytes for LP64D). Only present in variadic functions. */
+ return a->is_variadic ? native_frame_va_save_bytes(a->base.c->abi) : 0u;
+}
+
+/* Callee-saved registers are homed just below the locals at rv_save_off(), 8
+ * bytes each — they are NOT frame slots, so the frame size must reserve their
+ * bytes explicitly. Zero at -O0 (no callee-saves are taken). */
+static u32 rv_callee_save_bytes(const RvNativeTarget* a) {
+ return a->frame.ncallee_saves * 8u;
}
static u32 rv_frame_size(const RvNativeTarget* a) {
- u32 raw = RV_FRAME_SAVE_SIZE + a->cum_off + a->max_outgoing + rv_va_save_sz(a);
+ u32 raw = RV_FRAME_SAVE_SIZE + a->frame.cum_off + rv_callee_save_bytes(a) +
+ a->frame.max_outgoing + rv_va_save_sz(a);
return align_up_u32(raw, 16u);
}
@@ -1180,25 +1177,7 @@ static void rv_load_label_addr(NativeTarget* t, NativeLoc dst, MCLabel l) {
static NativeFrameSlot rv_frame_slot(NativeTarget* t,
const NativeFrameSlotDesc* d) {
- RvNativeTarget* a = rv_of(t);
- RvNativeSlot* s;
- u32 size = d->size ? d->size : 8u;
- u32 align = d->align ? d->align : 1u;
- if (a->frame_final) rv_panic(a, "frame slot requested after prologue");
- if (a->nslots == a->slots_cap) {
- u32 cap = a->slots_cap ? a->slots_cap * 2u : 16u;
- RvNativeSlot* nb = arena_zarray(t->c->tu, RvNativeSlot, cap);
- if (a->slots) memcpy(nb, a->slots, sizeof(*nb) * a->nslots);
- a->slots = nb;
- a->slots_cap = cap;
- }
- a->cum_off = align_up_u32(a->cum_off + size, align);
- s = &a->slots[a->nslots++];
- s->off = a->cum_off;
- s->size = size;
- s->align = align;
- s->kind = d->kind;
- return (NativeFrameSlot)a->nslots;
+ return native_frame_slot_alloc(&rv_of(t)->frame, d);
}
static void rv_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) {
@@ -1207,9 +1186,9 @@ static void rv_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) {
const ABIFuncInfo* abi = abi_cg_func_info(t->c->abi, fd->fn_type);
a->func = fd;
a->loc = fd->loc;
- a->nslots = 0;
- a->cum_off = 0;
- a->max_outgoing = 0;
+ /* Shared frame bookkeeping: clears the slot table, cum_off, max_outgoing,
+ * callee-save set, and known_frame/has_alloca/frame_final. */
+ native_frame_reset(&a->frame);
a->incoming_stack_size = 0;
a->next_param_int = 0;
a->next_param_fp = 0;
@@ -1219,10 +1198,7 @@ static void rv_func_begin_common(NativeTarget* t, const CGFuncDesc* fd) {
a->sret_ptr_slot = NATIVE_FRAME_SLOT_NONE;
a->npatches = 0;
a->nalloca = 0;
- a->ncallee_saves = 0;
- a->known_frame = 0;
- a->has_alloca = 0;
- a->frame_final = 0;
+ a->minimal_prologue_words = 0;
mc->set_section(mc, fd->text_section_id);
mc->emit_align(mc, 4, 0);
@@ -1264,22 +1240,22 @@ static void rv_emit_entry_save_stores(RvNativeTarget* a) {
/* Collect the callee-saves the body used (none at -O0). */
static u32 rv_collect_int_saves(RvNativeTarget* a, u32* regs) {
u32 n = 0, i;
- for (i = 0; i < a->ncallee_saves; ++i)
- if (a->callee_saves[i].cls == NATIVE_REG_INT)
- regs[n++] = a->callee_saves[i].reg;
+ for (i = 0; i < a->frame.ncallee_saves; ++i)
+ if (a->frame.callee_saves[i].cls == NATIVE_REG_INT)
+ regs[n++] = a->frame.callee_saves[i].reg;
return n;
}
static u32 rv_collect_fp_saves(RvNativeTarget* a, u32* regs) {
u32 n = 0, i;
- for (i = 0; i < a->ncallee_saves; ++i)
- if (a->callee_saves[i].cls == NATIVE_REG_FP)
- regs[n++] = a->callee_saves[i].reg;
+ for (i = 0; i < a->frame.ncallee_saves; ++i)
+ if (a->frame.callee_saves[i].cls == NATIVE_REG_FP)
+ regs[n++] = a->frame.callee_saves[i].reg;
return n;
}
/* s0-relative offset of the i-th saved register (saves stack below locals). */
static i32 rv_save_off(RvNativeTarget* a, u32 idx) {
- return -(i32)(a->cum_off) - 8 - 8 * (i32)idx;
+ return -(i32)(a->frame.cum_off) - 8 - 8 * (i32)idx;
}
static void rv_load_s0(MCEmitter* mc, int fp, u32 reg, i32 off) {
@@ -1409,7 +1385,7 @@ static void rv_func_end(NativeTarget* t) {
rv_load_s0(mc, 0, int_regs[i], rv_save_off(a, (u32)i));
for (i = (i32)n_fp - 1; i >= 0; --i)
rv_load_s0(mc, 1, fp_regs[i], rv_save_off(a, n_int + (u32)i));
- if (a->has_alloca)
+ if (a->frame.has_alloca)
rv_emit_addr_adjust(mc, RV_SP, RV_S0, -(i32)fp_pair_off);
rv64_emit32(mc, rv_ld(RV_RA, RV_S0, 8));
rv64_emit32(mc, rv_ld(RV_S0, RV_S0, 0));
@@ -1423,7 +1399,7 @@ static void rv_func_end(NativeTarget* t) {
rv64_emit32(mc, rv_jalr(RV_ZERO, RV_RA, 0));
/* patch prologue */
- if (!a->known_frame) {
+ if (!a->frame.known_frame) {
u32 words[RV_PROLOGUE_WORDS];
u32 nwords, k;
for (k = 0; k < RV_PROLOGUE_WORDS; ++k) words[k] = RV_NOP;
@@ -1435,7 +1411,7 @@ static void rv_func_end(NativeTarget* t) {
}
/* patch alloca sites: addi dst, sp, max_outgoing */
{
- u32 mo = align_up_u32(a->max_outgoing, 16u);
+ u32 mo = align_up_u32(a->frame.max_outgoing, 16u);
u32 k;
if (mo > 2047u) rv_panic(a, "max_outgoing too large for alloca patch");
for (k = 0; k < a->npatches; ++k)
@@ -1446,7 +1422,9 @@ static void rv_func_end(NativeTarget* t) {
/* CFI: CFA = s0 + (frame_size - fp_pair_off) */
if (mc->cfi_set_next_pc_offset && mc->cfi_def_cfa && mc->cfi_offset) {
i32 cfa = (i32)frame_size - (i32)fp_pair_off;
- u32 post = a->prologue_pos + (a->known_frame ? 0u : RV_PROLOGUE_WORDS * 4u);
+ u32 post = a->prologue_pos + (a->frame.known_frame
+ ? a->minimal_prologue_words * 4u
+ : RV_PROLOGUE_WORDS * 4u);
u32 k;
mc->cfi_set_next_pc_offset(mc, post - a->func_start);
mc->cfi_def_cfa(mc, RV_S0, cfa);
@@ -1470,13 +1448,56 @@ static void rv_func_end(NativeTarget* t) {
a->func = NULL;
}
+/* rv64 homes its callee-saves below the locals at rv_save_off(idx) rather than
+ * in frame slots, so alloc_slots=0: native_frame just records the {reg,cls} set
+ * derived from the optimizer's per-class used-masks. */
+static void rv_reserve_callee_saves(NativeTarget* t, const u32* used,
+ u32 nclasses) {
+ native_frame_set_callee_saves(&rv_of(t)->frame, used, nclasses, NULL, 0, 0);
+}
+
+/* Optimizer entry point: the full frame is supplied up front, so the prologue
+ * is emitted final the moment it is built — no NOP region, no func_end patch
+ * (rv_func_end skips patching when known_frame). rv_build_prologue emits the
+ * sret spill and the variadic register-save stores inline, so there is no
+ * separate entry-save emission. Slot creation order matches the single-pass
+ * path: callee-saves first (only recorded for rv64), then static slots, then
+ * the sret entry-save slot. */
static void rv_func_begin_known_frame(NativeTarget* t, const CGFuncDesc* fd,
const NativeKnownFrameDesc* frame,
NativeFrameSlot* out_slots) {
- (void)fd;
- (void)frame;
- (void)out_slots;
- rv_panic(rv_of(t), "known-frame path not implemented yet");
+ RvNativeTarget* a = rv_of(t);
+ MCEmitter* mc = t->mc;
+ u32 int_regs[RV_MAX_CALLEE_SAVES], fp_regs[RV_MAX_CALLEE_SAVES];
+ u32 n_int, n_fp, frame_size, fp_pair_off, nwords, i;
+ u32 words[RV_KNOWN_PROLOGUE_WORDS];
+ rv_func_begin_common(t, fd);
+ a->frame.known_frame = 1;
+ if (frame) {
+ a->frame.has_alloca = frame->has_alloca;
+ if (frame->callee_saved_used && frame->ncallee_classes)
+ rv_reserve_callee_saves(t, frame->callee_saved_used,
+ frame->ncallee_classes);
+ for (i = 0; i < frame->nslots; ++i) {
+ NativeFrameSlot slot = rv_frame_slot(t, &frame->slots[i]);
+ if (out_slots) out_slots[i] = slot;
+ }
+ rv_reserve_entry_saves(a);
+ native_frame_note_outgoing(&a->frame, frame->max_outgoing);
+ }
+ /* Frame is final: size and offsets are settled, so emit the exact prologue. */
+ frame_size = rv_frame_size(a);
+ fp_pair_off = rv_fp_pair_off(a, frame_size);
+ a->frame_size_final = frame_size;
+ a->fp_pair_off = fp_pair_off;
+ n_int = rv_collect_int_saves(a, int_regs);
+ n_fp = rv_collect_fp_saves(a, fp_regs);
+ a->prologue_pos = mc->pos(mc);
+ nwords = rv_build_prologue(a, words, RV_KNOWN_PROLOGUE_WORDS, frame_size,
+ fp_pair_off, int_regs, n_int, fp_regs, n_fp);
+ for (i = 0; i < nwords; ++i) rv64_emit32(mc, words[i]);
+ a->minimal_prologue_words = nwords;
+ native_frame_set_final(&a->frame);
}
/* ============================ params / ABI helpers ============================ */
@@ -1865,8 +1886,8 @@ static void rv_plan_call(NativeTarget* t, const NativeCallDesc* desc,
plan->has_sret = abi && abi->has_sret;
plan->is_variadic = abi && abi->variadic;
plan->stack_arg_size = rv_call_stack_size(t, desc);
- if (plan->stack_arg_size > a->max_outgoing)
- a->max_outgoing = plan->stack_arg_size;
+ if (plan->stack_arg_size > a->frame.max_outgoing)
+ a->frame.max_outgoing = plan->stack_arg_size;
/* Indirect callee in an arg register would be clobbered by arg loads. */
if (plan->callee.kind == NATIVE_LOC_REG &&
(NativeAllocClass)plan->callee.cls == NATIVE_REG_INT &&
@@ -1992,8 +2013,24 @@ static void rv_emit_tail_site(NativeTarget* t, NativeLoc callee) {
RvNativeTarget* a = rv_of(t);
MCEmitter* mc = t->mc;
i32 cfa = (i32)(RV_FRAME_SAVE_SIZE + rv_va_save_sz(a));
- if (a->ncallee_saves)
- rv_panic(a, "tail call with callee-saves (O1 path) not implemented");
+ int indirect = callee.kind == NATIVE_LOC_REG;
+ u32 int_regs[RV_MAX_CALLEE_SAVES], fp_regs[RV_MAX_CALLEE_SAVES];
+ u32 n_int = rv_collect_int_saves(a, int_regs);
+ u32 n_fp = rv_collect_fp_saves(a, fp_regs);
+ i32 i;
+ /* Stage an indirect callee into a reserved scratch (t1) BEFORE the teardown:
+ * regalloc parks the function pointer in a callee-saved register so it
+ * survives arg marshalling, and the callee-save / s0 / ra restores below would
+ * otherwise overwrite it. t1 is reserved (never allocable) and untouched by
+ * the restore loop (which only uses t0 for far offsets). */
+ if (indirect) rv64_emit32(mc, rv_addi(RV_TMP1, loc_reg(callee), 0));
+ /* Restore callee-saves before tearing the frame down (O1 path; none at -O0).
+ * Their save offsets are s0-relative via rv_save_off, so the restore is
+ * frame-size- and teardown-order-independent. */
+ for (i = (i32)n_int - 1; i >= 0; --i)
+ rv_load_s0(mc, 0, int_regs[i], rv_save_off(a, (u32)i));
+ for (i = (i32)n_fp - 1; i >= 0; --i)
+ rv_load_s0(mc, 1, fp_regs[i], rv_save_off(a, n_int + (u32)i));
rv64_emit32(mc, rv_ld(RV_RA, RV_S0, 8));
rv64_emit32(mc, rv_addi(RV_SP, RV_S0, cfa));
rv64_emit32(mc, rv_ld(RV_S0, RV_S0, 0));
@@ -2003,8 +2040,8 @@ static void rv_emit_tail_site(NativeTarget* t, NativeLoc callee) {
rv64_emit32(mc, rv_jalr(RV_ZERO, RV_TMP0, 0));
mc->emit_reloc_at(mc, mc->section_id, pos, R_RV_CALL, callee.v.global.sym,
callee.v.global.addend, 0, 0);
- } else if (callee.kind == NATIVE_LOC_REG) {
- rv64_emit32(mc, rv_jalr(RV_ZERO, loc_reg(callee), 0));
+ } else if (indirect) {
+ rv64_emit32(mc, rv_jalr(RV_ZERO, RV_TMP1, 0));
} else {
rv_panic(a, "unsupported tail call target");
}
@@ -2112,7 +2149,7 @@ static void rv_alloca(NativeTarget* t, NativeLoc dst, NativeLoc size,
rv_emit_load_imm(mc, 1, RV_TMP1, -(i64)al);
rv64_emit32(mc, rv_and(RV_TMP0, RV_TMP0, RV_TMP1));
rv64_emit32(mc, rv_sub(RV_SP, RV_SP, RV_TMP0));
- a->has_alloca = 1;
+ a->frame.has_alloca = 1;
/* dst = sp + max_outgoing (patched in func_end) */
if (a->npatches == a->patches_cap) {
u32 cap = a->patches_cap ? a->patches_cap * 2u : 8u;
@@ -3228,6 +3265,7 @@ NativeTarget* rv64_native_target_new(Compiler* c, ObjBuilder* obj,
t->c = c;
t->obj = obj;
t->mc = mc;
+ native_frame_init(&a->frame, c);
t->regs = &rv_reg_info;
t->class_for_type = rv_class_for_type;
t->imm_legal = rv_imm_legal;
@@ -3235,7 +3273,9 @@ NativeTarget* rv64_native_target_new(Compiler* c, ObjBuilder* obj,
t->func_begin = rv_func_begin;
t->func_begin_known_frame = rv_func_begin_known_frame;
t->note_frame_state = NULL;
- t->reserve_callee_saves = NULL;
+ /* Non-NULL so the optimizer emit path (plan_frame) computes the callee-saved
+ * set; rv_func_begin_known_frame derives the records from the masks. */
+ t->reserve_callee_saves = rv_reserve_callee_saves;
t->signature_stack_bytes = rv_signature_stack_bytes;
t->call_stack_bytes = rv_call_stack_bytes;
t->has_store_zero_reg = 1;
@@ -3313,7 +3353,7 @@ static const char* rv_no_tail(NativeDirectTarget* d, const CGCallDesc* call) {
NativeLoc* args = NULL;
NativeLoc* results = NULL;
u32 i, stack;
- if (a->ncallee_saves)
+ if (a->frame.ncallee_saves)
return "rv64 tail call: callee-saved registers in use";
memset(&nd, 0, sizeof nd);
if (call->nargs) args = arena_zarray(d->base.c->tu, NativeLoc, call->nargs);