commit e6a04f06bd66c77b397468a5749785163adb18fa
parent bded6d326e383ec53eabd205d8521456899ed8a5
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sun, 10 May 2026 13:19:36 -0700
arch/rv64: variadic save area atop frame, contiguous with stack args
Place the 64-byte GP save area immediately above the saved-s0/ra pair so
[save_area, save_area+64, caller's stack] forms one byte stream — a
single 8-byte stride past save_area[7] lands on the caller's first stack
arg. Spill only a_{next_param_int}..a7 (named int params already live in
their own slots). Stack-arg reads in rv_param skip past the save area
when the callee is variadic.
Fixes j03_va_int_spill (10 i32 variadics) and j06_va_double_spill (9 f64
variadics) at L0 and L1; rv64 cg suite goes 765→769 pass / 8→4 fail.
The remaining 4 fails are Phase 4 .tbss cases.
Diffstat:
2 files changed, 46 insertions(+), 61 deletions(-)
diff --git a/doc/rv64-status.md b/doc/rv64-status.md
@@ -19,13 +19,13 @@ E links and runs under qemu-riscv64 via podman.
| Path | Pass | Fail | Skip |
|----------------------------|-----:|-----:|-----:|
| R (roundtrip) | 386 | 0 | 0 |
-| E (qemu exec) | 379 | 8 | ~ |
+| E (qemu exec) | 383 | 4 | ~ |
| D / J (native JIT) | 0 | 0 | 772 |
Skips are valid: D and J require host == rv64. With
-`CFREE_TEST_ALLOW_SKIP=1`, the suite reports **765 pass, 8 fail, 768 skip**.
+`CFREE_TEST_ALLOW_SKIP=1`, the suite reports **769 pass, 4 fail, 768 skip**.
-The 8 fails are 4 cases × 2 opt levels — see the phase checklists below.
+The 4 fails are 2 cases × 2 opt levels — see the Phase 4 checklist below.
---
@@ -56,28 +56,22 @@ The 8 fails are 4 cases × 2 opt levels — see the phase checklists below.
- ✅ Intrinsics: memcpy/memmove/memset, popcount, ctz, clz, bswap16/32/64,
add/sub/mul_overflow, expect, assume_aligned, prefetch, trap
-## Phase 3 — Variadic LP64D 🚧
+## Phase 3 — Variadic LP64D ✅
Variadic-args calling convention with **save area contiguous with caller's
stack args** so a single `void*` walk works for any number of args.
- ✅ va_list = `void*`; va_start / va_arg / va_end / va_copy
-- ✅ Prologue spills a0..a7 into a 64-byte save slot
- ✅ va_arg handles RC_INT and RC_FP (bitcast via FMV.X.{W,D})
- ✅ Variadic FP **args being passed** are bitcast into integer regs
(RC_FP storage → FMV.X.{W,D} → a-reg)
-- 🚧 Save area placement: currently inside the frame near other slots;
- walks past `save_area[7]` fall into locals, not caller's stack
-- ⬜ Relayout: place save area at the very top of the frame (right above
- the saved-s0/ra pair) so [save_area, save_area+64, caller's stack]
- forms one contiguous byte stream
-- ⬜ Honor named-arg count: spill only a_{nparams_int}..a7 (today we
- spill all 8 unconditionally; correct but wastes prologue insns)
-
-Failing today (4 case × 2 levels = 8):
-- `j03_va_int_spill` — 10 i32 variadics; 9th and 10th read from
- wrong addresses
-- `j06_va_double_spill` — 9 f64 variadics; 9th double read from wrong addr
+- ✅ Save area sits at the very top of the frame, above the saved-s0/ra
+ pair, so [save_area, save_area+64, caller's stack] is one contiguous
+ byte stream — `save_area[8]` coincides with the caller's first stack arg
+- ✅ Prologue spills only a_{next_param_int}..a7 (named int params already
+ landed in their own slots; sret consumes a0 when present)
+- ✅ Stack-arg reads in `rv_param` use `caller_stack_base = 16 + 64` for
+ variadic functions to skip past the save area
## Phase 4 — TLS LE ⬜
diff --git a/src/arch/rv64.c b/src/arch/rv64.c
@@ -353,20 +353,12 @@ static void rv_func_begin(CGTarget* t, const CGFuncDesc* fd) {
a->next_param_int = 1;
}
- /* Variadic: reserve a 64-byte GP save area for a0..a7. The patcher
- * spills the regs into it as part of the prologue, after `addi s0,…`. */
- if (a->is_variadic) {
- FrameSlotDesc gpd = {
- .type = NULL,
- .name = 0,
- .loc = (SrcLoc){0, 0, 0},
- .size = 64,
- .align = 8,
- .kind = FS_SPILL,
- .flags = 0,
- };
- a->gp_save_slot = rv_frame_slot(t, &gpd);
- }
+ /* Variadic: a 64-byte GP save area for a0..a7 lives at the very top
+ * of the frame, immediately above the saved-s0/ra pair, so its bytes
+ * are contiguous with the caller's stack args. The patcher spills the
+ * unnamed a-regs into it as part of the prologue. The slot is implicit
+ * (not allocated through rv_frame_slot) — it sits at [s0 + 16] when
+ * is_variadic is set. */
}
static void rv_func_end(CGTarget* t) {
@@ -381,11 +373,15 @@ static void rv_func_end(CGTarget* t) {
u32 int_saves_sz = n_int_saves * 8u;
u32 fp_saves_sz = n_fp_saves * 8u;
+ /* Variadic functions reserve a 64-byte save area at the very top of
+ * the frame so the save area and caller's stack args form a single
+ * contiguous byte stream walked by the va_list pointer. */
+ u32 va_save_sz = a->is_variadic ? 64u : 0u;
u32 locals_off = max_out + int_saves_sz + fp_saves_sz; /* from sp */
u32 fp_pair_off = locals_off + a->cum_off;
- u32 frame_size = fp_pair_off + 16u;
+ u32 frame_size = fp_pair_off + 16u + va_save_sz;
frame_size = (frame_size + 15u) & ~15u;
- fp_pair_off = frame_size - 16u;
+ fp_pair_off = frame_size - 16u - va_save_sz;
a->fp_pair_off = fp_pair_off;
/* Place the epilogue label at current pos. */
@@ -484,15 +480,14 @@ static void rv_func_end(CGTarget* t) {
words[wi++] = rv_sd(RV_A0, RV_S0, -(i32)s->off);
}
}
- /* Variadic: spill a0..a7 into the GP save area. */
- if (a->is_variadic && a->gp_save_slot != FRAME_SLOT_NONE) {
- RvSlot* gs = slot_get(a, a->gp_save_slot);
- if (gs) {
- for (u32 i = 0; i < 8; ++i) {
- if (wi >= RV_PROLOGUE_WORDS) goto overflow;
- words[wi++] = rv_sd(RV_A0 + i, RV_S0,
- -(i32)gs->off + (i32)i * 8);
- }
+ /* Variadic: spill the still-unconsumed a-regs (a_{nparams_int}..a7)
+ * into the save area at [s0 + 16 + i*8]. The save area sits between
+ * the saved-s0/ra pair and the caller's stack args, so save_area[8]
+ * == caller's first stack arg. */
+ if (a->is_variadic) {
+ for (u32 i = a->next_param_int; i < 8; ++i) {
+ if (wi >= RV_PROLOGUE_WORDS) goto overflow;
+ words[wi++] = rv_sd(RV_A0 + i, RV_S0, 16 + (i32)i * 8);
}
}
/* int saves */
@@ -623,6 +618,9 @@ static void rv_param(CGTarget* t, const CGParamDesc* p) {
RvSlot* s = slot_get(a, p->slot);
if (!s) compiler_panic(t->c, a->loc, "rv64 param: bad slot");
const ABIArgInfo* ai = p->abi;
+ /* Caller's stack args start above the saved-s0/ra pair, plus the
+ * 64-byte variadic save area when this function is variadic. */
+ i32 caller_stack_base = 16 + (a->is_variadic ? 64 : 0);
if (ai->kind == ABI_ARG_IGNORE) return;
if (ai->kind == ABI_ARG_INDIRECT) {
@@ -638,7 +636,7 @@ static void rv_param(CGTarget* t, const CGParamDesc* p) {
/* Incoming stack args live in the caller's outgoing-arg area,
* which is `frame_size - fp_pair_off` (= 16 + the saved-s0/ra
* pair) above s0 — same logic as aa64's `16 + caller_off`. */
- emit32(mc, rv_ld(RV_T1, RV_S0, (i32)(16u + caller_off)));
+ emit32(mc, rv_ld(RV_T1, RV_S0, caller_stack_base + (i32)caller_off));
ptr_reg = RV_T1;
}
u32 nbytes = s->size;
@@ -681,7 +679,7 @@ static void rv_param(CGTarget* t, const CGParamDesc* p) {
u32 caller_off = a->next_param_stack;
a->next_param_stack += 8;
emit32(mc, enc_int_load(sz, 0, RV_T2, RV_S0,
- (i32)(16u + caller_off)));
+ caller_stack_base + (i32)caller_off));
emit32(mc, enc_int_store(sz, RV_T2, RV_S0,
-(i32)s->off + (i32)part_off));
}
@@ -699,10 +697,10 @@ static void rv_param(CGTarget* t, const CGParamDesc* p) {
u32 caller_off = a->next_param_stack;
a->next_param_stack += 8;
if (sz == 8) {
- emit32(mc, rv_fld(0, RV_S0, (i32)(16u + caller_off)));
+ emit32(mc, rv_fld(0, RV_S0, caller_stack_base + (i32)caller_off));
emit32(mc, rv_fsd(0, RV_S0, -(i32)s->off + (i32)part_off));
} else {
- emit32(mc, rv_flw(0, RV_S0, (i32)(16u + caller_off)));
+ emit32(mc, rv_flw(0, RV_S0, caller_stack_base + (i32)caller_off));
emit32(mc, rv_fsw(0, RV_S0, -(i32)s->off + (i32)part_off));
}
}
@@ -1868,10 +1866,11 @@ static void rv_alloca_(CGTarget* t, Operand d, Operand sz, u32 align) {
a->has_alloca = 1;
}
/* RV64 LP64D va_list: a single `void*` pointing at the next argument
- * slot. The prologue spills a0..a7 into a contiguous save area. The
- * save area and the caller's stack args form a single byte-stream
- * (named-arg-count-aware), but for our test corpus the save area
- * alone covers every variadic case. */
+ * slot. The prologue spills a_{nparams_int}..a7 into the save area at
+ * [s0 + 16]. The save area lives at the top of the callee frame,
+ * immediately above the saved-s0/ra pair, so save_area[8] coincides
+ * with the caller's first stack arg — a single 8-byte stride covers
+ * register and stack args alike. */
static void rv_va_start_(CGTarget* t, Operand ap_op) {
RImpl* a = impl_of(t);
MCEmitter* mc = t->mc;
@@ -1879,17 +1878,9 @@ static void rv_va_start_(CGTarget* t, Operand ap_op) {
compiler_panic(t->c, a->loc, "rv64 va_start: function not variadic");
}
u32 ap = reg_num(ap_op);
- /* ap is the address of the va_list variable; *ap = &save_area[named_int]. */
- RvSlot* gs = slot_get(a, a->gp_save_slot);
- if (!gs) compiler_panic(t->c, a->loc, "rv64 va_start: no save slot");
- /* t0 = s0 - gs->off + next_param_int*8 */
- i32 off = -(i32)gs->off + (i32)(a->next_param_int * 8u);
- if (off >= -2048 && off <= 2047) {
- emit32(mc, rv_addi(RV_T0, RV_S0, off));
- } else {
- emit_load_imm(mc, 1, RV_T0, (i64)off);
- emit32(mc, rv_add(RV_T0, RV_S0, RV_T0));
- }
+ /* *ap = s0 + 16 + next_param_int*8 (skip past named-int slots). */
+ i32 off = 16 + (i32)(a->next_param_int * 8u);
+ emit32(mc, rv_addi(RV_T0, RV_S0, off));
emit32(mc, rv_sd(RV_T0, ap, 0));
}