commit b871959ddf3e4188f507041b2f9f7181d2662750
parent 42ce19068af007468976b1b1cfcbfbde2594abc5
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Sat, 9 May 2026 15:15:37 -0700
test/cg: register Groups J, K, L (varargs, atomics, intrinsics)
Sets the test contract for the next round of CGTarget surface area:
varargs (`va_start_`/`va_arg_`/`va_end_`/`va_copy_`), atomics (load,
store, every `AtomicOp` rmw, cas success/failure, fence), and
intrinsics (popcount/ctz/clz/bswap, mem*, hint kinds, checked arith).
Cases drive the live `TargetABI` and CGTarget interfaces; each fails at
emit-time with the backend's existing "not implemented" diagnostic
until the aa64 lowerings land.
Diffstat:
6 files changed, 1392 insertions(+), 3 deletions(-)
diff --git a/test/cg/CORPUS.md b/test/cg/CORPUS.md
@@ -239,13 +239,93 @@ can deref).
| `i09_alloca_preserves_locals` | · | named `int` locals before+after alloca; both readable post-alloca | 42 |
| `i10_alloca_after_named_local`| · | alloca after a fixed local — frame layout must keep both addressable | 42 |
+## Group J — varargs
+
+Drives `va_start_`, `va_arg_`, `va_end_`, `va_copy_` on `CGTarget` and
+the ABI's vararg classification (`abi_va_list_type` + the
+`vararg_*_offset` fields on `ABIFuncInfo`). Each case pairs a variadic
+helper (`int f(int n, ...)`) with a `test_main` caller; the helper
+allocates an `ap` of `abi_va_list_type` size in a local slot and passes
+its address to `va_start_`/`va_arg_`. AArch64 PCS routes int and FP var
+args through separate save areas, so spill cases exist for each.
+
+| Case | Status | Body | Expected |
+|---|---|---|---|
+| `j01_va_int_sum_3` | · | `int sum(int n, ...)`; `sum(3, 1, 2, 3)` (basic va_start/va_arg/va_end) | 6 |
+| `j02_va_zero_args` | · | `sum(0)` — va_start/va_end with zero va_arg calls | 0 |
+| `j03_va_int_spill` | · | `sum(10, 1..10)` — 10 var ints (>7 in GPR save area; rest spill) | 55 |
+| `j04_va_int64` | · | `sum_ll(2, 21LL, 21LL)` — i64 var args; low 32 of sum | 42 |
+| `j05_va_double_sum` | · | `int sumd(int n, ...){ftoi_s of fp accumulator}`; `sumd(3, 1.5, 2.0, 3.5)` | 7 |
+| `j06_va_double_spill` | · | `sumd(9, 0.5×9)` — exhaust FP save area; last spills | 4 |
+| `j07_va_mixed_int_dbl` | · | `int f(int n, int, double, int, double)`; sum truncated to int | 42 |
+| `j08_va_copy` | · | `va_copy(b, a)`; consume first arg from each — equal halves of `42` | 42 |
+| `j09_va_two_fixed` | · | `int f(int a, int b, ...) { return a+b+va_arg(); }` — second fixed slot | 42 |
+
+## Group K — atomics
+
+Exercises `atomic_load`, `atomic_store`, `atomic_rmw` (every `AtomicOp`
+kind), `atomic_cas` (success and failure paths), and `fence`. Each case
+stores into an `FSF_ADDR_TAKEN` i32/i64 local, performs one atomic op
+via the helper's address operand, then reads back via plain load to
+verify the post-state. `MemOrder` is varied across cases so a backend
+that bakes an ordering bit reset wins consistency. A successful CAS
+returns the prior; failure leaves memory unchanged.
+
+| Case | Status | Body | Expected |
+|---|---|---|---|
+| `k01_atomic_load_relaxed` | · | `int x=42; r=atomic_load(&x, RELAXED); return r;` | 42 |
+| `k02_atomic_store_load_acq` | · | `atomic_store(&x, 42, RELEASE); r=atomic_load(&x, ACQUIRE);` | 42 |
+| `k03_atomic_load_seq_cst` | · | `atomic_load(&x, SEQ_CST)` — full barrier ordering | 42 |
+| `k04_atomic_rmw_add` | · | `x=40; prior=rmw(ADD,&x,2,SEQ_CST); return atomic_load(&x);` post-state | 42 |
+| `k05_atomic_rmw_xchg` | · | `x=99; rmw(XCHG,&x,42); return load(&x);` | 42 |
+| `k06_atomic_rmw_and` | · | `x=0xFF; rmw(AND,&x,0x2A); return load(&x);` | 42 |
+| `k07_atomic_rmw_or` | · | `x=0x20; rmw(OR,&x,0x0A); return load(&x);` | 42 |
+| `k08_atomic_rmw_xor` | · | `x=0xFF; rmw(XOR,&x,0xD5); return load(&x);` (= 0x2A) | 42 |
+| `k09_atomic_rmw_sub` | · | `x=44; rmw(SUB,&x,2); return load(&x);` | 42 |
+| `k10_atomic_rmw_nand` | · | `x=0xFF; rmw(NAND,&x,0xD5);` post-state low 8 = `~(0xFF&0xD5)&0xFF = 0x2A` | 42 |
+| `k11_atomic_cas_success` | · | `x=10; cas(&x,exp=10,des=42)→ok=1;` post-load | 42 |
+| `k12_atomic_cas_failure` | · | `x=10; cas(&x,exp=99,des=42)→ok=0;` post-load (unchanged) | 10 |
+| `k13_atomic_load_i64` | · | i64 atomic load of `0x1_0000_002A`; low 8 | 42 |
+| `k14_atomic_rmw_prior` | · | return `prior` from `rmw(ADD,&x=40,2)` (not post-state) → 40 | 40 |
+| `k15_fence_seq_cst` | · | `fence(SEQ_CST)` between two plain stores+loads; no observable race | 42 |
+
+## Group L — intrinsics
+
+Drives `CGTarget.intrinsic` across every `IntrinKind` group. Bit ops
+return their result in a single REG dst. `MEMCPY`/`MEMMOVE`/`MEMSET`
+take three address/byte/n args and write through memory. Hint kinds
+(`PREFETCH`, `EXPECT`, `UNREACHABLE`, `TRAP`, `ASSUME_ALIGNED`) are
+emitted on a path the test then steps over; the oracle is the post-hint
+control flow. Checked-arith intrinsics return `(result, overflow_flag)`
+in two REG dsts; cases observe each independently.
+
+| Case | Status | Body | Expected |
+|---|---|---|---|
+| `l01_popcount_u32` | · | `popcount(0x000000FF) → 8` | 8 |
+| `l02_popcount_u64` | · | `popcount((u64)-1) → 64` | 64 |
+| `l03_ctz_u32` | · | `ctz(0x80) → 7` | 7 |
+| `l04_clz_u32` | · | `clz(0x000000FF) → 24` (32-bit) | 24 |
+| `l05_bswap16` | · | `bswap16(0x1234) → 0x3412`; low 8 | 18 |
+| `l06_bswap32` | · | `bswap32(0x11223344) → 0x44332211`; low 8 | 17 |
+| `l07_bswap64` | · | `bswap64(0x1122334455667788) → 0x8877665544332211`; low 8 | 17 |
+| `l08_memcpy_4` | · | i32 src=42; `memcpy(&dst,&src,4)`; return dst | 42 |
+| `l09_memmove_overlap` | · | `int a[5]={1,2,3,4,5}; memmove(a+1,a,16); return a[4];` (overlap-safe) | 4 |
+| `l10_memset_zero` | · | `int b[4]; memset(b,0,16); return b[2];` | 0 |
+| `l11_memset_ff` | · | `int b; memset(&b,0xFF,4); return b;` low 8 | 255 |
+| `l12_expect_taken` | · | `if (__builtin_expect(x==1,1)) return 42;` with `x=1` | 42 |
+| `l13_unreachable_live` | · | `if (x) return 42; else __builtin_unreachable();` with `x=1` | 42 |
+| `l14_trap_live` | · | `if (x) return 42; else __builtin_trap();` with `x=1` — trap path unreached | 42 |
+| `l15_prefetch_noop` | · | `__builtin_prefetch(p); *p = 42; return *p;` — hint must not corrupt p | 42 |
+| `l16_assume_aligned` | · | `p = assume_aligned(p,8); *p=42; return *p;` — hint must round-trip p | 42 |
+| `l17_add_overflow_no` | · | `add_overflow(20,22,&r) → ovf=0`; return `r` | 42 |
+| `l18_add_overflow_yes` | · | `add_overflow(INT_MAX,1,&r) → ovf=1`; return `ovf` | 1 |
+| `l19_sub_overflow_yes` | · | `sub_overflow(INT_MIN,1,&r) → ovf=1`; return `ovf` | 1 |
+| `l20_mul_overflow_no` | · | `mul_overflow(6,7,&r) → ovf=0`; return `r` | 42 |
+
## Deferred groups
| Group | Theme |
|---|---|
-| J | varargs |
-| K | atomics |
-| L | intrinsics |
| M | inline asm |
| N | TLS |
| O | sections + globals |
diff --git a/test/cg/harness/cases.c b/test/cg/harness/cases.c
@@ -139,6 +139,53 @@ void build_i08_vla_param_sum(CgTestCtx*);
void build_i09_alloca_preserves_locals(CgTestCtx*);
void build_i10_alloca_after_named_local(CgTestCtx*);
+void build_j01_va_int_sum_3(CgTestCtx*);
+void build_j02_va_zero_args(CgTestCtx*);
+void build_j03_va_int_spill(CgTestCtx*);
+void build_j04_va_int64(CgTestCtx*);
+void build_j05_va_double_sum(CgTestCtx*);
+void build_j06_va_double_spill(CgTestCtx*);
+void build_j07_va_mixed_int_dbl(CgTestCtx*);
+void build_j08_va_copy(CgTestCtx*);
+void build_j09_va_two_fixed(CgTestCtx*);
+
+void build_k01_atomic_load_relaxed(CgTestCtx*);
+void build_k02_atomic_store_load_acq(CgTestCtx*);
+void build_k03_atomic_load_seq_cst(CgTestCtx*);
+void build_k04_atomic_rmw_add(CgTestCtx*);
+void build_k05_atomic_rmw_xchg(CgTestCtx*);
+void build_k06_atomic_rmw_and(CgTestCtx*);
+void build_k07_atomic_rmw_or(CgTestCtx*);
+void build_k08_atomic_rmw_xor(CgTestCtx*);
+void build_k09_atomic_rmw_sub(CgTestCtx*);
+void build_k10_atomic_rmw_nand(CgTestCtx*);
+void build_k11_atomic_cas_success(CgTestCtx*);
+void build_k12_atomic_cas_failure(CgTestCtx*);
+void build_k13_atomic_load_i64(CgTestCtx*);
+void build_k14_atomic_rmw_prior(CgTestCtx*);
+void build_k15_fence_seq_cst(CgTestCtx*);
+
+void build_l01_popcount_u32(CgTestCtx*);
+void build_l02_popcount_u64(CgTestCtx*);
+void build_l03_ctz_u32(CgTestCtx*);
+void build_l04_clz_u32(CgTestCtx*);
+void build_l05_bswap16(CgTestCtx*);
+void build_l06_bswap32(CgTestCtx*);
+void build_l07_bswap64(CgTestCtx*);
+void build_l08_memcpy_4(CgTestCtx*);
+void build_l09_memmove_overlap(CgTestCtx*);
+void build_l10_memset_zero(CgTestCtx*);
+void build_l11_memset_ff(CgTestCtx*);
+void build_l12_expect_taken(CgTestCtx*);
+void build_l13_unreachable_live(CgTestCtx*);
+void build_l14_trap_live(CgTestCtx*);
+void build_l15_prefetch_noop(CgTestCtx*);
+void build_l16_assume_aligned(CgTestCtx*);
+void build_l17_add_overflow_no(CgTestCtx*);
+void build_l18_add_overflow_yes(CgTestCtx*);
+void build_l19_sub_overflow_yes(CgTestCtx*);
+void build_l20_mul_overflow_no(CgTestCtx*);
+
/* ---- registry ---- */
const CgCase cg_cases[] = {
@@ -274,6 +321,56 @@ const CgCase cg_cases[] = {
{ "i08_vla_param_sum", build_i08_vla_param_sum, 45, CG_CASE_DEFAULT },
{ "i09_alloca_preserves_locals", build_i09_alloca_preserves_locals, 42, CG_CASE_DEFAULT },
{ "i10_alloca_after_named_local", build_i10_alloca_after_named_local, 42, CG_CASE_DEFAULT },
+
+ /* Group J — varargs */
+ { "j01_va_int_sum_3", build_j01_va_int_sum_3, 6, CG_CASE_DEFAULT },
+ { "j02_va_zero_args", build_j02_va_zero_args, 0, CG_CASE_DEFAULT },
+ { "j03_va_int_spill", build_j03_va_int_spill, 55, CG_CASE_DEFAULT },
+ { "j04_va_int64", build_j04_va_int64, 42, CG_CASE_DEFAULT },
+ { "j05_va_double_sum", build_j05_va_double_sum, 7, CG_CASE_DEFAULT },
+ { "j06_va_double_spill", build_j06_va_double_spill, 4, CG_CASE_DEFAULT },
+ { "j07_va_mixed_int_dbl", build_j07_va_mixed_int_dbl, 42, CG_CASE_DEFAULT },
+ { "j08_va_copy", build_j08_va_copy, 42, CG_CASE_DEFAULT },
+ { "j09_va_two_fixed", build_j09_va_two_fixed, 42, CG_CASE_DEFAULT },
+
+ /* Group K — atomics */
+ { "k01_atomic_load_relaxed", build_k01_atomic_load_relaxed, 42, CG_CASE_DEFAULT },
+ { "k02_atomic_store_load_acq", build_k02_atomic_store_load_acq, 42, CG_CASE_DEFAULT },
+ { "k03_atomic_load_seq_cst", build_k03_atomic_load_seq_cst, 42, CG_CASE_DEFAULT },
+ { "k04_atomic_rmw_add", build_k04_atomic_rmw_add, 42, CG_CASE_DEFAULT },
+ { "k05_atomic_rmw_xchg", build_k05_atomic_rmw_xchg, 42, CG_CASE_DEFAULT },
+ { "k06_atomic_rmw_and", build_k06_atomic_rmw_and, 42, CG_CASE_DEFAULT },
+ { "k07_atomic_rmw_or", build_k07_atomic_rmw_or, 42, CG_CASE_DEFAULT },
+ { "k08_atomic_rmw_xor", build_k08_atomic_rmw_xor, 42, CG_CASE_DEFAULT },
+ { "k09_atomic_rmw_sub", build_k09_atomic_rmw_sub, 42, CG_CASE_DEFAULT },
+ { "k10_atomic_rmw_nand", build_k10_atomic_rmw_nand, 42, CG_CASE_DEFAULT },
+ { "k11_atomic_cas_success", build_k11_atomic_cas_success, 42, CG_CASE_DEFAULT },
+ { "k12_atomic_cas_failure", build_k12_atomic_cas_failure, 10, CG_CASE_DEFAULT },
+ { "k13_atomic_load_i64", build_k13_atomic_load_i64, 42, CG_CASE_DEFAULT },
+ { "k14_atomic_rmw_prior", build_k14_atomic_rmw_prior, 40, CG_CASE_DEFAULT },
+ { "k15_fence_seq_cst", build_k15_fence_seq_cst, 42, CG_CASE_DEFAULT },
+
+ /* Group L — intrinsics */
+ { "l01_popcount_u32", build_l01_popcount_u32, 8, CG_CASE_DEFAULT },
+ { "l02_popcount_u64", build_l02_popcount_u64, 64, CG_CASE_DEFAULT },
+ { "l03_ctz_u32", build_l03_ctz_u32, 7, CG_CASE_DEFAULT },
+ { "l04_clz_u32", build_l04_clz_u32, 24, CG_CASE_DEFAULT },
+ { "l05_bswap16", build_l05_bswap16, 18, CG_CASE_DEFAULT },
+ { "l06_bswap32", build_l06_bswap32, 17, CG_CASE_DEFAULT },
+ { "l07_bswap64", build_l07_bswap64, 17, CG_CASE_DEFAULT },
+ { "l08_memcpy_4", build_l08_memcpy_4, 42, CG_CASE_DEFAULT },
+ { "l09_memmove_overlap", build_l09_memmove_overlap, 4, CG_CASE_DEFAULT },
+ { "l10_memset_zero", build_l10_memset_zero, 0, CG_CASE_DEFAULT },
+ { "l11_memset_ff", build_l11_memset_ff, 255, CG_CASE_DEFAULT },
+ { "l12_expect_taken", build_l12_expect_taken, 42, CG_CASE_DEFAULT },
+ { "l13_unreachable_live", build_l13_unreachable_live, 42, CG_CASE_DEFAULT },
+ { "l14_trap_live", build_l14_trap_live, 42, CG_CASE_DEFAULT },
+ { "l15_prefetch_noop", build_l15_prefetch_noop, 42, CG_CASE_DEFAULT },
+ { "l16_assume_aligned", build_l16_assume_aligned, 42, CG_CASE_DEFAULT },
+ { "l17_add_overflow_no", build_l17_add_overflow_no, 42, CG_CASE_DEFAULT },
+ { "l18_add_overflow_yes", build_l18_add_overflow_yes, 1, CG_CASE_DEFAULT },
+ { "l19_sub_overflow_yes", build_l19_sub_overflow_yes, 1, CG_CASE_DEFAULT },
+ { "l20_mul_overflow_no", build_l20_mul_overflow_no, 42, CG_CASE_DEFAULT },
};
const unsigned cg_cases_count = sizeof(cg_cases) / sizeof(cg_cases[0]);
diff --git a/test/cg/harness/cases_j.c b/test/cg/harness/cases_j.c
@@ -0,0 +1,584 @@
+/* Group J — varargs.
+ * See CORPUS.md for the case list and expected values. */
+
+#include "cg_test.h"
+
+#include "core/pool.h"
+#include "core/arena.h"
+
+#include <string.h>
+
+/* ============================================================
+ * Group J: varargs
+ *
+ * Drives va_start_/va_arg_/va_end_/va_copy_ on CGTarget plus the ABI's
+ * variadic classification (abi_func_info on a type_func with variadic=1
+ * carries vararg_gp_offset/vararg_fp_offset/vararg_overflow_offset). The
+ * standard cgtest_begin_func/cgtest_call helpers hardcode variadic=0 so
+ * this file mirrors the variadic-aware paths locally.
+ *
+ * Each test_main calls a variadic helper. The helper allocates an `ap`
+ * local of abi_va_list_type size (FSF_ADDR_TAKEN), invokes va_start_
+ * with &ap, runs n va_arg_'s, calls va_end_, and returns the
+ * accumulator.
+ * ============================================================ */
+
+/* ---- variadic-aware helpers ---- */
+
+/* Mirrors cgtest_begin_func_at but builds fn_type with variadic=1.
+ * The caller passes the count of fixed (named) params; var args are
+ * appended at the call site. */
+static CgTestFn* j_begin_va_func(CgTestCtx* ctx, const char* name,
+ const Type* ret_ty,
+ const Type* const* fixed_param_types,
+ u32 nfixed)
+{
+ CgTestFn* tf = arena_new(ctx->c->tu, CgTestFn);
+ memset(tf, 0, sizeof *tf);
+ tf->ctx = ctx;
+ tf->ret_ty = ret_ty;
+
+ const Type** ptypes = NULL;
+ if (nfixed) {
+ ptypes = arena_array(ctx->c->tu, const Type*, nfixed);
+ for (u32 i = 0; i < nfixed; ++i) ptypes[i] = fixed_param_types[i];
+ }
+ tf->fn_type = type_func(ctx->pool, ret_ty, ptypes, (u16)nfixed, 1);
+ tf->abi_info = abi_func_info(ctx->c->abi, tf->fn_type);
+ tf->sym = cgtest_decl_func(ctx, name);
+
+ CGParamDesc* pds = NULL;
+ if (nfixed) {
+ tf->params = arena_array(ctx->c->tu, CgTestParam, nfixed);
+ memset(tf->params, 0, sizeof(CgTestParam) * nfixed);
+ pds = arena_array(ctx->c->tu, CGParamDesc, nfixed);
+ memset(pds, 0, sizeof(CGParamDesc) * nfixed);
+ for (u32 i = 0; i < nfixed; ++i) {
+ tf->params[i].type = ptypes[i];
+ tf->params[i].abi = &tf->abi_info->params[i];
+ pds[i].index = i;
+ pds[i].type = ptypes[i];
+ pds[i].slot = FRAME_SLOT_NONE;
+ pds[i].abi = &tf->abi_info->params[i];
+ pds[i].incoming = tf->abi_info->params[i].parts;
+ pds[i].nincoming= tf->abi_info->params[i].nparts;
+ }
+ }
+ tf->nparams = nfixed;
+
+ tf->fd.sym = tf->sym;
+ tf->fd.text_section_id = ctx->text_sec;
+ tf->fd.group_id = OBJ_GROUP_NONE;
+ tf->fd.fn_type = tf->fn_type;
+ tf->fd.abi = tf->abi_info;
+ tf->fd.params = pds;
+ tf->fd.nparams = nfixed;
+
+ ctx->target->func_begin(ctx->target, &tf->fd);
+
+ for (u32 i = 0; i < nfixed; ++i) {
+ FrameSlotDesc fsd = {
+ .type = ptypes[i],
+ .size = abi_sizeof (ctx->c->abi, ptypes[i]),
+ .align = abi_alignof(ctx->c->abi, ptypes[i]),
+ .kind = FS_PARAM,
+ .flags = FSF_NONE,
+ };
+ FrameSlot s = ctx->target->frame_slot(ctx->target, &fsd);
+ tf->params[i].slot = s;
+ pds[i].slot = s;
+ ctx->target->param(ctx->target, &pds[i]);
+ }
+ return tf;
+}
+
+/* Direct call to a variadic callee. fn_type built with variadic=1; the
+ * abi info reports per-arg classification including the ABI's
+ * vararg-vs-fixed split. */
+static void j_call_va(CgTestFn* caller, ObjSymId callee_sym,
+ const Type* ret_ty,
+ const Type* const* arg_types,
+ const CgTestArg* args,
+ u32 nargs, u32 nfixed,
+ Operand ret_storage)
+{
+ CgTestCtx* ctx = caller->ctx;
+ const Type** ptypes = NULL;
+ if (nargs) {
+ ptypes = arena_array(ctx->c->tu, const Type*, nargs);
+ for (u32 i = 0; i < nargs; ++i) ptypes[i] = arg_types[i];
+ }
+ /* type_func with variadic=1; nparams is the fixed count. nfixed must
+ * match the helper's named-param count even though we pass nargs
+ * Type pointers — abi_func_info reads its variadic flag from the
+ * Type and handles per-arg classification via ABIFuncInfo.params[]. */
+ const Type* fn_ty = type_func(ctx->pool, ret_ty, ptypes, (u16)nfixed, 1);
+ const ABIFuncInfo* info = abi_func_info(ctx->c->abi, fn_ty);
+
+ CGABIValue* avs = NULL;
+ if (nargs) {
+ avs = arena_array(ctx->c->tu, CGABIValue, nargs);
+ memset(avs, 0, sizeof(CGABIValue) * nargs);
+ for (u32 i = 0; i < nargs; ++i) {
+ CGABIValue* av = &avs[i];
+ av->type = arg_types[i];
+ av->abi = (i < info->nparams) ? &info->params[i] : NULL;
+ switch (args[i].kind) {
+ case CGT_ARG_IMM:
+ av->storage = IMM_op(args[i].v.imm, arg_types[i]); break;
+ case CGT_ARG_REG:
+ av->storage = REG_op(args[i].v.reg, arg_types[i]); break;
+ default:
+ av->storage = LOCAL_op(args[i].v.slot, arg_types[i]); break;
+ }
+ }
+ }
+
+ CGCallDesc desc; memset(&desc, 0, sizeof desc);
+ desc.fn_type = fn_ty;
+ desc.abi = info;
+ desc.callee = GLOBAL_op(callee_sym, 0);
+ desc.args = avs;
+ desc.nargs = nargs;
+ desc.ret.type = ret_ty;
+ desc.ret.abi = &info->ret;
+ desc.ret.storage = ret_storage;
+ ctx->target->call(ctx->target, &desc);
+}
+
+/* ---- shared helpers ---- */
+
+/* Allocate an ap local of abi_va_list_type and addr_of into a register. */
+typedef struct VaApRegs { FrameSlot slot; Reg ap_addr; const Type* ap_ty; } VaApRegs;
+
+static VaApRegs j_alloc_ap(CgTestFn* tf)
+{
+ CgTestCtx* ctx = tf->ctx;
+ const Type* ap_ty = abi_va_list_type(ctx->c->abi, ctx->pool);
+ const Type* ap_pty = T_ptr(ctx, ap_ty);
+ FrameSlot ap_slot = cgtest_local(tf, ap_ty, FSF_ADDR_TAKEN);
+ Reg ap_addr = ctx->target->alloc_reg(ctx->target, RC_INT, ap_pty);
+ ctx->target->addr_of(ctx->target, REG_op(ap_addr, ap_pty),
+ LOCAL_op(ap_slot, ap_ty));
+ return (VaApRegs){ ap_slot, ap_addr, ap_ty };
+}
+
+/* Build helper: int sum(int n, ...) { va_start(ap); int s=0; for(i=0;i<n;i++)
+ * s += va_arg(ap, T); va_end(ap); return s; } — T is the va_arg type. */
+static ObjSymId j_build_int_sum_helper(CgTestCtx* ctx, const char* name,
+ const Type* va_ty, const Type* acc_ty)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* params[] = { I32 };
+ CgTestFn* tf = j_begin_va_func(ctx, name, acc_ty, params, 1);
+ CGTarget* T = ctx->target;
+
+ Reg n = T->alloc_reg(T, RC_INT, I32);
+ cgtest_load_local(tf, REG_op(n, I32), cgtest_param_slot(tf, 0), I32);
+
+ VaApRegs ap = j_alloc_ap(tf);
+ T->va_start_(T, REG_op(ap.ap_addr, T_ptr(ctx, ap.ap_ty)));
+
+ /* Accumulator starts at 0. */
+ FrameSlot ss = cgtest_local(tf, acc_ty, FSF_NONE);
+ cgtest_store_local(tf, ss, IMM_op(0, acc_ty), acc_ty);
+ FrameSlot is = cgtest_local(tf, I32, FSF_NONE);
+ cgtest_store_local(tf, is, IMM_op(0, I32), I32);
+
+ Label top = T->label_new(T);
+ Label end = T->label_new(T);
+ T->label_place(T, top);
+ Reg ir = T->alloc_reg(T, RC_INT, I32);
+ cgtest_load_local(tf, REG_op(ir, I32), is, I32);
+ T->cmp_branch(T, CMP_GE_S, REG_op(ir, I32), REG_op(n, I32), end);
+
+ Reg v = T->alloc_reg(T, RC_INT, va_ty);
+ T->va_arg_(T, REG_op(v, va_ty), REG_op(ap.ap_addr, T_ptr(ctx, ap.ap_ty)), va_ty);
+
+ Reg sr = T->alloc_reg(T, RC_INT, acc_ty);
+ cgtest_load_local(tf, REG_op(sr, acc_ty), ss, acc_ty);
+ T->binop(T, BO_IADD, REG_op(sr, acc_ty), REG_op(sr, acc_ty), REG_op(v, va_ty));
+ cgtest_store_local(tf, ss, REG_op(sr, acc_ty), acc_ty);
+
+ T->binop(T, BO_IADD, REG_op(ir, I32), REG_op(ir, I32), IMM_op(1, I32));
+ cgtest_store_local(tf, is, REG_op(ir, I32), I32);
+ T->jump(T, top);
+ T->label_place(T, end);
+
+ T->va_end_(T, REG_op(ap.ap_addr, T_ptr(ctx, ap.ap_ty)));
+ Reg out = T->alloc_reg(T, RC_INT, acc_ty);
+ cgtest_load_local(tf, REG_op(out, acc_ty), ss, acc_ty);
+ cgtest_ret_reg(tf, out, acc_ty);
+ cgtest_end(tf);
+ return tf->sym;
+}
+
+/* Build helper: int sumd(int n, ...) — fp accumulator, ftoi_s before return. */
+static ObjSymId j_build_double_sum_helper(CgTestCtx* ctx, const char* name)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* F64 = T_f64(ctx);
+ const Type* params[] = { I32 };
+ CgTestFn* tf = j_begin_va_func(ctx, name, I32, params, 1);
+ CGTarget* T = ctx->target;
+
+ Reg n = T->alloc_reg(T, RC_INT, I32);
+ cgtest_load_local(tf, REG_op(n, I32), cgtest_param_slot(tf, 0), I32);
+
+ VaApRegs ap = j_alloc_ap(tf);
+ T->va_start_(T, REG_op(ap.ap_addr, T_ptr(ctx, ap.ap_ty)));
+
+ FrameSlot ss = cgtest_local(tf, F64, FSF_NONE);
+ Reg zero = T->alloc_reg(T, RC_FP, F64);
+ /* Materialize 0.0 via a u64 zero bitcast: easier — use convert(0). */
+ Reg iz = T->alloc_reg(T, RC_INT, I32);
+ T->load_imm(T, REG_op(iz, I32), 0);
+ T->convert(T, CV_ITOF_S, REG_op(zero, F64), REG_op(iz, I32));
+ cgtest_store_local(tf, ss, REG_op(zero, F64), F64);
+
+ FrameSlot is = cgtest_local(tf, I32, FSF_NONE);
+ cgtest_store_local(tf, is, IMM_op(0, I32), I32);
+
+ Label top = T->label_new(T);
+ Label end = T->label_new(T);
+ T->label_place(T, top);
+ Reg ir = T->alloc_reg(T, RC_INT, I32);
+ cgtest_load_local(tf, REG_op(ir, I32), is, I32);
+ T->cmp_branch(T, CMP_GE_S, REG_op(ir, I32), REG_op(n, I32), end);
+
+ Reg v = T->alloc_reg(T, RC_FP, F64);
+ T->va_arg_(T, REG_op(v, F64), REG_op(ap.ap_addr, T_ptr(ctx, ap.ap_ty)), F64);
+
+ Reg sr = T->alloc_reg(T, RC_FP, F64);
+ cgtest_load_local(tf, REG_op(sr, F64), ss, F64);
+ T->binop(T, BO_FADD, REG_op(sr, F64), REG_op(sr, F64), REG_op(v, F64));
+ cgtest_store_local(tf, ss, REG_op(sr, F64), F64);
+
+ T->binop(T, BO_IADD, REG_op(ir, I32), REG_op(ir, I32), IMM_op(1, I32));
+ cgtest_store_local(tf, is, REG_op(ir, I32), I32);
+ T->jump(T, top);
+ T->label_place(T, end);
+
+ T->va_end_(T, REG_op(ap.ap_addr, T_ptr(ctx, ap.ap_ty)));
+ Reg final = T->alloc_reg(T, RC_FP, F64);
+ cgtest_load_local(tf, REG_op(final, F64), ss, F64);
+ Reg ir32 = T->alloc_reg(T, RC_INT, I32);
+ T->convert(T, CV_FTOI_S, REG_op(ir32, I32), REG_op(final, F64));
+ cgtest_ret_reg(tf, ir32, I32);
+ cgtest_end(tf);
+ return tf->sym;
+}
+
+/* ---- cases ---- */
+
+/* j01_va_int_sum_3 — sum(3, 1, 2, 3) → 6. */
+void build_j01_va_int_sum_3(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ ObjSymId sum = j_build_int_sum_helper(ctx, "j01_sum", I32, I32);
+
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ Reg dst = ctx->target->alloc_reg(ctx->target, RC_INT, I32);
+ const Type* atypes[] = { I32, I32, I32, I32 };
+ CgTestArg args[] = {
+ { .kind = CGT_ARG_IMM, .type = I32, .v.imm = 3 },
+ { .kind = CGT_ARG_IMM, .type = I32, .v.imm = 1 },
+ { .kind = CGT_ARG_IMM, .type = I32, .v.imm = 2 },
+ { .kind = CGT_ARG_IMM, .type = I32, .v.imm = 3 },
+ };
+ j_call_va(tf, sum, I32, atypes, args, 4, 1, REG_op(dst, I32));
+ cgtest_ret_reg(tf, dst, I32);
+ cgtest_end(tf);
+}
+
+/* j02_va_zero_args — sum(0); va_start/va_end with no va_arg → 0. */
+void build_j02_va_zero_args(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ ObjSymId sum = j_build_int_sum_helper(ctx, "j02_sum", I32, I32);
+
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ Reg dst = ctx->target->alloc_reg(ctx->target, RC_INT, I32);
+ const Type* atypes[] = { I32 };
+ CgTestArg args[] = { { .kind = CGT_ARG_IMM, .type = I32, .v.imm = 0 } };
+ j_call_va(tf, sum, I32, atypes, args, 1, 1, REG_op(dst, I32));
+ cgtest_ret_reg(tf, dst, I32);
+ cgtest_end(tf);
+}
+
+/* j03_va_int_spill — sum(10, 1..10) → 55. Exhausts AArch64 GPR save area. */
+void build_j03_va_int_spill(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ ObjSymId sum = j_build_int_sum_helper(ctx, "j03_sum", I32, I32);
+
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ Reg dst = ctx->target->alloc_reg(ctx->target, RC_INT, I32);
+ const Type* atypes[11] = { I32, I32, I32, I32, I32, I32, I32, I32, I32, I32, I32 };
+ CgTestArg args[11];
+ args[0] = (CgTestArg){ .kind = CGT_ARG_IMM, .type = I32, .v.imm = 10 };
+ for (int i = 0; i < 10; ++i) {
+ args[i+1] = (CgTestArg){ .kind = CGT_ARG_IMM, .type = I32, .v.imm = i+1 };
+ }
+ j_call_va(tf, sum, I32, atypes, args, 11, 1, REG_op(dst, I32));
+ cgtest_ret_reg(tf, dst, I32);
+ cgtest_end(tf);
+}
+
+/* j04_va_int64 — sum_ll(2, 21LL, 21LL); low 32 of result → 42. */
+void build_j04_va_int64(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* I64 = T_i64(ctx);
+ ObjSymId sum = j_build_int_sum_helper(ctx, "j04_sum_ll", I64, I64);
+
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ CGTarget* T = ctx->target;
+ Reg r64 = T->alloc_reg(T, RC_INT, I64);
+ const Type* atypes[] = { I32, I64, I64 };
+ CgTestArg args[] = {
+ { .kind = CGT_ARG_IMM, .type = I32, .v.imm = 2 },
+ { .kind = CGT_ARG_IMM, .type = I64, .v.imm = 21 },
+ { .kind = CGT_ARG_IMM, .type = I64, .v.imm = 21 },
+ };
+ j_call_va(tf, sum, I64, atypes, args, 3, 1, REG_op(r64, I64));
+ /* Truncate to i32. */
+ Reg r32 = T->alloc_reg(T, RC_INT, I32);
+ T->convert(T, CV_TRUNC, REG_op(r32, I32), REG_op(r64, I64));
+ cgtest_ret_reg(tf, r32, I32);
+ cgtest_end(tf);
+}
+
+/* ---- helpers for fp + double-arg passing ---- */
+
+/* Emit a call_const for a double-precision FP constant from raw little-endian
+ * bytes; returns the FP reg. */
+static Reg j_load_f64(CgTestCtx* ctx, const u8* bytes_le8)
+{
+ const Type* F64 = T_f64(ctx);
+ Reg r = ctx->target->alloc_reg(ctx->target, RC_FP, F64);
+ ConstBytes cb = { .type = F64, .bytes = bytes_le8, .size = 8, .align = 8 };
+ ctx->target->load_const(ctx->target, REG_op(r, F64), cb);
+ return r;
+}
+
+/* j05_va_double_sum — sumd(3, 1.5, 2.0, 3.5) → 7. */
+void build_j05_va_double_sum(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* F64 = T_f64(ctx);
+ ObjSymId sumd = j_build_double_sum_helper(ctx, "j05_sumd");
+
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ /* 1.5, 2.0, 3.5 as little-endian double bytes. */
+ static const u8 D15[8] = {0,0,0,0,0,0,0xF8,0x3F};
+ static const u8 D20[8] = {0,0,0,0,0,0,0x00,0x40};
+ static const u8 D35[8] = {0,0,0,0,0,0,0x0C,0x40};
+ Reg r1 = j_load_f64(ctx, D15);
+ Reg r2 = j_load_f64(ctx, D20);
+ Reg r3 = j_load_f64(ctx, D35);
+ Reg dst = ctx->target->alloc_reg(ctx->target, RC_INT, I32);
+
+ const Type* atypes[] = { I32, F64, F64, F64 };
+ CgTestArg args[] = {
+ { .kind = CGT_ARG_IMM, .type = I32, .v.imm = 3 },
+ { .kind = CGT_ARG_REG, .type = F64, .v.reg = r1 },
+ { .kind = CGT_ARG_REG, .type = F64, .v.reg = r2 },
+ { .kind = CGT_ARG_REG, .type = F64, .v.reg = r3 },
+ };
+ j_call_va(tf, sumd, I32, atypes, args, 4, 1, REG_op(dst, I32));
+ cgtest_ret_reg(tf, dst, I32);
+ cgtest_end(tf);
+}
+
+/* j06_va_double_spill — sumd(9, 0.5×9) → 4 (after ftoi_s of 4.5). */
+void build_j06_va_double_spill(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* F64 = T_f64(ctx);
+ ObjSymId sumd = j_build_double_sum_helper(ctx, "j06_sumd");
+
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ static const u8 D05[8] = {0,0,0,0,0,0,0xE0,0x3F}; /* 0.5 */
+ Reg dst = ctx->target->alloc_reg(ctx->target, RC_INT, I32);
+
+ const Type* atypes[10] = { I32, F64, F64, F64, F64, F64, F64, F64, F64, F64 };
+ CgTestArg args[10];
+ args[0] = (CgTestArg){ .kind = CGT_ARG_IMM, .type = I32, .v.imm = 9 };
+ for (int i = 0; i < 9; ++i) {
+ Reg r = j_load_f64(ctx, D05);
+ args[i+1] = (CgTestArg){ .kind = CGT_ARG_REG, .type = F64, .v.reg = r };
+ }
+ j_call_va(tf, sumd, I32, atypes, args, 10, 1, REG_op(dst, I32));
+ cgtest_ret_reg(tf, dst, I32);
+ cgtest_end(tf);
+}
+
+/* helper for j07: int f(int n, int a, double b, int c, double d) — fixed n,
+ * then 4 var args of mixed kind. Body sums int+(int)b+int+(int)d. */
+static ObjSymId j_build_j07_helper(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* F64 = T_f64(ctx);
+ const Type* params[] = { I32 };
+ CgTestFn* tf = j_begin_va_func(ctx, "j07_f", I32, params, 1);
+ CGTarget* T = ctx->target;
+
+ VaApRegs ap = j_alloc_ap(tf);
+ T->va_start_(T, REG_op(ap.ap_addr, T_ptr(ctx, ap.ap_ty)));
+
+ Reg a = T->alloc_reg(T, RC_INT, I32);
+ Reg c = T->alloc_reg(T, RC_INT, I32);
+ Reg b = T->alloc_reg(T, RC_FP, F64);
+ Reg d = T->alloc_reg(T, RC_FP, F64);
+ T->va_arg_(T, REG_op(a, I32), REG_op(ap.ap_addr, T_ptr(ctx, ap.ap_ty)), I32);
+ T->va_arg_(T, REG_op(b, F64), REG_op(ap.ap_addr, T_ptr(ctx, ap.ap_ty)), F64);
+ T->va_arg_(T, REG_op(c, I32), REG_op(ap.ap_addr, T_ptr(ctx, ap.ap_ty)), I32);
+ T->va_arg_(T, REG_op(d, F64), REG_op(ap.ap_addr, T_ptr(ctx, ap.ap_ty)), F64);
+
+ Reg ib = T->alloc_reg(T, RC_INT, I32);
+ Reg id = T->alloc_reg(T, RC_INT, I32);
+ T->convert(T, CV_FTOI_S, REG_op(ib, I32), REG_op(b, F64));
+ T->convert(T, CV_FTOI_S, REG_op(id, I32), REG_op(d, F64));
+ Reg s = T->alloc_reg(T, RC_INT, I32);
+ T->binop(T, BO_IADD, REG_op(s, I32), REG_op(a, I32), REG_op(ib, I32));
+ T->binop(T, BO_IADD, REG_op(s, I32), REG_op(s, I32), REG_op(c, I32));
+ T->binop(T, BO_IADD, REG_op(s, I32), REG_op(s, I32), REG_op(id, I32));
+
+ T->va_end_(T, REG_op(ap.ap_addr, T_ptr(ctx, ap.ap_ty)));
+ cgtest_ret_reg(tf, s, I32);
+ cgtest_end(tf);
+ return tf->sym;
+}
+
+/* j07_va_mixed_int_dbl — f(_, 10, 16.5, 7, 8.5) → 10+16+7+8 = 41 truncated.
+ * Adjust constants so int sum lands at 42: 10 + (int)16.0 + 8 + (int)8.0 = 42.
+ */
+void build_j07_va_mixed_int_dbl(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* F64 = T_f64(ctx);
+ ObjSymId f = j_build_j07_helper(ctx);
+
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ static const u8 D16[8] = {0,0,0,0,0,0,0x30,0x40}; /* 16.0 */
+ static const u8 D08[8] = {0,0,0,0,0,0,0x20,0x40}; /* 8.0 */
+ Reg b = j_load_f64(ctx, D16);
+ Reg d = j_load_f64(ctx, D08);
+ Reg dst = ctx->target->alloc_reg(ctx->target, RC_INT, I32);
+
+ const Type* atypes[] = { I32, I32, F64, I32, F64 };
+ CgTestArg args[] = {
+ { .kind = CGT_ARG_IMM, .type = I32, .v.imm = 0 /* unused n */ },
+ { .kind = CGT_ARG_IMM, .type = I32, .v.imm = 10 },
+ { .kind = CGT_ARG_REG, .type = F64, .v.reg = b },
+ { .kind = CGT_ARG_IMM, .type = I32, .v.imm = 8 },
+ { .kind = CGT_ARG_REG, .type = F64, .v.reg = d },
+ };
+ j_call_va(tf, f, I32, atypes, args, 5, 1, REG_op(dst, I32));
+ cgtest_ret_reg(tf, dst, I32);
+ cgtest_end(tf);
+}
+
+/* helper for j08: int f(int n, ...) { va_list a, b; va_start(a); va_copy(b,a);
+ * int x = va_arg(a, int); int y = va_arg(b, int); return x + y; }
+ * Both ap and bp see the same first var arg, so x == y. */
+static ObjSymId j_build_j08_helper(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* params[] = { I32 };
+ CgTestFn* tf = j_begin_va_func(ctx, "j08_f", I32, params, 1);
+ CGTarget* T = ctx->target;
+
+ /* Two va_list locals + their addresses. */
+ const Type* ap_ty = abi_va_list_type(ctx->c->abi, ctx->pool);
+ const Type* ap_pty = T_ptr(ctx, ap_ty);
+ FrameSlot ap = cgtest_local(tf, ap_ty, FSF_ADDR_TAKEN);
+ FrameSlot bp = cgtest_local(tf, ap_ty, FSF_ADDR_TAKEN);
+ Reg a_addr = T->alloc_reg(T, RC_INT, ap_pty);
+ Reg b_addr = T->alloc_reg(T, RC_INT, ap_pty);
+ T->addr_of(T, REG_op(a_addr, ap_pty), LOCAL_op(ap, ap_ty));
+ T->addr_of(T, REG_op(b_addr, ap_pty), LOCAL_op(bp, ap_ty));
+
+ T->va_start_(T, REG_op(a_addr, ap_pty));
+ T->va_copy_ (T, REG_op(b_addr, ap_pty), REG_op(a_addr, ap_pty));
+
+ Reg x = T->alloc_reg(T, RC_INT, I32);
+ Reg y = T->alloc_reg(T, RC_INT, I32);
+ T->va_arg_(T, REG_op(x, I32), REG_op(a_addr, ap_pty), I32);
+ T->va_arg_(T, REG_op(y, I32), REG_op(b_addr, ap_pty), I32);
+
+ Reg s = T->alloc_reg(T, RC_INT, I32);
+ T->binop(T, BO_IADD, REG_op(s, I32), REG_op(x, I32), REG_op(y, I32));
+
+ T->va_end_(T, REG_op(a_addr, ap_pty));
+ T->va_end_(T, REG_op(b_addr, ap_pty));
+ cgtest_ret_reg(tf, s, I32);
+ cgtest_end(tf);
+ return tf->sym;
+}
+
+/* j08_va_copy — f(_, 21) → 21+21 = 42 (both va_lists see arg 0). */
+void build_j08_va_copy(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ ObjSymId f = j_build_j08_helper(ctx);
+
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ Reg dst = ctx->target->alloc_reg(ctx->target, RC_INT, I32);
+ const Type* atypes[] = { I32, I32 };
+ CgTestArg args[] = {
+ { .kind = CGT_ARG_IMM, .type = I32, .v.imm = 0 },
+ { .kind = CGT_ARG_IMM, .type = I32, .v.imm = 21 },
+ };
+ j_call_va(tf, f, I32, atypes, args, 2, 1, REG_op(dst, I32));
+ cgtest_ret_reg(tf, dst, I32);
+ cgtest_end(tf);
+}
+
+/* helper for j09: int f(int a, int b, ...) { va_list ap; va_start(ap, b);
+ * int c = va_arg(ap, int); va_end(ap); return a + b + c; } */
+static ObjSymId j_build_j09_helper(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* params[] = { I32, I32 };
+ CgTestFn* tf = j_begin_va_func(ctx, "j09_f", I32, params, 2);
+ CGTarget* T = ctx->target;
+
+ Reg a = T->alloc_reg(T, RC_INT, I32);
+ Reg b = T->alloc_reg(T, RC_INT, I32);
+ cgtest_load_local(tf, REG_op(a, I32), cgtest_param_slot(tf, 0), I32);
+ cgtest_load_local(tf, REG_op(b, I32), cgtest_param_slot(tf, 1), I32);
+
+ VaApRegs ap = j_alloc_ap(tf);
+ T->va_start_(T, REG_op(ap.ap_addr, T_ptr(ctx, ap.ap_ty)));
+ Reg c = T->alloc_reg(T, RC_INT, I32);
+ T->va_arg_(T, REG_op(c, I32), REG_op(ap.ap_addr, T_ptr(ctx, ap.ap_ty)), I32);
+ T->va_end_(T, REG_op(ap.ap_addr, T_ptr(ctx, ap.ap_ty)));
+
+ Reg s = T->alloc_reg(T, RC_INT, I32);
+ T->binop(T, BO_IADD, REG_op(s, I32), REG_op(a, I32), REG_op(b, I32));
+ T->binop(T, BO_IADD, REG_op(s, I32), REG_op(s, I32), REG_op(c, I32));
+ cgtest_ret_reg(tf, s, I32);
+ cgtest_end(tf);
+ return tf->sym;
+}
+
+/* j09_va_two_fixed — f(10, 15, 17) → 42. */
+void build_j09_va_two_fixed(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ ObjSymId f = j_build_j09_helper(ctx);
+
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ Reg dst = ctx->target->alloc_reg(ctx->target, RC_INT, I32);
+ const Type* atypes[] = { I32, I32, I32 };
+ CgTestArg args[] = {
+ { .kind = CGT_ARG_IMM, .type = I32, .v.imm = 10 },
+ { .kind = CGT_ARG_IMM, .type = I32, .v.imm = 15 },
+ { .kind = CGT_ARG_IMM, .type = I32, .v.imm = 17 },
+ };
+ j_call_va(tf, f, I32, atypes, args, 3, 2, REG_op(dst, I32));
+ cgtest_ret_reg(tf, dst, I32);
+ cgtest_end(tf);
+}
diff --git a/test/cg/harness/cases_k.c b/test/cg/harness/cases_k.c
@@ -0,0 +1,209 @@
+/* Group K — atomics.
+ * See CORPUS.md for the case list and expected values. */
+
+#include "cg_test.h"
+
+/* ============================================================
+ * Group K: atomics
+ *
+ * Drives atomic_load / atomic_store / atomic_rmw / atomic_cas / fence
+ * on CGTarget across every AtomicOp and several MemOrders. Every case
+ * uses an FSF_ADDR_TAKEN i32 (or i64 for k13) local as the atomic
+ * object: store-into via plain store sets the prior state, the atomic
+ * op is then dispatched against the address, and a plain load after
+ * reads the post-state for the oracle. The MF_ATOMIC flag rides along
+ * the MemAccess so the backend can route to ldar/stlr-class encodings.
+ * ============================================================ */
+
+/* Helper: build the standard prelude — a single addr-taken i32 local x
+ * pre-initialized to `init`, plus its address in a register. */
+typedef struct KCtx { CgTestFn* tf; FrameSlot x; Reg p_addr; } KCtx;
+
+static KCtx k_open_i32(CgTestCtx* ctx, i64 init)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* PI32 = T_ptr(ctx, I32);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ CGTarget* T = ctx->target;
+ FrameSlot x = cgtest_local(tf, I32, FSF_ADDR_TAKEN);
+ cgtest_store_local(tf, x, IMM_op(init, I32), I32);
+ Reg p = T->alloc_reg(T, RC_INT, PI32);
+ T->addr_of(T, REG_op(p, PI32), LOCAL_op(x, I32));
+ return (KCtx){ tf, x, p };
+}
+
+/* MemAccess for a 4-byte i32 atomic at &x. */
+static MemAccess k_ma32(CgTestCtx* ctx)
+{
+ MemAccess ma = { 0 };
+ ma.type = T_i32(ctx);
+ ma.size = 4;
+ ma.align = 4;
+ ma.flags = MF_ATOMIC;
+ ma.alias.kind = ALIAS_LOCAL;
+ return ma;
+}
+
+/* Reload x and return; helper for the post-state oracle. */
+static void k_close_load_x(KCtx* k)
+{
+ CgTestCtx* ctx = k->tf->ctx;
+ const Type* I32 = T_i32(ctx);
+ Reg r = ctx->target->alloc_reg(ctx->target, RC_INT, I32);
+ cgtest_load_local(k->tf, REG_op(r, I32), k->x, I32);
+ cgtest_ret_reg(k->tf, r, I32);
+ cgtest_end(k->tf);
+}
+
+/* k01_atomic_load_relaxed — return atomic_load(&x=42, RELAXED). */
+void build_k01_atomic_load_relaxed(CgTestCtx* ctx)
+{
+ KCtx k = k_open_i32(ctx, 42);
+ const Type* I32 = T_i32(ctx);
+ Reg r = ctx->target->alloc_reg(ctx->target, RC_INT, I32);
+ ctx->target->atomic_load(ctx->target, REG_op(r, I32),
+ REG_op(k.p_addr, T_ptr(ctx, I32)),
+ k_ma32(ctx), MO_RELAXED);
+ cgtest_ret_reg(k.tf, r, I32);
+ cgtest_end(k.tf);
+}
+
+/* k02_atomic_store_load_acq — atomic_store(&x, 42, RELEASE) then
+ * atomic_load(&x, ACQUIRE). */
+void build_k02_atomic_store_load_acq(CgTestCtx* ctx)
+{
+ KCtx k = k_open_i32(ctx, 0);
+ const Type* I32 = T_i32(ctx);
+ CGTarget* T = ctx->target;
+ T->atomic_store(T, REG_op(k.p_addr, T_ptr(ctx, I32)),
+ IMM_op(42, I32), k_ma32(ctx), MO_RELEASE);
+ Reg r = T->alloc_reg(T, RC_INT, I32);
+ T->atomic_load(T, REG_op(r, I32), REG_op(k.p_addr, T_ptr(ctx, I32)),
+ k_ma32(ctx), MO_ACQUIRE);
+ cgtest_ret_reg(k.tf, r, I32);
+ cgtest_end(k.tf);
+}
+
+/* k03_atomic_load_seq_cst — full-barrier load. */
+void build_k03_atomic_load_seq_cst(CgTestCtx* ctx)
+{
+ KCtx k = k_open_i32(ctx, 42);
+ const Type* I32 = T_i32(ctx);
+ Reg r = ctx->target->alloc_reg(ctx->target, RC_INT, I32);
+ ctx->target->atomic_load(ctx->target, REG_op(r, I32),
+ REG_op(k.p_addr, T_ptr(ctx, I32)),
+ k_ma32(ctx), MO_SEQ_CST);
+ cgtest_ret_reg(k.tf, r, I32);
+ cgtest_end(k.tf);
+}
+
+/* Shared body for the rmw post-state cases (k04..k10). */
+static void k_rmw_post(CgTestCtx* ctx, AtomicOp op, i64 init, i64 val)
+{
+ KCtx k = k_open_i32(ctx, init);
+ const Type* I32 = T_i32(ctx);
+ Reg prior = ctx->target->alloc_reg(ctx->target, RC_INT, I32);
+ ctx->target->atomic_rmw(ctx->target, op, REG_op(prior, I32),
+ REG_op(k.p_addr, T_ptr(ctx, I32)),
+ IMM_op(val, I32), k_ma32(ctx), MO_SEQ_CST);
+ k_close_load_x(&k);
+}
+
+void build_k04_atomic_rmw_add (CgTestCtx* c) { k_rmw_post(c, AO_ADD, 40, 2); }
+void build_k05_atomic_rmw_xchg(CgTestCtx* c) { k_rmw_post(c, AO_XCHG, 99, 42); }
+void build_k06_atomic_rmw_and (CgTestCtx* c) { k_rmw_post(c, AO_AND, 0xFF, 0x2A); }
+void build_k07_atomic_rmw_or (CgTestCtx* c) { k_rmw_post(c, AO_OR, 0x20, 0x0A); }
+void build_k08_atomic_rmw_xor (CgTestCtx* c) { k_rmw_post(c, AO_XOR, 0xFF, 0xD5); }
+void build_k09_atomic_rmw_sub (CgTestCtx* c) { k_rmw_post(c, AO_SUB, 44, 2); }
+
+/* k10_atomic_rmw_nand — post-state low 8: ~(0xFF & 0xD5) & 0xFF = 0x2A = 42. */
+void build_k10_atomic_rmw_nand(CgTestCtx* c) { k_rmw_post(c, AO_NAND, 0xFF, 0xD5); }
+
+/* k11_atomic_cas_success — x=10; cas(&x, exp=10, des=42) → ok=1; load → 42. */
+void build_k11_atomic_cas_success(CgTestCtx* ctx)
+{
+ KCtx k = k_open_i32(ctx, 10);
+ const Type* I32 = T_i32(ctx);
+ CGTarget* T = ctx->target;
+ Reg prior = T->alloc_reg(T, RC_INT, I32);
+ Reg ok = T->alloc_reg(T, RC_INT, I32);
+ T->atomic_cas(T, REG_op(prior, I32), REG_op(ok, I32),
+ REG_op(k.p_addr, T_ptr(ctx, I32)),
+ IMM_op(10, I32), IMM_op(42, I32),
+ k_ma32(ctx), MO_SEQ_CST, MO_RELAXED);
+ k_close_load_x(&k);
+}
+
+/* k12_atomic_cas_failure — x=10; cas(&x, exp=99, des=42) → ok=0; x unchanged. */
+void build_k12_atomic_cas_failure(CgTestCtx* ctx)
+{
+ KCtx k = k_open_i32(ctx, 10);
+ const Type* I32 = T_i32(ctx);
+ CGTarget* T = ctx->target;
+ Reg prior = T->alloc_reg(T, RC_INT, I32);
+ Reg ok = T->alloc_reg(T, RC_INT, I32);
+ T->atomic_cas(T, REG_op(prior, I32), REG_op(ok, I32),
+ REG_op(k.p_addr, T_ptr(ctx, I32)),
+ IMM_op(99, I32), IMM_op(42, I32),
+ k_ma32(ctx), MO_SEQ_CST, MO_RELAXED);
+ k_close_load_x(&k);
+}
+
+/* k13_atomic_load_i64 — i64 atomic load of 0x1_0000_002A; return low 32 = 42. */
+void build_k13_atomic_load_i64(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* I64 = T_i64(ctx);
+ const Type* PI64 = T_ptr(ctx, I64);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ CGTarget* T = ctx->target;
+
+ FrameSlot x = cgtest_local(tf, I64, FSF_ADDR_TAKEN);
+ /* Materialize via load_imm into a 64-bit reg, then store. */
+ Reg init = T->alloc_reg(T, RC_INT, I64);
+ T->load_imm(T, REG_op(init, I64), 0x10000002Aull);
+ cgtest_store_local(tf, x, REG_op(init, I64), I64);
+
+ Reg p = T->alloc_reg(T, RC_INT, PI64);
+ T->addr_of(T, REG_op(p, PI64), LOCAL_op(x, I64));
+
+ MemAccess ma = { .type = I64, .size = 8, .align = 8,
+ .flags = MF_ATOMIC, .alias.kind = ALIAS_LOCAL };
+ Reg r64 = T->alloc_reg(T, RC_INT, I64);
+ T->atomic_load(T, REG_op(r64, I64), REG_op(p, PI64), ma, MO_SEQ_CST);
+
+ Reg r32 = T->alloc_reg(T, RC_INT, I32);
+ T->convert(T, CV_TRUNC, REG_op(r32, I32), REG_op(r64, I64));
+ cgtest_ret_reg(tf, r32, I32);
+ cgtest_end(tf);
+}
+
+/* k14_atomic_rmw_prior — return the prior value rmw produced (40), not the
+ * post-state. */
+void build_k14_atomic_rmw_prior(CgTestCtx* ctx)
+{
+ KCtx k = k_open_i32(ctx, 40);
+ const Type* I32 = T_i32(ctx);
+ Reg prior = ctx->target->alloc_reg(ctx->target, RC_INT, I32);
+ ctx->target->atomic_rmw(ctx->target, AO_ADD, REG_op(prior, I32),
+ REG_op(k.p_addr, T_ptr(ctx, I32)),
+ IMM_op(2, I32), k_ma32(ctx), MO_SEQ_CST);
+ cgtest_ret_reg(k.tf, prior, I32);
+ cgtest_end(k.tf);
+}
+
+/* k15_fence_seq_cst — fence between two plain atomic stores; load checks. */
+void build_k15_fence_seq_cst(CgTestCtx* ctx)
+{
+ KCtx k = k_open_i32(ctx, 0);
+ const Type* I32 = T_i32(ctx);
+ CGTarget* T = ctx->target;
+ T->atomic_store(T, REG_op(k.p_addr, T_ptr(ctx, I32)),
+ IMM_op(42, I32), k_ma32(ctx), MO_RELAXED);
+ T->fence(T, MO_SEQ_CST);
+ Reg r = T->alloc_reg(T, RC_INT, I32);
+ T->atomic_load(T, REG_op(r, I32), REG_op(k.p_addr, T_ptr(ctx, I32)),
+ k_ma32(ctx), MO_RELAXED);
+ cgtest_ret_reg(k.tf, r, I32);
+ cgtest_end(k.tf);
+}
diff --git a/test/cg/harness/cases_l.c b/test/cg/harness/cases_l.c
@@ -0,0 +1,416 @@
+/* Group L — intrinsics.
+ * See CORPUS.md for the case list and expected values. */
+
+#include "cg_test.h"
+
+/* ============================================================
+ * Group L: compiler intrinsics
+ *
+ * Drives CGTarget.intrinsic across every IntrinKind. Operand shapes
+ * follow arch.h's documentation:
+ * POPCOUNT/CTZ/CLZ/BSWAP* : dsts[0] REG, args[0] REG
+ * MEMCPY/MEMMOVE : args = (dst_addr, src_addr, n_bytes)
+ * MEMSET : args = (dst_addr, byte_value, n_bytes)
+ * PREFETCH : args = (addr)
+ * ASSUME_ALIGNED : dsts[0] REG, args = (ptr, align)
+ * EXPECT : dsts[0] REG, args = (val, expected)
+ * UNREACHABLE / TRAP : no dsts, no args
+ * *_OVERFLOW : dsts[0] result, dsts[1] i1 ovf; args = (a, b)
+ * ============================================================ */
+
+/* helper: emit a single-result bit-op intrinsic on `in` (returns dst reg). */
+static Reg l_bitop(CgTestCtx* ctx, IntrinKind kind,
+ const Type* arg_ty, i64 imm)
+{
+ const Type* I32 = T_i32(ctx);
+ CGTarget* T = ctx->target;
+ Reg src = T->alloc_reg(T, RC_INT, arg_ty);
+ T->load_imm(T, REG_op(src, arg_ty), imm);
+ Reg dst = T->alloc_reg(T, RC_INT, I32);
+ Operand dsts[1] = { REG_op(dst, I32) };
+ Operand args[1] = { REG_op(src, arg_ty) };
+ T->intrinsic(T, kind, dsts, 1, args, 1);
+ return dst;
+}
+
+/* l01_popcount_u32 — popcount(0xFF) → 8. */
+void build_l01_popcount_u32(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* U32 = T_u32(ctx);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ Reg r = l_bitop(ctx, INTRIN_POPCOUNT, U32, 0xFF);
+ cgtest_ret_reg(tf, r, I32);
+ cgtest_end(tf);
+}
+
+/* l02_popcount_u64 — popcount((u64)-1) → 64. */
+void build_l02_popcount_u64(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* U64 = T_u64(ctx);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ Reg r = l_bitop(ctx, INTRIN_POPCOUNT, U64, -1);
+ cgtest_ret_reg(tf, r, I32);
+ cgtest_end(tf);
+}
+
+/* l03_ctz_u32 — ctz(0x80) → 7. */
+void build_l03_ctz_u32(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* U32 = T_u32(ctx);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ Reg r = l_bitop(ctx, INTRIN_CTZ, U32, 0x80);
+ cgtest_ret_reg(tf, r, I32);
+ cgtest_end(tf);
+}
+
+/* l04_clz_u32 — clz(0xFF) over 32 bits → 24. */
+void build_l04_clz_u32(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* U32 = T_u32(ctx);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ Reg r = l_bitop(ctx, INTRIN_CLZ, U32, 0xFF);
+ cgtest_ret_reg(tf, r, I32);
+ cgtest_end(tf);
+}
+
+/* l05_bswap16 — bswap16(0x1234) → 0x3412 (low 8 = 0x12 = 18). */
+void build_l05_bswap16(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* U16 = T_u16(ctx);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ Reg r = l_bitop(ctx, INTRIN_BSWAP16, U16, 0x1234);
+ cgtest_ret_reg(tf, r, I32);
+ cgtest_end(tf);
+}
+
+/* l06_bswap32 — bswap32(0x11223344) → 0x44332211 (low 8 = 0x11 = 17). */
+void build_l06_bswap32(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* U32 = T_u32(ctx);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ Reg r = l_bitop(ctx, INTRIN_BSWAP32, U32, 0x11223344);
+ cgtest_ret_reg(tf, r, I32);
+ cgtest_end(tf);
+}
+
+/* l07_bswap64 — bswap64(0x1122334455667788) → 0x8877665544332211; low 8 = 17. */
+void build_l07_bswap64(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* U64 = T_u64(ctx);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ CGTarget* T = ctx->target;
+
+ Reg src = T->alloc_reg(T, RC_INT, U64);
+ T->load_imm(T, REG_op(src, U64), 0x1122334455667788ll);
+ Reg dst64 = T->alloc_reg(T, RC_INT, U64);
+ Operand dsts[1] = { REG_op(dst64, U64) };
+ Operand args[1] = { REG_op(src, U64) };
+ T->intrinsic(T, INTRIN_BSWAP64, dsts, 1, args, 1);
+
+ Reg r32 = T->alloc_reg(T, RC_INT, I32);
+ T->convert(T, CV_TRUNC, REG_op(r32, I32), REG_op(dst64, U64));
+ cgtest_ret_reg(tf, r32, I32);
+ cgtest_end(tf);
+}
+
+/* l08_memcpy_4 — int src=42; memcpy(&dst,&src,4); return dst. */
+void build_l08_memcpy_4(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* I64 = T_i64(ctx);
+ const Type* PI32 = T_ptr(ctx, I32);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ CGTarget* T = ctx->target;
+
+ FrameSlot src = cgtest_local(tf, I32, FSF_ADDR_TAKEN);
+ FrameSlot dst = cgtest_local(tf, I32, FSF_ADDR_TAKEN);
+ cgtest_store_local(tf, src, IMM_op(42, I32), I32);
+
+ Reg ps = T->alloc_reg(T, RC_INT, PI32);
+ Reg pd = T->alloc_reg(T, RC_INT, PI32);
+ T->addr_of(T, REG_op(ps, PI32), LOCAL_op(src, I32));
+ T->addr_of(T, REG_op(pd, PI32), LOCAL_op(dst, I32));
+
+ Operand args[3] = {
+ REG_op(pd, PI32),
+ REG_op(ps, PI32),
+ IMM_op(4, I64),
+ };
+ T->intrinsic(T, INTRIN_MEMCPY, NULL, 0, args, 3);
+
+ Reg r = T->alloc_reg(T, RC_INT, I32);
+ cgtest_load_local(tf, REG_op(r, I32), dst, I32);
+ cgtest_ret_reg(tf, r, I32);
+ cgtest_end(tf);
+}
+
+/* l09_memmove_overlap — int a[5]={1..5}; memmove(a+1,a,16); return a[4]→4. */
+void build_l09_memmove_overlap(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* I64 = T_i64(ctx);
+ const Type* U8 = T_u8(ctx);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ CGTarget* T = ctx->target;
+
+ /* alloca 5*4 = 20 bytes, aligned to 4. */
+ Reg buf = T->alloc_reg(T, RC_INT, T_ptr(ctx, I32));
+ T->alloca_(T, REG_op(buf, T_ptr(ctx, I32)), IMM_op(20, I64), 4);
+
+ MemAccess ma = { .type = I32, .size = 4, .align = 4,
+ .alias.kind = ALIAS_LOCAL };
+ for (int i = 0; i < 5; ++i) {
+ T->store(T, IND_op(buf, (i32)(i*4), I32), IMM_op(i+1, I32), ma);
+ }
+
+ /* dst = a + 4 (one i32 forward); use byte arithmetic for the addr. */
+ Reg dst = T->alloc_reg(T, RC_INT, T_ptr(ctx, U8));
+ T->binop(T, BO_IADD, REG_op(dst, T_ptr(ctx, U8)),
+ REG_op(buf, T_ptr(ctx, I32)), IMM_op(4, I64));
+
+ Operand args[3] = {
+ REG_op(dst, T_ptr(ctx, U8)),
+ REG_op(buf, T_ptr(ctx, I32)),
+ IMM_op(16, I64),
+ };
+ T->intrinsic(T, INTRIN_MEMMOVE, NULL, 0, args, 3);
+
+ /* return a[4] (byte offset 16 from buf — old a[3]=4 was copied here). */
+ Reg r = T->alloc_reg(T, RC_INT, I32);
+ T->load(T, REG_op(r, I32), IND_op(buf, 16, I32), ma);
+ cgtest_ret_reg(tf, r, I32);
+ cgtest_end(tf);
+}
+
+/* l10_memset_zero — int b[4]; memset(b,0,16); return b[2] → 0. */
+void build_l10_memset_zero(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* I64 = T_i64(ctx);
+ const Type* U8 = T_u8(ctx);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ CGTarget* T = ctx->target;
+
+ Reg buf = T->alloc_reg(T, RC_INT, T_ptr(ctx, I32));
+ T->alloca_(T, REG_op(buf, T_ptr(ctx, I32)), IMM_op(16, I64), 4);
+
+ /* Pre-poison so the memset is observable. */
+ MemAccess ma = { .type = I32, .size = 4, .align = 4,
+ .alias.kind = ALIAS_LOCAL };
+ for (int i = 0; i < 4; ++i)
+ T->store(T, IND_op(buf, (i32)(i*4), I32), IMM_op(0xDEAD, I32), ma);
+
+ Operand args[3] = {
+ REG_op(buf, T_ptr(ctx, I32)),
+ IMM_op(0, U8),
+ IMM_op(16, I64),
+ };
+ T->intrinsic(T, INTRIN_MEMSET, NULL, 0, args, 3);
+
+ Reg r = T->alloc_reg(T, RC_INT, I32);
+ T->load(T, REG_op(r, I32), IND_op(buf, 8, I32), ma);
+ cgtest_ret_reg(tf, r, I32);
+ cgtest_end(tf);
+}
+
+/* l11_memset_ff — int b; memset(&b,0xFF,4); load → 0xFFFFFFFF; low 8 = 255. */
+void build_l11_memset_ff(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* I64 = T_i64(ctx);
+ const Type* U8 = T_u8(ctx);
+ const Type* PI32 = T_ptr(ctx, I32);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ CGTarget* T = ctx->target;
+
+ FrameSlot b = cgtest_local(tf, I32, FSF_ADDR_TAKEN);
+ Reg p = T->alloc_reg(T, RC_INT, PI32);
+ T->addr_of(T, REG_op(p, PI32), LOCAL_op(b, I32));
+
+ Operand args[3] = {
+ REG_op(p, PI32),
+ IMM_op(0xFF, U8),
+ IMM_op(4, I64),
+ };
+ T->intrinsic(T, INTRIN_MEMSET, NULL, 0, args, 3);
+
+ Reg r = T->alloc_reg(T, RC_INT, I32);
+ cgtest_load_local(tf, REG_op(r, I32), b, I32);
+ cgtest_ret_reg(tf, r, I32);
+ cgtest_end(tf);
+}
+
+/* l12_expect_taken — int x = expect(1==1, 1); if (x) return 42; else return 99. */
+void build_l12_expect_taken(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ CGTarget* T = ctx->target;
+
+ Reg cond = T->alloc_reg(T, RC_INT, I32);
+ T->load_imm(T, REG_op(cond, I32), 1);
+
+ Reg out = T->alloc_reg(T, RC_INT, I32);
+ Operand dsts[1] = { REG_op(out, I32) };
+ Operand args[2] = { REG_op(cond, I32), IMM_op(1, I32) };
+ T->intrinsic(T, INTRIN_EXPECT, dsts, 1, args, 2);
+
+ Label miss = T->label_new(T);
+ T->cmp_branch(T, CMP_EQ, REG_op(out, I32), IMM_op(0, I32), miss);
+ cgtest_ret_imm(tf, 42, I32);
+ T->label_place(T, miss);
+ cgtest_ret_imm(tf, 99, I32);
+ cgtest_end(tf);
+}
+
+/* l13_unreachable_live — if(x) return 42; else __builtin_unreachable(). */
+void build_l13_unreachable_live(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ CGTarget* T = ctx->target;
+
+ Reg x = T->alloc_reg(T, RC_INT, I32);
+ T->load_imm(T, REG_op(x, I32), 1);
+
+ Label dead = T->label_new(T);
+ T->cmp_branch(T, CMP_EQ, REG_op(x, I32), IMM_op(0, I32), dead);
+ cgtest_ret_imm(tf, 42, I32);
+
+ T->label_place(T, dead);
+ T->intrinsic(T, INTRIN_UNREACHABLE, NULL, 0, NULL, 0);
+ cgtest_end(tf);
+}
+
+/* l14_trap_live — if(x) return 42; else __builtin_trap(). */
+void build_l14_trap_live(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ CGTarget* T = ctx->target;
+
+ Reg x = T->alloc_reg(T, RC_INT, I32);
+ T->load_imm(T, REG_op(x, I32), 1);
+
+ Label trap_lbl = T->label_new(T);
+ T->cmp_branch(T, CMP_EQ, REG_op(x, I32), IMM_op(0, I32), trap_lbl);
+ cgtest_ret_imm(tf, 42, I32);
+
+ T->label_place(T, trap_lbl);
+ T->intrinsic(T, INTRIN_TRAP, NULL, 0, NULL, 0);
+ cgtest_end(tf);
+}
+
+/* l15_prefetch_noop — prefetch(&x); *p=42; return *p. */
+void build_l15_prefetch_noop(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* PI32 = T_ptr(ctx, I32);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ CGTarget* T = ctx->target;
+
+ FrameSlot x = cgtest_local(tf, I32, FSF_ADDR_TAKEN);
+ Reg p = T->alloc_reg(T, RC_INT, PI32);
+ T->addr_of(T, REG_op(p, PI32), LOCAL_op(x, I32));
+
+ Operand pf_args[1] = { REG_op(p, PI32) };
+ T->intrinsic(T, INTRIN_PREFETCH, NULL, 0, pf_args, 1);
+
+ cgtest_store_local(tf, x, IMM_op(42, I32), I32);
+ Reg r = T->alloc_reg(T, RC_INT, I32);
+ cgtest_load_local(tf, REG_op(r, I32), x, I32);
+ cgtest_ret_reg(tf, r, I32);
+ cgtest_end(tf);
+}
+
+/* l16_assume_aligned — p = assume_aligned(p, 8); *p = 42; return *p. */
+void build_l16_assume_aligned(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ const Type* I64 = T_i64(ctx);
+ const Type* PI32 = T_ptr(ctx, I32);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ CGTarget* T = ctx->target;
+
+ Reg p = T->alloc_reg(T, RC_INT, PI32);
+ T->alloca_(T, REG_op(p, PI32), IMM_op(8, I64), 8);
+
+ Reg p2 = T->alloc_reg(T, RC_INT, PI32);
+ Operand dsts[1] = { REG_op(p2, PI32) };
+ Operand args[2] = { REG_op(p, PI32), IMM_op(8, I32) };
+ T->intrinsic(T, INTRIN_ASSUME_ALIGNED, dsts, 1, args, 2);
+
+ MemAccess ma = { .type = I32, .size = 4, .align = 8,
+ .alias.kind = ALIAS_LOCAL };
+ T->store(T, IND_op(p2, 0, I32), IMM_op(42, I32), ma);
+ Reg r = T->alloc_reg(T, RC_INT, I32);
+ T->load(T, REG_op(r, I32), IND_op(p2, 0, I32), ma);
+ cgtest_ret_reg(tf, r, I32);
+ cgtest_end(tf);
+}
+
+/* Helper: emit a 2-operand checked-arith intrinsic; return either the result
+ * or the overflow bit per `which`. */
+static Reg l_chkarith(CgTestCtx* ctx, IntrinKind kind, i64 a, i64 b,
+ int which /*0=value,1=ovf*/)
+{
+ const Type* I32 = T_i32(ctx);
+ CGTarget* T = ctx->target;
+ Reg ra = T->alloc_reg(T, RC_INT, I32);
+ Reg rb = T->alloc_reg(T, RC_INT, I32);
+ T->load_imm(T, REG_op(ra, I32), a);
+ T->load_imm(T, REG_op(rb, I32), b);
+ Reg val = T->alloc_reg(T, RC_INT, I32);
+ Reg ovf = T->alloc_reg(T, RC_INT, I32);
+ Operand dsts[2] = { REG_op(val, I32), REG_op(ovf, I32) };
+ Operand args[2] = { REG_op(ra, I32), REG_op(rb, I32) };
+ T->intrinsic(T, kind, dsts, 2, args, 2);
+ return which ? ovf : val;
+}
+
+/* l17_add_overflow_no — add_overflow(20,22) → val=42, ovf=0; return val. */
+void build_l17_add_overflow_no(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ Reg r = l_chkarith(ctx, INTRIN_ADD_OVERFLOW, 20, 22, 0);
+ cgtest_ret_reg(tf, r, I32);
+ cgtest_end(tf);
+}
+
+/* l18_add_overflow_yes — add_overflow(INT_MAX,1) → ovf=1; return ovf. */
+void build_l18_add_overflow_yes(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ Reg r = l_chkarith(ctx, INTRIN_ADD_OVERFLOW, 0x7FFFFFFF, 1, 1);
+ cgtest_ret_reg(tf, r, I32);
+ cgtest_end(tf);
+}
+
+/* l19_sub_overflow_yes — sub_overflow(INT_MIN,1) → ovf=1. */
+void build_l19_sub_overflow_yes(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ Reg r = l_chkarith(ctx, INTRIN_SUB_OVERFLOW, (i64)(i32)0x80000000, 1, 1);
+ cgtest_ret_reg(tf, r, I32);
+ cgtest_end(tf);
+}
+
+/* l20_mul_overflow_no — mul_overflow(6,7) → val=42, ovf=0; return val. */
+void build_l20_mul_overflow_no(CgTestCtx* ctx)
+{
+ const Type* I32 = T_i32(ctx);
+ CgTestFn* tf = cgtest_begin_main(ctx, I32);
+ Reg r = l_chkarith(ctx, INTRIN_MUL_OVERFLOW, 6, 7, 0);
+ cgtest_ret_reg(tf, r, I32);
+ cgtest_end(tf);
+}
diff --git a/test/cg/run.sh b/test/cg/run.sh
@@ -142,6 +142,9 @@ if $CC $CFREE_CFLAGS \
"$TEST_DIR/harness/cases_g.c" \
"$TEST_DIR/harness/cases_h.c" \
"$TEST_DIR/harness/cases_i.c" \
+ "$TEST_DIR/harness/cases_j.c" \
+ "$TEST_DIR/harness/cases_k.c" \
+ "$TEST_DIR/harness/cases_l.c" \
"$LIB_AR" -o "$CG_RUNNER" 2>"$BUILD_DIR/cg-runner.err"; then
printf ' %s cg-runner\n' "$(color_grn built)"
else