commit 49ddbdeb2d484ac81bf4da4703f4b33f6f24de4a
parent d6525a7acdd2895269b80ec6084546262d3e2a34
Author: Ryan Sepassi <rsepassi@gmail.com>
Date: Fri, 29 May 2026 10:48:45 -0700
Add threaded-bytecode interpreter for the opt IR (cfree run --no-jit)
Implements a third execution path alongside the native backend and the
emulator: an interpreter that runs the optimizer IR directly, for
environments that cannot JIT. See doc/INTERPRETER.md.
Pipeline: opt_run_o1_interp runs the maximal target-independent subset of
the O1 path (stops before machinize/regalloc/MIR), and src/interp lowers the
resulting Func into fixed-width bytecode and runs it over an explicit,
swappable InterpStack (CALL pushes / RET pops / tail calls replace the
frame — never host C recursion). Dispatch is a portable switch (the
InterpInsn.handler slot is reserved for direct threading later).
cfree run --no-jit forces -O1, captures each function into an InterpProgram
via an interp sink on the Compiler, and executes the entry through the
engine with NO JIT execution fallback. The normal object/JIT-link still runs
so data globals / externs / function pointers resolve (host-identity:
abstract addresses are real host pointers; globals resolve via the JIT
image's symbol table + dlsym). Wasm entries get their instance/linear-memory
set up and run init+entry through the interpreter.
Coverage: imm/copy/load/store/addr-of (incl. memory-dest results),
binop/unop/cmp/convert (saturating ftoi), all branches + switch + indirect,
internal/external/indirect calls, scalar+sret returns, alloca, agg-copy/set,
atomics (single-thread serialized), and common intrinsics. Inline asm,
IR_LOCAL_STATIC_DATA_* (static locals / switch jump tables), va_arg,
bitfields, TLS, and stack-routed/f32/multi-reg-return external calls are
diagnosed (interp: ... not supported) rather than miscompiled. Emu
interpret-mode (Phase 4) is not yet wired.
New: include/cfree/interp.h, src/interp/{interp.h,interp_program,lower,
engine,ffi,interp_stubs}.c, CFREE_INTERP_ENABLED (config.h + mk/config.mk +
config_assert.c + Makefile gating + stub).
Tests: test/interp/interp_smoke_test.c (make test-interp); toy `I` path and
wasm `N` path (make test-interp-toy; defaults now include them). Differential
vs the JIT: toy 136/156 match exactly (0 diffs; rest cleanly SKIP), wasm
41/42 .wat, C incl. non-variadic libc FFI; full toy suite 1306 pass / 0 fail.
interp: complete non-emu features + harden FFI/TLS
Finish the threaded-bytecode interpreter for the optimizer IR so
`cfree run --no-jit` covers the full non-emu surface (validated
differentially against the JIT: toy I-path 300 pass / 0 fail, only the
inline-asm cases SKIP by design).
- Direct-threaded dispatch (computed goto) is the default, gated by
CFREE_INTERP_THREADED (config.h); shares one set of handler bodies with
a portable switch fallback via OP()/NEXT()/GO(). Auto-falls back to the
switch for the cfree self-host (no labels-as-values) and non-GNU
compilers.
- Static local data + dense -O1 switch jump tables: materialize each
IR_LOCAL_STATIC_DATA_* blob into an interp-private buffer, with
label-address entries holding bytecode pcs (not native code addresses).
- Variadics (interpreter-private va_list), bitfield load/store, and
thread-local addressing (IR_TLS_ADDR_OF via a host resolve_tls hook +
cfree_jit_tlv_resolve, which unwraps our own Mach-O TLV descriptors).
- True O(1) tail calls: relocate the callee frame onto the dead caller
and rewind the arenas (validated 5M deep).
- Expanded FFI: f32-by-value args and 1/2-register struct returns
(bit-exact vs JIT).
Adversarial-review fixes:
- ext_call: reject (not overflow) an aggregate/>8B scalar variadic-tail
arg; deliver a single-register return into a memory destination instead
of dropping it; forward fr->sret_ptr for an sret external tail call.
- branch handlers observe the g_mem_fault latch before acting on a
faulted selector.
- TLS: diagnose (not miscompile) non-Mach-O / foreign-descriptor cases;
cfree_jit_tlv_resolve verifies a descriptor is ours before calling
through it.
Adds test/toy/cases/141_threadlocal_mutate (R/I/C/W; L skipped via a new
<name>.link.skip hook, since defining a _Thread_local in a linked
executable needs PIE/crt TLS work).
Diffstat:
25 files changed, 4033 insertions(+), 50 deletions(-)
diff --git a/Makefile b/Makefile
@@ -141,6 +141,9 @@ ifneq ($(CFREE_EMU_ENABLED),1)
LIB_SRCS_ARCH_RV64 := $(filter-out %/emu.c,$(LIB_SRCS_ARCH_RV64))
LIB_SRCS_NONARCH += src/arch/emu_stubs.c
endif
+ifneq ($(CFREE_INTERP_ENABLED),1)
+LIB_SRCS_NONARCH += src/interp/interp_stubs.c
+endif
LIB_SRCS_OBJ_ELF = $(shell find src/obj/elf -name '*.c' 2>/dev/null)
LIB_SRCS_OBJ_MACHO = $(shell find src/obj/macho -name '*.c' 2>/dev/null)
@@ -162,6 +165,7 @@ LIB_SRCS_NONARCH += src/obj/archive_stubs.c
endif
LIB_SRCS_OPT = $(filter-out src/opt/pass_o2.c,$(shell find src/opt -name '*.c' 2>/dev/null))
+LIB_SRCS_INTERP = $(filter-out %/interp_stubs.c,$(shell find src/interp -name '*.c' 2>/dev/null))
LIB_SRCS_WASM_CORE = $(shell find src/wasm -name '*.c' 2>/dev/null)
LIB_SRCS_API_AR = src/api/archive.c
LIB_SRCS_API_DISASM = src/api/disasm.c
@@ -188,6 +192,9 @@ LIB_SRCS = $(LIB_SRCS_NONARCH)
ifeq ($(CFREE_OPT_ENABLED),1)
LIB_SRCS += $(LIB_SRCS_OPT)
endif
+ifeq ($(CFREE_INTERP_ENABLED),1)
+LIB_SRCS += $(LIB_SRCS_INTERP)
+endif
ifeq ($(CFREE_AR_ENABLED),1)
LIB_SRCS += $(LIB_SRCS_API_AR)
endif
diff --git a/doc/INTERPRETER.md b/doc/INTERPRETER.md
@@ -0,0 +1,374 @@
+# Threaded Bytecode Interpreter for the cfree IR
+
+## Context
+
+cfree currently has two ways to *run* IR: the native backend (lowers opt IR → machine
+code) and the emulator, which dynamic-binary-translates a guest ISA by decoding →
+lifting to CG IR → optimizing → **JIT**ing to host code (`src/emu/emu.c:380-505`).
+Both require allocating executable memory and emitting native instructions.
+
+We want a third execution path: a **threaded interpreter** that runs cfree IR directly,
+for environments that cannot JIT (no W^X-exempt mmap, no codegen for the host arch, etc.).
+It must serve two configurations behind one engine:
+
+1. **Host-identity** — run compiled C directly; abstract addresses *are* real host
+ pointers; external calls go to real linked symbols.
+2. **Emu/guest** — replace the emu's JIT step; pointers are guest VAs translated
+ through the existing `EmuAddrSpace`.
+
+Design decisions (confirmed with the user):
+- Consume the **optimizer IR** (`src/opt/ir.h` `Func`/`Block`/`Inst`), tapped on the
+ **O1 PReg path** — *after* target-independent passes, *before* register allocation /
+ MIR lowering. At this point `f->opt_reg_ssa == 0`, `OPK_REG` operands carry virtual
+ **PReg** ids, and there are no `IR_PHI` nodes. This gives an unbounded virtual-register
+ machine and lets us reuse opt's optimizations before interpreting. (O2/SSA is
+ deprecated/disabled — ignored.)
+- New **fixed-width, cache-friendly bytecode** with **direct threading** (labels-as-values
+ / computed goto): each record's first word is a pre-resolved `&&handler`.
+- **Explicit, first-class call stack.** The interpreter must NOT use host C recursion for
+ IR-level calls. The whole runtime is parameterized by an `InterpStack` (an explicit
+ heap/region-backed frame stack) so execution can be suspended and swapped between stacks
+ — the substrate for "virtual threads"/fibers. No scheduling *policy* is specified yet;
+ the requirement is that the *mechanism* be explicit and swap-ready from day one.
+
+## Tap point: `opt_run_o1_interp`
+
+`opt_run_o1_native` (`src/opt/opt.c:84-224`) is the model. We add a sibling
+`opt_run_o1_interp(Compiler*, const CgIrFunc*) -> Func*` (declared in `src/opt/opt.h`)
+that runs the **maximal target-independent subset** and stops before regalloc:
+
+Run: `opt_func_from_cg_ir` → `opt_build_cfg` → `opt_jump_cleanup(CFG)` → `opt_build_cfg`
+→ `opt_simplify_local` → `opt_try_tiny_inline` (+ rebuild idiom from `opt.c:119-121`)
+→ `opt_addr_xform_pregs` → `opt_promote_scalar_locals` → `opt_addr_of_global_cse`
+→ `opt_build_loop_tree` → `opt_live_blocks` + `opt_dead_def_elim_with_live` → **STOP**.
+
+Skip: `opt_machinize_native` (only captures phys-reg pools + resolves inline-asm
+constraints — irrelevant to a virtual-register interpreter), `opt_lower_loop_imm_operands`
++ `opt_hoist_loop_consts` (pure pessimization for an interpreter, which takes immediates
+directly; the first early-returns on `target==NULL` anyway), `opt_regalloc_locations`,
+`opt_lower_to_mir`, and all MIR passes. Leave `opt_reg_ssa==0`, `opt_rewritten==0`.
+
+**Phase-0 verification risk:** `opt_addr_xform_pregs`/`opt_promote_scalar_locals`/
+`opt_addr_of_global_cse` run *after* machinize in the native pipeline; confirm they have
+no dependency on machinize side-effects (expected: they are local/escape analysis only).
+Also confirm the `opt_verify` debug calls don't assert on `opt_has_target`; if they do,
+omit them from the interp entry (debug-only).
+
+## Control flow & operand model at the tap point (verified)
+
+- **Scopes are CFG no-ops.** `IR_SCOPE_BEGIN/ELSE/END` are treated as `return` by
+ `pass_native_emit.c:751`; control flow is driven entirely by `Block.succ[]` +
+ terminators. `IR_BREAK_TO`/`IR_CONTINUE_TO` carry their destination in `succ[0]`
+ (`pass_cfg.c:10-14`) → lower as `OP_BR succ[0]`. Every reachable block has explicit
+ succ edges (fallthrough materialized in `cg_ir_lower.c:1062-1095`), so the loader can
+ resolve all branches to bytecode pcs.
+- Terminators: `IR_BR`(succ0), `IR_CONDBR`(succ0=true/succ1=false), `IR_CMP_BRANCH`
+ (`extra.imm`=CmpOp, succ0=taken/succ1=fall), `IR_SWITCH` (`IRSwitchAux` cases+default —
+ interpret directly), `IR_INDIRECT_BRANCH` (`IRIndirectAux.targets[]`), `IR_RET`.
+- `OptOperand` (`ir.h:65-86`): `OPK_REG`→PReg id (`v.reg`); `OPK_IMM`→`v.imm`;
+ `OPK_LOCAL`→`v.frame_slot`; `OPK_GLOBAL`→`v.global.{sym,addend}`;
+ `OPK_INDIRECT`→`v.ind.{base,index,log2_scale,ofs}`. PRegs are `1..f->npregs`, typed by
+ `f->preg_type[]`/`f->preg_cls[]` (`RC_INT`/`RC_FP`/`RC_VEC`).
+- `IR_CALL` carries `IRCallAux` with the **semantic, un-ABI-lowered** `CGCallDesc`
+ (`desc.callee` Operand, `desc.args[]`/`desc.ret` as `CGABIValue`, `desc.abi`=`ABIFuncInfo*`).
+
+## New files & build wiring
+
+New dir `src/interp/`:
+- `interp.h` — `InterpProgram`/`InterpFunc`/`InterpFrame`/`InterpMem`, entry functions.
+- `bytecode.h` — `InterpInsn` record layout, opcode enum, side-table structs.
+- `lower.c` — `Func` (post `opt_run_o1_interp`) → `InterpFunc` (loader + threader).
+- `engine.c` — computed-goto dispatch loop + handlers; exports the dispatch table.
+- `mem.c` — `InterpMem` implementations (host-identity, emu-addr-space).
+- `ffi.c` — external-call marshaller + bounded thunk family.
+- `interp_program.c` — program lifecycle, sym→`InterpFunc` table, extern resolver.
+
+Public header `include/cfree/interp.h` (parallel to `include/cfree/jit.h`):
+`cfree_interp_program_new/_free`, `_add_func`, `_lookup`, `CfreeInterpMemVtable`, and the
+explicit-stack API: `cfree_interp_stack_new/_free`, `cfree_interp_call_on`
+(seed a stack with an entry frame + args) and `cfree_interp_resume` (run/resume a stack
+until it returns/traps/blocks). `_call` is a convenience wrapper that allocates a stack,
+seeds it, and resumes to completion.
+
+`include/cfree/config.h`: add `CFREE_INTERP_ENABLED` (near `CFREE_JIT_ENABLED`); requires
+`CFREE_OPT_ENABLED`. `Makefile`: mirror `LIB_SRCS_OPT` (`Makefile:164,188-190`) with
+`LIB_SRCS_INTERP = $(shell find src/interp -name '*.c')` gated on the flag; add an
+`interp_stubs.c` (mirror `disasm_stubs.c`, `Makefile:126`) for the disabled build.
+
+## Bytecode format
+
+Fixed-width record (target ~32 bytes), direct-threaded:
+```
+struct InterpInsn {
+ void* handler; // pre-resolved &&label (set at load); word 0 for computed goto
+ u16 opcode; // for dump/debug + non-GNU fallback
+ u16 flags; // BinOp/CmpOp/ConvKind tag, width, signedness fast bits
+ u32 dst; // dest PReg id
+ u32 a, b; // src PReg ids / pcs / side-table indices (opcode-specific)
+ i64 aux; // inline immediate or aux-table index
+};
+```
+Opcode set: one family per IROp, width/signedness-specialized where it speeds dispatch
+(`OP_BINOP_I32/I64/F32/F64` with BinOp in `flags` initially; specialize hot add/sub/mul
+later). `OP_LOAD_{8,16,32,64,F32,F64}` / `OP_STORE_*`, `OP_LOAD_IMM`, `OP_LOAD_CONST`,
+`OP_COPY`, `OP_ADDR_LOCAL`, `OP_ADDR_GLOBAL`, `OP_TLS_ADDR`, `OP_UNOP_*`, `OP_CMP_*`,
+`OP_CONVERT_*`, `OP_BR`, `OP_CONDBR`, `OP_CMP_BRANCH_*`, `OP_SWITCH`, `OP_INDIRECT_BR`,
+`OP_CALL_INTERNAL`, `OP_CALL_EXTERNAL`, `OP_RET[_VOID]`, `OP_ALLOCA`, `OP_AGG_COPY/SET`,
+`OP_BITFIELD_*`, `OP_VA_*`, `OP_ATOMIC_*`, `OP_FENCE`, `OP_INTRINSIC`, `OP_TRAP`, `OP_NOP`.
+
+## Explicit stack & frame model
+
+The runtime is parameterized by an explicit call stack so it can be suspended and swapped
+(virtual threads). Execution state lives in data structures, never in the host C stack.
+
+```
+typedef struct InterpFrame { // one IR-level activation
+ InterpFunc* fn;
+ InterpInsn* ip; // resume point (saved across calls/yields)
+ u64* regs; // register file slab [fn->npregs], owned by the stack
+ u32 mem_off; // offset of this frame's addressable bytes in the stack
+ u32 frame_bytes, alloca_top;
+ u8* sret_ptr; // aggregate-return destination, if any
+ u32 ret_dst; // caller PReg/slot to receive scalar return (resolved on push)
+} InterpFrame;
+
+typedef struct InterpStack { // a swappable execution context / fiber
+ InterpFrame* frames; u32 nframes, frames_cap; // explicit call stack (grows on CALL)
+ u8* regs_arena; u32 regs_top, regs_cap; // bump region for per-frame register files
+ u8* mem_arena; u32 mem_top, mem_cap; // addressable frame bytes (host-identity mode)
+ u64 guest_sp_base; // emu mode: frames carve guest stack instead
+ u64 scalar_ret; u8 ret_is_fp; // return shuttle between frames
+ u8 status; // RUNNING/DONE/TRAP/BLOCKED
+} InterpStack;
+```
+
+- **CALL pushes** a new `InterpFrame` onto `stack->frames` (bump-allocating its register
+ file from `regs_arena` and its addressable bytes from `mem_arena`/guest stack); **RET
+ pops** and writes the return into the caller frame's `ret_dst`. The dispatch loop runs
+ the *top* frame; it never calls itself for IR-level calls.
+- The engine entry is `interp_resume(InterpProgram*, InterpStack*) -> InterpRunStatus`.
+ It runs until the stack empties (DONE), traps, or blocks. **Swapping virtual threads =
+ calling `interp_resume` with a different `InterpStack`.** A suspension/yield point just
+ saves `ip` into the top frame and returns a status; resuming re-enters the loop on the
+ same stack. (External/native calls are the one place host C stack is unavoidably used,
+ for the duration of the call — noted as a non-suspendable region.)
+- Memory-model split stays clean: `InterpStack` holds **control state** (frames, ips,
+ register files); **addressable bytes** come from `mem_arena` (host-identity) or the guest
+ address space (emu mode, frames carve `guest_sp_base`). The register file is always
+ host-side, so it never needs translation.
+- The host C stack stays O(1) regardless of IR call depth; deep IR recursion grows
+ `stack->frames`, not the host stack.
+
+Per-`InterpFunc` side tables: constant pool (wide imms + `ConstBytes`), call-target table
+`{internal InterpFunc* | external host_fp, CGCallDesc*, ABIFuncInfo*}`, switch tables
+(case→pc, default→pc resolved), `slot_off[nframe_slots]`+`frame_bytes`, and a transient
+`block_pc[nblocks]` used during branch fixup.
+
+## Lowering algorithm (`Func` → `InterpFunc`)
+
+1. Walk blocks in `f->emit_order`, skipping unreachable; record `block_pc[b]`; emit one
+ record per non-no-op `Inst` (SCOPE_*/NOP/PARAM_DECL emit nothing; assert no `IR_PHI`).
+2. PReg id *is* its register-file index; register file size = `f->npregs`.
+3. Bump-allocate `slot_off[]` over `f->frame_slots[]` honoring `align`/`size`;
+ `FS_ALLOCA` slots allocate dynamically at `OP_ALLOCA`.
+4. Encode operands per `OptOperand` union; choose width/signedness from
+ `MemAccess.size` / `abi_cg_type_info` (`abi.h:141`). `OPK_INDIRECT` needs an aux record.
+5. Second pass: rewrite branch/switch/indirect targets from block id → `block_pc`.
+6. Thread: set `record.handler = dispatch_table[opcode]` (table fetched once via an
+ engine "publish table" call).
+
+**Arena lifetime:** `InterpFunc` references small descriptors (`CGCallDesc`, switch aux)
+from the opt `Func` arena. Deep-copy them in `lower.c` (recommended, given the emu's
+per-block churn) rather than pinning the opt arena.
+
+## Engine (`engine.c`)
+
+`interp_resume(P, stack)` runs the top frame with a direct-threaded loop. `ip`/`regs`/
+`fr` are cached locals reloaded from `stack->frames[top]` after any push/pop;
+`#define NEXT() goto *(ip->handler)`. Representative handlers:
+- **BINOP int/fp:** read `regs[a]`,`regs[b]`, apply by `flags`, write `regs[dst]`; guard
+ div/rem-by-zero and FTOI overflow → `goto trap` (sets `stack->status=TRAP`, returns).
+- **LOAD/STORE:** compute abstract addr (`regs[base] + ofs [+ regs[index]<<scale]`), call
+ `P->mem.translate(ctx, addr, n, perms)`; never raw-deref — this is what makes the two
+ memory models swap cleanly.
+- **CMP_BRANCH:** compare, set `ip = code + (taken ? taken_pc : fall_pc)`.
+- **CALL_INTERNAL:** save `ip` into current frame; **push** a new `InterpFrame` (bump regs
+ + mem from the stack); copy `desc.args[i]` by value into callee param homes; record
+ `ret_dst`; reload cached locals from the new top frame; `NEXT()`. No host recursion.
+- **CALL_EXTERNAL:** `ffi_call_external` (below) — the only handler that uses the host C
+ stack, for the call's duration (non-suspendable region).
+- **RET:** shuttle scalar/aggregate into `stack->scalar_ret`/`sret_ptr`; **pop** the frame;
+ if the stack is now empty set `status=DONE` and return; else write the result into the
+ caller's `ret_dst`, reload cached locals, `NEXT()`.
+- **ALLOCA:** bump `fr->alloca_top`, return `frame_base + off`.
+- **CONVERT:** specialized by src/dst width → the corresponding C cast.
+
+## Pluggable memory (`mem.c`)
+
+`InterpMem { u8* (*translate)(void* ctx, u64 addr, u64 n, u8 perms); void* ctx; }`.
+- Host-identity: `return (u8*)(uintptr_t)addr`. Locals/allocas return real `&frame.mem[off]`;
+ globals resolve via `cfree_jit_lookup`/extern resolver.
+- Emu: `return emu_addr_space_ptr((EmuAddrSpace*)ctx, addr, n, perms)` (`emu.h:402`,
+ bounds+perm checked). Interpreter frames live in *guest* stack memory: carve
+ `frame_bytes` off the guest SP so `&local` stays a valid guest VA.
+
+## Call marshalling (`ffi.c`)
+
+- **Internal** (callee resolves to an `InterpFunc`): handled by the engine's push/pop on
+ the explicit `InterpStack` (above) — pure value semantics, no host ABI, no host recursion;
+ aggregates copied by `abi_cg_sizeof` bytes.
+- **External** (host function pointer): hand-rolled marshaller driven by `desc.abi`
+ (`ABIFuncInfo`, `abi.h:116`). Classify ret + args into int/fp/by-ref buckets; dispatch
+ through a finite family of typed function-pointer cast thunks keyed on
+ `(ret_class, n_int, n_fp)` up to a fixed cap (~8 int + 8 fp). Handle `has_sret`
+ (alloc aggregate, pass hidden pointer, copy back) and byval struct args (copy to host
+ buffer, pass pointer). Variadic supported when varargs fit the register thunk family;
+ `vararg_on_stack` (Apple ARM64) → diagnose, defer. Anything beyond the family →
+ diagnose `"interp: unsupported external call signature"` with the precise reason.
+
+## Integration with existing paths
+
+- **Standalone:** front end records CG IR as today → `opt_run_o1_interp` per func →
+ `lower.c` → register in `InterpProgram`; globals via extern resolver / `cfree_jit_lookup`;
+ `cfree_interp_lookup` + `cfree_interp_call` to run.
+- **Emu interpret-mode:** in `emu_translate_block` (`emu.c:380-505`) keep decode +
+ `arch->emu->lift_block` (`emu.c:464`) unchanged; instead of `cfree_link_session_jit`,
+ run the lifted CG IR through `opt_run_o1_interp` + `lower.c` into an `InterpFunc`, cache
+ by `guest_pc` (mirror `emu_cache_insert`, `emu.c:500`). Dispatch calls `interp_run` with
+ the emu-addr-space `InterpMem`; external/runtime symbols route via
+ `emu_runtime_extern_resolver` (`emu.c:480`). Gate on a new JIT-vs-INTERP `CfreeEmuMode`.
+
+## Phased implementation order
+
+- **Phase 0 — scaffolding:** `src/interp/` skeletons, config flag, Makefile + stubs,
+ `opt_run_o1_interp` (verify the pass-subset risks above). Build green.
+- **Phase 1 — minimal leaf int fn (milestone):** build the explicit `InterpStack`/
+ `InterpFrame` model and `interp_resume` loop up front (single-frame is enough here, but
+ the push/pop machinery and arenas land now — not retrofitted). Support `LOAD_IMM`,
+ `COPY`, int `BINOP`, scalar int `RET`, `BR`, params-in-PRegs. Run `int add(int,int)` /
+ `return 2+3;` host-identity by seeding a stack and resuming; assert results.
+- **Phase 2 — control flow & memory:** `CONDBR`, `CMP_BRANCH`, `SWITCH`, `LOAD`/`STORE`/
+ `ADDR_OF` (local+global) via vtable, frame-slot offsets, `ALLOCA`, `CONVERT`, `UNOP`,
+ `CMP`. Loops/switches/pointers work.
+- **Phase 3 — calls & FP:** internal call/return via stack push/pop (incl. aggregates;
+ exercises multi-frame depth on the explicit stack), external FFI thunks
+ (sret/byval/basic variadic), F32/F64 handlers.
+- **Phase 4 — emu integration:** emu-addr-space `InterpMem`, interpret-mode branch,
+ guest-stack frames, runtime-symbol routing; run a guest ISA smoke test interpreted.
+- **Phase 5 — long tail:** atomics (host atomics / single-thread serialize), bitfields,
+ `AGG_SET`, `TLS_ADDR`, `VA_*`, intrinsics. Diagnose-and-reject the rest.
+
+## Verification
+
+- **Unit harness:** new `test/interp/interp_smoke_test.c` mirroring
+ `test/opt/cg_ir_lower_test.c` (self-contained heap/diag sink, `EXPECT`): build tiny CG
+ IR, run `opt_run_o1_interp` + lower + `interp_run`, assert return values. Register in
+ `test/test.mk`; add `test/interp/run.sh` paralleling `test/opt/run.sh`.
+- **Differential testing (highest value):** run the same CG IR through native JIT
+ (`cfree_link_session_jit`) and the interpreter over the existing opt corpus; assert
+ identical results/side effects. Reuses the optimizer front half; catches semantic drift.
+- **Emu mode:** mirror `test/emu/rv64_smoke_test.c`; compare interpret-mode vs JIT output.
+- **Tooling:** an `interp_dump` bytecode disassembler gated like `CFREE_DUMP`
+ (`opt.c:69-82`) for debugging lowering.
+
+## Unsupported / risks (diagnose, don't miscompile)
+
+- **Inline asm** (`IR_ASM_BLOCK`): needs machinize's constraint resolution we skip; no
+ portable interpretation → reject at lower time.
+- **Vectors** (`RC_VEC`): `u64` register file → reject vector pregs/types initially; widen
+ to 128-bit lanes later if needed.
+- **setjmp/longjmp:** host `setjmp` over `InterpFrame` chains feasible in identity mode;
+ emu mode needs interpreter-stack unwinding → defer, diagnose.
+- **FFI cap:** signatures beyond the thunk family / `vararg_on_stack` / exotic
+ `ABI_ARG_EXPAND` → diagnose with the reason.
+- **`opt_verify` w/o machinize:** if it asserts on `opt_has_target`, drop from interp entry.
+
+## Implementation status (initial landing)
+
+Implemented and validated differentially against the JIT:
+
+- **`opt_run_o1_interp`** (`src/opt/opt.c`): runs `opt_func_from_cg_ir` → cfg/jump-cleanup
+ → `opt_simplify_local` → `opt_addr_xform_pregs` → `opt_promote_scalar_locals`
+ → `opt_addr_of_global_cse` → loop tree → `opt_live_blocks` + `opt_dead_def_elim_with_live`,
+ then STOP. `opt_verify` is omitted (debug-only; some checks assume the machinized shape).
+- **`src/interp/`**: `interp.h`/`interp_program.c`/`lower.c`/`engine.c`/`ffi.c` (+ `interp_stubs.c`).
+ Public API in `include/cfree/interp.h`. Gated by `CFREE_INTERP_ENABLED`.
+- **Direct-threaded dispatch.** Each `InterpInsn` caches the `&&handler` of its opcode and
+ every handler tail-dispatches with `goto *in->handler`, so the branch predictor sees a
+ distinct indirect branch per opcode site. The same handler bodies compile as a portable
+ `switch` fallback via the `OP()`/`NEXT()`/`GO()` macros: the threaded path is gated on
+ `(defined(__GNUC__) || defined(__clang__)) && !defined(__cfree__)` (cfree's own C front end
+ has no labels-as-values, so the self-host build uses the switch), with the GNU label-as-value
+ warnings suppressed locally. The dispatch table is published from the in-function labels once,
+ then each function's records are threaded lazily on first entry. A build can force either path
+ via `-DCFREE_INTERP_THREADED=0|1`.
+- **Explicit `InterpStack`/`InterpFrame`** with offset-based register/mem arenas (so a deep
+ call can `realloc` an arena without invalidating cached frame pointers). CALL pushes / RET
+ pops; **tail calls** (terminator `IR_CALL` / `CG_CALL_TAIL`) relocate the freshly-built callee
+ frame down onto the dead caller's register/memory region and rewind the arenas, so a tail
+ loop runs in **true O(1)** interp + host stack space (validated to 5M deep).
+- **Ops**: imm/copy/load/store/addr-of (incl. memory-destination results via `write_dst`),
+ binop/unop/cmp/convert (width+sign from the op tag), br/condbr/cmp_branch/switch/
+ indirect-br/load-label-addr, internal+external/indirect call, ret (scalar + aggregate
+ sret), alloca, agg-copy/set, **bitfield load/store** (shift+mask extract / read-modify-write
+ insert, signed-field sign-extension), **`IR_TLS_ADDR_OF`** (thread-local address; see TLS
+ below), **variadics** (`va_start`/`va_arg`/`va_end`/`va_copy`), **atomics** (single-thread
+ serialized), and the common **intrinsics** (memcpy/move/set, popcount/ctz/clz/bswap,
+ checked-overflow, expect, trap). Float→int conversion **saturates** (NaN→0, clamp) — matches
+ Wasm `trunc_sat` and avoids the UB of a raw out-of-range cast.
+- **Static local data + switch jump tables** (`IR_LOCAL_STATIC_DATA_*`, `lower.c`): each
+ function-scope static blob (regular static locals, dense-switch jump tables, computed-goto
+ label arrays) is materialized into an interp-private, program-lifetime buffer at lower time
+ and its symbol bound to that buffer. `WRITE` records contribute literal bytes; `LABEL_ADDR`
+ records contribute the target block's **bytecode pc** (so an `IR_LOAD` + `IR_INDIRECT_BRANCH`
+ through a jump table lands on the right record, not a native code address baked in by the
+ parallel object/JIT path). This is what unblocks the dense `-O1` switch lowering.
+- **Variadics** (interpreter-private va_list): on an internal call to a variadic callee the
+ anonymous arguments are laid out into a contiguous buffer in the callee frame; `va_start`
+ seeds the va_list with a cursor over it, `va_arg` reads the typed slot and advances, `va_copy`
+ duplicates the cursor. The layout is self-consistent because the interpreter owns both the
+ call-site build and the va ops — independent of the target ABI's real va_list.
+- **Thread-local storage**: a thread-local's symbol does not denote its storage on every target
+ (a Mach-O symbol resolves to a TLV descriptor), so `IR_TLS_ADDR_OF` routes through the host
+ `resolve_tls` hook. In `--no-jit` the driver implements it via `cfree_jit_tlv_resolve`, which
+ unwraps **our own** Mach-O descriptor (it verifies `desc[+0]==&cfree_jit_tlv_thunk` and
+ `desc[+8]==jit->tls_ctx` first, so a foreign/dyld descriptor never becomes a wild call) and
+ calls the JIT image's per-thread block accessor to get the variable's real address. The
+ interpreter shares the same per-thread storage the JIT would use, so reads/writes are
+ consistent and persist across calls. Anything it can't resolve safely — a non-Mach-O image or
+ a foreign descriptor (e.g. an extern thread-local) — returns NULL and is **diagnosed**, never
+ treated as storage.
+- **`cfree run --no-jit`** (`driver/run.c`): forces `-O1` minimum, attaches an `InterpProgram`
+ so each function is captured while the normal object/JIT-link still runs (it lays out data
+ globals and resolves externs / function pointers). The entry executes **only** through the
+ interpreter — there is **no JIT execution fallback**; a non-interpretable entry is an error.
+ Globals/externs resolve by iterating the JIT image's symbol table (locals included,
+ tolerating the target's leading-underscore C mangling), then host `dlsym`; thread-locals
+ additionally route through `cfree_jit_tlv_resolve`. Wasm entries get their instance/linear-
+ memory set up and run `__cfree_wasm_init` + entry via the interpreter.
+- **FFI** (`ffi.c`): a maximal host-ABI prototype family — int args in `u64×8`, fp args in
+ `double×8` (or `float×8` when every fp arg is a 4-byte single; a float/double mix in one
+ signature is diagnosed). Int/fp args land in their register pools regardless of interleaving
+ on the supported ABIs. Handles sret, byval-by-pointer, and **multi-register struct returns**
+ (up to two registers, any int/fp class combination — struct-returning thunks steer the return
+ registers and the caller scatters each part into the aggregate). `vararg_on_stack` (Apple
+ ARM64 variadics) and 32-bit-fp struct-return fields are diagnosed/deferred.
+- **Tests**: `test/interp/interp_smoke_test.c` (unit, `make test-interp`); toy `I` path and
+ wasm `N` path (`make test-interp-toy`, default paths now include them). Differential result:
+ toy `I` 298/312 match the JIT exactly with 0 diffs (only the 7 inline-asm cases SKIP, by
+ design); wasm `N` `.wat`/`.wasm` match; C (incl. libc via FFI — f32 args, multi-register
+ struct returns) matches; `141_threadlocal_mutate` exercises TLS define+mutate.
+
+Not yet implemented (diagnosed → SKIP, not miscompiled): inline asm (by design; needs
+machinize's constraint resolution), FFI signatures beyond the register-thunk family
+(`vararg_on_stack` external variadics, 3+-register struct returns, 32-bit-fp struct-return
+fields, and an aggregate/>8-byte scalar in an external variadic-argument position — these have
+no per-call ABI classification, so they are rejected rather than marshalled), thread-locals on
+non-Mach-O images or via foreign/dyld descriptors (extern thread-locals), and **Phase 4 emu
+integration** (interpret-mode in `emu_translate_block`).
+
+Known limitations (correct results, not bugs): the threaded-dispatch per-site branch benefit
+only materializes at optimization — the `-O0`+sanitizer test build merges the dispatch sites
+(still computed-goto through the handler field, never a switch). The `g_mem_fault` latch is
+re-checked on straight-line ops and on branch selectors; full coverage matters once emu mode
+(where `translate` can fault) lands.
diff --git a/driver/run.c b/driver/run.c
@@ -1,5 +1,6 @@
#include <cfree/compile.h>
#include <cfree/core.h>
+#include <cfree/interp.h>
#include <cfree/jit.h>
#include <cfree/link.h>
#include <cfree/wasm.h>
@@ -29,6 +30,7 @@ typedef struct RunOptions {
size_t argv_bound;
int opt_level;
+ int no_jit; /* --no-jit: execute the entry via the IR interpreter */
int debug_info;
int metrics;
int bench_time;
@@ -264,6 +266,10 @@ void driver_help_run(void) {
"\n"
"COMPILE OPTIONS\n"
" -O0 -O1 -O2 Optimization level (default -O0)\n"
+ " --no-jit Execute the entry through the IR interpreter\n"
+ " instead of JIT-compiled native code (forces "
+ "-O1\n"
+ " minimum so the optimizer IR is available)\n"
" -g Emit DWARF debug info\n"
" --time, --metrics Emit scoped compile/link/JIT timing to stderr\n"
" --bench-time Emit parseable compile/JIT/execution timings\n"
@@ -445,6 +451,10 @@ static int run_parse(int argc, char** argv, RunOptions* o) {
o->metrics = 1;
continue;
}
+ if (driver_streq(a, "--no-jit")) {
+ o->no_jit = 1;
+ continue;
+ }
if (driver_streq(a, "-O0")) {
o->opt_level = 0;
continue;
@@ -595,7 +605,10 @@ static void run_fill_compile_opts(const RunOptions* o,
CfreeCCompileOptions* copts) {
CfreeCCompileOptions z = {0};
*copts = z;
- copts->code.opt_level = o->opt_level;
+ /* The interpreter consumes the O1 PReg-path IR; force at least -O1 so the
+ * optimizer runs and each function is captured into the InterpProgram. */
+ copts->code.opt_level =
+ (o->no_jit && o->opt_level < 1) ? 1 : o->opt_level;
copts->code.debug_info = o->debug_info;
driver_cflags_fill_pp(&o->cf, &copts->preprocess);
copts->diagnostics.warnings_are_errors = o->warnings_are_errors;
@@ -634,6 +647,62 @@ typedef struct RunWasmMemoryPrefix {
uint32_t flags;
} RunWasmMemoryPrefix;
+#define RUN_WASM_INSTANCE_BYTES (64u * 1024u)
+#define RUN_WASM_MEMORY_BYTES (16u * 1024u * 1024u)
+
+/* Allocate and wire a Wasm instance + linear memory and bind host imports.
+ * Returns 0 on success (caller owns the returned instance and memory),
+ * nonzero on error (already freed). Shared by the JIT and interpreter paths. */
+static int run_wasm_make_instance(RunOptions* ro, CfreeCompiler* compiler,
+ CfreeJit* jit, uint8_t** inst_out,
+ uint8_t** mem_out) {
+ uint8_t* instance = (uint8_t*)driver_alloc_zeroed(ro->env,
+ RUN_WASM_INSTANCE_BYTES);
+ uint8_t* memory =
+ (uint8_t*)driver_alloc_zeroed(ro->env, RUN_WASM_MEMORY_BYTES);
+ const CfreeWasmHostImport* h_imports = NULL;
+ size_t h_nimports = 0;
+ CfreeWasmResolveFn h_resolve = NULL;
+ void* h_user = NULL;
+ CfreeWasmHostImport canned[1];
+ extern int32_t run_test_host_add(CfreeWasmInstance*, int32_t, int32_t);
+ if (!instance || !memory) {
+ driver_errf(RUN_TOOL, "out of memory");
+ driver_free(ro->env, memory, RUN_WASM_MEMORY_BYTES);
+ driver_free(ro->env, instance, RUN_WASM_INSTANCE_BYTES);
+ return 1;
+ }
+ for (uint32_t i = 0; i < 8u; ++i)
+ ((RunWasmMemoryPrefix*)instance)[i].data =
+ memory + i * (RUN_WASM_MEMORY_BYTES / 8u);
+ cfree_wasm_get_host_imports(compiler, &h_imports, &h_nimports, &h_resolve,
+ &h_user);
+ if (h_nimports == 0 && !h_resolve && driver_getenv("CFREE_TEST_HOST_IMPORTS")) {
+ canned[0].module = "env";
+ canned[0].field = "host_add";
+ canned[0].func = (void*)(uintptr_t)run_test_host_add;
+ h_imports = canned;
+ h_nimports = 1;
+ }
+ if (cfree_wasm_bind_host_imports(compiler, jit, (CfreeWasmInstance*)instance,
+ h_imports, h_nimports, h_resolve,
+ h_user) != CFREE_OK) {
+ driver_errf(RUN_TOOL, "wasm host import bind failed");
+ driver_free(ro->env, memory, RUN_WASM_MEMORY_BYTES);
+ driver_free(ro->env, instance, RUN_WASM_INSTANCE_BYTES);
+ return 1;
+ }
+ *inst_out = instance;
+ *mem_out = memory;
+ return 0;
+}
+
+static void run_wasm_free_instance(RunOptions* ro, uint8_t* instance,
+ uint8_t* memory) {
+ driver_free(ro->env, memory, RUN_WASM_MEMORY_BYTES);
+ driver_free(ro->env, instance, RUN_WASM_INSTANCE_BYTES);
+}
+
static int run_call_wasm_entry(RunOptions* ro, CfreeCompiler* compiler,
CfreeJit* jit, void* entry, int* rc_out) {
void* init_sym = cfree_jit_lookup(jit, CFREE_SLICE_LIT("__cfree_wasm_init"));
@@ -648,61 +717,127 @@ static int run_call_wasm_entry(RunOptions* ro, CfreeCompiler* compiler,
WasmMainFn fn;
} entry_u;
if (!init_sym) return 0;
- instance = (uint8_t*)driver_alloc_zeroed(ro->env, 64u * 1024u);
- memory = (uint8_t*)driver_alloc_zeroed(ro->env, 16u * 1024u * 1024u);
- if (!instance || !memory) {
- driver_errf(RUN_TOOL, "out of memory");
- driver_free(ro->env, memory, 16u * 1024u * 1024u);
- driver_free(ro->env, instance, 64u * 1024u);
+ if (run_wasm_make_instance(ro, compiler, jit, &instance, &memory) != 0) {
*rc_out = 1;
return 1;
}
- for (uint32_t i = 0; i < 8u; ++i)
- ((RunWasmMemoryPrefix*)instance)[i].data =
- memory + i * (16u * 1024u * 1024u / 8u);
- /* Resolve host imports before init so the start function (if any) and
- * any global initializers that call imports see populated slots. */
- {
- const CfreeWasmHostImport* h_imports = NULL;
- size_t h_nimports = 0;
- CfreeWasmResolveFn h_resolve = NULL;
- void* h_user = NULL;
- cfree_wasm_get_host_imports(compiler, &h_imports, &h_nimports, &h_resolve,
- &h_user);
- /* When invoked from the wasm-front test suite, the runner exposes a
- * small canned set of host imports. The env var stays opt-in so
- * production `cfree run` doesn't surface test scaffolding. */
- CfreeWasmHostImport canned[1];
- size_t ncanned = 0;
- extern int32_t run_test_host_add(CfreeWasmInstance*, int32_t, int32_t);
- if (h_nimports == 0 && !h_resolve &&
- driver_getenv("CFREE_TEST_HOST_IMPORTS")) {
- canned[0].module = "env";
- canned[0].field = "host_add";
- canned[0].func = (void*)(uintptr_t)run_test_host_add;
- ncanned = 1;
- h_imports = canned;
- h_nimports = ncanned;
- }
- if (cfree_wasm_bind_host_imports(
- compiler, jit, (CfreeWasmInstance*)instance, h_imports, h_nimports,
- h_resolve, h_user) != CFREE_OK) {
- driver_errf(RUN_TOOL, "wasm host import bind failed");
- driver_free(ro->env, memory, 16u * 1024u * 1024u);
- driver_free(ro->env, instance, 64u * 1024u);
- *rc_out = 1;
- return 1;
- }
- }
init_u.p = init_sym;
entry_u.p = entry;
init_u.fn((CfreeWasmInstance*)instance);
*rc_out = entry_u.fn(instance);
- driver_free(ro->env, memory, 16u * 1024u * 1024u);
- driver_free(ro->env, instance, 64u * 1024u);
+ run_wasm_free_instance(ro, instance, memory);
+ return 1;
+}
+
+/* Wasm entry through the interpreter (--no-jit). Sets up the instance like the
+ * JIT path, then runs __cfree_wasm_init and the entry as InterpFuncs, passing
+ * the instance pointer as the (single) argument. Returns 0 if the module is
+ * not a Wasm module (no interpretable __cfree_wasm_init); otherwise 1 with
+ * *rc_out set. */
+static int run_call_wasm_entry_interp(RunOptions* ro, CfreeCompiler* compiler,
+ CfreeJit* jit, CfreeInterpProgram* interp,
+ int* rc_out) {
+ CfreeInterpFunc* init_fn =
+ cfree_interp_lookup(interp, CFREE_SLICE_LIT("__cfree_wasm_init"));
+ CfreeInterpFunc* entry_fn = cfree_interp_lookup(interp, cfree_slice_cstr(ro->entry));
+ uint8_t* instance;
+ uint8_t* memory;
+ uint64_t args[1];
+ int64_t ret = 0;
+ CfreeInterpStatus s;
+ if (!init_fn) return 0; /* not a Wasm module */
+ if (!entry_fn) {
+ driver_errf(RUN_TOOL, "interp: wasm entry %.*s has no interpretable IR",
+ CFREE_SLICE_ARG(cfree_slice_cstr(ro->entry)));
+ *rc_out = 1;
+ return 1;
+ }
+ if (run_wasm_make_instance(ro, compiler, jit, &instance, &memory) != 0) {
+ *rc_out = 1;
+ return 1;
+ }
+ args[0] = (uint64_t)(uintptr_t)instance;
+ s = cfree_interp_call_args(interp, init_fn, args, 1u, &ret);
+ if (s == CFREE_INTERP_DONE)
+ s = cfree_interp_call_args(interp, entry_fn, args, 1u, &ret);
+ if (s == CFREE_INTERP_DONE) {
+ *rc_out = (int)ret;
+ } else {
+ driver_errf(RUN_TOOL, "interp: could not execute wasm entry %.*s",
+ CFREE_SLICE_ARG(cfree_slice_cstr(ro->entry)));
+ *rc_out = 1;
+ }
+ run_wasm_free_instance(ro, instance, memory);
return 1;
}
+/* Host-identity symbol resolver for the interpreter.
+ *
+ * The interpreter holds symbol names as they appear in the object/image symbol
+ * table (already target-mangled, e.g. a leading `_` on Mach-O). cfree_jit_lookup
+ * only finds GLOBAL-bind symbols, but the toy/C frontends emit module-private
+ * data and helper functions with LOCAL bind. So resolve by iterating the JIT
+ * image's full symbol table (locals included) for an exact name match — this
+ * also avoids the re-mangling cfree_jit_lookup would apply. Extern/libc symbols
+ * not defined in the image fall back to host dlsym (which wants the unmangled
+ * name, so try with a leading `_` stripped too). */
+static void* interp_jit_resolve(void* ctx, CfreeSlice name) {
+ CfreeJit* jit = (CfreeJit*)ctx;
+ void* p = NULL;
+ /* The interpreter holds object-table names (target-mangled, e.g. a leading
+ * `_` on Mach-O); the JIT image exposes the canonical unmangled names. Match
+ * tolerating a single leading-underscore on either side. */
+ CfreeSlice alt = name;
+ if (name.len > 1 && name.s[0] == '_') {
+ alt.s = name.s + 1;
+ alt.len = name.len - 1;
+ }
+ /* Two passes so an EXACT name match always wins over the underscore-stripped
+ * fallback (avoids a local `_g` masking a global `g`, or vice-versa). */
+ int pass;
+ for (pass = 0; pass < 2 && !p; ++pass) {
+ CfreeSlice want = (pass == 0) ? name : alt;
+ CfreeJitSymIter* it = NULL;
+ if (pass == 1 && alt.s == name.s) break; /* no distinct stripped form */
+ if (!jit || cfree_jit_sym_iter_new(jit, &it) != CFREE_OK) break;
+ {
+ CfreeJitSym s;
+ while (cfree_jit_sym_iter_next(it, &s) == CFREE_ITER_ITEM) {
+ size_t k;
+ if (s.name.len != want.len) continue;
+ for (k = 0; k < want.len && s.name.s[k] == want.s[k]; ++k) {
+ }
+ if (k == want.len) {
+ p = (void*)(uintptr_t)s.addr;
+ break;
+ }
+ }
+ cfree_jit_sym_iter_free(it);
+ }
+ }
+ if (!p) {
+ p = driver_dlsym_resolver(NULL, name);
+ if (!p && name.s[0] == '_') p = driver_dlsym_resolver(NULL, alt);
+ }
+ return p;
+}
+
+/* Thread-local resolver for the interpreter. On Mach-O a thread-local symbol
+ * resolves to a TLV descriptor, not the storage; cfree_jit_tlv_resolve unwraps
+ * our own descriptors to the calling thread's address and returns NULL for
+ * anything it can't safely resolve (non-Mach-O images, or a foreign/dyld
+ * descriptor such as an extern thread-local). We return NULL in those cases so
+ * the engine diagnoses cleanly rather than treating a descriptor — or an
+ * unvalidated non-Mach-O symbol address — as the variable's storage. */
+static void* interp_jit_resolve_tls(void* ctx, CfreeSlice name, int64_t addend) {
+ CfreeJit* jit = (CfreeJit*)ctx;
+ void* sym = interp_jit_resolve(ctx, name);
+ void* tls;
+ if (!sym) return NULL;
+ tls = cfree_jit_tlv_resolve(jit, sym);
+ return tls ? (uint8_t*)tls + addend : NULL;
+}
+
int driver_run(int argc, char** argv) {
DriverEnv env;
RunOptions ro = {0};
@@ -710,6 +845,7 @@ int driver_run(int argc, char** argv) {
CfreeJitHost jhost;
CfreeCompiler* compiler = NULL;
CfreeJit* jit = NULL;
+ CfreeInterpProgram* interp = NULL;
RunMetrics metrics_storage;
RunMetrics* metrics = NULL;
void* sym;
@@ -756,6 +892,23 @@ int driver_run(int argc, char** argv) {
return 1;
}
+ /* For --no-jit, attach an InterpProgram so the optimizer captures each
+ * function's IR as it compiles. The native object/JIT image is still built
+ * (it lays out data globals and resolves externs/function pointers); only
+ * the entry's *execution* is routed through the interpreter. */
+ if (ro.no_jit) {
+ interp = cfree_interp_program_new(compiler);
+ if (!interp) {
+ driver_errf(RUN_TOOL, "failed to initialize interpreter");
+ driver_compiler_free(compiler);
+ run_metrics_finish(metrics);
+ run_options_release(&ro);
+ driver_env_fini(&env);
+ return 1;
+ }
+ cfree_interp_program_attach(interp, compiler);
+ }
+
if (ro.bench_time) bench_compile_start = driver_now_ns();
run_metrics_begin(metrics, "run.compile_and_jit");
rc = run_compile_and_jit(&ro, compiler, &jhost, &jit);
@@ -765,6 +918,7 @@ int driver_run(int argc, char** argv) {
if (ro.bench_time)
run_bench_time("compile_and_jit",
bench_compile_end - bench_compile_start);
+ cfree_interp_program_free(interp);
driver_compiler_free(compiler);
run_metrics_finish(metrics);
run_options_release(&ro);
@@ -778,6 +932,7 @@ int driver_run(int argc, char** argv) {
if (!sym) {
driver_errf(RUN_TOOL, "entry symbol not found: %.*s",
CFREE_SLICE_ARG(cfree_slice_cstr(ro.entry)));
+ cfree_interp_program_free(interp);
cfree_jit_free(jit);
driver_compiler_free(compiler);
run_metrics_finish(metrics);
@@ -795,18 +950,67 @@ int driver_run(int argc, char** argv) {
entry_fn = u.fn;
}
+ /* --no-jit: execute the entry through the IR interpreter. There is NO JIT
+ * fallback — if the entry was not captured as interpretable IR (e.g. it came
+ * from a precompiled .o, or uses an unsupported construct), that is an error.
+ * The native object/JIT image is still built, but only to lay out data
+ * globals and resolve externs/function pointers for the interpreter; the
+ * entry's code is never run as native. */
+ if (ro.no_jit) {
+ CfreeInterpHost host;
+ CfreeInterpFunc* ifn;
+ int64_t ret = 0;
+ CfreeInterpStatus s;
+ host.translate = NULL; /* host-identity: abstract addrs are host pointers */
+ host.resolve_sym = interp_jit_resolve;
+ host.resolve_tls = interp_jit_resolve_tls;
+ host.ctx = jit;
+ cfree_interp_program_set_host(interp, &host);
+ /* Wasm modules need their instance/linear-memory set up and a 2-call
+ * (init, entry) sequence with the instance pointer as the argument. */
+ if (run_call_wasm_entry_interp(&ro, compiler, jit, interp, &rc))
+ goto after_entry;
+ ifn = cfree_interp_lookup(interp, cfree_slice_cstr(ro.entry));
+ if (!ifn) {
+ driver_errf(RUN_TOOL,
+ "interp: entry %.*s has no interpretable IR (--no-jit "
+ "requires an IR-compiled entry; .o/.a inputs are not "
+ "supported)",
+ CFREE_SLICE_ARG(cfree_slice_cstr(ro.entry)));
+ rc = 1;
+ goto after_entry;
+ }
+ run_metrics_begin(metrics, "run.entry_call");
+ if (ro.bench_time) bench_exec_start = driver_now_ns();
+ s = cfree_interp_call(interp, ifn, (int)ro.prog_argc, ro.prog_argv, &ret);
+ if (ro.bench_time) bench_exec_end = driver_now_ns();
+ run_metrics_end(metrics, "run.entry_call");
+ if (s == CFREE_INTERP_DONE) {
+ rc = (int)ret;
+ } else {
+ /* The engine already emitted an "interp: ... not supported" / trap
+ * diagnostic; surface a nonzero status (no native fallback). */
+ driver_errf(RUN_TOOL, "interp: could not execute %.*s",
+ CFREE_SLICE_ARG(cfree_slice_cstr(ro.entry)));
+ rc = 1;
+ }
+ goto after_entry;
+ }
+
run_metrics_begin(metrics, "run.entry_call");
if (ro.bench_time) bench_exec_start = driver_now_ns();
if (!run_call_wasm_entry(&ro, compiler, jit, sym, &rc))
rc = entry_fn((int)ro.prog_argc, ro.prog_argv);
if (ro.bench_time) bench_exec_end = driver_now_ns();
run_metrics_end(metrics, "run.entry_call");
+after_entry:
if (ro.bench_time) {
run_bench_time("compile_and_jit", bench_compile_end - bench_compile_start);
run_bench_time("execution", bench_exec_end - bench_exec_start);
run_bench_time("total", bench_exec_end - bench_total_start);
}
+ cfree_interp_program_free(interp);
cfree_jit_free(jit);
driver_compiler_free(compiler);
run_metrics_finish(metrics);
diff --git a/include/cfree/config.h b/include/cfree/config.h
@@ -72,6 +72,21 @@
#define CFREE_DBG_ENABLED 1
#define CFREE_EMU_ENABLED 1
+/* Threaded-bytecode interpreter for the optimizer IR. Runs cfree IR
+ * directly (host-identity) or over the emu address space; requires the
+ * optimizer pipeline (it consumes the O1 PReg-path Func). */
+#define CFREE_INTERP_ENABLED 1
+
+/* Interpreter dispatch: direct-threaded (computed goto) by default. The engine
+ * additionally requires the host compiler to support labels-as-values, so it
+ * transparently falls back to a portable switch when built by cfree itself
+ * (no labels-as-values) or a non-GNU compiler — this flag only expresses the
+ * preference. Guarded so a build can force the switch with
+ * -DCFREE_INTERP_THREADED=0. */
+#ifndef CFREE_INTERP_THREADED
+#define CFREE_INTERP_THREADED 1
+#endif
+
/* cfree multi-call driver tools. These flags control both dispatch/help and
* the driver/<tool>.c objects included in the cfree binary. */
#define CFREE_TOOL_CC_ENABLED 1
diff --git a/include/cfree/interp.h b/include/cfree/interp.h
@@ -0,0 +1,108 @@
+#ifndef CFREE_INTERP_H
+#define CFREE_INTERP_H
+
+#include <cfree/core.h>
+
+/*
+ * Threaded-bytecode interpreter for the cfree optimizer IR.
+ *
+ * An InterpProgram is a collection of lowered functions (one per compiled
+ * function) plus a host/memory binding that resolves abstract addresses and
+ * global symbols. Two configurations share the engine:
+ *
+ * host-identity abstract addresses are real host pointers; globals resolve
+ * via the bound resolver (e.g. the JIT image + dlsym), and
+ * external calls dispatch to real host function pointers.
+ * emu/guest addresses are guest VAs translated through an EmuAddrSpace.
+ *
+ * `cfree run --no-jit` uses the host-identity configuration: it compiles and
+ * JIT-links as usual (so data globals/externs get real addresses), attaches an
+ * InterpProgram to the compiler so each function is also lowered to bytecode,
+ * then runs the entry through the engine instead of the native code.
+ */
+
+typedef struct CfreeInterpProgram CfreeInterpProgram;
+typedef struct CfreeInterpFunc CfreeInterpFunc;
+typedef struct CfreeInterpStack CfreeInterpStack;
+
+/* Status returned by the engine when running/resuming a stack. */
+typedef enum CfreeInterpStatus {
+ CFREE_INTERP_DONE = 0, /* stack ran to completion (top frame returned) */
+ CFREE_INTERP_TRAP = 1, /* a fault (div-by-zero, bad memory, unreachable) */
+ CFREE_INTERP_BLOCKED = 2, /* suspended at a yield point (reserved) */
+ CFREE_INTERP_ERROR = 3, /* unsupported operation / signature */
+} CfreeInterpStatus;
+
+enum {
+ CFREE_INTERP_PERM_READ = 1 << 0,
+ CFREE_INTERP_PERM_WRITE = 1 << 1,
+ CFREE_INTERP_PERM_EXEC = 1 << 2,
+};
+
+/* Pluggable memory + symbol binding. `translate` maps an abstract address to a
+ * host byte pointer for a span of `n` bytes with the given permission bits
+ * (returns NULL on a fault). `resolve_sym` maps a global symbol name to a host
+ * address (data or function); used for OP_ADDR_GLOBAL and external-call
+ * targets. `resolve_tls` maps a thread-local symbol name (+addend) to the
+ * calling thread's address of that variable — distinct because a thread-local
+ * symbol does not resolve to its storage directly on every target (e.g. a
+ * Mach-O symbol resolves to a TLV descriptor). When NULL, the engine falls back
+ * to `resolve_sym` (correct only where the symbol *is* the storage). Any of
+ * these may be NULL for the trivial host-identity case, in which the engine
+ * treats abstract addresses as host pointers directly. */
+typedef struct CfreeInterpHost {
+ uint8_t* (*translate)(void* ctx, uint64_t addr, uint64_t n, int perms);
+ void* (*resolve_sym)(void* ctx, CfreeSlice name);
+ void* (*resolve_tls)(void* ctx, CfreeSlice name, int64_t addend);
+ void* ctx;
+} CfreeInterpHost;
+
+/* Create/destroy a program. The program borrows the compiler (its arenas back
+ * the lowered functions); it must not outlive the compiler. */
+CFREE_API CfreeInterpProgram* cfree_interp_program_new(CfreeCompiler*);
+CFREE_API void cfree_interp_program_free(CfreeInterpProgram*);
+
+/* Attach the program as the compiler's interp sink so that subsequent compiles
+ * lower each function into it (in addition to native emission). Pass NULL to
+ * detach. */
+CFREE_API void cfree_interp_program_attach(CfreeInterpProgram*, CfreeCompiler*);
+
+/* Bind the host/memory resolver. Borrowed; the caller owns `host->ctx`. */
+CFREE_API void cfree_interp_program_set_host(CfreeInterpProgram*,
+ const CfreeInterpHost*);
+
+/* Look up a captured function by (C, unmangled) name; NULL if absent. */
+CFREE_API CfreeInterpFunc* cfree_interp_lookup(CfreeInterpProgram*,
+ CfreeSlice name);
+
+/* Explicit-stack API (swap-ready substrate for fibers/virtual threads). */
+CFREE_API CfreeInterpStack* cfree_interp_stack_new(CfreeInterpProgram*);
+CFREE_API void cfree_interp_stack_free(CfreeInterpStack*);
+
+/* Seed `stack` with an entry frame for `fn`, binding up to two integer
+ * arguments (argc, argv) when the function declares them. */
+CFREE_API CfreeStatus cfree_interp_call_on(CfreeInterpStack* stack,
+ CfreeInterpFunc* fn, int argc,
+ char** argv);
+
+/* Run/resume `stack` until it returns, traps, or blocks. On DONE the entry's
+ * scalar result is written to *out_ret (if non-NULL). */
+CFREE_API CfreeInterpStatus cfree_interp_resume(CfreeInterpStack* stack,
+ int64_t* out_ret);
+
+/* Convenience: allocate a stack, seed the entry, resume to completion, free
+ * the stack. Returns the engine status; *out_ret gets the scalar result. */
+CFREE_API CfreeInterpStatus cfree_interp_call(CfreeInterpProgram*,
+ CfreeInterpFunc*, int argc,
+ char** argv, int64_t* out_ret);
+
+/* Like cfree_interp_call, but binds `nargs` raw register-width values to the
+ * entry's first parameters (e.g. a single instance pointer for a Wasm entry).
+ * Each arg is placed into the corresponding scalar parameter slot. */
+CFREE_API CfreeInterpStatus cfree_interp_call_args(CfreeInterpProgram*,
+ CfreeInterpFunc*,
+ const uint64_t* args,
+ uint32_t nargs,
+ int64_t* out_ret);
+
+#endif
diff --git a/include/cfree/jit.h b/include/cfree/jit.h
@@ -77,6 +77,12 @@ CFREE_API CfreeStatus cfree_jit_addr_to_sym(CfreeJit*, uint64_t addr,
CFREE_API uint64_t cfree_jit_runtime_to_image(CfreeJit*, uint64_t runtime_pc);
CFREE_API uint64_t cfree_jit_image_to_runtime(CfreeJit*, uint64_t image_vaddr);
+/* Resolve a thread-local variable's address for the calling thread, given the
+ * runtime address its symbol resolves to (a Mach-O TLV descriptor). Returns
+ * NULL when the image/target uses no TLV descriptor (e.g. non-Mach-O), letting
+ * callers diagnose rather than dereference a descriptor as data. */
+CFREE_API void* cfree_jit_tlv_resolve(CfreeJit*, void* descriptor);
+
typedef struct CfreeJitSymIter CfreeJitSymIter;
typedef struct CfreeJitSym {
diff --git a/mk/config.mk b/mk/config.mk
@@ -31,6 +31,7 @@ CFREE_LINK_ENABLED := $(call cfg_flag,CFREE_LINK_ENABLED)
CFREE_JIT_ENABLED := $(call cfg_flag,CFREE_JIT_ENABLED)
CFREE_DBG_ENABLED := $(call cfg_flag,CFREE_DBG_ENABLED)
CFREE_EMU_ENABLED := $(call cfg_flag,CFREE_EMU_ENABLED)
+CFREE_INTERP_ENABLED := $(call cfg_flag,CFREE_INTERP_ENABLED)
CFREE_TOOL_CC_ENABLED := $(call cfg_flag,CFREE_TOOL_CC_ENABLED)
CFREE_TOOL_CHECK_ENABLED := $(call cfg_flag,CFREE_TOOL_CHECK_ENABLED)
diff --git a/src/core/config_assert.c b/src/core/config_assert.c
@@ -40,6 +40,8 @@ CFREE_ASSERT_BOOL(CFREE_LINK_ENABLED);
CFREE_ASSERT_BOOL(CFREE_JIT_ENABLED);
CFREE_ASSERT_BOOL(CFREE_DBG_ENABLED);
CFREE_ASSERT_BOOL(CFREE_EMU_ENABLED);
+CFREE_ASSERT_BOOL(CFREE_INTERP_ENABLED);
+CFREE_ASSERT_BOOL(CFREE_INTERP_THREADED);
CFREE_ASSERT_BOOL(CFREE_TOOL_CC_ENABLED);
CFREE_ASSERT_BOOL(CFREE_TOOL_CHECK_ENABLED);
@@ -78,6 +80,8 @@ _Static_assert(!CFREE_EMU_ENABLED ||
CFREE_DISASM_ENABLED && CFREE_OBJ_ELF_ENABLED),
"CFREE_EMU_ENABLED requires JIT, link, disasm, and ELF "
"support");
+_Static_assert(!CFREE_INTERP_ENABLED || CFREE_OPT_ENABLED,
+ "CFREE_INTERP_ENABLED requires CFREE_OPT_ENABLED");
_Static_assert(!CFREE_TOOL_CC_ENABLED ||
(CFREE_LANG_C_ENABLED && CFREE_LINK_ENABLED &&
diff --git a/src/core/core.h b/src/core/core.h
@@ -129,6 +129,11 @@ struct CfreeCompiler {
size_t wasm_host_nimports;
void* wasm_host_resolve; /* CfreeWasmResolveFn */
void* wasm_host_user;
+ /* Optional InterpProgram sink (struct InterpProgram*). When non-NULL, the
+ * optimizer additionally lowers each function through opt_run_o1_interp into
+ * this program for the threaded interpreter. Set by cfree_interp_program_attach;
+ * borrowed (the caller owns the program). */
+ void* interp_sink;
/* Keep jmp_buf last: its size comes from the including C environment
* (host libc for some tests, rt/include for libcfree), and must not shift
* the offsets of the fields above across those builds. */
diff --git a/src/interp/engine.c b/src/interp/engine.c
@@ -0,0 +1,1614 @@
+/* The interpreter engine: an explicit-stack dispatch loop over the lowered
+ * bytecode. IR-level calls push/pop InterpFrames on the InterpStack instead of
+ * recursing on the host C stack, so execution can be suspended and resumed.
+ *
+ * Dispatch is a switch on the record opcode. (Direct threading via a computed
+ * goto is reserved for a later pass; the InterpInsn keeps a `handler` slot for
+ * it. A switch keeps the engine portable under -Wpedantic and self-host.) */
+
+#include <cfree/config.h> /* CFREE_INTERP_THREADED: dispatch default */
+#include <string.h>
+
+#include "abi/abi.h"
+#include "cg/cgtarget.h"
+#include "cg/type.h"
+#include "core/arena.h"
+#include "core/core.h"
+#include "core/diag.h"
+#include "interp/interp.h"
+
+#define PERM_R CFREE_INTERP_PERM_READ
+#define PERM_W CFREE_INTERP_PERM_WRITE
+
+static SrcLoc iloc(void) {
+ SrcLoc l;
+ l.file_id = 0;
+ l.line = 0;
+ l.col = 0;
+ return l;
+}
+
+/* ---- width / fp helpers ---- */
+
+static u64 mask_w(u64 v, u32 w) {
+ if (w >= 8) return v;
+ if (w == 0) return v;
+ return v & ((1ull << (w * 8u)) - 1ull);
+}
+
+static i64 sext_w(u64 v, u32 w) {
+ u32 bits;
+ u64 m;
+ if (w >= 8 || w == 0) return (i64)v;
+ bits = w * 8u;
+ v &= ((1ull << bits) - 1ull);
+ m = 1ull << (bits - 1u);
+ return (i64)((v ^ m) - m);
+}
+
+/* Low `width`-bit mask (width in *bits*, 0..64). */
+static u64 bits_mask(u32 width) {
+ return width >= 64u ? ~0ull : ((1ull << width) - 1ull);
+}
+
+/* Interpreter-private va_list layout: a single cursor walks a contiguous buffer
+ * of the anonymous arguments, each at an 8-byte (16 for >8B types) aligned slot.
+ * The interpreter owns both the call-site buffer build and va_start/va_arg, so
+ * the layout is self-consistent regardless of the target ABI's real va_list. */
+static u32 va_align_of(u32 size) { return size > 8u ? 16u : 8u; }
+static u32 va_stride_of(u32 size) {
+ return size > 8u ? ((size + 15u) & ~15u) : 8u;
+}
+
+static double rd_f(u64 bits, u32 w) {
+ if (w == 4) {
+ float f;
+ u32 b = (u32)bits;
+ memcpy(&f, &b, 4);
+ return (double)f;
+ }
+ {
+ double d;
+ memcpy(&d, &bits, 8);
+ return d;
+ }
+}
+
+static u64 wr_f(double d, u32 w) {
+ if (w == 4) {
+ float f = (float)d;
+ u32 b;
+ memcpy(&b, &f, 4);
+ return b;
+ }
+ {
+ u64 b;
+ memcpy(&b, &d, 8);
+ return b;
+ }
+}
+
+/* ---- memory access (always vtable-translated) ---- */
+
+static int g_mem_fault;
+
+static u64 mem_read(InterpProgram* p, u64 addr, u32 size) {
+ u8* host = interp_translate(p, addr, size, PERM_R);
+ u64 v = 0;
+ if (!host) {
+ g_mem_fault = 1;
+ return 0;
+ }
+ memcpy(&v, host, size ? size : 8u);
+ return v;
+}
+
+static void mem_write(InterpProgram* p, u64 addr, u32 size, u64 v) {
+ u8* host = interp_translate(p, addr, size, PERM_W);
+ if (!host) {
+ g_mem_fault = 1;
+ return;
+ }
+ memcpy(host, &v, size ? size : 8u);
+}
+
+static void mem_copy(InterpProgram* p, u64 dst, u64 src, u32 n) {
+ u8* d = interp_translate(p, dst, n, PERM_W);
+ u8* s = interp_translate(p, src, n, PERM_R);
+ if (!d || !s) {
+ g_mem_fault = 1;
+ return;
+ }
+ memmove(d, s, n);
+}
+
+/* ---- operand access ---- */
+
+static u64 frame_base(InterpStack* st, u32 mem_off) {
+ return (u64)(uintptr_t)(st->mem_arena + mem_off);
+}
+
+/* addr_from_operand semantics: the abstract address an lvalue operand denotes. */
+static u64 op_addr(InterpStack* st, InterpFunc* fn, u64* regs, u32 mem_off,
+ const Operand* op) {
+ switch ((OptOperandKind)op->kind) {
+ case OPT_OPK_LOCAL:
+ return frame_base(st, mem_off) + fn->slot_off[op->v.frame_slot];
+ case OPT_OPK_GLOBAL:
+ return (u64)(uintptr_t)interp_global_base(fn, op->v.global.sym) +
+ (u64)op->v.global.addend;
+ case OPT_OPK_INDIRECT: {
+ u64 a = regs[op->v.ind.base];
+ if (op->v.ind.index != (Reg)REG_NONE)
+ a += regs[op->v.ind.index] << op->v.ind.log2_scale;
+ a += (u64)(i64)op->v.ind.ofs;
+ return a;
+ }
+ case OPT_OPK_REG:
+ return regs[op->v.reg];
+ default:
+ return 0;
+ }
+}
+
+/* loc_from_operand-as-value semantics: the scalar value of a value operand. */
+static u64 op_value(InterpStack* st, InterpFunc* fn, u64* regs, u32 mem_off,
+ const Operand* op) {
+ switch ((OptOperandKind)op->kind) {
+ case OPT_OPK_REG:
+ return regs[op->v.reg];
+ case OPT_OPK_IMM:
+ return (u64)op->v.imm;
+ case OPT_OPK_LOCAL:
+ case OPT_OPK_GLOBAL:
+ case OPT_OPK_INDIRECT: {
+ u64 a = op_addr(st, fn, regs, mem_off, op);
+ u32 sz = abi_cg_sizeof(fn->prog->c->abi, op->type);
+ return mem_read(fn->prog, a, sz ? sz : 8u);
+ }
+ default:
+ return 0;
+ }
+}
+
+/* write_loc semantics: store a scalar result into a destination operand, which
+ * may be a register OR a memory location (OPK_LOCAL/GLOBAL/INDIRECT). The
+ * optimizer leaves un-promoted (e.g. address-taken) destinations as memory. */
+static void write_dst(InterpStack* st, InterpFunc* fn, u64* regs, u32 mem_off,
+ const Operand* op, u64 value) {
+ if (op->kind == OPK_REG) {
+ regs[op->v.reg] = value;
+ return;
+ }
+ {
+ u64 a = op_addr(st, fn, regs, mem_off, op);
+ u32 sz = abi_cg_sizeof(fn->prog->c->abi, op->type);
+ mem_write(fn->prog, a, sz ? sz : 8u, value);
+ }
+}
+
+/* pointer_addr_from_operand semantics: the address an aggregate pointer
+ * operand denotes. An OPK_LOCAL of pointer type *holds* the pointer (load it);
+ * otherwise the local *is* the aggregate storage (its frame home is the
+ * address). Used only by AGG_COPY/AGG_SET. */
+static u64 interp_ptr_addr(InterpStack* st, InterpFunc* fn, u64* regs,
+ u32 mem_off, const Operand* op) {
+ if (op->kind == OPK_LOCAL && !cg_type_is_ptr(fn->prog->c, op->type))
+ return frame_base(st, mem_off) + fn->slot_off[op->v.frame_slot];
+ if (op->kind == OPK_LOCAL) {
+ /* pointer-typed local: the slot holds the pointer value */
+ u64 slot = frame_base(st, mem_off) + fn->slot_off[op->v.frame_slot];
+ return mem_read(fn->prog, slot, 8u);
+ }
+ return op_addr(st, fn, regs, mem_off, op);
+}
+
+/* Common compiler intrinsics. Returns 0 (and sets status) if unsupported. */
+static int interp_intrinsic(InterpStack* st, InterpFunc* fn, u64* regs,
+ u32 mem_off, InterpInsn* in);
+
+/* The register and addressable-memory arenas are FIXED reservations that never
+ * move: an OP_ADDR_OF materializes a local's address as an absolute host
+ * pointer into mem_arena, and that pointer can escape into a register or out to
+ * another local, so reallocating (moving) the arena would dangle it. Frames
+ * follow strict stack discipline (CALL bumps the top, RET rewinds it), so a
+ * generous fixed reservation suffices; overflow traps cleanly as a stack
+ * overflow rather than corrupting memory. */
+#define INTERP_REGS_RESERVE (8u * 1024u * 1024u)
+#define INTERP_MEM_RESERVE (8u * 1024u * 1024u)
+
+static u32 bump(u8* arena, u32* top, u32 cap, u32 size, u32 align) {
+ u32 off = (*top + align - 1u) & ~(align - 1u);
+ (void)arena;
+ if (off + size > cap || off + size < off) return 0xffffffffu; /* overflow */
+ *top = off + size;
+ return off;
+}
+
+/* Push a fresh frame for fn; returns its index, or 0xffffffff on overflow.
+ * The arenas never move, so existing frame pointers stay valid. */
+static u32 frame_push(InterpStack* st, InterpFunc* fn) {
+ InterpFrame* fr;
+ u32 regs_off, mem_off;
+ if (st->nframes == st->frames_cap) {
+ Heap* h = st->prog->c->ctx->heap;
+ u32 ncap = st->frames_cap ? st->frames_cap * 2u : 32u;
+ InterpFrame* nf = (InterpFrame*)h->realloc(
+ h, st->frames, sizeof(InterpFrame) * st->frames_cap,
+ sizeof(InterpFrame) * ncap, _Alignof(InterpFrame));
+ if (!nf) return 0xffffffffu;
+ st->frames = nf;
+ st->frames_cap = ncap;
+ }
+ regs_off = bump(st->regs_arena, &st->regs_top, st->regs_cap,
+ (fn->npregs ? fn->npregs : 1u) * 8u, 8u);
+ mem_off = bump(st->mem_arena, &st->mem_top, st->mem_cap,
+ fn->frame_bytes ? fn->frame_bytes : 16u, fn->frame_align);
+ if (regs_off == 0xffffffffu || mem_off == 0xffffffffu) return 0xffffffffu;
+ fr = &st->frames[st->nframes];
+ memset(fr, 0, sizeof *fr);
+ fr->fn = fn;
+ fr->regs_off = regs_off;
+ fr->mem_off = mem_off;
+ fr->frame_bytes = fn->frame_bytes;
+ fr->alloca_top = fn->frame_bytes;
+ fr->ip = &fn->code[fn->block_pc[fn->f->entry] == INTERP_PC_NONE
+ ? 0u
+ : fn->block_pc[fn->f->entry]];
+ /* zero the register file */
+ memset(st->regs_arena + regs_off, 0, (fn->npregs ? fn->npregs : 1u) * 8u);
+ st->nframes++;
+ return st->nframes - 1u;
+}
+
+static void unsupported(InterpStack* st, const char* what) {
+ st->status = CFREE_INTERP_ERROR;
+ st->trap_reason = what;
+ diag_emit(st->prog->c->ctx->diag, CFREE_DIAG_ERROR, iloc(),
+ "interp: %s not supported", what ? what : "operation");
+}
+
+static void fault(InterpStack* st, const char* what) {
+ st->status = CFREE_INTERP_TRAP;
+ st->trap_reason = what;
+ diag_emit(st->prog->c->ctx->diag, CFREE_DIAG_ERROR, iloc(), "interp: trap: %s",
+ what ? what : "fault");
+}
+
+/* ---- integer/fp arithmetic ---- */
+
+static u64 do_binop(InterpStack* st, u32 binop, u64 a, u64 b, u32 w, u8 fp) {
+ if (fp) {
+ double x = rd_f(a, w), y = rd_f(b, w), r = 0;
+ switch ((BinOp)binop) {
+ case BO_FADD: r = x + y; break;
+ case BO_FSUB: r = x - y; break;
+ case BO_FMUL: r = x * y; break;
+ case BO_FDIV: r = x / y; break;
+ default: unsupported(st, "fp binop"); return 0;
+ }
+ return wr_f(r, w);
+ }
+ switch ((BinOp)binop) {
+ case BO_IADD: return mask_w(a + b, w);
+ case BO_ISUB: return mask_w(a - b, w);
+ case BO_IMUL: return mask_w(a * b, w);
+ case BO_SDIV: {
+ i64 x = sext_w(a, w), y = sext_w(b, w);
+ if (y == 0) { fault(st, "integer divide by zero"); return 0; }
+ /* INT_MIN / -1 overflows (UB / SIGFPE on x86) — wraps to INT_MIN. */
+ if (y == -1) return mask_w(0u - (u64)x, w);
+ return mask_w((u64)(x / y), w);
+ }
+ case BO_UDIV: {
+ u64 x = mask_w(a, w), y = mask_w(b, w);
+ if (y == 0) { fault(st, "integer divide by zero"); return 0; }
+ return mask_w(x / y, w);
+ }
+ case BO_SREM: {
+ i64 x = sext_w(a, w), y = sext_w(b, w);
+ if (y == 0) { fault(st, "integer divide by zero"); return 0; }
+ if (y == -1) return 0; /* INT_MIN % -1 == 0 (avoids the overflow UB) */
+ return mask_w((u64)(x % y), w);
+ }
+ case BO_UREM: {
+ u64 x = mask_w(a, w), y = mask_w(b, w);
+ if (y == 0) { fault(st, "integer divide by zero"); return 0; }
+ return mask_w(x % y, w);
+ }
+ case BO_AND: return mask_w(a & b, w);
+ case BO_OR: return mask_w(a | b, w);
+ case BO_XOR: return mask_w(a ^ b, w);
+ case BO_SHL: return mask_w(a << (b & (w * 8u - 1u)), w);
+ case BO_SHR_S: {
+ i64 x = sext_w(a, w);
+ return mask_w((u64)(x >> (b & (w * 8u - 1u))), w);
+ }
+ case BO_SHR_U: return mask_w(mask_w(a, w) >> (b & (w * 8u - 1u)), w);
+ default: unsupported(st, "int binop"); return 0;
+ }
+}
+
+static int do_cmp(InterpStack* st, u32 cmp, u64 a, u64 b, u32 w, u8 fp) {
+ if (fp || (cmp >= CMP_LT_F)) {
+ double x = rd_f(a, w), y = rd_f(b, w);
+ switch ((CmpOp)cmp) {
+ case CMP_EQ: return x == y;
+ case CMP_NE: return x != y;
+ case CMP_LT_F: return x < y;
+ case CMP_LE_F: return x <= y;
+ case CMP_GT_F: return x > y;
+ case CMP_GE_F: return x >= y;
+ default: break;
+ }
+ }
+ switch ((CmpOp)cmp) {
+ case CMP_EQ: return mask_w(a, w) == mask_w(b, w);
+ case CMP_NE: return mask_w(a, w) != mask_w(b, w);
+ case CMP_LT_S: return sext_w(a, w) < sext_w(b, w);
+ case CMP_LE_S: return sext_w(a, w) <= sext_w(b, w);
+ case CMP_GT_S: return sext_w(a, w) > sext_w(b, w);
+ case CMP_GE_S: return sext_w(a, w) >= sext_w(b, w);
+ case CMP_LT_U: return mask_w(a, w) < mask_w(b, w);
+ case CMP_LE_U: return mask_w(a, w) <= mask_w(b, w);
+ case CMP_GT_U: return mask_w(a, w) > mask_w(b, w);
+ case CMP_GE_U: return mask_w(a, w) >= mask_w(b, w);
+ default: unsupported(st, "cmp"); return 0;
+ }
+}
+
+/* Saturating float-to-integer (NaN -> 0, out-of-range -> clamped to the
+ * destination width). Matches Wasm trunc_sat semantics and, crucially, avoids
+ * the UB of casting a NaN/overflowing double to an integer (which traps under
+ * UBSan). For in-range values this is identical to a plain truncating cast, so
+ * well-defined C float->int conversions are unaffected. Avoids <math.h>
+ * (libcfree is freestanding) by building the 2^k bound with a loop. */
+static u64 ftoi_sat(double d, u32 wbytes, int is_signed) {
+ u32 bits, i;
+ double bound;
+ if (d != d) return 0; /* NaN */
+ if (wbytes == 0 || wbytes > 8) wbytes = 8;
+ bits = wbytes * 8u;
+ if (is_signed) {
+ bound = 1.0;
+ for (i = 0; i + 1u < bits; ++i) bound *= 2.0; /* 2^(bits-1) */
+ if (d >= bound)
+ return mask_w(bits >= 64 ? 0x7fffffffffffffffull
+ : (((u64)1 << (bits - 1u)) - 1u),
+ wbytes);
+ if (d < -bound)
+ return mask_w(bits >= 64 ? 0x8000000000000000ull
+ : ((u64)1 << (bits - 1u)),
+ wbytes);
+ return mask_w((u64)(i64)d, wbytes);
+ }
+ bound = 1.0;
+ for (i = 0; i < bits; ++i) bound *= 2.0; /* 2^bits */
+ if (d < 0.0) return 0;
+ if (d >= bound)
+ return mask_w(bits >= 64 ? ~0ull : (((u64)1 << bits) - 1u), wbytes);
+ return mask_w((u64)d, wbytes);
+}
+
+static u64 do_convert(InterpStack* st, InterpInsn* in, u64 v) {
+ u32 wd = in->w0, ws = in->w1;
+ switch ((ConvKind)in->sub) {
+ case CV_SEXT: return mask_w((u64)sext_w(v, ws), wd);
+ case CV_ZEXT: return mask_w(mask_w(v, ws), wd);
+ case CV_TRUNC: return mask_w(v, wd);
+ case CV_ITOF_S: return wr_f((double)sext_w(v, ws), wd);
+ case CV_ITOF_U: return wr_f((double)mask_w(v, ws), wd);
+ case CV_FTOI_S: return ftoi_sat(rd_f(v, ws), wd, 1);
+ case CV_FTOI_U: return ftoi_sat(rd_f(v, ws), wd, 0);
+ case CV_FEXT: return wr_f(rd_f(v, ws), wd);
+ case CV_FTRUNC: return wr_f(rd_f(v, ws), wd);
+ case CV_BITCAST: return mask_w(v, wd);
+ default: unsupported(st, "convert"); return 0;
+ }
+}
+
+static u64 do_rmw(u32 op, u64 old, u64 val, u32 w) {
+ switch ((AtomicOp)op) {
+ case AO_XCHG: return mask_w(val, w);
+ case AO_ADD: return mask_w(old + val, w);
+ case AO_SUB: return mask_w(old - val, w);
+ case AO_AND: return mask_w(old & val, w);
+ case AO_OR: return mask_w(old | val, w);
+ case AO_XOR: return mask_w(old ^ val, w);
+ case AO_NAND: return mask_w(~(old & val), w);
+ default: return old;
+ }
+}
+
+static u64 do_unop(InterpStack* st, u32 unop, u64 a, u32 w, u8 fp) {
+ (void)fp;
+ switch ((UnOp)unop) {
+ case UO_NEG: return mask_w(0u - a, w); /* well-defined two's-complement */
+ case UO_FNEG: return wr_f(-rd_f(a, w), w);
+ case UO_NOT: return mask_w(a, w) == 0 ? 1u : 0u;
+ case UO_BNOT: return mask_w(~a, w);
+ default: unsupported(st, "unop"); return 0;
+ }
+}
+
+/* Bind call arguments into a freshly-pushed callee frame (value semantics). */
+static void bind_args(InterpStack* st, u32 caller_idx, u32 callee_idx,
+ const OptCGCallDesc* desc) {
+ InterpProgram* p = st->prog;
+ InterpFrame* caller = &st->frames[caller_idx];
+ InterpFrame* callee = &st->frames[callee_idx];
+ InterpFunc* cfn = caller->fn;
+ InterpFunc* efn = callee->fn;
+ u64* cregs = (u64*)(st->regs_arena + caller->regs_off);
+ u64* eregs = (u64*)(st->regs_arena + callee->regs_off);
+ u32 nbind = desc->nargs < efn->f->nparams ? desc->nargs : efn->f->nparams;
+ u32 i;
+ for (i = 0; i < nbind; ++i) {
+ OptCGABIValue* arg = &desc->args[i];
+ IRParam* pr = &efn->f->params[i];
+ u32 size = abi_cg_sizeof(p->c->abi, arg->type);
+ if (pr->storage.kind == CG_LOCAL_STORAGE_REG) {
+ eregs[pr->storage.v.reg] =
+ op_value(st, cfn, cregs, caller->mem_off, &arg->storage);
+ } else {
+ u64 dst = frame_base(st, callee->mem_off) +
+ efn->slot_off[pr->storage.v.frame_slot];
+ if (cg_type_is_aggregate(p->c, arg->type) || size > 8u) {
+ u64 src = op_addr(st, cfn, cregs, caller->mem_off, &arg->storage);
+ mem_copy(p, dst, src, size);
+ } else {
+ mem_write(p, dst,
+ size ? size : 8u,
+ op_value(st, cfn, cregs, caller->mem_off, &arg->storage));
+ }
+ }
+ }
+}
+
+/* Lay out the anonymous (variadic) arguments of an internal call into a
+ * contiguous buffer in the callee frame's addressable region, above its static
+ * frame and any future alloca. Records the buffer offset on the callee frame so
+ * IOP_VA_START can hand va_arg a cursor over it. Returns 0 on stack overflow. */
+static int build_varargs(InterpStack* st, u32 caller_idx, u32 callee_idx,
+ const OptCGCallDesc* desc) {
+ InterpProgram* p = st->prog;
+ InterpFrame* caller = &st->frames[caller_idx];
+ InterpFrame* callee = &st->frames[callee_idx];
+ InterpFunc* cfn = caller->fn;
+ InterpFunc* efn = callee->fn;
+ u64* cregs = (u64*)(st->regs_arena + caller->regs_off);
+ u32 nfixed = efn->f->nparams;
+ u32 cur = (callee->alloca_top + 15u) & ~15u; /* 16-align buffer start */
+ u32 buf_start = cur;
+ u32 i;
+ if (desc->nargs <= nfixed) return 1; /* no anonymous args */
+ for (i = nfixed; i < desc->nargs; ++i) {
+ OptCGABIValue* arg = &desc->args[i];
+ u32 size = abi_cg_sizeof(p->c->abi, arg->type);
+ u32 al = va_align_of(size);
+ u64 dst;
+ cur = (cur + al - 1u) & ~(al - 1u);
+ if ((u64)callee->mem_off + cur + va_stride_of(size) > st->mem_cap)
+ return 0;
+ dst = frame_base(st, callee->mem_off) + cur;
+ if (cg_type_is_aggregate(p->c, arg->type) || size > 8u) {
+ u64 src = op_addr(st, cfn, cregs, caller->mem_off, &arg->storage);
+ mem_copy(p, dst, src, size);
+ } else {
+ mem_write(p, dst, 8u,
+ op_value(st, cfn, cregs, caller->mem_off, &arg->storage));
+ }
+ cur += va_stride_of(size);
+ }
+ callee->has_varargs = 1;
+ callee->vararg_off = callee->mem_off + buf_start;
+ callee->alloca_top = cur;
+ if (callee->mem_off + cur > st->mem_top) st->mem_top = callee->mem_off + cur;
+ return 1;
+}
+
+/* ---- external (host ABI) call marshalling ---- */
+
+/* Record an integer-register argument. Returns non-zero (with *why) on
+ * overflow of the supported register-thunk family. */
+static int ffi_push_int(InterpFfiArgs* fa, u64 v, const char** why) {
+ if (fa->nint >= 8u) {
+ *why = "external call: too many int args";
+ return 1;
+ }
+ fa->iargs[fa->nint++] = v;
+ return 0;
+}
+
+/* Record an fp-register argument, tracking single vs double precision (the two
+ * occupy the fp register differently). Returns non-zero (with *why) on overflow
+ * or a float/double mix within one signature. */
+static int ffi_push_fp(InterpFfiArgs* fa, u64 bits, u32 size, const char** why) {
+ if (fa->nfp >= 8u) {
+ *why = "external call: too many fp args";
+ return 1;
+ }
+ if (size == 4u) {
+ if (fa->nfp > 0u && !fa->args_fp_is_float) {
+ *why = "external call: mixed float/double args";
+ return 1;
+ }
+ fa->args_fp_is_float = 1u;
+ fa->fargs_f[fa->nfp++] = (float)rd_f(bits, 4u);
+ } else {
+ if (fa->nfp > 0u && fa->args_fp_is_float) {
+ *why = "external call: mixed float/double args";
+ return 1;
+ }
+ fa->fargs[fa->nfp++] = rd_f(bits, size ? size : 8u);
+ }
+ return 0;
+}
+
+static u64 ext_call(InterpStack* st, InterpFrame* fr, u64* regs, void* host_fp,
+ const OptCGCallDesc* desc) {
+ InterpProgram* p = st->prog;
+ const ABIFuncInfo* fi = desc->abi;
+ InterpFfiArgs fa;
+ const char* reason = NULL;
+ u32 i;
+
+ if (!fi) {
+ unsupported(st, "external call without ABI info");
+ return 0;
+ }
+ if (fi->vararg_on_stack && fi->variadic) {
+ unsupported(st, "variadic external call (stack-routed)");
+ return 0;
+ }
+ memset(&fa, 0, sizeof fa);
+ fa.fi = fi;
+
+ /* hidden struct return: pass the caller's aggregate-return slot directly.
+ * When the call is a tail call its result has no local home (ret.storage is
+ * void) — forward this frame's own sret destination instead. */
+ if (fi->has_sret) {
+ u32 rsz = abi_cg_sizeof(p->c->abi, desc->ret.type);
+ if (desc->ret.storage.kind == OPK_LOCAL ||
+ desc->ret.storage.kind == OPK_GLOBAL ||
+ desc->ret.storage.kind == OPK_INDIRECT) {
+ u64 dst = op_addr(st, fr->fn, regs, fr->mem_off, &desc->ret.storage);
+ fa.sret = interp_translate(p, dst, rsz, PERM_W);
+ } else {
+ fa.sret = fr->sret_ptr; /* tail call: deliver to our caller's sret slot */
+ }
+ if (!fa.sret) { unsupported(st, "sret destination"); return 0; }
+ fa.iargs[fa.nint++] = (u64)(uintptr_t)fa.sret;
+ fa.ret_is_void = 1;
+ }
+
+ for (i = 0; i < desc->nargs; ++i) {
+ OptCGABIValue* arg = &desc->args[i];
+ const ABIArgInfo* ai =
+ (i < fi->nparams) ? &fi->params[i] : NULL;
+ if (ai && ai->kind == ABI_ARG_IGNORE) continue;
+ if (ai && ai->kind == ABI_ARG_INDIRECT) {
+ /* byval: pass a pointer to the aggregate (caller's copy). */
+ u64 a = op_addr(st, fr->fn, regs, fr->mem_off, &arg->storage);
+ u8* h = interp_translate(p, a, 1, PERM_R);
+ if (fa.nint >= 8) { unsupported(st, "external call: too many int args"); return 0; }
+ fa.iargs[fa.nint++] = (u64)(uintptr_t)h;
+ continue;
+ }
+ if (ai && ai->kind == ABI_ARG_DIRECT && ai->nparts > 1) {
+ /* aggregate split across registers: read each part from memory. */
+ u64 base = op_addr(st, fr->fn, regs, fr->mem_off, &arg->storage);
+ u32 k;
+ for (k = 0; k < ai->nparts; ++k) {
+ const ABIArgPart* pt = &ai->parts[k];
+ u64 chunk = mem_read(p, base + pt->src_offset, pt->size);
+ int bad = (pt->cls == ABI_CLASS_FP)
+ ? ffi_push_fp(&fa, chunk, pt->size, &reason)
+ : ffi_push_int(&fa, chunk, &reason);
+ if (bad) { unsupported(st, reason); return 0; }
+ }
+ continue;
+ }
+ /* scalar (or variadic extra arg): route by type. The named-parameter
+ * aggregate/large cases are handled by the INDIRECT / multi-part branches
+ * above; a variadic-tail arg has no ABI classification (ai==NULL), so an
+ * aggregate or >8-byte scalar here can't be marshalled (and op_value's
+ * 8-byte read would overflow) — diagnose rather than corrupt. */
+ if (cg_type_is_aggregate(p->c, arg->type) ||
+ abi_cg_sizeof(p->c->abi, arg->type) > 8u) {
+ unsupported(st, "external call: aggregate/oversized variadic argument");
+ return 0;
+ }
+ {
+ ABITypeInfo ti = abi_cg_type_info(p->c->abi, arg->type);
+ u64 v = op_value(st, fr->fn, regs, fr->mem_off, &arg->storage);
+ int bad = (ti.scalar_kind == ABI_SC_FLOAT)
+ ? ffi_push_fp(&fa, v, ti.size ? ti.size : 8u, &reason)
+ : ffi_push_int(&fa, v, &reason);
+ if (bad) { unsupported(st, reason); return 0; }
+ }
+ }
+
+ /* Return classification from the ABI's own return descriptor (robust even
+ * when desc->ret.type is void, e.g. a tail call whose result is not stored
+ * into any caller local). A small struct can come back in up to two
+ * registers; each part's class steers which return register the thunk reads. */
+ if (!fi->has_sret) {
+ if (fi->ret.kind == ABI_ARG_IGNORE || fi->ret.nparts == 0) {
+ fa.ret_is_void = 1;
+ fa.ret_nparts = 0;
+ } else if (fi->ret.nparts > 2) {
+ unsupported(st, "external call: 3+ register struct return");
+ return 0;
+ } else {
+ u32 k;
+ fa.ret_nparts = (u8)fi->ret.nparts;
+ for (k = 0; k < fi->ret.nparts; ++k) {
+ fa.ret_fp[k] = (fi->ret.parts[k].cls == ABI_CLASS_FP) ? 1u : 0u;
+ fa.ret_size[k] = fi->ret.parts[k].size ? fi->ret.parts[k].size : 8u;
+ /* A 4-byte fp return part is a single in the low half of an fp reg; the
+ * two-register thunks read fp parts as doubles, so diagnose it. */
+ if (fi->ret.nparts > 1u && fa.ret_fp[k] && fa.ret_size[k] == 4u) {
+ unsupported(st, "external call: 32-bit fp struct-return field");
+ return 0;
+ }
+ }
+ }
+ }
+
+ {
+ u64 out[2] = {0, 0};
+ if (interp_ffi_invoke(host_fp, &fa, out, &reason) != 0) {
+ unsupported(st, reason ? reason : "external call signature");
+ return 0;
+ }
+ if (fa.ret_is_void || fa.ret_nparts == 0) return 0;
+ /* Deliver the result. A register destination (OPK_REG) takes the low
+ * register; a memory destination (an address-taken result local, or a small
+ * aggregate returned in registers) receives each part's bytes scattered to
+ * its src_offset. A value-less tail call has no home — the low register is
+ * shuttled out as the scalar result. */
+ if (desc->ret.storage.kind == OPK_REG) {
+ if (fa.ret_nparts == 1) regs[desc->ret.storage.v.reg] = out[0];
+ } else if (desc->ret.storage.kind == OPK_LOCAL ||
+ desc->ret.storage.kind == OPK_GLOBAL ||
+ desc->ret.storage.kind == OPK_INDIRECT) {
+ u64 dst = op_addr(st, fr->fn, regs, fr->mem_off, &desc->ret.storage);
+ u32 k;
+ for (k = 0; k < fi->ret.nparts && k < 2u; ++k)
+ mem_write(p, dst + fi->ret.parts[k].src_offset, fa.ret_size[k], out[k]);
+ }
+ return out[0];
+ }
+}
+
+/* ---- engine ---- */
+
+/* Dispatch mechanism. With labels-as-values (GNU computed goto) the engine is
+ * direct-threaded: each InterpInsn caches the &&handler of its opcode and every
+ * handler tail-dispatches straight to the next via `goto *`, giving the branch
+ * predictor a distinct indirect branch per opcode site. This is the default
+ * (CFREE_INTERP_THREADED in <cfree/config.h>). cfree's own C front end does not
+ * implement labels-as-values, so the self-host build (__cfree__) and any non-GNU
+ * compiler transparently fall back to a portable `switch`; the two share one set
+ * of handler bodies through OP()/NEXT()/GO(). Force the choice with
+ * -DCFREE_INTERP_THREADED=0|1. */
+#if !defined(CFREE_INTERP_THREADED)
+/* Belt-and-braces: config.h normally defines this. Default on so a missed
+ * include degrades to threaded-where-supported, never a silent switch. */
+# define CFREE_INTERP_THREADED 1
+#endif
+/* Effective dispatch: requested AND the compiler can compile labels-as-values. */
+#if CFREE_INTERP_THREADED && \
+ (defined(__GNUC__) || defined(__clang__)) && !defined(__cfree__)
+# define INTERP_DISPATCH_THREADED 1
+#else
+# define INTERP_DISPATCH_THREADED 0
+#endif
+
+/* The opcode roster: one entry per InterpOp with a handler, used to publish the
+ * threaded dispatch table from the in-function &&labels. Must stay in sync with
+ * the OP(...) handlers below (a missing/extra entry is a compile error: an
+ * undefined or unused label). */
+#define INTERP_OPS(X) \
+ X(IOP_NOP) X(IOP_LOAD_IMM) X(IOP_LOAD_CONST) X(IOP_COPY) X(IOP_COPY_AGG) \
+ X(IOP_LOAD) X(IOP_LOAD_AGG) X(IOP_STORE) X(IOP_STORE_AGG) X(IOP_ADDR_OF) \
+ X(IOP_TLS_ADDR) X(IOP_BINOP) X(IOP_UNOP) X(IOP_CMP) X(IOP_CONVERT) \
+ X(IOP_CALL) X(IOP_BR) X(IOP_CONDBR) X(IOP_CMP_BRANCH) X(IOP_SWITCH) \
+ X(IOP_INDIRECT_BR) X(IOP_LOAD_LABEL_ADDR) X(IOP_RET) X(IOP_RET_VOID) \
+ X(IOP_ALLOCA) X(IOP_AGG_COPY) X(IOP_AGG_SET) X(IOP_BITFIELD_LOAD) \
+ X(IOP_BITFIELD_STORE) X(IOP_VA_START) X(IOP_VA_ARG) X(IOP_VA_END) \
+ X(IOP_VA_COPY) X(IOP_ATOMIC_LOAD) X(IOP_ATOMIC_STORE) X(IOP_ATOMIC_RMW) \
+ X(IOP_ATOMIC_CAS) X(IOP_FENCE) X(IOP_INTRINSIC) X(IOP_TRAP)
+
+#if INTERP_DISPATCH_THREADED
+# define OP(name) L_##name
+/* linear op: re-check the memory-fault latch, advance, dispatch the next insn */
+# define NEXT() \
+ do { \
+ if (g_mem_fault) goto fault_mem; \
+ ++ip; \
+ in = ip; \
+ I = in->inst; \
+ goto *in->handler; \
+ } while (0)
+/* branch op: ip already retargeted, dispatch without advancing */
+# define GO() \
+ do { \
+ in = ip; \
+ I = in->inst; \
+ goto *in->handler; \
+ } while (0)
+# if defined(__clang__)
+# pragma clang diagnostic push
+# pragma clang diagnostic ignored "-Wgnu-label-as-value"
+# pragma clang diagnostic ignored "-Wpedantic"
+# elif defined(__GNUC__)
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wpedantic"
+# endif
+#else
+# define OP(name) case name
+# define NEXT() break
+# define GO() continue
+#endif
+
+CfreeInterpStatus interp_run_stack(InterpStack* st, int64_t* out_ret) {
+ InterpProgram* p = st->prog;
+ InterpFrame* fr;
+ InterpFunc* fn;
+ u64* regs;
+ u32 mem_off;
+ InterpInsn* ip;
+ InterpInsn* in = NULL;
+ const Inst* I = NULL;
+
+ if (st->nframes == 0) {
+ st->status = CFREE_INTERP_DONE;
+ if (out_ret) *out_ret = (int64_t)st->scalar_ret;
+ return CFREE_INTERP_DONE;
+ }
+
+#if INTERP_DISPATCH_THREADED
+ /* Per-function lazy threading: copy each opcode's handler into its record on
+ * first entry to the function (RELOAD runs whenever the top frame changes). */
+# define RELOAD() \
+ do { \
+ fr = &st->frames[st->nframes - 1u]; \
+ fn = fr->fn; \
+ regs = (u64*)(st->regs_arena + fr->regs_off); \
+ mem_off = fr->mem_off; \
+ ip = fr->ip; \
+ if (!fn->threaded) { \
+ u32 ti_; \
+ for (ti_ = 0; ti_ < fn->ncode; ++ti_) { \
+ u32 o_ = fn->code[ti_].op; \
+ fn->code[ti_].handler = \
+ g_dt[o_ < (u32)IOP__COUNT ? o_ : (u32)IOP_TRAP]; \
+ } \
+ fn->threaded = 1; \
+ } \
+ } while (0)
+#else
+# define RELOAD() \
+ do { \
+ fr = &st->frames[st->nframes - 1u]; \
+ fn = fr->fn; \
+ regs = (u64*)(st->regs_arena + fr->regs_off); \
+ mem_off = fr->mem_off; \
+ ip = fr->ip; \
+ } while (0)
+#endif
+
+#if INTERP_DISPATCH_THREADED
+ static void* g_dt[IOP__COUNT];
+ static int g_dt_ready = 0;
+ if (!g_dt_ready) {
+# define DT_ENTRY(name) g_dt[name] = &&L_##name;
+ INTERP_OPS(DT_ENTRY)
+# undef DT_ENTRY
+ g_dt_ready = 1;
+ }
+#endif
+
+ RELOAD();
+ if (!fn->ok) {
+ unsupported(st, fn->reject_reason ? fn->reject_reason : "function");
+ return (CfreeInterpStatus)st->status;
+ }
+ g_mem_fault = 0;
+
+#if INTERP_DISPATCH_THREADED
+ in = ip;
+ I = in->inst;
+ goto *in->handler;
+#else
+ for (;;) {
+ in = ip;
+ I = in->inst;
+ switch ((InterpOp)in->op) {
+#endif
+ OP(IOP_NOP):
+ NEXT();
+ OP(IOP_LOAD_IMM):
+ write_dst(st, fn, regs, mem_off, &I->opnds[0], (u64)in->imm);
+ NEXT();
+ OP(IOP_LOAD_CONST): {
+ ConstBytes cb = I->extra.cbytes;
+ u64 v = 0;
+ u32 n = cb.size > 8u ? 8u : cb.size;
+ if (cb.bytes && n) memcpy(&v, cb.bytes, n);
+ write_dst(st, fn, regs, mem_off, &I->opnds[0], v);
+ NEXT();
+ }
+ OP(IOP_COPY):
+ write_dst(st, fn, regs, mem_off, &I->opnds[0],
+ op_value(st, fn, regs, mem_off, &I->opnds[1]));
+ NEXT();
+ OP(IOP_COPY_AGG): {
+ u64 d = op_addr(st, fn, regs, mem_off, &I->opnds[0]);
+ u64 s = op_addr(st, fn, regs, mem_off, &I->opnds[1]);
+ mem_copy(p, d, s, abi_cg_sizeof(p->c->abi, I->opnds[0].type));
+ NEXT();
+ }
+ OP(IOP_LOAD): {
+ u64 a = op_addr(st, fn, regs, mem_off, &I->opnds[1]);
+ write_dst(st, fn, regs, mem_off, &I->opnds[0],
+ mem_read(p, a, in->w0 ? in->w0 : 8u));
+ NEXT();
+ }
+ OP(IOP_LOAD_AGG): {
+ u64 d = op_addr(st, fn, regs, mem_off, &I->opnds[0]);
+ u64 s = op_addr(st, fn, regs, mem_off, &I->opnds[1]);
+ mem_copy(p, d, s, abi_cg_sizeof(p->c->abi, I->opnds[0].type));
+ NEXT();
+ }
+ OP(IOP_STORE): {
+ u64 a = op_addr(st, fn, regs, mem_off, &I->opnds[0]);
+ u64 v = op_value(st, fn, regs, mem_off, &I->opnds[1]);
+ mem_write(p, a, in->w0 ? in->w0 : 8u, v);
+ NEXT();
+ }
+ OP(IOP_STORE_AGG): {
+ u64 d = op_addr(st, fn, regs, mem_off, &I->opnds[0]);
+ u64 s = op_addr(st, fn, regs, mem_off, &I->opnds[1]);
+ mem_copy(p, d, s, abi_cg_sizeof(p->c->abi, I->opnds[1].type));
+ NEXT();
+ }
+ OP(IOP_ADDR_OF):
+ write_dst(st, fn, regs, mem_off, &I->opnds[0],
+ op_addr(st, fn, regs, mem_off, &I->opnds[1]));
+ NEXT();
+ OP(IOP_BINOP): {
+ u64 r = do_binop(st, in->sub,
+ op_value(st, fn, regs, mem_off, &I->opnds[1]),
+ op_value(st, fn, regs, mem_off, &I->opnds[2]), in->w0,
+ in->fp0);
+ if (st->status) goto stop;
+ write_dst(st, fn, regs, mem_off, &I->opnds[0], r);
+ NEXT();
+ }
+ OP(IOP_UNOP): {
+ u64 r = do_unop(st, in->sub,
+ op_value(st, fn, regs, mem_off, &I->opnds[1]), in->w0,
+ in->fp0);
+ if (st->status) goto stop;
+ write_dst(st, fn, regs, mem_off, &I->opnds[0], r);
+ NEXT();
+ }
+ OP(IOP_CMP): {
+ u64 r = (u64)do_cmp(st, in->sub,
+ op_value(st, fn, regs, mem_off, &I->opnds[1]),
+ op_value(st, fn, regs, mem_off, &I->opnds[2]),
+ in->w0, in->fp0);
+ if (st->status) goto stop;
+ write_dst(st, fn, regs, mem_off, &I->opnds[0], r);
+ NEXT();
+ }
+ OP(IOP_CONVERT): {
+ u64 r =
+ do_convert(st, in, op_value(st, fn, regs, mem_off, &I->opnds[1]));
+ if (st->status) goto stop;
+ write_dst(st, fn, regs, mem_off, &I->opnds[0], r);
+ NEXT();
+ }
+ OP(IOP_ALLOCA): {
+ u64 size = op_value(st, fn, regs, mem_off, &I->opnds[1]);
+ u32 align = in->imm ? (u32)in->imm : 16u;
+ u32 off = (fr->alloca_top + align - 1u) & ~(align - 1u);
+ if ((u64)fr->mem_off + off + size > st->mem_cap) {
+ fault(st, "alloca: stack overflow");
+ goto stop;
+ }
+ write_dst(st, fn, regs, mem_off, &I->opnds[0],
+ frame_base(st, fr->mem_off) + off);
+ fr->alloca_top = off + (u32)size;
+ /* Advance the global high-water so a nested call's frame is allocated
+ * ABOVE this live alloca region (otherwise it would alias it). */
+ if (fr->mem_off + fr->alloca_top > st->mem_top)
+ st->mem_top = fr->mem_off + fr->alloca_top;
+ NEXT();
+ }
+ OP(IOP_BR):
+ ip = &fn->code[in->t0];
+ GO();
+ OP(IOP_CONDBR): {
+ u64 c = op_value(st, fn, regs, mem_off, &I->opnds[0]);
+ /* A faulting selector would otherwise branch on garbage: branch ops
+ * skip the straight-line fault re-check, so test the latch here. */
+ if (g_mem_fault) { fault(st, "invalid memory access"); goto stop; }
+ ip = &fn->code[c ? in->t0 : in->t1];
+ GO();
+ }
+ OP(IOP_CMP_BRANCH): {
+ int taken = do_cmp(st, in->sub,
+ op_value(st, fn, regs, mem_off, &I->opnds[0]),
+ op_value(st, fn, regs, mem_off, &I->opnds[1]),
+ in->w0 ? in->w0 : 8u, in->fp0);
+ if (st->status) goto stop;
+ if (g_mem_fault) { fault(st, "invalid memory access"); goto stop; }
+ ip = &fn->code[taken ? in->t0 : in->t1];
+ GO();
+ }
+ OP(IOP_SWITCH): {
+ InterpSwitch* sw = &fn->switches[in->t0];
+ u64 sel = op_value(st, fn, regs, mem_off, &I->opnds[0]);
+ u32 ci;
+ u32 target = sw->default_pc;
+ u32 selw = (u32)abi_cg_sizeof(p->c->abi, sw->sel_type);
+ if (g_mem_fault) { fault(st, "invalid memory access"); goto stop; }
+ for (ci = 0; ci < sw->ncases; ++ci) {
+ if (mask_w(sel, selw) == mask_w(sw->aux->cases[ci].value, selw)) {
+ target = sw->case_pc[ci];
+ break; /* leaves the case-search loop, not the dispatch */
+ }
+ }
+ if (target == INTERP_PC_NONE) { fault(st, "switch: no target"); goto stop; }
+ ip = &fn->code[target];
+ GO();
+ }
+ OP(IOP_LOAD_LABEL_ADDR):
+ /* encode target pc as the label address */
+ write_dst(st, fn, regs, mem_off, &I->opnds[0], (u64)in->t0);
+ NEXT();
+ OP(IOP_INDIRECT_BR): {
+ u64 target = op_value(st, fn, regs, mem_off, &I->opnds[0]);
+ if (g_mem_fault) { fault(st, "invalid memory access"); goto stop; }
+ if (target >= fn->ncode) { fault(st, "indirect branch out of range"); goto stop; }
+ ip = &fn->code[target];
+ GO();
+ }
+ OP(IOP_CALL): {
+ IRCallAux* aux = (IRCallAux*)I->extra.aux;
+ OptCGCallDesc* desc = &aux->desc;
+ InterpFunc* callee = NULL;
+ void* host_fp = NULL;
+ if (desc->callee.kind == OPK_GLOBAL) {
+ callee = interp_func_for_sym(p, desc->callee.v.global.sym);
+ if (callee && !callee->ok) {
+ /* A known internal callee we cannot interpret: propagate its
+ * reason rather than silently calling the native version (the
+ * --no-jit contract is that execution never falls back to JIT). */
+ unsupported(st, callee->reject_reason ? callee->reject_reason
+ : "callee");
+ goto stop;
+ }
+ if (!callee)
+ host_fp = interp_global_base(fn, desc->callee.v.global.sym);
+ } else if (desc->callee.kind == OPK_REG) {
+ host_fp = (void*)(uintptr_t)regs[desc->callee.v.reg];
+ /* If the function pointer targets a TU-internal function, interpret
+ * it (don't run its native code) so --no-jit truly never executes
+ * JITed code. External pointers fall through to the FFI path. */
+ callee = interp_func_for_addr(p, host_fp);
+ if (callee && !callee->ok) {
+ unsupported(st, callee->reject_reason ? callee->reject_reason
+ : "callee");
+ goto stop;
+ }
+ }
+ if (callee) {
+ /* internal call: push a frame and bind args. */
+ u32 caller_idx = st->nframes - 1u;
+ u32 callee_idx;
+ if (!in->tail) fr->ip = ip + 1; /* resume after a non-tail call */
+ callee_idx = frame_push(st, callee);
+ if (callee_idx == 0xffffffffu) { fault(st, "call: stack overflow"); goto stop; }
+ bind_args(st, caller_idx, callee_idx, desc);
+ if (!build_varargs(st, caller_idx, callee_idx, desc)) {
+ fault(st, "call: stack overflow");
+ goto stop;
+ }
+ if (g_mem_fault) { fault(st, "invalid memory access"); goto stop; }
+ {
+ InterpFrame* cf = &st->frames[callee_idx];
+ InterpFrame* caller = &st->frames[caller_idx];
+ if (in->tail) {
+ /* True O(1) tail call: the callee's result IS this function's
+ * result, so inherit the tail-caller's return target and relocate
+ * the freshly-built callee frame down onto the (now dead) caller's
+ * register/memory region, rewinding the arenas. A tail loop then
+ * runs in constant interp+host stack space instead of growing the
+ * fixed reservation each iteration.
+ *
+ * Safe because the callee has not executed yet: no absolute
+ * pointers into its own frame exist (va_start runs later; an arg
+ * holding &caller_local would be UB, the caller being about to
+ * return). bind_args/build_varargs already copied every argument
+ * value out of the caller, so overwriting the caller is fine. */
+ u32 dst_regs = caller->regs_off;
+ u32 dst_mem = caller->mem_off;
+ u32 nregs_bytes = (callee->npregs ? callee->npregs : 1u) * 8u;
+ u32 mem_used = cf->alloca_top; /* static frame + vararg buffer */
+ cf->ret_wanted = caller->ret_wanted;
+ cf->ret_dst = caller->ret_dst;
+ cf->sret_ptr = caller->sret_ptr;
+ if (cf->regs_off != dst_regs)
+ memmove(st->regs_arena + dst_regs,
+ st->regs_arena + cf->regs_off, nregs_bytes);
+ if (cf->mem_off != dst_mem) {
+ memmove(st->mem_arena + dst_mem, st->mem_arena + cf->mem_off,
+ mem_used);
+ if (cf->has_varargs) cf->vararg_off -= (cf->mem_off - dst_mem);
+ }
+ cf->regs_off = dst_regs;
+ cf->mem_off = dst_mem;
+ *caller = *cf;
+ st->nframes = caller_idx + 1u;
+ st->regs_top = dst_regs + nregs_bytes;
+ st->mem_top = dst_mem + mem_used;
+ } else if (desc->ret.storage.kind == OPK_REG) {
+ cf->ret_wanted = 1;
+ cf->ret_dst = desc->ret.storage.v.reg;
+ } else if (desc->ret.storage.kind == OPK_LOCAL) {
+ /* aggregate return: callee writes into the caller's slot */
+ u64 a = frame_base(st, caller->mem_off) +
+ caller->fn->slot_off[desc->ret.storage.v.frame_slot];
+ cf->sret_ptr = interp_translate(p, a, 1, PERM_W);
+ }
+ }
+ RELOAD();
+ GO();
+ }
+ if (!host_fp) { unsupported(st, "unresolved call target"); goto stop; }
+ {
+ u64 callret = ext_call(st, fr, regs, host_fp, desc);
+ if (st->status) goto stop;
+ if (in->tail) {
+ /* External tail call: the call's result is this function's
+ * result (desc.ret.storage may be empty for a tail call). */
+ u64 rv = callret;
+ u8 want = fr->ret_wanted;
+ u32 rdst = fr->ret_dst;
+ st->regs_top = fr->regs_off;
+ st->mem_top = fr->mem_off;
+ st->nframes--;
+ st->scalar_ret = rv;
+ if (st->nframes == 0) {
+ st->status = CFREE_INTERP_DONE;
+ if (out_ret) *out_ret = (int64_t)rv;
+ return CFREE_INTERP_DONE;
+ }
+ if (want) {
+ InterpFrame* caller = &st->frames[st->nframes - 1u];
+ u64* cregs = (u64*)(st->regs_arena + caller->regs_off);
+ cregs[rdst] = rv;
+ }
+ RELOAD();
+ GO();
+ }
+ }
+ NEXT();
+ }
+ OP(IOP_RET):
+ OP(IOP_RET_VOID): {
+ u8 is_fp = 0;
+ u64 rv = 0;
+ u8* sret = fr->sret_ptr;
+ if (in->op == IOP_RET) {
+ IRRetAux* aux = (IRRetAux*)I->extra.aux;
+ OptCGABIValue* val = &aux->val;
+ if (cg_type_is_aggregate(p->c, val->type) ||
+ abi_cg_sizeof(p->c->abi, val->type) > 8u) {
+ if (sret) {
+ u64 src = op_addr(st, fn, regs, mem_off, &val->storage);
+ u8* s = interp_translate(p, src,
+ abi_cg_sizeof(p->c->abi, val->type),
+ PERM_R);
+ if (s) memcpy(sret, s, abi_cg_sizeof(p->c->abi, val->type));
+ }
+ } else {
+ ABITypeInfo ti = abi_cg_type_info(p->c->abi, val->type);
+ u32 sz = abi_cg_sizeof(p->c->abi, val->type);
+ rv = op_value(st, fn, regs, mem_off, &val->storage);
+ is_fp = (ti.scalar_kind == ABI_SC_FLOAT) ? 1u : 0u;
+ /* A scalar result whose caller destination is a memory slot (an
+ * address-taken result local) is delivered via sret_ptr, not a
+ * register — write it there. */
+ if (sret) memcpy(sret, &rv, sz ? (sz > 8u ? 8u : sz) : 8u);
+ }
+ }
+ /* The popped (callee) frame records where its scalar result lands in
+ * the caller — capture before popping, then rewind the arenas to the
+ * frame's bases (strict stack discipline). */
+ {
+ u8 want = fr->ret_wanted;
+ u32 dst = fr->ret_dst;
+ st->regs_top = fr->regs_off;
+ st->mem_top = fr->mem_off;
+ st->nframes--;
+ st->scalar_ret = rv;
+ st->ret_is_fp = is_fp;
+ if (st->nframes == 0) {
+ st->status = CFREE_INTERP_DONE;
+ if (out_ret) *out_ret = (int64_t)rv;
+ return CFREE_INTERP_DONE;
+ }
+ if (want) {
+ InterpFrame* caller = &st->frames[st->nframes - 1u];
+ u64* cregs = (u64*)(st->regs_arena + caller->regs_off);
+ cregs[dst] = rv;
+ }
+ }
+ RELOAD();
+ GO();
+ }
+ OP(IOP_INTRINSIC): {
+ if (!interp_intrinsic(st, fn, regs, mem_off, in)) goto stop;
+ NEXT();
+ }
+ OP(IOP_FENCE):
+ NEXT(); /* single-thread: no-op */
+ OP(IOP_AGG_SET):
+ OP(IOP_AGG_COPY): {
+ /* AGG_COPY/SET use pointer-deref addressing (pointer_addr_from_operand):
+ * a LOCAL holding a pointer is dereferenced; otherwise it is the slot. */
+ u64 d = interp_ptr_addr(st, fn, regs, mem_off, &I->opnds[0]);
+ if (in->op == IOP_AGG_COPY) {
+ IRAggAux* aux = (IRAggAux*)I->extra.aux;
+ u64 s = interp_ptr_addr(st, fn, regs, mem_off, &I->opnds[1]);
+ mem_copy(p, d, s, aux ? aux->access.size : 0u);
+ } else {
+ IRAggAux* aux = (IRAggAux*)I->extra.aux;
+ u64 byte = op_value(st, fn, regs, mem_off, &I->opnds[1]);
+ u32 n = aux ? aux->access.size : 0u;
+ u8* h = interp_translate(p, d, n, PERM_W);
+ if (h) memset(h, (int)(byte & 0xffu), n); else g_mem_fault = 1;
+ }
+ NEXT();
+ }
+ OP(IOP_TLS_ADDR): {
+ /* A thread-local's symbol does not resolve to its storage on every
+ * target (a Mach-O symbol resolves to a TLV descriptor), so route
+ * through interp_tls_addr / the host resolve_tls hook, which returns the
+ * running thread's address of the variable (already +addend). */
+ IRTlsAux* aux = (IRTlsAux*)I->extra.aux;
+ void* addr = aux ? interp_tls_addr(fn, aux->sym, aux->addend) : NULL;
+ if (!addr) { unsupported(st, "unresolved thread-local symbol"); goto stop; }
+ write_dst(st, fn, regs, mem_off, &I->opnds[0], (u64)(uintptr_t)addr);
+ NEXT();
+ }
+ OP(IOP_BITFIELD_LOAD): {
+ /* opnds[1] is the record address; the field bits live in the storage
+ * unit at record + storage_offset. Extract by shift+mask (target uses
+ * little-endian bit numbering), sign-extending signed fields. */
+ IRBitFieldAux* aux = (IRBitFieldAux*)I->extra.aux;
+ u64 rec, raw, v = 0;
+ u32 ssz, width;
+ if (!aux) { unsupported(st, "bitfield access"); goto stop; }
+ rec = op_addr(st, fn, regs, mem_off, &I->opnds[1]);
+ ssz = aux->access.storage.size ? aux->access.storage.size : 4u;
+ width = aux->access.bit_width;
+ if (width) {
+ raw = mem_read(p, rec + aux->access.storage_offset, ssz);
+ v = (raw >> aux->access.bit_offset) & bits_mask(width);
+ if (aux->access.signed_ && width < 64u &&
+ (v & (1ull << (width - 1u))))
+ v |= ~bits_mask(width);
+ }
+ write_dst(st, fn, regs, mem_off, &I->opnds[0], v);
+ NEXT();
+ }
+ OP(IOP_BITFIELD_STORE): {
+ /* opnds[0] = record address, opnds[1] = source value. Read-modify-write
+ * the storage unit: clear the field bits, then OR in the masked, shifted
+ * source. A zero-width field is a layout barrier — no store. */
+ IRBitFieldAux* aux = (IRBitFieldAux*)I->extra.aux;
+ u64 rec, addr, ones, fmask, src, raw;
+ u32 ssz, width;
+ if (!aux) { unsupported(st, "bitfield access"); goto stop; }
+ width = aux->access.bit_width;
+ if (width == 0) NEXT();
+ rec = op_addr(st, fn, regs, mem_off, &I->opnds[0]);
+ ssz = aux->access.storage.size ? aux->access.storage.size : 4u;
+ addr = rec + aux->access.storage_offset;
+ ones = bits_mask(width);
+ fmask = ones << aux->access.bit_offset;
+ src = op_value(st, fn, regs, mem_off, &I->opnds[1]);
+ raw = mem_read(p, addr, ssz);
+ raw = (raw & ~fmask) | ((src & ones) << aux->access.bit_offset);
+ mem_write(p, addr, ssz, raw);
+ NEXT();
+ }
+ OP(IOP_VA_START): {
+ /* opnds[0] is the va_list object's address (a pointer value). Seed it
+ * with a cursor over this frame's anonymous-argument buffer. */
+ u64 ap = op_value(st, fn, regs, mem_off, &I->opnds[0]);
+ u64 cursor = fr->has_varargs ? frame_base(st, fr->vararg_off) : 0u;
+ mem_write(p, ap, 8u, cursor);
+ NEXT();
+ }
+ OP(IOP_VA_END):
+ NEXT(); /* nothing to release in the cursor model */
+ OP(IOP_VA_COPY): {
+ /* opnds = [dst va_list addr, src va_list addr]: duplicate the cursor. */
+ u64 d = op_value(st, fn, regs, mem_off, &I->opnds[0]);
+ u64 s = op_value(st, fn, regs, mem_off, &I->opnds[1]);
+ mem_write(p, d, 8u, mem_read(p, s, 8u));
+ NEXT();
+ }
+ OP(IOP_VA_ARG): {
+ /* opnds[0] = dst (type drives the read width), opnds[1] = va_list addr.
+ * Align the cursor, read the slot, advance, store the cursor back. */
+ CfreeCgTypeId ty = I->opnds[0].type;
+ u64 ap = op_value(st, fn, regs, mem_off, &I->opnds[1]);
+ u64 cursor = mem_read(p, ap, 8u);
+ u32 size = abi_cg_sizeof(p->c->abi, ty);
+ u32 al = va_align_of(size);
+ cursor = (cursor + al - 1u) & ~((u64)al - 1u);
+ if (cg_type_is_aggregate(p->c, ty) || size > 8u) {
+ u64 dstaddr = op_addr(st, fn, regs, mem_off, &I->opnds[0]);
+ mem_copy(p, dstaddr, cursor, size);
+ } else {
+ write_dst(st, fn, regs, mem_off, &I->opnds[0],
+ mem_read(p, cursor, size ? size : 8u));
+ }
+ mem_write(p, ap, 8u, cursor + va_stride_of(size));
+ NEXT();
+ }
+ /* Atomics: single-threaded interpreter, so the operation is serialized
+ * and the memory order is irrelevant (treated as seq-cst). */
+ OP(IOP_ATOMIC_LOAD): {
+ u64 a = op_value(st, fn, regs, mem_off, &I->opnds[1]);
+ write_dst(st, fn, regs, mem_off, &I->opnds[0],
+ mem_read(p, a, in->w0 ? in->w0 : 8u));
+ NEXT();
+ }
+ OP(IOP_ATOMIC_STORE): {
+ u64 a = op_value(st, fn, regs, mem_off, &I->opnds[0]);
+ mem_write(p, a, in->w0 ? in->w0 : 8u,
+ op_value(st, fn, regs, mem_off, &I->opnds[1]));
+ NEXT();
+ }
+ OP(IOP_ATOMIC_RMW): {
+ u32 w = in->w0 ? in->w0 : 8u;
+ u64 a = op_value(st, fn, regs, mem_off, &I->opnds[1]);
+ u64 old = mem_read(p, a, w);
+ u64 v = op_value(st, fn, regs, mem_off, &I->opnds[2]);
+ mem_write(p, a, w, do_rmw(in->sub, old, v, w));
+ write_dst(st, fn, regs, mem_off, &I->opnds[0], old);
+ NEXT();
+ }
+ OP(IOP_ATOMIC_CAS): {
+ u32 w = in->w0 ? in->w0 : 8u;
+ u64 a = op_value(st, fn, regs, mem_off, &I->opnds[2]);
+ u64 expected = op_value(st, fn, regs, mem_off, &I->opnds[3]);
+ u64 desired = op_value(st, fn, regs, mem_off, &I->opnds[4]);
+ u64 old = mem_read(p, a, w);
+ u64 ok = (mask_w(old, w) == mask_w(expected, w));
+ if (ok) mem_write(p, a, w, desired);
+ write_dst(st, fn, regs, mem_off, &I->opnds[0], old); /* prior */
+ write_dst(st, fn, regs, mem_off, &I->opnds[1], ok); /* ok flag */
+ NEXT();
+ }
+ OP(IOP_TRAP):
+ unsupported(st, fn->reject_reason ? fn->reject_reason : "operation");
+ goto stop;
+#if !INTERP_DISPATCH_THREADED
+ default:
+ unsupported(st, "opcode");
+ goto stop;
+ }
+ if (g_mem_fault) { fault(st, "invalid memory access"); goto stop; }
+ ip++;
+ }
+#else
+fault_mem:
+ fault(st, "invalid memory access");
+ /* fall through to stop */
+#endif
+
+stop:
+ fr->ip = ip;
+ return (CfreeInterpStatus)st->status;
+#undef RELOAD
+}
+#if INTERP_DISPATCH_THREADED
+# if defined(__clang__)
+# pragma clang diagnostic pop
+# elif defined(__GNUC__)
+# pragma GCC diagnostic pop
+# endif
+#endif
+
+/* ---- intrinsics ---- */
+
+static u64 ipopcount(u64 v, u32 w) {
+ u64 m = (w >= 8) ? ~0ull : ((1ull << (w * 8u)) - 1ull);
+ u64 x = v & m;
+ u64 n = 0;
+ while (x) { n += (x & 1u); x >>= 1; }
+ return n;
+}
+static u64 ictz(u64 v, u32 w) {
+ u32 bits = w * 8u;
+ u64 n = 0;
+ if ((v & ((bits >= 64) ? ~0ull : ((1ull << bits) - 1ull))) == 0) return bits;
+ while (!(v & 1u)) { n++; v >>= 1; }
+ return n;
+}
+static u64 iclz(u64 v, u32 w) {
+ u32 bits = w * 8u;
+ u64 n = 0;
+ u64 top = 1ull << (bits - 1u);
+ v &= (bits >= 64) ? ~0ull : ((1ull << bits) - 1ull);
+ if (v == 0) return bits;
+ while (!(v & top)) { n++; v <<= 1; }
+ return n;
+}
+static u64 ibswap(u64 v, u32 nbytes) {
+ u64 r = 0;
+ u32 i;
+ for (i = 0; i < nbytes; ++i) {
+ r = (r << 8) | (v & 0xffu);
+ v >>= 8;
+ }
+ return r;
+}
+
+static int interp_intrinsic(InterpStack* st, InterpFunc* fn, u64* regs,
+ u32 mem_off, InterpInsn* in) {
+ InterpProgram* p = st->prog;
+ IRIntrinAux* aux = (IRIntrinAux*)in->inst->extra.aux;
+ Compiler* c = p->c;
+ if (!aux) { unsupported(st, "intrinsic"); return 0; }
+#define ARGV(i) op_value(st, fn, regs, mem_off, &aux->args[i])
+#define AWID(i) ((u32)abi_cg_sizeof(c->abi, aux->args[i].type))
+#define DST0 (aux->ndst > 0 && aux->dsts[0].kind == OPK_REG ? aux->dsts[0].v.reg : 0u)
+ switch (aux->kind) {
+ case INTRIN_MEMCPY:
+ case INTRIN_MEMMOVE: {
+ u64 d = ARGV(0), s = ARGV(1), n = ARGV(2);
+ mem_copy(p, d, s, (u32)n);
+ if (aux->ndst > 0 && aux->dsts[0].kind == OPK_REG) regs[DST0] = d;
+ return 1;
+ }
+ case INTRIN_MEMSET: {
+ u64 d = ARGV(0), byte = ARGV(1), n = ARGV(2);
+ u8* h = interp_translate(p, d, (u32)n, PERM_W);
+ if (!h) { fault(st, "memset: invalid memory"); return 0; }
+ memset(h, (int)(byte & 0xffu), (size_t)n);
+ if (aux->ndst > 0 && aux->dsts[0].kind == OPK_REG) regs[DST0] = d;
+ return 1;
+ }
+ case INTRIN_POPCOUNT:
+ regs[DST0] = ipopcount(ARGV(0), AWID(0));
+ return 1;
+ case INTRIN_CTZ:
+ regs[DST0] = ictz(ARGV(0), AWID(0));
+ return 1;
+ case INTRIN_CLZ:
+ regs[DST0] = iclz(ARGV(0), AWID(0));
+ return 1;
+ case INTRIN_BSWAP16:
+ regs[DST0] = ibswap(ARGV(0), 2);
+ return 1;
+ case INTRIN_BSWAP32:
+ regs[DST0] = ibswap(ARGV(0), 4);
+ return 1;
+ case INTRIN_BSWAP64:
+ regs[DST0] = ibswap(ARGV(0), 8);
+ return 1;
+ case INTRIN_EXPECT:
+ regs[DST0] = ARGV(0);
+ return 1;
+ case INTRIN_ASSUME_ALIGNED:
+ if (aux->ndst > 0 && aux->dsts[0].kind == OPK_REG) regs[DST0] = ARGV(0);
+ return 1;
+ case INTRIN_PREFETCH:
+ return 1;
+ case INTRIN_TRAP:
+ fault(st, "__builtin_trap");
+ return 0;
+ case INTRIN_UNREACHABLE:
+ fault(st, "unreachable");
+ return 0;
+ case INTRIN_SADD_OVERFLOW:
+ case INTRIN_UADD_OVERFLOW:
+ case INTRIN_SSUB_OVERFLOW:
+ case INTRIN_USUB_OVERFLOW:
+ case INTRIN_SMUL_OVERFLOW:
+ case INTRIN_UMUL_OVERFLOW: {
+ u32 w = AWID(0);
+ u64 a = ARGV(0), b = ARGV(1);
+ u64 res = 0;
+ int ovf = 0;
+ switch (aux->kind) {
+ /* For w<8 the operands fit in i64/u64 so the exact result is available
+ * and a re-narrow comparison detects overflow; for w==8 there is no
+ * wider type, so detect via sign/carry logic (the re-narrow trick would
+ * always read "no overflow"). */
+ case INTRIN_SADD_OVERFLOW: {
+ i64 x = sext_w(a, w), y = sext_w(b, w);
+ u64 r = (u64)x + (u64)y;
+ res = mask_w(r, w);
+ ovf = (w < 8) ? (sext_w(res, w) != x + y)
+ : (int)((((u64)x ^ r) & ((u64)y ^ r)) >> 63);
+ break;
+ }
+ case INTRIN_UADD_OVERFLOW: {
+ u64 x = mask_w(a, w), y = mask_w(b, w), r = x + y;
+ res = mask_w(r, w);
+ ovf = (res != r) || (mask_w(r, w) < x);
+ break;
+ }
+ case INTRIN_SSUB_OVERFLOW: {
+ i64 x = sext_w(a, w), y = sext_w(b, w);
+ u64 r = (u64)x - (u64)y;
+ res = mask_w(r, w);
+ ovf = (w < 8) ? (sext_w(res, w) != x - y)
+ : (int)((((u64)x ^ (u64)y) & ((u64)x ^ r)) >> 63);
+ break;
+ }
+ case INTRIN_USUB_OVERFLOW: {
+ ovf = mask_w(a, w) < mask_w(b, w);
+ res = mask_w(mask_w(a, w) - mask_w(b, w), w);
+ break;
+ }
+ case INTRIN_SMUL_OVERFLOW: {
+ i64 x = sext_w(a, w), y = sext_w(b, w);
+ u64 r = (u64)x * (u64)y;
+ res = mask_w(r, w);
+ if (w < 8) {
+ ovf = (sext_w(res, w) != x * y);
+ } else if (x == 0 || y == 0) {
+ ovf = 0;
+ } else if ((x == -1 && (u64)y == 0x8000000000000000ull) ||
+ (y == -1 && (u64)x == 0x8000000000000000ull)) {
+ ovf = 1; /* INT64_MIN * -1 */
+ } else {
+ ovf = ((i64)r / x != y);
+ }
+ break;
+ }
+ case INTRIN_UMUL_OVERFLOW: {
+ u64 x = mask_w(a, w), y = mask_w(b, w), r = x * y;
+ res = mask_w(r, w);
+ ovf = (w < 8) ? (r != res) : (x != 0 && r / x != y);
+ break;
+ }
+ default: break;
+ }
+ if (aux->ndst > 0 && aux->dsts[0].kind == OPK_REG)
+ regs[aux->dsts[0].v.reg] = res;
+ if (aux->ndst > 1 && aux->dsts[1].kind == OPK_REG)
+ regs[aux->dsts[1].v.reg] = (u64)ovf;
+ return 1;
+ }
+ default:
+ unsupported(st, "intrinsic");
+ return 0;
+ }
+#undef ARGV
+#undef AWID
+#undef DST0
+}
+
+/* ---- public stack API ---- */
+
+CfreeInterpStack* cfree_interp_stack_new(CfreeInterpProgram* pp) {
+ InterpProgram* p = (InterpProgram*)pp;
+ Heap* h;
+ InterpStack* st;
+ if (!p) return NULL;
+ h = p->c->ctx->heap;
+ st = (InterpStack*)h->alloc(h, sizeof(*st), _Alignof(InterpStack));
+ if (!st) return NULL;
+ memset(st, 0, sizeof *st);
+ st->prog = p;
+ /* Fixed, non-relocating arenas (see bump()/INTERP_*_RESERVE). */
+ st->regs_arena = (u8*)h->alloc(h, INTERP_REGS_RESERVE, 16u);
+ st->mem_arena = (u8*)h->alloc(h, INTERP_MEM_RESERVE, 16u);
+ if (!st->regs_arena || !st->mem_arena) {
+ if (st->regs_arena) h->free(h, st->regs_arena, INTERP_REGS_RESERVE);
+ if (st->mem_arena) h->free(h, st->mem_arena, INTERP_MEM_RESERVE);
+ h->free(h, st, sizeof *st);
+ return NULL;
+ }
+ st->regs_cap = INTERP_REGS_RESERVE;
+ st->mem_cap = INTERP_MEM_RESERVE;
+ return (CfreeInterpStack*)st;
+}
+
+void cfree_interp_stack_free(CfreeInterpStack* s) {
+ InterpStack* st = (InterpStack*)s;
+ Heap* h;
+ if (!st) return;
+ h = st->prog->c->ctx->heap;
+ if (st->frames) h->free(h, st->frames, sizeof(InterpFrame) * st->frames_cap);
+ if (st->regs_arena) h->free(h, st->regs_arena, st->regs_cap);
+ if (st->mem_arena) h->free(h, st->mem_arena, st->mem_cap);
+ h->free(h, st, sizeof *st);
+}
+
+static void bind_entry_param(InterpStack* st, InterpFunc* fn, u32 idx, u32 i,
+ u64 value) {
+ InterpFrame* fr = &st->frames[idx];
+ IRParam* pr;
+ if (i >= fn->f->nparams) return;
+ pr = &fn->f->params[i];
+ if (pr->storage.kind == CG_LOCAL_STORAGE_REG) {
+ u64* regs = (u64*)(st->regs_arena + fr->regs_off);
+ regs[pr->storage.v.reg] = value;
+ } else {
+ u64 dst = frame_base(st, fr->mem_off) + fn->slot_off[pr->storage.v.frame_slot];
+ mem_write(st->prog, dst, 8u, value);
+ }
+}
+
+CfreeStatus cfree_interp_call_on(CfreeInterpStack* s, CfreeInterpFunc* ff,
+ int argc, char** argv) {
+ InterpStack* st = (InterpStack*)s;
+ InterpFunc* fn = (InterpFunc*)ff;
+ u32 idx;
+ if (!st || !fn) return CFREE_INVALID;
+ idx = frame_push(st, fn);
+ if (idx == 0xffffffffu) return CFREE_NOMEM;
+ bind_entry_param(st, fn, idx, 0u, (u64)(unsigned)argc);
+ bind_entry_param(st, fn, idx, 1u, (u64)(uintptr_t)argv);
+ return CFREE_OK;
+}
+
+CfreeInterpStatus cfree_interp_resume(CfreeInterpStack* s, int64_t* out_ret) {
+ InterpStack* st = (InterpStack*)s;
+ if (!st) return CFREE_INTERP_ERROR;
+ return interp_run_stack(st, out_ret);
+}
+
+CfreeInterpStatus cfree_interp_call(CfreeInterpProgram* pp, CfreeInterpFunc* ff,
+ int argc, char** argv, int64_t* out_ret) {
+ CfreeInterpStack* s = cfree_interp_stack_new(pp);
+ CfreeInterpStatus rc;
+ if (!s) return CFREE_INTERP_ERROR;
+ if (cfree_interp_call_on(s, ff, argc, argv) != CFREE_OK) {
+ cfree_interp_stack_free(s);
+ return CFREE_INTERP_ERROR;
+ }
+ rc = cfree_interp_resume(s, out_ret);
+ cfree_interp_stack_free(s);
+ return rc;
+}
+
+CfreeInterpStatus cfree_interp_call_args(CfreeInterpProgram* pp,
+ CfreeInterpFunc* ff,
+ const uint64_t* args, uint32_t nargs,
+ int64_t* out_ret) {
+ InterpStack* st = (InterpStack*)cfree_interp_stack_new(pp);
+ InterpFunc* fn = (InterpFunc*)ff;
+ CfreeInterpStatus rc;
+ u32 idx, i;
+ if (!st) return CFREE_INTERP_ERROR;
+ if (!fn) {
+ cfree_interp_stack_free((CfreeInterpStack*)st);
+ return CFREE_INTERP_ERROR;
+ }
+ idx = frame_push(st, fn);
+ if (idx == 0xffffffffu) {
+ cfree_interp_stack_free((CfreeInterpStack*)st);
+ return CFREE_INTERP_ERROR;
+ }
+ for (i = 0; i < nargs; ++i) bind_entry_param(st, fn, idx, i, args[i]);
+ rc = interp_run_stack(st, out_ret);
+ cfree_interp_stack_free((CfreeInterpStack*)st);
+ return rc;
+}
diff --git a/src/interp/ffi.c b/src/interp/ffi.c
@@ -0,0 +1,150 @@
+/* External (host-ABI) call marshaller.
+ *
+ * The engine has already classified arguments into integer-register and
+ * fp-register slots per the call's ABIFuncInfo (sret pointer first, byval as a
+ * pointer, aggregate-in-regs split into chunks). On the supported ABIs (SysV
+ * x64, AAPCS64, RV64 LP64D) integer and fp arguments are assigned from
+ * independent register sequences, so calling through one maximal prototype
+ * `T(u64 x8, <fp> x8)` places the first `nint` integers and `nfp` fp values in
+ * the correct registers regardless of their original interleaving; unused
+ * trailing slots are zero and ignored by the callee.
+ *
+ * Two fp-argument shapes are needed because a 32-bit `float` and a 64-bit
+ * `double` occupy the fp register differently (single vs double precision): the
+ * engine picks `args_fp_is_float` when every fp arg is a 4-byte single, and the
+ * dispatcher selects the `float`-parameter thunk family. A call mixing float
+ * and double fp args within one signature is rejected upstream.
+ *
+ * Returns mirror this: a value comes back in one or two registers, classified
+ * into int/fp parts. Two-register returns dispatch through struct-returning
+ * thunks whose field types steer the ABI to the right return registers; the
+ * caller copies each part's bytes into the aggregate destination.
+ *
+ * The casts deliberately mismatch the callee's real prototype — the classic
+ * libffi-lite trick. That trips clang's -fsanitize=function, so the dispatcher
+ * opts out of that one check (clang only; cfree self-host never enables it). */
+
+#include <string.h>
+
+#include "core/core.h"
+#include "interp/interp.h"
+
+/* 8 integer + 8 fp register slots, fp as double or as single. */
+#define IPARAMS u64, u64, u64, u64, u64, u64, u64, u64
+#define DPARAMS double, double, double, double, double, double, double, double
+#define FPARAMS float, float, float, float, float, float, float, float
+
+#define IVALS(a) \
+ (a)->iargs[0], (a)->iargs[1], (a)->iargs[2], (a)->iargs[3], (a)->iargs[4], \
+ (a)->iargs[5], (a)->iargs[6], (a)->iargs[7]
+#define DVALS(a) \
+ (a)->fargs[0], (a)->fargs[1], (a)->fargs[2], (a)->fargs[3], (a)->fargs[4], \
+ (a)->fargs[5], (a)->fargs[6], (a)->fargs[7]
+#define FVALS(a) \
+ (a)->fargs_f[0], (a)->fargs_f[1], (a)->fargs_f[2], (a)->fargs_f[3], \
+ (a)->fargs_f[4], (a)->fargs_f[5], (a)->fargs_f[6], (a)->fargs_f[7]
+
+/* Two-register return shapes; the field types pick the return-register classes
+ * (e.g. {u64,u64}=RAX,RDX / X0,X1; {u64,double}=RAX,XMM0 / X0,V0; etc). */
+typedef struct {
+ u64 a, b;
+} R_ii;
+typedef struct {
+ u64 a;
+ double b;
+} R_id;
+typedef struct {
+ double a;
+ u64 b;
+} R_di;
+typedef struct {
+ double a, b;
+} R_dd;
+
+/* double-fp-arg thunks, by return shape */
+typedef u64 (*t_i_d)(IPARAMS, DPARAMS);
+typedef double (*t_d_d)(IPARAMS, DPARAMS);
+typedef float (*t_f_d)(IPARAMS, DPARAMS);
+typedef R_ii (*t_ii_d)(IPARAMS, DPARAMS);
+typedef R_id (*t_id_d)(IPARAMS, DPARAMS);
+typedef R_di (*t_di_d)(IPARAMS, DPARAMS);
+typedef R_dd (*t_dd_d)(IPARAMS, DPARAMS);
+/* float-fp-arg thunks, scalar returns only (mixed signatures are rare) */
+typedef u64 (*t_i_f)(IPARAMS, FPARAMS);
+typedef double (*t_d_f)(IPARAMS, FPARAMS);
+typedef float (*t_f_f)(IPARAMS, FPARAMS);
+
+#if defined(__clang__)
+__attribute__((no_sanitize("function")))
+#endif
+int
+interp_ffi_invoke(void* fp, const InterpFfiArgs* a, u64 out[2],
+ const char** reason) {
+ int flt;
+ out[0] = 0;
+ out[1] = 0;
+ if (!fp) {
+ *reason = "external call target is null";
+ return 1;
+ }
+ if (a->nint > 8u || a->nfp > 8u) {
+ *reason = "external call has too many register arguments";
+ return 1;
+ }
+ flt = a->args_fp_is_float;
+
+ /* void return */
+ if (a->ret_is_void || a->ret_nparts == 0u) {
+ if (flt)
+ ((t_i_f)fp)(IVALS(a), FVALS(a));
+ else
+ ((t_i_d)fp)(IVALS(a), DVALS(a));
+ return 0;
+ }
+
+ /* single-register return */
+ if (a->ret_nparts == 1u) {
+ if (a->ret_fp[0]) {
+ if (a->ret_size[0] == 4u) {
+ float r = flt ? ((t_f_f)fp)(IVALS(a), FVALS(a))
+ : ((t_f_d)fp)(IVALS(a), DVALS(a));
+ u32 b;
+ memcpy(&b, &r, 4);
+ out[0] = b;
+ } else {
+ double r = flt ? ((t_d_f)fp)(IVALS(a), FVALS(a))
+ : ((t_d_d)fp)(IVALS(a), DVALS(a));
+ memcpy(&out[0], &r, 8);
+ }
+ } else {
+ out[0] = flt ? ((t_i_f)fp)(IVALS(a), FVALS(a))
+ : ((t_i_d)fp)(IVALS(a), DVALS(a));
+ }
+ return 0;
+ }
+
+ /* two-register return. A float fp arg combined with a struct return is rare;
+ * require double fp args here (engine also rejects 4-byte fp return parts). */
+ if (flt) {
+ *reason = "external call: float args with multi-register return";
+ return 1;
+ }
+ if (!a->ret_fp[0] && !a->ret_fp[1]) {
+ R_ii r = ((t_ii_d)fp)(IVALS(a), DVALS(a));
+ out[0] = r.a;
+ out[1] = r.b;
+ } else if (!a->ret_fp[0] && a->ret_fp[1]) {
+ R_id r = ((t_id_d)fp)(IVALS(a), DVALS(a));
+ out[0] = r.a;
+ memcpy(&out[1], &r.b, 8);
+ } else if (a->ret_fp[0] && !a->ret_fp[1]) {
+ R_di r = ((t_di_d)fp)(IVALS(a), DVALS(a));
+ memcpy(&out[0], &r.a, 8);
+ out[1] = r.b;
+ } else {
+ R_dd r = ((t_dd_d)fp)(IVALS(a), DVALS(a));
+ memcpy(&out[0], &r.a, 8);
+ memcpy(&out[1], &r.b, 8);
+ }
+ return 0;
+}
diff --git a/src/interp/interp.h b/src/interp/interp.h
@@ -0,0 +1,249 @@
+#ifndef CFREE_INTERP_INTERNAL_H
+#define CFREE_INTERP_INTERNAL_H
+
+/* Internal interface for the threaded-bytecode interpreter (src/interp).
+ *
+ * The loader (lower.c) turns a post-opt_run_o1_interp Func into an InterpFunc:
+ * a flat array of fixed-width InterpInsn records plus side tables. The engine
+ * (engine.c) runs the records over an explicit, swappable InterpStack so that
+ * IR-level calls never use host C recursion. Memory + symbol resolution is
+ * pluggable (host-identity or emu/guest) via CfreeInterpHost. */
+
+#include <cfree/interp.h>
+
+#include "abi/abi.h"
+#include "cg/cgtarget.h"
+#include "core/core.h"
+#include "core/slice.h"
+#include "obj/obj.h"
+#include "opt/ir.h"
+
+/* One opcode family per IROp, width/aggregate-specialized where it changes the
+ * handler. The opcode drives the engine's dispatch switch. */
+typedef enum InterpOp {
+ IOP_NOP = 0,
+ IOP_LOAD_IMM,
+ IOP_LOAD_CONST,
+ IOP_COPY, /* scalar register/memory move */
+ IOP_COPY_AGG, /* aggregate / >8B byte copy */
+ IOP_LOAD, /* scalar load */
+ IOP_LOAD_AGG, /* aggregate load (byte copy) */
+ IOP_STORE, /* scalar store */
+ IOP_STORE_AGG,
+ IOP_ADDR_OF,
+ IOP_TLS_ADDR,
+ IOP_BINOP,
+ IOP_UNOP,
+ IOP_CMP,
+ IOP_CONVERT,
+ IOP_CALL,
+ IOP_BR,
+ IOP_CONDBR,
+ IOP_CMP_BRANCH,
+ IOP_SWITCH,
+ IOP_INDIRECT_BR,
+ IOP_LOAD_LABEL_ADDR,
+ IOP_RET,
+ IOP_RET_VOID,
+ IOP_ALLOCA,
+ IOP_AGG_COPY,
+ IOP_AGG_SET,
+ IOP_BITFIELD_LOAD,
+ IOP_BITFIELD_STORE,
+ IOP_VA_START,
+ IOP_VA_ARG,
+ IOP_VA_END,
+ IOP_VA_COPY,
+ IOP_ATOMIC_LOAD,
+ IOP_ATOMIC_STORE,
+ IOP_ATOMIC_RMW,
+ IOP_ATOMIC_CAS,
+ IOP_FENCE,
+ IOP_INTRINSIC,
+ IOP_TRAP, /* unreachable/unsupported-at-runtime guard */
+ IOP__COUNT,
+} InterpOp;
+
+/* Fixed-width, cache-friendly record. Hot fields (dst preg, resolved branch
+ * pcs, immediate, widths) are cached; `inst` retains the source instruction so
+ * handlers can read full operand detail (types, MemAccess, aux, call desc)
+ * generically. Direct threading via `handler` is reserved for a future pass;
+ * the engine currently dispatches on `op`. */
+typedef struct InterpInsn {
+ void* handler; /* reserved: &&label for direct threading */
+ const Inst* inst; /* source instruction (arena-resident, c->tu lifetime) */
+ u32 op; /* InterpOp */
+ u32 sub; /* sub-op tag: BinOp/UnOp/CmpOp/ConvKind/AtomicOp/MemOrder */
+ u32 dst; /* dest PReg id (cache of opnds[0].v.reg); 0 if none */
+ u32 t0; /* resolved pc of succ[0] / switch-table index / src reg */
+ u32 t1; /* resolved pc of succ[1] */
+ i64 imm; /* immediate cache (LOAD_IMM etc.) */
+ u16 w0; /* primary width in bytes (result/load/store) */
+ u16 w1; /* secondary width in bytes (CONVERT source) */
+ u8 fp0; /* result/operand is floating point */
+ u8 fp1; /* CONVERT source is floating point */
+ u8 tail; /* IOP_CALL: this is a tail call (terminator, no return) */
+ u8 pad;
+} InterpInsn;
+
+/* Side table for IOP_SWITCH: case values and pre-resolved target pcs. */
+typedef struct InterpSwitch {
+ const IRSwitchAux* aux; /* cases[].value (block ids ignored; we use *_pc) */
+ u32* case_pc; /* ncases entries: case i -> code pc */
+ u32 ncases;
+ u32 default_pc;
+ CfreeCgTypeId sel_type;
+} InterpSwitch;
+
+typedef struct InterpProgram InterpProgram;
+
+/* A global symbol referenced by a function: name resolved at lower time (the
+ * obj is available then), host address resolved lazily at run time (after the
+ * JIT image is linked). */
+typedef struct InterpGlobal {
+ ObjSymId sym;
+ Slice name;
+ void* cached; /* resolved host base address, or NULL until first use */
+ u8 resolved;
+} InterpGlobal;
+
+typedef struct InterpFunc {
+ InterpProgram* prog;
+ Func* f; /* source opt Func (arena = compiler->tu) */
+ ObjSymId sym;
+ Slice name; /* unmangled C name (borrowed from the global pool) */
+
+ InterpGlobal* globals;
+ u32 nglobals;
+
+ InterpInsn* code;
+ u32 ncode;
+ u32* block_pc; /* block id -> code pc; INTERP_PC_NONE if unreachable */
+ u32 nblocks;
+
+ u32* slot_off; /* frame slot id -> byte offset within the frame */
+ u32 nslots;
+ u32 frame_align; /* max alignment of any static slot (>=8) */
+ u32 frame_bytes; /* static frame size; alloca grows beyond at runtime */
+ u32 npregs; /* register-file size (PReg ids index it) */
+
+ InterpSwitch* switches;
+ u32 nswitches;
+
+ u8 threaded;
+ u8 ok; /* 0 if lowering rejected an unsupported op */
+ const char* reject_reason; /* set when !ok */
+} InterpFunc;
+
+#define INTERP_PC_NONE 0xffffffffu
+
+struct InterpProgram {
+ Compiler* c;
+ InterpFunc** funcs;
+ u32 nfuncs;
+ u32 funcs_cap;
+ CfreeInterpHost host;
+ u8 have_host;
+};
+
+/* One IR-level activation. Execution state lives here, never on the host C
+ * stack — CALL pushes, RET pops. */
+typedef struct InterpFrame {
+ InterpFunc* fn;
+ InterpInsn* ip; /* resume point */
+ u32 regs_off; /* register file offset in regs_arena (npregs u64s) */
+ u32 mem_off; /* this frame's addressable bytes offset in mem_arena */
+ u32 frame_bytes; /* static + alloca high-water */
+ u32 alloca_top; /* current frame byte usage (>= static frame_bytes) */
+ u8* sret_ptr; /* aggregate-return destination, or NULL */
+ u32 ret_dst; /* caller PReg to receive a scalar return */
+ u8 ret_wanted; /* caller wants a scalar result in ret_dst */
+ u8 ret_is_fp;
+ u8 has_varargs; /* variadic callee: a vararg buffer was laid out */
+ u32 vararg_off; /* mem_arena offset of this frame's vararg buffer */
+} InterpFrame;
+
+/* A swappable execution context. interp_resume runs the TOP frame. */
+struct CfreeInterpStack {
+ InterpProgram* prog;
+ InterpFrame* frames;
+ u32 nframes;
+ u32 frames_cap;
+ u8* regs_arena; /* bump region for per-frame register files (bytes) */
+ u32 regs_top;
+ u32 regs_cap;
+ u8* mem_arena; /* addressable frame bytes (host-identity) */
+ u32 mem_top;
+ u32 mem_cap;
+ u64 scalar_ret; /* return shuttle between frames */
+ u8 ret_is_fp;
+ u8 status; /* CfreeInterpStatus */
+ const char* trap_reason;
+};
+typedef struct CfreeInterpStack InterpStack;
+
+/* ---- interp_program.c ---- */
+/* Called by the optimizer (src/opt/opt.c) for each compiled function when an
+ * interp sink is attached: lowers `f` and registers it by sym + name. `obj`
+ * is the builder the function was recorded into, used to resolve referenced
+ * global symbol names while it is still alive. */
+void interp_capture_func(void* program, Func* f, ObjSymId sym, const char* name,
+ u32 name_len, const ObjBuilder* obj);
+/* Resolve an internal call target (defined-in-TU function) by symbol. */
+InterpFunc* interp_func_for_sym(InterpProgram*, ObjSymId);
+/* Reverse-map a resolved host code address back to an interpretable function
+ * (so an indirect call through a function pointer to a TU-internal function is
+ * still interpreted, not run as native). NULL if the address is not one. */
+InterpFunc* interp_func_for_addr(InterpProgram*, void* addr);
+/* Resolve the host base address of a function-referenced global (lazily caches
+ * through interp_resolve_sym). NULL if unresolved. */
+void* interp_global_base(InterpFunc*, ObjSymId);
+/* Resolve a thread-local variable's address (+addend) for the running thread.
+ * Routes through the host's resolve_tls hook (a thread-local symbol may resolve
+ * to a TLV descriptor rather than storage); falls back to treating it as a
+ * plain global when no hook is bound. NULL if unresolved. */
+void* interp_tls_addr(InterpFunc*, ObjSymId, i64 addend);
+
+/* ---- lower.c ---- */
+InterpFunc* interp_lower(InterpProgram*, Func*, ObjSymId, Slice name,
+ const ObjBuilder* obj);
+
+/* ---- engine.c ---- */
+CfreeInterpStatus interp_run_stack(InterpStack*, int64_t* out_ret);
+/* Translate an abstract address to a host pointer (host-identity if no host
+ * vtable is bound). Returns NULL on a fault. */
+u8* interp_translate(InterpProgram*, u64 addr, u64 n, int perms);
+/* Resolve a global/extern symbol name to a host address (NULL if unresolved). */
+void* interp_resolve_sym(InterpProgram*, Slice name);
+
+/* ---- ffi.c ---- */
+/* Marshal an external call: `fp` is the host function pointer, `desc` the
+ * semantic call descriptor. Reads argument values via the engine's accessors
+ * (passed as a callback closure). Returns 0 on success, non-zero (with
+ * *reason set) when the signature is outside the supported thunk family. */
+typedef struct InterpFfiArgs {
+ const ABIFuncInfo* fi;
+ /* int/fp register-slot values, already laid out by the engine. */
+ u64 iargs[8];
+ u32 nint;
+ double fargs[8]; /* fp args when passed as doubles */
+ float fargs_f[8]; /* fp args when passed as 32-bit singles */
+ u32 nfp;
+ u8 args_fp_is_float; /* 1: every fp arg is a 4-byte single (use fargs_f) */
+ void* sret; /* hidden struct-return buffer, or NULL */
+ u8 ret_is_void;
+ /* Register-return classification (only when !sret): the value comes back in
+ * ret_nparts (0=void, 1, or 2) registers; out[k] receives each register's raw
+ * 64-bit contents and the engine reassembles the aggregate. */
+ u8 ret_nparts;
+ u8 ret_fp[2]; /* part k is fp-class (returned in XMM/V) */
+ u32 ret_size[2]; /* bytes carried by part k */
+} InterpFfiArgs;
+
+/* Invokes `fp` through a cast thunk matching the classified signature. Fills
+ * out[0] (and out[1] for a two-register return) with raw register contents.
+ * Returns non-zero (with *reason) for an unsupported signature. */
+int interp_ffi_invoke(void* fp, const InterpFfiArgs*, u64 out[2],
+ const char** reason);
+
+#endif
diff --git a/src/interp/interp_program.c b/src/interp/interp_program.c
@@ -0,0 +1,177 @@
+/* InterpProgram lifecycle, capture (called from the optimizer), symbol/name
+ * lookup, and the pluggable memory/symbol resolution used by the engine. */
+
+#include <string.h>
+
+#include "core/arena.h"
+#include "core/core.h"
+#include "core/slice.h"
+#include "interp/interp.h"
+
+static void* ip_alloc(InterpProgram* p, size_t n, size_t align) {
+ Heap* h = p->c->ctx->heap;
+ void* q = h->alloc(h, n, align);
+ if (!q) compiler_panic(p->c, (SrcLoc){0, 0, 0}, "interp: out of memory");
+ memset(q, 0, n);
+ return q;
+}
+
+static void ip_free(InterpProgram* p, void* q, size_t n) {
+ Heap* h = p->c->ctx->heap;
+ if (q) h->free(h, q, n);
+}
+
+CfreeInterpProgram* cfree_interp_program_new(CfreeCompiler* cc) {
+ Compiler* c = (Compiler*)cc;
+ Heap* h;
+ InterpProgram* p;
+ if (!c) return NULL;
+ h = c->ctx->heap;
+ p = (InterpProgram*)h->alloc(h, sizeof(*p), _Alignof(InterpProgram));
+ if (!p) return NULL;
+ memset(p, 0, sizeof(*p));
+ p->c = c;
+ return (CfreeInterpProgram*)p;
+}
+
+void cfree_interp_program_free(CfreeInterpProgram* pp) {
+ InterpProgram* p = (InterpProgram*)pp;
+ Heap* h;
+ if (!p) return;
+ h = p->c->ctx->heap;
+ if (p->funcs) ip_free(p, p->funcs, sizeof(p->funcs[0]) * p->funcs_cap);
+ /* InterpFunc bodies live on c->tu (compiler-lifetime arena); only the
+ * function-pointer table and the program struct are heap-owned. */
+ h->free(h, p, sizeof(*p));
+}
+
+void cfree_interp_program_attach(CfreeInterpProgram* pp, CfreeCompiler* cc) {
+ Compiler* c = (Compiler*)cc;
+ if (!c) return;
+ c->interp_sink = pp; /* may be NULL to detach */
+}
+
+void cfree_interp_program_set_host(CfreeInterpProgram* pp,
+ const CfreeInterpHost* host) {
+ InterpProgram* p = (InterpProgram*)pp;
+ if (!p) return;
+ if (host) {
+ p->host = *host;
+ p->have_host = 1;
+ } else {
+ p->have_host = 0;
+ }
+}
+
+static void ip_register(InterpProgram* p, InterpFunc* fn) {
+ if (p->nfuncs == p->funcs_cap) {
+ u32 ncap = p->funcs_cap ? p->funcs_cap * 2u : 16u;
+ InterpFunc** nf = ip_alloc(p, sizeof(nf[0]) * ncap, _Alignof(InterpFunc*));
+ if (p->nfuncs) memcpy(nf, p->funcs, sizeof(nf[0]) * p->nfuncs);
+ if (p->funcs) ip_free(p, p->funcs, sizeof(p->funcs[0]) * p->funcs_cap);
+ p->funcs = nf;
+ p->funcs_cap = ncap;
+ }
+ p->funcs[p->nfuncs++] = fn;
+}
+
+void interp_capture_func(void* program, Func* f, ObjSymId sym, const char* name,
+ u32 name_len, const ObjBuilder* obj) {
+ InterpProgram* p = (InterpProgram*)program;
+ Slice nm;
+ InterpFunc* fn;
+ if (!p || !f) return;
+ nm.s = name;
+ nm.len = name_len;
+ fn = interp_lower(p, f, sym, nm, obj);
+ if (fn) ip_register(p, fn);
+}
+
+InterpFunc* interp_func_for_sym(InterpProgram* p, ObjSymId sym) {
+ u32 i;
+ if (!p || sym == OBJ_SYM_NONE) return NULL;
+ for (i = 0; i < p->nfuncs; ++i)
+ if (p->funcs[i]->sym == sym) return p->funcs[i];
+ return NULL;
+}
+
+InterpFunc* interp_func_for_addr(InterpProgram* p, void* addr) {
+ u32 i;
+ if (!p || !addr) return NULL;
+ for (i = 0; i < p->nfuncs; ++i) {
+ InterpFunc* fn = p->funcs[i];
+ if (!fn->ok || !fn->name.s) continue;
+ if (interp_resolve_sym(p, fn->name) == addr) return fn;
+ }
+ return NULL;
+}
+
+void* interp_global_base(InterpFunc* fn, ObjSymId sym) {
+ u32 i;
+ if (!fn) return NULL;
+ for (i = 0; i < fn->nglobals; ++i) {
+ InterpGlobal* g = &fn->globals[i];
+ if (g->sym != sym) continue;
+ if (!g->resolved) {
+ g->cached = interp_resolve_sym(fn->prog, g->name);
+ g->resolved = 1;
+ }
+ return g->cached;
+ }
+ return NULL;
+}
+
+void* interp_tls_addr(InterpFunc* fn, ObjSymId sym, i64 addend) {
+ InterpProgram* p = fn ? fn->prog : NULL;
+ u32 i;
+ if (!p) return NULL;
+ if (p->have_host && p->host.resolve_tls) {
+ /* Resolve by symbol name (the host maps a thread-local name + addend to the
+ * running thread's storage, unwrapping a TLV descriptor where needed). */
+ for (i = 0; i < fn->nglobals; ++i)
+ if (fn->globals[i].sym == sym)
+ return p->host.resolve_tls(p->host.ctx, fn->globals[i].name, addend);
+ return NULL;
+ }
+ /* No TLS resolver bound: treat as a plain global (correct only where the
+ * symbol already denotes the variable's storage). */
+ {
+ void* base = interp_global_base(fn, sym);
+ return base ? (u8*)base + (u64)addend : NULL;
+ }
+}
+
+/* Match a stored (possibly target-mangled) symbol name against an unmangled
+ * query, tolerating a single leading-underscore prefix (Mach-O / COFF C
+ * mangling). ELF stores the bare name, so the exact branch handles it. */
+static int name_matches(Slice stored, CfreeSlice q) {
+ if (!stored.s) return 0;
+ if (stored.len == q.len && memcmp(stored.s, q.s, q.len) == 0) return 1;
+ if (stored.len == q.len + 1u && stored.s[0] == '_' &&
+ memcmp(stored.s + 1, q.s, q.len) == 0)
+ return 1;
+ return 0;
+}
+
+CfreeInterpFunc* cfree_interp_lookup(CfreeInterpProgram* pp, CfreeSlice name) {
+ InterpProgram* p = (InterpProgram*)pp;
+ u32 i;
+ if (!p || !name.s) return NULL;
+ for (i = 0; i < p->nfuncs; ++i)
+ if (name_matches(p->funcs[i]->name, name))
+ return (CfreeInterpFunc*)p->funcs[i];
+ return NULL;
+}
+
+u8* interp_translate(InterpProgram* p, u64 addr, u64 n, int perms) {
+ if (p && p->have_host && p->host.translate)
+ return p->host.translate(p->host.ctx, addr, n, perms);
+ /* Host-identity: abstract addresses are real host pointers. */
+ return (u8*)(uintptr_t)addr;
+}
+
+void* interp_resolve_sym(InterpProgram* p, Slice name) {
+ if (p && p->have_host && p->host.resolve_sym)
+ return p->host.resolve_sym(p->host.ctx, name);
+ return NULL;
+}
diff --git a/src/interp/interp_stubs.c b/src/interp/interp_stubs.c
@@ -0,0 +1,75 @@
+/* Stubs for the public interpreter API when CFREE_INTERP_ENABLED == 0.
+ * Mirrors src/arch/disasm_stubs.c: the only callers of a disabled interpreter
+ * are external (the `cfree run --no-jit` path, embedders), so the public
+ * cfree_interp_* surface returns NULL / CFREE_UNSUPPORTED. This file is added
+ * to the build only in the disabled branch (see Makefile). */
+
+#include <cfree/interp.h>
+
+CfreeInterpProgram* cfree_interp_program_new(CfreeCompiler* c) {
+ (void)c;
+ return NULL;
+}
+
+void cfree_interp_program_free(CfreeInterpProgram* p) { (void)p; }
+
+void cfree_interp_program_attach(CfreeInterpProgram* p, CfreeCompiler* c) {
+ (void)p;
+ (void)c;
+}
+
+void cfree_interp_program_set_host(CfreeInterpProgram* p,
+ const CfreeInterpHost* host) {
+ (void)p;
+ (void)host;
+}
+
+CfreeInterpFunc* cfree_interp_lookup(CfreeInterpProgram* p, CfreeSlice name) {
+ (void)p;
+ (void)name;
+ return NULL;
+}
+
+CfreeInterpStack* cfree_interp_stack_new(CfreeInterpProgram* p) {
+ (void)p;
+ return NULL;
+}
+
+void cfree_interp_stack_free(CfreeInterpStack* s) { (void)s; }
+
+CfreeStatus cfree_interp_call_on(CfreeInterpStack* s, CfreeInterpFunc* fn,
+ int argc, char** argv) {
+ (void)s;
+ (void)fn;
+ (void)argc;
+ (void)argv;
+ return CFREE_UNSUPPORTED;
+}
+
+CfreeInterpStatus cfree_interp_resume(CfreeInterpStack* s, int64_t* out_ret) {
+ (void)s;
+ (void)out_ret;
+ return CFREE_INTERP_ERROR;
+}
+
+CfreeInterpStatus cfree_interp_call(CfreeInterpProgram* p, CfreeInterpFunc* fn,
+ int argc, char** argv, int64_t* out_ret) {
+ (void)p;
+ (void)fn;
+ (void)argc;
+ (void)argv;
+ (void)out_ret;
+ return CFREE_INTERP_ERROR;
+}
+
+CfreeInterpStatus cfree_interp_call_args(CfreeInterpProgram* p,
+ CfreeInterpFunc* fn,
+ const uint64_t* args, uint32_t nargs,
+ int64_t* out_ret) {
+ (void)p;
+ (void)fn;
+ (void)args;
+ (void)nargs;
+ (void)out_ret;
+ return CFREE_INTERP_ERROR;
+}
diff --git a/src/interp/lower.c b/src/interp/lower.c
@@ -0,0 +1,526 @@
+/* Func (post opt_run_o1_interp) -> InterpFunc bytecode loader.
+ *
+ * Walks blocks in f->emit_order, emits one fixed-width InterpInsn per non-no-op
+ * Inst, bump-allocates frame-slot offsets, resolves branch targets from block
+ * ids to code pcs, and collects the global symbols the function references (so
+ * their names survive past the obj's lifetime). Unsupported ops are lowered to
+ * IOP_TRAP and the function is flagged rejected with a reason — the engine then
+ * reports a clean "interp: <op> not supported" rather than miscompiling. */
+
+#include <string.h>
+
+#include "abi/abi.h"
+/* cg/ir.h must precede any header that pulls opt/ir.h: opt/ir.h aliases
+ * `Operand`/`CGCallDesc`/... to their Opt* forms via macros, so the semantic
+ * cg structs (CgIrLocalStatic*Aux, reused verbatim as the opt aux pointers)
+ * have to be parsed first. This mirrors opt/opt.h's include order. */
+#include "cg/ir.h"
+#include "cg/cgtarget.h"
+#include "cg/type.h"
+#include "core/arena.h"
+#include "core/core.h"
+#include "core/pool.h"
+#include "core/slice.h"
+#include "interp/interp.h"
+#include "obj/obj.h"
+#include "opt/ir.h"
+
+typedef struct Lower {
+ InterpProgram* p;
+ Compiler* c;
+ Func* f;
+ const ObjBuilder* obj;
+ InterpFunc* fn;
+ const char* reject;
+} Lower;
+
+static void type_wf(Compiler* c, CfreeCgTypeId t, u16* w, u8* fp) {
+ ABITypeInfo ti = abi_cg_type_info(c->abi, t);
+ *w = (u16)ti.size;
+ *fp = (ti.scalar_kind == ABI_SC_FLOAT) ? 1u : 0u;
+}
+
+static int is_agg_or_large(Compiler* c, CfreeCgTypeId t) {
+ return cg_type_is_aggregate(c, t) || abi_cg_sizeof(c->abi, t) > 8u;
+}
+
+/* Does this IROp produce a bytecode record? */
+static int inst_emits(u16 op) {
+ switch ((IROp)op) {
+ case IR_NOP:
+ case IR_CONST_I:
+ case IR_CONST_BYTES:
+ case IR_PHI:
+ case IR_PARAM_DECL:
+ case IR_SCOPE_BEGIN:
+ case IR_SCOPE_ELSE:
+ case IR_SCOPE_END:
+ return 0;
+ default:
+ return 1;
+ }
+}
+
+/* Map an IROp to its InterpOp, choosing aggregate-specialized handlers. On an
+ * unsupported op, records a reject reason and returns IOP_TRAP. */
+static InterpOp map_op(Lower* lw, const Inst* in) {
+ Compiler* c = lw->c;
+ switch ((IROp)in->op) {
+ case IR_LOAD_IMM:
+ return IOP_LOAD_IMM;
+ case IR_LOAD_CONST:
+ return IOP_LOAD_CONST;
+ case IR_COPY:
+ return is_agg_or_large(c, in->opnds[0].type) ? IOP_COPY_AGG : IOP_COPY;
+ case IR_LOAD:
+ return is_agg_or_large(c, in->opnds[0].type) ? IOP_LOAD_AGG : IOP_LOAD;
+ case IR_STORE:
+ return is_agg_or_large(c, in->opnds[1].type) ? IOP_STORE_AGG : IOP_STORE;
+ case IR_ADDR_OF:
+ return IOP_ADDR_OF;
+ case IR_TLS_ADDR_OF:
+ return IOP_TLS_ADDR;
+ case IR_AGG_COPY:
+ return IOP_AGG_COPY;
+ case IR_AGG_SET:
+ return IOP_AGG_SET;
+ case IR_BITFIELD_LOAD:
+ return IOP_BITFIELD_LOAD;
+ case IR_BITFIELD_STORE:
+ return IOP_BITFIELD_STORE;
+ case IR_BINOP:
+ return IOP_BINOP;
+ case IR_UNOP:
+ return IOP_UNOP;
+ case IR_CMP:
+ return IOP_CMP;
+ case IR_CONVERT:
+ return IOP_CONVERT;
+ case IR_CALL:
+ return IOP_CALL;
+ case IR_BR:
+ case IR_BREAK_TO:
+ case IR_CONTINUE_TO:
+ return IOP_BR;
+ case IR_CONDBR:
+ return IOP_CONDBR;
+ case IR_CMP_BRANCH:
+ return IOP_CMP_BRANCH;
+ case IR_SWITCH:
+ return IOP_SWITCH;
+ case IR_INDIRECT_BRANCH:
+ return IOP_INDIRECT_BR;
+ case IR_LOAD_LABEL_ADDR:
+ return IOP_LOAD_LABEL_ADDR;
+ case IR_RET: {
+ IRRetAux* aux = (IRRetAux*)in->extra.aux;
+ return (aux && aux->present) ? IOP_RET : IOP_RET_VOID;
+ }
+ case IR_ALLOCA:
+ return IOP_ALLOCA;
+ case IR_VA_START:
+ return IOP_VA_START;
+ case IR_VA_ARG:
+ return IOP_VA_ARG;
+ case IR_VA_END:
+ return IOP_VA_END;
+ case IR_VA_COPY:
+ return IOP_VA_COPY;
+ case IR_ATOMIC_LOAD:
+ return IOP_ATOMIC_LOAD;
+ case IR_ATOMIC_STORE:
+ return IOP_ATOMIC_STORE;
+ case IR_ATOMIC_RMW:
+ return IOP_ATOMIC_RMW;
+ case IR_ATOMIC_CAS:
+ return IOP_ATOMIC_CAS;
+ case IR_FENCE:
+ return IOP_FENCE;
+ case IR_INTRINSIC:
+ return IOP_INTRINSIC;
+ case IR_ASM_BLOCK:
+ lw->reject = "inline asm";
+ return IOP_TRAP;
+ case IR_LOCAL_STATIC_DATA_BEGIN:
+ case IR_LOCAL_STATIC_DATA_WRITE:
+ case IR_LOCAL_STATIC_DATA_LABEL_ADDR:
+ case IR_LOCAL_STATIC_DATA_END:
+ /* Function-scope static data (incl. dense-switch jump tables and
+ * computed-goto label arrays) is materialized into an interp-private
+ * blob at lower time (lower_static_blobs) and the blob's symbol is
+ * resolved to that buffer; the stream ops themselves are pure markers. */
+ return IOP_NOP;
+ default:
+ lw->reject = "unhandled IR op";
+ return IOP_TRAP;
+ }
+}
+
+/* Record (deduped) a global symbol the function references, resolving its name
+ * from the obj while it is alive. */
+static void note_global(Lower* lw, ObjSymId sym) {
+ InterpFunc* fn = lw->fn;
+ const ObjSym* s;
+ u32 i;
+ if (sym == OBJ_SYM_NONE || !lw->obj) return;
+ for (i = 0; i < fn->nglobals; ++i)
+ if (fn->globals[i].sym == sym) return;
+ s = obj_symbol_get(lw->obj, sym);
+ if (!s) return;
+ fn->globals[fn->nglobals].sym = sym;
+ fn->globals[fn->nglobals].name = pool_slice(lw->c->global, s->name);
+ fn->globals[fn->nglobals].cached = NULL;
+ fn->globals[fn->nglobals].resolved = 0;
+ fn->nglobals++;
+}
+
+static void note_operand_globals(Lower* lw, const Operand* op) {
+ if (op->kind == OPK_GLOBAL) note_global(lw, op->v.global.sym);
+}
+
+static void note_inst_globals(Lower* lw, const Inst* in) {
+ u32 i;
+ for (i = 0; i < in->nopnds; ++i) note_operand_globals(lw, &in->opnds[i]);
+ if ((IROp)in->op == IR_CALL && in->extra.aux) {
+ IRCallAux* aux = (IRCallAux*)in->extra.aux;
+ note_operand_globals(lw, &aux->desc.callee);
+ } else if ((IROp)in->op == IR_TLS_ADDR_OF && in->extra.aux) {
+ IRTlsAux* aux = (IRTlsAux*)in->extra.aux;
+ note_global(lw, aux->sym);
+ }
+}
+
+/* Block placement: code pc per reachable block, in emit_order. */
+static u32 block_pc_of(InterpFunc* fn, u32 block) {
+ if (block >= fn->nblocks) return INTERP_PC_NONE;
+ return fn->block_pc[block];
+}
+
+/* Point a (referenced) global symbol at an already-resolved host address,
+ * overriding lazy name resolution. Used for interp-private static blobs. */
+static void register_blob_global(InterpFunc* fn, ObjSymId sym, void* ptr) {
+ u32 i;
+ if (sym == OBJ_SYM_NONE) return;
+ for (i = 0; i < fn->nglobals; ++i) {
+ if (fn->globals[i].sym == sym) {
+ fn->globals[i].cached = ptr;
+ fn->globals[i].resolved = 1;
+ return;
+ }
+ }
+ fn->globals[fn->nglobals].sym = sym;
+ fn->globals[fn->nglobals].name.s = NULL;
+ fn->globals[fn->nglobals].name.len = 0;
+ fn->globals[fn->nglobals].cached = ptr;
+ fn->globals[fn->nglobals].resolved = 1;
+ fn->nglobals++;
+}
+
+/* Materialize every function-scope static data blob (regular static locals,
+ * dense-switch jump tables, computed-goto label arrays) into an interp-private,
+ * program-lifetime buffer and bind the blob symbol to it.
+ *
+ * WRITE records contribute their literal bytes (zero-fill when has_data==0).
+ * LABEL_ADDR records contribute a *bytecode pc* for the target block: the
+ * interpreter addresses code by InterpInsn index, so a table that the program
+ * later walks with IR_LOAD + IR_INDIRECT_BRANCH must hold interp pcs, not the
+ * native code-label addresses the parallel object/JIT path bakes in. (cfree
+ * forbids data-symbol relocations inside function-local statics, so WRITE bytes
+ * + code-label pcs are the only contents possible.) The BEGIN/WRITE/LABEL_ADDR/
+ * END stream ops lower to IOP_NOP — they are fully consumed here.
+ *
+ * Runs after block placement (block pcs known) and after the globals table is
+ * allocated; the per-blob sym is also referenced by an OPK_GLOBAL operand, so
+ * the table has room and Pass B's note_global dedups against the entry added
+ * here. */
+static void lower_static_blobs(Lower* lw) {
+ InterpFunc* fn = lw->fn;
+ Func* f = lw->f;
+ Arena* a = f->arena ? f->arena : lw->c->tu;
+ u32 b;
+ for (b = 0; b < f->nblocks; ++b) {
+ Block* bl = &f->blocks[b];
+ u32 k;
+ for (k = 0; k < bl->ninsts; ++k) {
+ const Inst* in = &bl->insts[k];
+ CgIrLocalStaticBeginAux* beg;
+ ObjSymId sym;
+ u32 total = 0u, off = 0u, j;
+ u8* buf;
+ if ((IROp)in->op != IR_LOCAL_STATIC_DATA_BEGIN) continue;
+ beg = (CgIrLocalStaticBeginAux*)in->extra.aux;
+ sym = beg ? beg->desc.sym : OBJ_SYM_NONE;
+ for (j = k + 1u; j < bl->ninsts; ++j) {
+ IROp op = (IROp)bl->insts[j].op;
+ if (op == IR_LOCAL_STATIC_DATA_END) break;
+ if (op == IR_LOCAL_STATIC_DATA_WRITE) {
+ CgIrLocalStaticWriteAux* w =
+ (CgIrLocalStaticWriteAux*)bl->insts[j].extra.aux;
+ total += w ? (u32)w->len : 0u;
+ } else if (op == IR_LOCAL_STATIC_DATA_LABEL_ADDR) {
+ CgIrLocalStaticLabelAux* la =
+ (CgIrLocalStaticLabelAux*)bl->insts[j].extra.aux;
+ total += la ? la->width : 0u;
+ }
+ }
+ buf = total ? arena_array(a, u8, total) : NULL;
+ for (j = k + 1u; j < bl->ninsts; ++j) {
+ IROp op = (IROp)bl->insts[j].op;
+ if (op == IR_LOCAL_STATIC_DATA_END) break;
+ if (op == IR_LOCAL_STATIC_DATA_WRITE) {
+ CgIrLocalStaticWriteAux* w =
+ (CgIrLocalStaticWriteAux*)bl->insts[j].extra.aux;
+ u32 len = w ? (u32)w->len : 0u;
+ if (len) {
+ if (w->has_data && w->data) memcpy(buf + off, w->data, len);
+ else memset(buf + off, 0, len);
+ }
+ off += len;
+ } else if (op == IR_LOCAL_STATIC_DATA_LABEL_ADDR) {
+ CgIrLocalStaticLabelAux* la =
+ (CgIrLocalStaticLabelAux*)bl->insts[j].extra.aux;
+ u32 width = la ? la->width : 0u;
+ u64 val = la ? (u64)block_pc_of(fn, (u32)la->target) : 0u;
+ u32 bi;
+ for (bi = 0u; bi < width && bi < 8u; ++bi)
+ buf[off + bi] = (u8)(val >> (bi * 8u));
+ for (bi = 8u; bi < width; ++bi) buf[off + bi] = 0u;
+ off += width;
+ }
+ }
+ register_blob_global(fn, sym, buf);
+ }
+ }
+}
+
+InterpFunc* interp_lower(InterpProgram* p, Func* f, ObjSymId sym, Slice name,
+ const ObjBuilder* obj) {
+ Compiler* c = p->c;
+ Arena* a = f->arena ? f->arena : c->tu;
+ Lower lw;
+ InterpFunc* fn;
+ u32 i, b, pc;
+ u32 ncode = 0, nopnd_total = 0;
+ u32 max_slot_id = 0;
+ u32 off;
+ u32 nswitch = 0, swi = 0;
+
+ memset(&lw, 0, sizeof lw);
+ lw.p = p;
+ lw.c = c;
+ lw.f = f;
+ lw.obj = obj;
+
+ fn = arena_znew(a, InterpFunc);
+ fn->prog = p;
+ fn->f = f;
+ fn->sym = sym;
+ fn->name = name;
+ fn->npregs = f->npregs ? f->npregs : 1u;
+ fn->nblocks = f->nblocks;
+ fn->ok = 1;
+ lw.fn = fn;
+
+ /* block_pc, default unreachable. */
+ fn->block_pc = arena_array(a, u32, f->nblocks ? f->nblocks : 1u);
+ for (b = 0; b < f->nblocks; ++b) fn->block_pc[b] = INTERP_PC_NONE;
+
+ /* Pass A: place blocks in emit_order, count records + operands + switches. */
+ for (i = 0; i < f->emit_order_n; ++i) {
+ b = f->emit_order[i];
+ if (b >= f->nblocks) continue;
+ if (fn->block_pc[b] != INTERP_PC_NONE) continue; /* placed already */
+ fn->block_pc[b] = ncode;
+ {
+ Block* bl = &f->blocks[b];
+ u32 k;
+ for (k = 0; k < bl->ninsts; ++k) {
+ const Inst* in = &bl->insts[k];
+ if (!inst_emits(in->op)) continue;
+ ncode++;
+ nopnd_total += in->nopnds;
+ if ((IROp)in->op == IR_SWITCH) nswitch++;
+ }
+ }
+ }
+ (void)nopnd_total;
+
+ fn->code = arena_zarray(a, InterpInsn, ncode ? ncode : 1u);
+ fn->ncode = ncode;
+ fn->switches = nswitch ? arena_zarray(a, InterpSwitch, nswitch) : NULL;
+ fn->nswitches = nswitch;
+ /* Over-allocate the globals table: at most one per operand + one per call. */
+ {
+ u32 cap = 0;
+ for (i = 0; i < f->nblocks; ++i) {
+ Block* bl = &f->blocks[i];
+ u32 k;
+ for (k = 0; k < bl->ninsts; ++k) cap += bl->insts[k].nopnds + 1u;
+ }
+ fn->globals = arena_zarray(a, InterpGlobal, cap ? cap : 1u);
+ fn->nglobals = 0;
+ }
+
+ /* Frame slots: bump-allocate non-alloca slots honoring align. */
+ for (i = 0; i < f->nframe_slots; ++i)
+ if (f->frame_slots[i].id > max_slot_id) max_slot_id = f->frame_slots[i].id;
+ fn->nslots = max_slot_id + 1u;
+ fn->slot_off = arena_zarray(a, u32, fn->nslots);
+ fn->frame_align = 16u;
+ off = 0;
+ for (i = 0; i < f->nframe_slots; ++i) {
+ IRFrameSlot* fs = &f->frame_slots[i];
+ u32 align = fs->align ? fs->align : 1u;
+ u32 size = fs->size;
+ if (fs->kind == FS_ALLOCA) continue; /* dynamic, allocated at OP_ALLOCA */
+ if (align > fn->frame_align) fn->frame_align = align;
+ off = (off + align - 1u) & ~(align - 1u);
+ fn->slot_off[fs->id] = off;
+ off += size ? size : 1u;
+ }
+ fn->frame_bytes = (off + 15u) & ~15u;
+
+ /* Materialize function-scope static data into interp-private buffers and bind
+ * their symbols (block pcs are now known for any label-address tables). */
+ lower_static_blobs(&lw);
+
+ /* Pass B: emit records, resolving branch targets to pcs. */
+ pc = 0;
+ for (i = 0; i < f->emit_order_n; ++i) {
+ Block* bl;
+ u32 k;
+ b = f->emit_order[i];
+ if (b >= f->nblocks) continue;
+ if (fn->block_pc[b] != pc) {
+ /* A block placed at a different pc (duplicate in emit_order) — skip its
+ * second appearance to keep pc aligned with Pass A. */
+ if (fn->block_pc[b] != INTERP_PC_NONE && fn->block_pc[b] < pc) continue;
+ }
+ bl = &f->blocks[b];
+ for (k = 0; k < bl->ninsts; ++k) {
+ const Inst* in = &bl->insts[k];
+ InterpInsn* rec;
+ if (!inst_emits(in->op)) continue;
+ note_inst_globals(&lw, in);
+ rec = &fn->code[pc++];
+ rec->inst = in;
+ rec->op = (u32)map_op(&lw, in);
+ rec->dst = (in->nopnds > 0 && in->opnds[0].kind == OPK_REG)
+ ? in->opnds[0].v.reg
+ : 0u;
+ switch ((InterpOp)rec->op) {
+ case IOP_LOAD_IMM:
+ rec->imm = in->extra.imm;
+ type_wf(c, in->opnds[0].type, &rec->w0, &rec->fp0);
+ break;
+ case IOP_COPY:
+ case IOP_LOAD:
+ case IOP_ADDR_OF:
+ type_wf(c, in->opnds[0].type, &rec->w0, &rec->fp0);
+ if ((IROp)in->op == IR_LOAD) rec->w0 = (u16)in->extra.mem.size;
+ break;
+ case IOP_STORE:
+ rec->w0 = (u16)in->extra.mem.size;
+ type_wf(c, in->opnds[1].type, &rec->w1, &rec->fp1);
+ rec->fp0 = rec->fp1;
+ break;
+ case IOP_BINOP:
+ case IOP_UNOP:
+ rec->sub = (u32)in->extra.imm;
+ type_wf(c, in->opnds[0].type, &rec->w0, &rec->fp0);
+ break;
+ case IOP_CMP:
+ rec->sub = (u32)in->extra.imm;
+ type_wf(c, in->opnds[1].type, &rec->w0, &rec->fp0);
+ break;
+ case IOP_CONVERT:
+ rec->sub = (u32)in->extra.imm;
+ type_wf(c, in->opnds[0].type, &rec->w0, &rec->fp0);
+ type_wf(c, in->opnds[1].type, &rec->w1, &rec->fp1);
+ break;
+ case IOP_ALLOCA:
+ rec->imm = in->extra.imm; /* alignment */
+ break;
+ case IOP_CALL: {
+ /* A realized tail call is the block terminator: the CGCallDesc has
+ * CG_CALL_TAIL, or equivalently the call is the last emitting inst
+ * of a successor-less block (it returns the callee's result
+ * directly). Detect both. */
+ IRCallAux* aux = (IRCallAux*)in->extra.aux;
+ u8 is_tail = aux && (aux->desc.flags & CG_CALL_TAIL);
+ if (!is_tail && bl->nsucc == 0) {
+ u32 j;
+ u8 last = 1;
+ for (j = k + 1u; j < bl->ninsts; ++j)
+ if (inst_emits(bl->insts[j].op)) { last = 0; break; }
+ is_tail = last;
+ }
+ rec->tail = is_tail;
+ break;
+ }
+ case IOP_BR:
+ rec->t0 = block_pc_of(fn, bl->nsucc > 0 ? bl->succ[0] : 0xffffffffu);
+ break;
+ case IOP_CONDBR:
+ case IOP_CMP_BRANCH:
+ if ((IROp)in->op == IR_CMP_BRANCH) {
+ rec->sub = (u32)in->extra.imm;
+ type_wf(c, in->opnds[0].type, &rec->w0, &rec->fp0);
+ }
+ rec->t0 = block_pc_of(fn, bl->nsucc > 0 ? bl->succ[0] : 0xffffffffu);
+ rec->t1 = block_pc_of(fn, bl->nsucc > 1 ? bl->succ[1] : 0xffffffffu);
+ break;
+ case IOP_SWITCH: {
+ IRSwitchAux* aux = (IRSwitchAux*)in->extra.aux;
+ InterpSwitch* sw = &fn->switches[swi];
+ u32 ci;
+ rec->t0 = swi;
+ swi++;
+ sw->aux = aux;
+ sw->sel_type = aux ? aux->selector_type : 0;
+ sw->ncases = aux ? aux->ncases : 0;
+ sw->case_pc =
+ sw->ncases ? arena_array(a, u32, sw->ncases) : NULL;
+ for (ci = 0; ci < sw->ncases; ++ci)
+ sw->case_pc[ci] = block_pc_of(fn, aux->cases[ci].block);
+ sw->default_pc =
+ aux ? block_pc_of(fn, aux->default_block) : INTERP_PC_NONE;
+ break;
+ }
+ case IOP_LOAD_LABEL_ADDR:
+ rec->t0 = block_pc_of(fn, (u32)in->extra.imm);
+ break;
+ case IOP_RET:
+ case IOP_RET_VOID:
+ break;
+ case IOP_ATOMIC_RMW: {
+ IRAtomicAux* aux = (IRAtomicAux*)in->extra.aux;
+ rec->sub = aux ? aux->op : 0u;
+ rec->w0 = aux ? (u16)aux->mem.size : 8u;
+ break;
+ }
+ case IOP_ATOMIC_LOAD:
+ case IOP_ATOMIC_STORE: {
+ IRAtomicAux* aux = (IRAtomicAux*)in->extra.aux;
+ rec->w0 = aux ? (u16)aux->mem.size : 8u;
+ break;
+ }
+ case IOP_ATOMIC_CAS: {
+ IRCasAux* aux = (IRCasAux*)in->extra.aux;
+ rec->w0 = aux ? (u16)aux->mem.size : 8u;
+ break;
+ }
+ case IOP_TRAP:
+ if (fn->ok) {
+ fn->ok = 0;
+ fn->reject_reason = lw.reject ? lw.reject : "unsupported op";
+ }
+ break;
+ default:
+ break;
+ }
+ }
+ }
+
+ return fn;
+}
diff --git a/src/link/link_jit.c b/src/link/link_jit.c
@@ -761,6 +761,38 @@ void* cfree_jit_lookup(CfreeJit* jit, CfreeSlice name) {
return (void*)vaddr_to_runtime(jit->image, jit->segs, s->vaddr);
}
+void* cfree_jit_tlv_resolve(CfreeJit* jit, void* descriptor) {
+ /* Mach-O thread-local access goes through a TLV descriptor, not the data:
+ * the symbol resolves to a 24-byte descriptor [thunk, ctx, image-offset]
+ * (see src/jit/tlv_thunk.h). JITed code calls thunk(desc); from C we can run
+ * the thunk's logic directly — `get_block` is the ctx's first word and is a
+ * normal C function (the asm thunk only exists to preserve caller-saved regs
+ * for the JITed access sequence). Returns the calling thread's address of the
+ * variable, or NULL if this is not one of our descriptors (non-Mach-O image,
+ * no in-image TLV ctx, or a *foreign* descriptor — e.g. an extern thread-local
+ * resolved through dyld — which we must not dereference/call into). */
+ u8* desc = (u8*)descriptor;
+ void* slot0;
+ void* ctx;
+ void* (*get_block)(void*);
+ u64 offset;
+ u8* base;
+ if (!jit || !desc) return NULL;
+ if (jit->c->target.obj != CFREE_OBJ_MACHO || !jit->tls_ctx) return NULL;
+ /* Ownership check: our descriptors carry &cfree_jit_tlv_thunk at +0 and this
+ * image's tls_ctx at +8 (jit_patch_tlv_descriptors). Refuse anything else so
+ * a foreign descriptor never becomes a wild indirect call. */
+ memcpy(&slot0, desc + 0u, sizeof slot0);
+ memcpy(&ctx, desc + 8u, sizeof ctx);
+ if (slot0 != (void*)&cfree_jit_tlv_thunk || ctx != jit->tls_ctx) return NULL;
+ memcpy(&get_block, ctx, sizeof get_block);
+ if (!get_block) return NULL;
+ memcpy(&offset, desc + 16u, sizeof offset);
+ base = (u8*)get_block(ctx);
+ if (!base) return NULL;
+ return base + offset;
+}
+
uint64_t cfree_jit_generation(CfreeJit* jit) {
return jit ? jit->generation : 0;
}
diff --git a/src/opt/opt.c b/src/opt/opt.c
@@ -1,3 +1,4 @@
+#include <cfree/config.h>
#include <string.h>
#include "abi/abi.h"
@@ -7,8 +8,11 @@
#include "cg/type.h"
#include "core/arena.h"
#include "core/core.h"
+#include "core/diag.h"
#include "core/hashmap.h"
#include "core/metrics.h"
+#include "core/pool.h"
+#include "core/slice.h"
#include "core/strbuf.h"
#include "debug/debug.h"
#include "opt/opt_internal.h"
@@ -223,6 +227,75 @@ static void opt_run_o1_native(OptImpl* o, Func* f) {
metrics_scope_end(o->c, "opt.o1.total");
}
+/* Sibling of opt_run_o1_native for the threaded interpreter. Runs the maximal
+ * target-independent subset and stops before opt_machinize_native: no regalloc,
+ * no MIR, no native emission. The result is a Func with virtual PRegs that the
+ * interpreter loader consumes directly. opt_verify is intentionally omitted —
+ * it is a debug aid and some checks assume the machinize-completed shape. */
+Func* opt_run_o1_interp(Compiler* c, const CgIrFunc* cg) {
+ Func* f;
+ OptLiveInfo live;
+ metrics_scope_begin(c, "opt.interp.total");
+ f = opt_func_from_cg_ir(c, cg);
+ opt_build_cfg(f);
+ opt_jump_cleanup(f, OPT_JUMP_CLEANUP_CFG);
+ opt_build_cfg(f);
+ opt_simplify_local(f);
+ /* Target-independent local/escape optimizations. These run after machinize
+ * in the native pipeline but depend only on the PReg/frame-slot view, not on
+ * physical-register pools, so they are safe here and shrink the work the
+ * interpreter does (promote scalar locals to PRegs, fold addr-of-local,
+ * CSE addr-of-global). */
+ opt_addr_xform_pregs(f);
+ opt_promote_scalar_locals(f);
+ opt_addr_of_global_cse(f);
+ opt_build_loop_tree(f);
+ memset(&live, 0, sizeof live);
+ opt_live_blocks(f, &live);
+ opt_dead_def_elim_with_live(f, &live);
+ metrics_scope_end(c, "opt.interp.total");
+ {
+ extern char* getenv(const char*);
+ if (getenv("CFREE_DUMP_INTERP")) {
+ CfreeWriter* w = NULL;
+ size_t len = 0;
+ const uint8_t* bytes;
+ cfree_writer_mem(c->ctx->heap, &w);
+ opt_ir_dump(f, w);
+ bytes = cfree_writer_mem_bytes(w, &len);
+ diag_emit(c->ctx->diag, CFREE_DIAG_NOTE, (SrcLoc){0, 0, 0},
+ "INTERP IR:\n%.*s", (int)len, (const char*)bytes);
+ }
+ }
+ return f;
+}
+
+#if CFREE_INTERP_ENABLED
+/* Defined in src/interp/lower.c. Lowers a post-opt_run_o1_interp Func into the
+ * program's bytecode and registers it by symbol (ObjSymId, for internal calls)
+ * and by unmangled C name (for entry lookup). Declared here (rather than
+ * including the interp header into opt) to keep the dependency one-way. */
+void interp_capture_func(void* program, Func* f, ObjSymId sym, const char* name,
+ u32 name_len, const ObjBuilder* obj);
+
+static void opt_maybe_capture_interp(OptImpl* o, const CgIrFunc* cg) {
+ ObjSymId sym;
+ Slice name;
+ if (!o->c->interp_sink) return;
+ sym = cg->desc.sym;
+ name = (sym != OBJ_SYM_NONE)
+ ? pool_slice(o->c->global, obj_symbol_get(o->target->obj, sym)->name)
+ : SLICE_NULL;
+ interp_capture_func(o->c->interp_sink, opt_run_o1_interp(o->c, cg), sym,
+ name.s, (u32)name.len, o->target->obj);
+}
+#else
+static void opt_maybe_capture_interp(OptImpl* o, const CgIrFunc* cg) {
+ (void)o;
+ (void)cg;
+}
+#endif
+
static void opt_dbg_dump_cg(OptImpl* o, const CgIrFunc* f) {
extern char* getenv(const char*);
CfreeWriter* w = NULL;
@@ -251,6 +324,7 @@ static void opt_on_func(void* user, CgIrFunc* cg_func) {
f = opt_func_from_cg_ir(o->c, cg_func);
metrics_scope_end(o->c, "opt.o1.cg_ir_lower");
opt_run_o1_native(o, f);
+ opt_maybe_capture_interp(o, cg_func);
}
static int opt_func_is_root(OptImpl* o, const CgIrFunc* f) {
@@ -487,6 +561,7 @@ static void opt_emit_reachable_aarch64(OptImpl* o, const CgIrModule* module) {
f = opt_func_from_cg_ir(o->c, module->funcs[i]);
metrics_scope_end(o->c, "opt.o1.cg_ir_lower");
opt_run_o1_native(o, f);
+ opt_maybe_capture_interp(o, module->funcs[i]);
}
opt_refresh_or_prune_aliases(o, module, &index, reachable);
ObjSymSet_fini(&data_seen);
diff --git a/src/opt/opt.h b/src/opt/opt.h
@@ -12,6 +12,13 @@
CgTarget* opt_cgtarget_new(Compiler*, CgTarget* target, int level);
Func* opt_func_from_cg_ir(Compiler*, const CgIrFunc*);
+/* Interpreter tap: run the maximal target-independent subset of the O1 pipeline
+ * (everything in opt_run_o1_native up to, but excluding, opt_machinize_native /
+ * regalloc / MIR / native emit) and return the resulting Func for the threaded
+ * bytecode interpreter to consume. At this point opt_reg_ssa==0, OPK_REG
+ * operands carry virtual PReg ids, and there are no IR_PHI nodes. */
+Func* opt_run_o1_interp(Compiler*, const CgIrFunc*);
+
/* ----- intra-procedural passes (run per retained Func at finalize on -O2)
* ----- */
void opt_build_cfg(Func*);
diff --git a/test/interp/interp_smoke_test.c b/test/interp/interp_smoke_test.c
@@ -0,0 +1,250 @@
+/* Unit smoke test for the threaded-bytecode interpreter.
+ *
+ * Mirrors test/opt/cg_ir_lower_test.c: a self-contained heap/diag harness that
+ * builds tiny CG IR by hand, runs it through opt_run_o1_interp + interp_lower,
+ * executes it on an InterpStack, and asserts the returned value. This exercises
+ * the loader + engine directly (the broad differential coverage against the JIT
+ * lives in test/toy/run.sh's I-path). */
+
+#include <cfree/core.h>
+#include <cfree/interp.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "cg/ir.h"
+#include "interp/interp.h"
+#include "opt/opt.h"
+
+#undef Operand
+#undef CGFuncDesc
+#undef CGParamDesc
+#undef CGCallDesc
+#undef CGLocalStorage
+
+static void* h_alloc(CfreeHeap* h, size_t n, size_t a) {
+ (void)h;
+ (void)a;
+ return n ? malloc(n) : NULL;
+}
+static void* h_realloc(CfreeHeap* h, void* p, size_t o, size_t n, size_t a) {
+ (void)h;
+ (void)o;
+ (void)a;
+ return realloc(p, n);
+}
+static void h_free(CfreeHeap* h, void* p, size_t n) {
+ (void)h;
+ (void)n;
+ free(p);
+}
+static CfreeHeap g_heap = {h_alloc, h_realloc, h_free, NULL};
+
+static int g_fails;
+static int g_checks;
+
+static void diag_sink(CfreeDiagSink* s, CfreeDiagKind k, CfreeSrcLoc loc,
+ const char* fmt, va_list ap) {
+ static const char* names[] = {"note", "warning", "error", "fatal"};
+ (void)s;
+ (void)loc;
+ fprintf(stderr, "%s: ", names[k]);
+ vfprintf(stderr, fmt, ap);
+ fputc('\n', stderr);
+}
+static CfreeDiagSink g_diag = {diag_sink, NULL, 0, 0};
+
+#define EXPECT(cond, ...) \
+ do { \
+ ++g_checks; \
+ if (!(cond)) { \
+ ++g_fails; \
+ fprintf(stderr, "FAIL %s:%d: ", __FILE__, __LINE__); \
+ fprintf(stderr, __VA_ARGS__); \
+ fputc('\n', stderr); \
+ } \
+ } while (0)
+
+typedef struct TestCtx {
+ CfreeContext ctx;
+ Compiler* c;
+ CfreeCgTypeId i32;
+ CfreeCgTypeId i64;
+} TestCtx;
+
+static void tc_init(TestCtx* tc) {
+ CfreeTarget target;
+ CfreeCgBuiltinTypes b;
+ memset(tc, 0, sizeof *tc);
+ tc->ctx.heap = &g_heap;
+ tc->ctx.diag = &g_diag;
+ tc->ctx.now = -1;
+ memset(&target, 0, sizeof target);
+ target.arch = CFREE_ARCH_ARM_64;
+ target.os = CFREE_OS_MACOS;
+ target.obj = CFREE_OBJ_MACHO;
+ target.ptr_size = 8;
+ target.ptr_align = 8;
+ if (cfree_compiler_new(target, &tc->ctx, (CfreeCompiler**)&tc->c) != CFREE_OK ||
+ !tc->c) {
+ fprintf(stderr, "fatal: compiler allocation failed\n");
+ abort();
+ }
+ b = cfree_cg_builtin_types(tc->c);
+ tc->i32 = b.id[CFREE_CG_BUILTIN_I32];
+ tc->i64 = b.id[CFREE_CG_BUILTIN_I64];
+}
+
+static void tc_fini(TestCtx* tc) {
+ cfree_compiler_free(tc->c);
+ tc->c = NULL;
+}
+
+static Operand local_op(CGLocal local, CfreeCgTypeId type) {
+ Operand o;
+ memset(&o, 0, sizeof o);
+ o.kind = OPK_LOCAL;
+ o.type = type;
+ o.v.local = local;
+ return o;
+}
+static Operand imm_op(i64 value, CfreeCgTypeId type) {
+ Operand o;
+ memset(&o, 0, sizeof o);
+ o.kind = OPK_IMM;
+ o.type = type;
+ o.v.imm = value;
+ return o;
+}
+static CGLocal add_local(CgIrFunc* f, CfreeCgTypeId type) {
+ CGLocalDesc d;
+ memset(&d, 0, sizeof d);
+ d.type = type;
+ d.size = 8;
+ d.align = 8;
+ return cg_ir_func_add_local(f, &d, 0, 0);
+}
+static CgIrInst* emit_ops(CgIrFunc* f, CgIrOp op, const Operand* ops, u32 n) {
+ CgIrInst* in = cg_ir_emit(f, op, (SrcLoc){0, 0, 0});
+ in->opnds = cg_ir_dup_operands(f->arena, ops, n);
+ in->nopnds = n;
+ return in;
+}
+
+/* Run a hand-built leaf CgIrFunc through the interp and return its scalar. */
+static CfreeInterpStatus run_leaf(TestCtx* tc, CgIrFunc* cg, int64_t* out) {
+ CfreeInterpProgram* prog = cfree_interp_program_new(tc->c);
+ Func* f = opt_run_o1_interp(tc->c, cg);
+ InterpFunc* fn =
+ interp_lower((InterpProgram*)prog, f, OBJ_SYM_NONE, SLICE_NULL, NULL);
+ CfreeInterpStatus s =
+ cfree_interp_call(prog, (CfreeInterpFunc*)fn, 0, NULL, out);
+ cfree_interp_program_free(prog);
+ return s;
+}
+
+static CgIrFunc* new_func(TestCtx* tc, CfreeCgTypeId ret_type) {
+ CGFuncDesc fd;
+ CfreeCgTypeId* rt;
+ memset(&fd, 0, sizeof fd);
+ rt = arena_array(tc->c->tu, CfreeCgTypeId, 1);
+ rt[0] = ret_type;
+ fd.fn_type = ret_type;
+ fd.result_types = rt;
+ fd.nresults = 1;
+ return cg_ir_func_new(tc->c, &fd);
+}
+
+static void ret_local(CgIrFunc* cg, CGLocal v) {
+ CgIrRetAux* aux = arena_znew(cg->arena, CgIrRetAux);
+ CGLocal rv = v;
+ CgIrInst* ret;
+ aux->values = cg_ir_dup_locals(cg->arena, &rv, 1);
+ aux->nvalues = 1;
+ ret = cg_ir_emit(cg, CG_IR_RET, (SrcLoc){0, 0, 0});
+ ret->extra.aux = aux;
+}
+
+/* fn() : i64 { a = 2; a = a + 3; return a; } => 5 */
+static void interp_runs_arithmetic(void) {
+ TestCtx tc;
+ CgIrFunc* cg;
+ CGLocal a;
+ int64_t ret = -1;
+ CfreeInterpStatus s;
+ tc_init(&tc);
+ cg = new_func(&tc, tc.i64);
+ a = add_local(cg, tc.i64);
+ {
+ Operand o[] = {local_op(a, tc.i64)};
+ CgIrInst* li = emit_ops(cg, CG_IR_LOAD_IMM, o, 1);
+ li->extra.imm = 2;
+ }
+ {
+ Operand o[] = {local_op(a, tc.i64), local_op(a, tc.i64),
+ imm_op(3, tc.i64)};
+ CgIrInst* bi = emit_ops(cg, CG_IR_BINOP, o, 3);
+ bi->extra.imm = BO_IADD;
+ }
+ ret_local(cg, a);
+ s = run_leaf(&tc, cg, &ret);
+ EXPECT(s == CFREE_INTERP_DONE, "arithmetic: status %d", (int)s);
+ EXPECT(ret == 5, "arithmetic: expected 5, got %lld", (long long)ret);
+ tc_fini(&tc);
+}
+
+/* fn() : i64 { a = 7; if (a == 7) a = 11; return a; } => 11
+ * Exercises CMP_BRANCH + a join block + fallthrough succ edges. */
+static void interp_runs_branch(void) {
+ TestCtx tc;
+ CgIrFunc* cg;
+ CGLocal a;
+ Label done;
+ int64_t ret = -1;
+ CfreeInterpStatus s;
+ tc_init(&tc);
+ cg = new_func(&tc, tc.i64);
+ a = add_local(cg, tc.i64);
+ done = cg_ir_func_add_label(cg);
+ {
+ Operand o[] = {local_op(a, tc.i64)};
+ CgIrInst* li = emit_ops(cg, CG_IR_LOAD_IMM, o, 1);
+ li->extra.imm = 7;
+ }
+ {
+ /* branch to `done` when a != 7 (i.e. skip the assignment) */
+ Operand o[] = {local_op(a, tc.i64), imm_op(7, tc.i64)};
+ CgIrInst* br = emit_ops(cg, CG_IR_CMP_BRANCH, o, 2);
+ CgIrCmpBranchAux* aux = arena_znew(cg->arena, CgIrCmpBranchAux);
+ aux->op = CMP_NE;
+ aux->target = done;
+ br->extra.aux = aux;
+ }
+ {
+ Operand o[] = {local_op(a, tc.i64)};
+ CgIrInst* li = emit_ops(cg, CG_IR_LOAD_IMM, o, 1);
+ li->extra.imm = 11;
+ }
+ {
+ CgIrInst* label = cg_ir_emit(cg, CG_IR_LABEL, (SrcLoc){0, 0, 0});
+ label->extra.imm = (i64)done;
+ cg_ir_func_note_label_place(cg, done, (SrcLoc){0, 0, 0});
+ }
+ ret_local(cg, a);
+ s = run_leaf(&tc, cg, &ret);
+ EXPECT(s == CFREE_INTERP_DONE, "branch: status %d", (int)s);
+ EXPECT(ret == 11, "branch: expected 11, got %lld", (long long)ret);
+ tc_fini(&tc);
+}
+
+int main(void) {
+ interp_runs_arithmetic();
+ interp_runs_branch();
+ if (g_fails) {
+ fprintf(stderr, "interp-smoke: %d/%d failed\n", g_fails, g_checks);
+ return 1;
+ }
+ printf("interp-smoke: %d checks, 0 failures\n", g_checks);
+ return 0;
+}
diff --git a/test/test.mk b/test/test.mk
@@ -63,6 +63,8 @@ TEST_TARGETS = \
test-dwarf \
test-elf \
test-emu \
+ test-interp \
+ test-interp-toy \
test-ir-recorder \
test-isa \
test-lib-deps \
@@ -120,6 +122,7 @@ DEFAULT_TEST_TARGETS = \
test-rv64-jit \
test-rv64-tls-link \
test-emu \
+ test-interp \
test-x64-inline \
test-x64-dbg \
test-rt-headers \
@@ -304,6 +307,23 @@ test-link-reloc-uleb128: $(RELOC_ULEB128_TEST_BIN)
$(RELOC_ULEB128_TEST_BIN): test/link/reloc_uleb128_unit.c $(LIB_OBJS)
@mkdir -p $(dir $@)
$(CC) $(TEST_HOST_CFLAGS) -Isrc test/link/reloc_uleb128_unit.c $(LIB_OBJS) -o $@
+# test-interp: threaded-bytecode interpreter unit smoke test. Builds tiny CG IR
+# by hand, runs opt_run_o1_interp + interp_lower + the engine, asserts the
+# returned value. Reaches internal opt/interp symbols -> links $(LIB_OBJS) and
+# needs -Isrc (mirrors test-opt).
+INTERP_SMOKE_TEST_BIN = build/test/interp_smoke_test
+
+test-interp: $(INTERP_SMOKE_TEST_BIN)
+ $(INTERP_SMOKE_TEST_BIN)
+
+$(INTERP_SMOKE_TEST_BIN): test/interp/interp_smoke_test.c $(LIB_OBJS)
+ @mkdir -p $(dir $@)
+ $(CC) $(TEST_HOST_CFLAGS) -Isrc test/interp/interp_smoke_test.c $(LIB_OBJS) -o $@
+
+# test-interp-toy: run the toy suite's interpreter (--no-jit) path only,
+# asserting it matches the golden exit codes (and SKIPping unimplemented ops).
+test-interp-toy: bin
+ @CFREE=$(abspath $(BIN)) CFREE_TEST_PATHS=I bash test/toy/run.sh
CG_API_TEST_BIN = build/test/cg_api_test
CG_SWITCH_TEST_BIN = build/test/cg_switch_test
diff --git a/test/toy/cases/141_threadlocal_mutate.expected b/test/toy/cases/141_threadlocal_mutate.expected
@@ -0,0 +1 @@
+43
+\ No newline at end of file
diff --git a/test/toy/cases/141_threadlocal_mutate.link.skip b/test/toy/cases/141_threadlocal_mutate.link.skip
@@ -0,0 +1 @@
+defining a _Thread_local and linking it into a standalone executable needs PIE/crt TLS setup (see PIE start.c limitation); R/I/C/W cover the semantics
+\ No newline at end of file
diff --git a/test/toy/cases/141_threadlocal_mutate.toy b/test/toy/cases/141_threadlocal_mutate.toy
@@ -0,0 +1,17 @@
+// Thread-local storage: define a mutable _Thread_local, mutate it across calls,
+// and read it back. Single-threaded, so the interpreter resolves the TLV the
+// same storage the JIT does; both must agree.
+var @[.threadlocal] counter: i64 = 40;
+
+fn bump(): i64 {
+ counter = counter + 1;
+ return counter;
+}
+
+fn __user_main(): i64 {
+ bump(); // 41
+ bump(); // 42
+ return bump(); // 43
+}
+
+fn main(): i32 { return __user_main() as i32; }
diff --git a/test/toy/run.sh b/test/toy/run.sh
@@ -3,6 +3,10 @@
#
# Paths per case:
# R cfree run -O{level} case.toy
+# I cfree run --no-jit -O{level} case.toy -> execute via the IR interpreter
+# instead of JIT native code; asserts the same exit code as the golden.
+# Ops the interpreter does not yet implement SKIP (greppable
+# "interp: <feature> not supported"), like paths C/W.
# L cfree cc -O{level} -c case.toy -> cfree ld case.o -> native executable
# X cfree cc -O{level} -target -> cfree ld -> exec_target for Linux cross targets
# C cfree cc --emit=c case.toy -> host cc -> native exec. Exercises the
@@ -25,14 +29,15 @@
# <name>.cbackend.skip opts the case out of path C (with reason),
# without affecting other paths
# <name>.wasm.skip opts the case out of path W (with reason)
+# <name>.link.skip opts the case out of path L (with reason)
# err/<name>.expected expected diagnostic substring for compile-fail cases
#
# Filtering:
# ./run.sh [name_filter] [paths]
-# CFREE_TEST_FILTER / CFREE_TEST_PATHS, where paths is a subset of "RLXCW".
+# CFREE_TEST_FILTER / CFREE_TEST_PATHS, where paths is a subset of "RLXCWI".
# X is opt-in cross-arch cc+ld+exec for aa64, x64, and rv64.
# C and W run only at O0 even when included with other opt levels.
-# Default paths are "RLCW"; override with CFREE_TEST_PATHS.
+# Default paths are "RLCWI"; override with CFREE_TEST_PATHS.
# CFREE_OPT_LEVELS selects optimization levels.
set -u
@@ -43,12 +48,13 @@ BUILD_DIR="$ROOT/build/test/toy"
CFREE="${CFREE:-$ROOT/build/cfree}"
FILTER="${1:-${CFREE_TEST_FILTER:-}}"
-PATHS="${2:-${CFREE_TEST_PATHS:-RLCW}}"
+PATHS="${2:-${CFREE_TEST_PATHS:-RLCWI}}"
case "$PATHS" in *R*) RUN_R=1;; *) RUN_R=0;; esac
case "$PATHS" in *L*) RUN_L=1;; *) RUN_L=0;; esac
case "$PATHS" in *X*) RUN_X=1;; *) RUN_X=0;; esac
case "$PATHS" in *C*) RUN_C=1;; *) RUN_C=0;; esac
case "$PATHS" in *W*) RUN_W=1;; *) RUN_W=0;; esac
+case "$PATHS" in *I*) RUN_I=1;; *) RUN_I=0;; esac
TOY_CROSS_ARCHS="${CFREE_TOY_CROSS_ARCHS:-aa64 x64 rv64}"
TOY_OPT_LEVELS="${CFREE_OPT_LEVELS:-0 1}"
HOST_CC="${CC:-cc}"
@@ -110,12 +116,36 @@ run_case_run() {
check_rc "$name/R-O$opt" "$rc" "$expected" "$err"
}
+# Path I: cfree run --no-jit — execute through the IR interpreter instead of
+# JIT-compiled native code, and assert the same exit code as the golden (which
+# the JIT R-path also matches). Ops the interpreter does not yet implement emit
+# a greppable "interp: <feature> not supported" diagnostic and SKIP rather than
+# FAIL, mirroring how paths C/W treat phased-rollout panics.
+run_case_interp() {
+ local name="$1" src="$2" expected="$3" work="$4" opt="$5"
+ local out="$work/interp.out" err="$work/interp.err" rc missing
+ local label="$name/I-O$opt"
+ "$CFREE" run --no-jit "-O$opt" "$src" > "$out" 2> "$err"
+ rc=$?
+ missing=$(grep -oE 'interp: .*not supported' "$err" 2>/dev/null | head -n1 || true)
+ if [ -n "$missing" ]; then
+ note_skip "$label" "$missing"
+ return
+ fi
+ check_rc "$label" "$rc" "$expected" "$err"
+}
+
run_case_link() {
local name="$1" src="$2" expected="$3" work="$4" opt="$5"
local obj="$work/$name.o" exe="$work/$name.exe"
local cc_err="$work/cc.err" ld_err="$work/ld.err" out="$work/exe.out"
local err="$work/exe.err" dump="$work/objdump.out" dump_err="$work/objdump.err" rc
local dump_exp="${src%.toy}.objdump"
+ local link_skip="${src%.toy}.link.skip"
+ if [ -e "$link_skip" ]; then
+ note_skip "$name/L-O$opt" "$(head -n1 "$link_skip")"
+ return
+ fi
if ! "$CFREE" cc "-O$opt" -c "$src" -o "$obj" > "$work/cc.out" 2> "$cc_err"; then
note_fail "$name/L-O$opt"
@@ -454,6 +484,9 @@ for src in "${cases[@]}"; do
if [ $RUN_R -eq 1 ]; then
run_case_run "$name" "$src" "$expected" "$work" "$opt"
fi
+ if [ $RUN_I -eq 1 ]; then
+ run_case_interp "$name" "$src" "$expected" "$work" "$opt"
+ fi
if [ $RUN_L -eq 1 ]; then
run_case_link "$name" "$src" "$expected" "$work" "$opt"
fi
diff --git a/test/wasm/run.sh b/test/wasm/run.sh
@@ -17,15 +17,17 @@ TEST_OBJ="${CFREE_TEST_OBJ:-macho}"
# Path filtering. Default runs the legacy set (everything except C):
# W wat2wasm
# D cfree run (JIT)
+# N cfree run --no-jit (IR interpreter; SKIPs ops it does not implement)
# O cfree cc -c (object output)
# J jit-runner against the produced obj
# E link + native exec
# C --emit=c + host cc + native exec (C-source backend; see doc/CBACKEND.md)
# C is opt-in because Phase 1 of the C backend skips most wasm cases; the
# combined `make test-cbackend` invokes this runner with CFREE_TEST_PATHS=C.
-PATHS="${CFREE_TEST_PATHS:-WDOJE}"
+PATHS="${CFREE_TEST_PATHS:-WDNOJE}"
case "$PATHS" in *W*) RUN_W=1;; *) RUN_W=0;; esac
case "$PATHS" in *D*) RUN_D=1;; *) RUN_D=0;; esac
+case "$PATHS" in *N*) RUN_N=1;; *) RUN_N=0;; esac
case "$PATHS" in *O*) RUN_O=1;; *) RUN_O=0;; esac
case "$PATHS" in *J*) RUN_J=1;; *) RUN_J=0;; esac
case "$PATHS" in *E*) RUN_E=1;; *) RUN_E=0;; esac
@@ -190,6 +192,27 @@ run_expect_rc() {
fi
}
+# Like run_expect_rc, but for the --no-jit interpreter path: ops the interpreter
+# does not yet implement emit a greppable "interp: <feature> not supported"
+# diagnostic and SKIP rather than FAIL (mirrors the toy I-path).
+run_expect_rc_interp() {
+ local label=$1
+ local expected=$2
+ shift 2
+ local errf="$BUILD_DIR/${label//\//_}.err"
+ "$@" >"$BUILD_DIR/${label//\//_}.out" 2>"$errf"
+ local rc=$?
+ local missing
+ missing=$(grep -oE 'interp: .*not supported' "$errf" 2>/dev/null | head -n1 || true)
+ if [ -n "$missing" ]; then
+ note_skip "$label" "$missing"
+ elif [ "$rc" -eq "$expected" ]; then
+ note_pass "$label"
+ else
+ note_fail "$label expected $expected got $rc"
+ fi
+}
+
run_expect_zero() {
local label=$1
shift
@@ -267,6 +290,13 @@ for wat in "$CASES_DIR"/*.wat; do
run_expect_rc "$name/D-wasm" "$expected" "$CFREE_BIN" run -e test_main "$wasm"
fi
+ if [ "$RUN_N" -eq 1 ]; then
+ run_expect_rc_interp "$name/N-wat" "$expected" "$CFREE_BIN" run --no-jit \
+ -e test_main "$wat"
+ run_expect_rc_interp "$name/N-wasm" "$expected" "$CFREE_BIN" run --no-jit \
+ -e test_main "$wasm"
+ fi
+
if [ "$RUN_O" -eq 1 ]; then
run_expect_zero "$name/O-wat" "$CFREE_BIN" cc -target "$target_triple" -c \
"$wat" -o "$wat_obj"